From 8216117f210e92c229f2c4069a10c07d63a7077e Mon Sep 17 00:00:00 2001 From: Michael Grady Date: Wed, 3 Feb 2010 02:59:35 +0000 Subject: [PATCH] ICU-4521 UText-based regex implementation X-SVN-Rev: 27482 --- .gitattributes | 2 + icu4c/source/common/unicode/utext.h | 160 +- icu4c/source/common/utext.cpp | 385 ++- icu4c/source/i18n/Makefile.in | 2 +- icu4c/source/i18n/i18n.vcproj | 8 + icu4c/source/i18n/regexcmp.cpp | 283 +- icu4c/source/i18n/regexcmp.h | 22 +- icu4c/source/i18n/regeximp.h | 7 +- icu4c/source/i18n/regexst.cpp | 8 +- icu4c/source/i18n/regexst.h | 5 +- icu4c/source/i18n/regextxt.cpp | 45 + icu4c/source/i18n/regextxt.h | 48 + icu4c/source/i18n/rematch.cpp | 4088 ++++++++++++++++++++--- icu4c/source/i18n/repattrn.cpp | 231 +- icu4c/source/i18n/unicode/regex.h | 502 ++- icu4c/source/i18n/unicode/uregex.h | 297 +- icu4c/source/i18n/uregex.cpp | 709 +++- icu4c/source/test/cintltst/reapits.c | 720 +++- icu4c/source/test/intltest/regextst.cpp | 2118 +++++++++++- icu4c/source/test/intltest/utxttest.cpp | 474 ++- icu4c/source/test/intltest/utxttest.h | 3 +- 21 files changed, 9262 insertions(+), 855 deletions(-) create mode 100644 icu4c/source/i18n/regextxt.cpp create mode 100644 icu4c/source/i18n/regextxt.h diff --git a/.gitattributes b/.gitattributes index 57a7188ff51..5be3af74f7b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -54,6 +54,8 @@ icu4c/source/data/in/nfkc.nrm -text icu4c/source/data/in/nfkc_cf.nrm -text icu4c/source/data/in/unorm.icu -text icu4c/source/data/locales/pool.res -text +icu4c/source/i18n/regextxt.cpp -text +icu4c/source/i18n/regextxt.h -text icu4c/source/samples/ucnv/data02.bin -text icu4c/source/test/perf/README -text icu4c/source/test/testdata/TestFont1.otf -text diff --git a/icu4c/source/common/unicode/utext.h b/icu4c/source/common/unicode/utext.h index 9f3ab5e6884..95b8dc54407 100644 --- a/icu4c/source/common/unicode/utext.h +++ b/icu4c/source/common/unicode/utext.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2004-2009, International Business Machines +* Copyright (C) 2004-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -136,6 +136,7 @@ #include "unicode/utypes.h" +#include "unicode/uchar.h" #if U_SHOW_CPLUSPLUS_API #include "unicode/localpointer.h" #include "unicode/rep.h" @@ -674,6 +675,148 @@ utext_extract(UText *ut, UErrorCode *status); +/** + * Compare two UTexts (binary order). The comparison begins at each source text's + * iteration position. The iteration position of each UText will be left following + * the last character compared. + * + * The comparison is done in code point order; unlike u_strCompare, you + * cannot choose to use code unit order. This is because the characters + * in a UText are accessed one code point at a time, and may not be from a UTF-16 + * context. + * + * This functions works with strings of different explicitly specified lengths + * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. + * A length argument of -1 signifies that as much of the string should be used as + * is necessary to compare with the other string. If both length arguments are -1, + * the entire remaining portionss of both strings are used. + * + * @param s1 First source string. + * @param length1 Length of first source string in UTF-32 code points. + * + * @param s2 Second source string. + * @param length2 Length of second source string in UTF-32 code points. + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL int32_t U_EXPORT2 +utext_compare(UText *s1, int32_t length1, + UText *s2, int32_t length2); + +/** + * Compare two UTexts (binary order). The comparison begins at each source text's + * iteration position. The iteration position of each UText will be left following + * the last character compared. This method differs from utext_compare in that + * it accepts native limits rather than lengths for each string. + * + * The comparison is done in code point order; unlike u_strCompare, you + * cannot choose to use code unit order. This is because the characters + * in a UText are accessed one code point at a time, and may not be from a UTF-16 + * context. + * + * This functions works with strings of different explicitly specified lengths + * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. + * A limit argument of -1 signifies that as much of the string should be used as + * is necessary to compare with the other string. If both limit arguments are -1, + * the entire remaining portionss of both strings are used. + * + * @param s1 First source string. + * @param limit1 Native index of the last character in the first source string to be considered. + * + * @param s2 Second source string. + * @param limit2 Native index of the last character in the second source string to be considered. + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL int32_t U_EXPORT2 +utext_compareNativeLimit(UText *s1, int64_t limit1, + UText *s2, int64_t limit2); + +/** + * Compare two UTexts case-insensitively using full case folding. The comparison + * begins at each source text's iteration position. The iteration position of each + * UText will be left following the last character compared. + * + * The comparison is done in code point order; this is because the characters + * in a UText are accessed one code point at a time, and may not be from a UTF-16 + * context. + * + * This functions works with strings of different explicitly specified lengths + * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. + * A length argument of -1 signifies that as much of the string should be used as + * is necessary to compare with the other string. If both length arguments are -1, + * the entire remaining portionss of both strings are used. + * + * @param s1 First source string. + * @param length1 Length of first source string in UTF-32 code points. + * + * @param s2 Second source string. + * @param length2 Length of second source string in UTF-32 code points. + * + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code point order with default case folding. + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL int32_t U_EXPORT2 +utext_caseCompare(UText *s1, int32_t length1, + UText *s2, int32_t length2, + uint32_t options, UErrorCode *pErrorCode); + +/** + * Compare two UTexts case-insensitively using full case folding. The comparison + * begins at each source text's iteration position. The iteration position of each + * UText will be left following the last character compared. This method differs from + * utext_caseCompare in that it accepts native limits rather than lengths for each + * string. + * + * The comparison is done in code point order; this is because the characters + * in a UText are accessed one code point at a time, and may not be from a UTF-16 + * context. + * + * This functions works with strings of different explicitly specified lengths + * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. + * A limit argument of -1 signifies that as much of the string should be used as + * is necessary to compare with the other string. If both length arguments are -1, + * the entire remaining portionss of both strings are used. + * + * @param s1 First source string. + * @param limit1 Native index of the last character in the first source string to be considered. + * + * @param s2 Second source string. + * @param limit2 Native index of the last character in the second source string to be considered. + * + * @param options A bit set of options: + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: + * Comparison in code point order with default case folding. + * + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I + * + * @param pErrorCode Must be a valid pointer to an error code value, + * which must not indicate a failure before the function call. + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL int32_t U_EXPORT2 +utext_caseCompareNativeLimit(UText *s1, int64_t limit1, + UText *s2, int64_t limit2, + uint32_t options, UErrorCode *pErrorCode); + + /************************************************************************************ * * #define inline versions of selected performance-critical text access functions @@ -689,6 +832,19 @@ utext_extract(UText *ut, * ************************************************************************************/ +/** + * inline version of utext_current32(), for performance-critical situations. + * + * Get the code point at the current iteration position of the UText. + * Returns U_SENTINEL (-1) if the position is at the end of the + * text. + * + * @internal ICU 4.4 technology preview + */ +#define UTEXT_CURRENT32(ut) \ + ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ + ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) + /** * inline version of utext_next32(), for performance-critical situations. * @@ -1291,8 +1447,8 @@ struct UTextFuncs { * (private) Spare function pointer * @internal */ - UTextClose *spare1; + /** * (private) Spare function pointer * @internal diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp index e1ea672bab9..5173ca4b969 100644 --- a/icu4c/source/common/utext.cpp +++ b/icu4c/source/common/utext.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2005-2009, International Business Machines +* Copyright (C) 2005-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -23,6 +23,7 @@ #include "cmemory.h" #include "cstring.h" #include "uassert.h" +#include "putilimp.h" U_NAMESPACE_USE @@ -450,6 +451,361 @@ utext_equals(const UText *a, const UText *b) { return TRUE; } +U_CAPI int32_t U_EXPORT2 +utext_compare(UText *s1, int32_t length1, + UText *s2, int32_t length2) { + UChar32 c1, c2; + + if(length1<0 && length2<0) { + /* strcmp style, go until end of string */ + for(;;) { + c1 = UTEXT_NEXT32(s1); + c2 = UTEXT_NEXT32(s2); + if(c1 != c2) { + break; + } else if(c1 == U_SENTINEL) { + return 0; + } + } + } else { + if(length1 < 0) { + length1 = INT32_MIN; + } else if (length2 < 0) { + length2 = INT32_MIN; + } + + /* memcmp/UnicodeString style, both length-specified */ + while((length1 > 0 || length1 == INT32_MIN) && (length2 > 0 || length2 == INT32_MIN)) { + c1 = UTEXT_NEXT32(s1); + c2 = UTEXT_NEXT32(s2); + + if(c1 != c2) { + break; + } else if(c1 == U_SENTINEL) { + return 0; + } + + if (length1 != INT32_MIN) { + length1 -= 1; + } + if (length2 != INT32_MIN) { + length2 -= 1; + } + } + + if(length1 <= 0 && length1 != INT32_MIN) { + if(length2 <= 0) { + return 0; + } else { + return -1; + } + } else if(length2 <= 0 && length2 != INT32_MIN) { + if (length1 <= 0) { + return 0; + } else { + return 1; + } + } + } + + return (int32_t)c1-(int32_t)c2; +} + +U_CAPI int32_t U_EXPORT2 +utext_compareNativeLimit(UText *s1, int64_t limit1, + UText *s2, int64_t limit2) { + UChar32 c1, c2; + + if(limit1<0 && limit2<0) { + /* strcmp style, go until end of string */ + for(;;) { + c1 = UTEXT_NEXT32(s1); + c2 = UTEXT_NEXT32(s2); + if(c1 != c2) { + return (int32_t)c1-(int32_t)c2; + } else if(c1 == U_SENTINEL) { + return 0; + } + } + } else { + /* memcmp/UnicodeString style, both length-specified */ + int64_t index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0); + int64_t index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0); + + while((limit1 < 0 || index1 < limit1) && (limit2 < 0 || index2 < limit2)) { + c1 = UTEXT_NEXT32(s1); + c2 = UTEXT_NEXT32(s2); + + if(c1 != c2) { + return (int32_t)c1-(int32_t)c2; + } else if(c1 == U_SENTINEL) { + return 0; + } + + if (limit1 >= 0) { + index1 = UTEXT_GETNATIVEINDEX(s1); + } + if (limit2 >= 0) { + index2 = UTEXT_GETNATIVEINDEX(s2); + } + } + + if(limit1 >= 0 && index1 >= limit1) { + if(index2 >= limit2) { + return 0; + } else { + return -1; + } + } else { + if(index1 >= limit1) { + return 0; + } else { + return 1; + } + } + } +} + +U_CAPI int32_t U_EXPORT2 +utext_caseCompare(UText *s1, int32_t length1, + UText *s2, int32_t length2, + uint32_t options, UErrorCode *pErrorCode) { + const UCaseProps *csp; + + /* case folding variables */ + const UChar *p; + int32_t length; + + /* case folding buffers, only use current-level start/limit */ + UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; + int32_t foldOffset1, foldOffset2, foldLength1, foldLength2; + + /* current code points */ + UChar32 c1, c2; + uint8_t cLength1, cLength2; + + /* argument checking */ + if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { + return 0; + } + if(s1==NULL || s2==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + csp=ucase_getSingleton(pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } + + /* for variable-length strings */ + if(length1 < 0) { + length1 = INT32_MIN; + } + if (length2 < 0) { + length2 = INT32_MIN; + } + + /* initialize */ + foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0; + + /* comparison loop */ + while((foldOffset1 < foldLength1 || length1 > 0 || length1 == INT32_MIN) && + (foldOffset2 < foldLength2 || length2 > 0 || length2 == INT32_MIN)) { + if(foldOffset1 < foldLength1) { + U16_NEXT_UNSAFE(fold1, foldOffset1, c1); + cLength1 = 0; + } else { + c1 = UTEXT_NEXT32(s1); + if (c1 != U_SENTINEL) { + cLength1 = U16_LENGTH(c1); + + length = ucase_toFullFolding(csp, c1, &p, options); + if(length >= 0) { + if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings + u_memcpy(fold1, p, length); + foldOffset1 = 0; + foldLength1 = length; + U16_NEXT_UNSAFE(fold1, foldOffset1, c1); + } else { + c1 = length; + } + } + } + + if(length1 != INT32_MIN) { + length1 -= 1; + } + } + + if(foldOffset2 < foldLength2) { + U16_NEXT_UNSAFE(fold2, foldOffset2, c2); + cLength2 = 0; + } else { + c2 = UTEXT_NEXT32(s2); + if (c2 != U_SENTINEL) { + cLength2 = U16_LENGTH(c2); + + length = ucase_toFullFolding(csp, c2, &p, options); + if(length >= 0) { + if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings + u_memcpy(fold2, p, length); + foldOffset2 = 0; + foldLength2 = length; + U16_NEXT_UNSAFE(fold2, foldOffset2, c2); + } else { + c2 = length; + } + } + } else if(c1 == U_SENTINEL) { + return 0; // end of both strings at once + } + + if(length2 != INT32_MIN) { + length2 -= 1; + } + } + + if(c1 != c2) { + return (int32_t)c1-(int32_t)c2; + } + } + + /* By now at least one of the strings is out of characters */ + length1 += foldLength1 - foldOffset1; + length2 += foldLength2 - foldOffset2; + + if(length1 <= 0 && length1 != INT32_MIN) { + if(length2 <= 0) { + return 0; + } else { + return -1; + } + } else { + if (length1 <= 0) { + return 0; + } else { + return 1; + } + } +} + +U_CAPI int32_t U_EXPORT2 +utext_caseCompareNativeLimit(UText *s1, int64_t limit1, + UText *s2, int64_t limit2, + uint32_t options, UErrorCode *pErrorCode) { + const UCaseProps *csp; + + /* case folding variables */ + const UChar *p; + int32_t length; + + /* case folding buffers, only use current-level start/limit */ + UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; + int32_t foldOffset1, foldOffset2, foldLength1, foldLength2; + + /* current code points */ + UChar32 c1, c2; + + /* native indexes into s1 and s2 */ + int64_t index1, index2; + + /* argument checking */ + if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { + return 0; + } + if(s1==NULL || s2==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + csp=ucase_getSingleton(pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } + + /* initialize */ + index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0); + index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0); + + foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0; + + /* comparison loop */ + while((foldOffset1 < foldLength1 || limit1 < 0 || index1 < limit1) && + (foldOffset2 < foldLength2 || limit2 < 0 || index2 < limit2)) { + if(foldOffset1 < foldLength1) { + U16_NEXT_UNSAFE(fold1, foldOffset1, c1); + } else { + c1 = UTEXT_NEXT32(s1); + if (c1 != U_SENTINEL) { + length = ucase_toFullFolding(csp, c1, &p, options); + if(length >= 0) { + if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings + u_memcpy(fold1, p, length); + foldOffset1 = 0; + foldLength1 = length; + U16_NEXT_UNSAFE(fold1, foldOffset1, c1); + } else { + c1 = length; + } + } + } + + if (limit1 >= 0) { + index1 = UTEXT_GETNATIVEINDEX(s1); + } + } + + if(foldOffset2 < foldLength2) { + U16_NEXT_UNSAFE(fold2, foldOffset2, c2); + } else { + c2 = UTEXT_NEXT32(s2); + if (c2 != U_SENTINEL) { + length = ucase_toFullFolding(csp, c2, &p, options); + if(length >= 0) { + if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings + u_memcpy(fold2, p, length); + foldOffset2 = 0; + foldLength2 = length; + U16_NEXT_UNSAFE(fold2, foldOffset2, c2); + } else { + c2 = length; + } + } + } else if(c1 == U_SENTINEL) { + return 0; + } + + if (limit2 >= 0) { + index2 = UTEXT_GETNATIVEINDEX(s2); + } + } + + if(c1 != c2) { + return (int32_t)c1-(int32_t)c2; + } + } + + /* By now at least one of the strings is out of characters */ + index1 -= foldLength1 - foldOffset1; + index2 -= foldLength2 - foldOffset2; + + if(limit1 >= 0 && index1 >= limit1) { + if(index2 >= limit2) { + return 0; + } else { + return -1; + } + } else { + if(index1 >= limit1) { + return 0; + } else { + return 1; + } + } +} + + U_CAPI UBool U_EXPORT2 utext_isWritable(const UText *ut) { @@ -800,7 +1156,7 @@ shallowTextClone(UText * dest, const UText * src, UErrorCode * status) { adjustPointer(dest, &dest->p, src); adjustPointer(dest, &dest->q, src); adjustPointer(dest, &dest->r, src); - adjustPointer(dest, (const void **)&dest->chunkContents, src); + adjustPointer(dest, (const void **)&dest->chunkContents, src); return dest; } @@ -932,7 +1288,7 @@ utf8TextAccess(UText *ut, int64_t index, UBool forward) { if (ix>length) { if (length>=0) { ix=length; - } else if (ix>ut->c) { + } else if (ix>=ut->c) { // Zero terminated string, and requested index is beyond // the region that has already been scanned. // Scan up to either the end of the string or to the @@ -1415,7 +1771,7 @@ utext_strFromUTF8(UChar *dest, if(ch<0){ ch = 0xfffd; } - if(ch<=0xFFFF){ + if(U_IS_BMP(ch)){ *(pDest++)=(UChar)ch; }else{ *(pDest++)=UTF16_LEAD(ch); @@ -1438,7 +1794,7 @@ utext_strFromUTF8(UChar *dest, if(ch<0){ ch = 0xfffd; } - reqLength+=UTF_CHAR_LENGTH(ch); + reqLength+=U16_LENGTH(ch); } } @@ -1485,7 +1841,7 @@ utf8TextExtract(UText *ut, int i; if (start32 < ut->chunkNativeLimit) { for (i=0; i<3; i++) { - if (U8_IS_LEAD(buf[start32]) || start32==0) { + if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) { break; } start32--; @@ -1494,7 +1850,7 @@ utf8TextExtract(UText *ut, if (limit32 < ut->chunkNativeLimit) { for (i=0; i<3; i++) { - if (U8_IS_LEAD(buf[limit32]) || limit32==0) { + if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) { break; } limit32--; @@ -1506,6 +1862,7 @@ utf8TextExtract(UText *ut, utext_strFromUTF8(dest, destCapacity, &destLength, (const char *)ut->context+start32, limit32-start32, pErrorCode); + utf8TextAccess(ut, limit32, TRUE); return destLength; } @@ -1870,6 +2227,8 @@ repTextExtract(UText *ut, } UnicodeString buffer(dest, 0, destCapacity); // writable alias rep->extractBetween(start32, limit32, buffer); + repTextAccess(ut, limit32, TRUE); + return u_terminateUChars(dest, destCapacity, length, status); } @@ -2138,6 +2497,9 @@ unistrTextExtract(UText *t, trimmedLength=destCapacity; } us->extract(start32, trimmedLength, dest); + t->chunkOffset = start32+trimmedLength; + } else { + t->chunkOffset = start32; } u_terminateUChars(dest, destCapacity, length, pErrorCode); return length; @@ -2528,7 +2890,7 @@ ucstrTextExtract(UText *ut, if (strLength>=0) { // We have filled the destination buffer, and the string length is known. // Cut the loop short. There is no need to scan string termination. - di = strLength; + di = limit32 - start32; si = limit32; break; } @@ -2548,7 +2910,7 @@ ucstrTextExtract(UText *ut, } // Put iteration position at the point just following the extracted text - ut->chunkOffset = si; + ut->chunkOffset = uprv_min(strLength, start32 + destCapacity); // Add a terminating NUL if space in the buffer permits, // and set the error status as required. @@ -2754,21 +3116,26 @@ charIterTextExtract(UText *ut, int32_t limit32 = pinIndex(limit, length); int32_t desti = 0; int32_t srci; + int32_t copyLimit; CharacterIterator *ci = (CharacterIterator *)ut->context; ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed. srci = ci->getIndex(); + copyLimit = srci; while (srcinext32PostInc(); int32_t len = U16_LENGTH(c); if (desti+len <= destCapacity) { U16_APPEND_UNSAFE(dest, desti, c); + copyLimit = srci+len; } else { desti += len; *status = U_BUFFER_OVERFLOW_ERROR; } srci += len; } + + charIterTextAccess(ut, copyLimit, TRUE); u_terminateUChars(dest, destCapacity, desti, status); return desti; diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index ca19b7c63d4..a23e5d9df39 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -76,7 +76,7 @@ translit.o utrans.o esctrn.o unesctrn.o funcrepl.o strrepl.o tridpars.o \ cpdtrans.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \ nultrans.o remtrans.o casetrn.o titletrn.o tolowtrn.o toupptrn.o anytrans.o \ name2uni.o uni2name.o nortrans.o quant.o transreg.o brktrans.o \ -regexcmp.o rematch.o repattrn.o regexst.o udatpg.o uregex.o uregexc.o \ +regexcmp.o rematch.o repattrn.o regexst.o regextxt.o udatpg.o uregex.o uregexc.o \ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \ csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \ wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \ diff --git a/icu4c/source/i18n/i18n.vcproj b/icu4c/source/i18n/i18n.vcproj index d885f88c86d..97633b28771 100644 --- a/icu4c/source/i18n/i18n.vcproj +++ b/icu4c/source/i18n/i18n.vcproj @@ -3602,6 +3602,14 @@ RelativePath=".\regexst.h" > + + + + diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 967d35e8546..82c1a0e2e35 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1,7 +1,7 @@ // // file: regexcmp.cpp // -// Copyright (C) 2002-2009 International Business Machines Corporation and others. +// Copyright (C) 2002-2010 International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the ICU regular expression compiler, which is responsible @@ -13,6 +13,7 @@ #if !UCONFIG_NO_REGULAR_EXPRESSIONS +#include "unicode/ustring.h" #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/uchar.h" @@ -21,6 +22,7 @@ #include "unicode/parseerr.h" #include "unicode/regex.h" #include "util.h" +#include "putilimp.h" #include "cmemory.h" #include "cstring.h" #include "uvectr32.h" @@ -33,6 +35,7 @@ // generated by a Perl script. #include "regexcmp.h" #include "regexst.h" +#include "regextxt.h" @@ -47,11 +50,13 @@ U_NAMESPACE_BEGIN RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status), fSetStack(status), fSetOpStack(status) { + // Lazy init of all shared global sets (needed for init()'s empty text) + RegexStaticSets::initGlobals(&status); + fStatus = &status; fRXPat = rxp; fScanIndex = 0; - fNextIndex = 0; fPeekChar = -1; fLineNum = 1; fCharNum = 0; @@ -97,6 +102,24 @@ void RegexCompile::compile( const UnicodeString &pat, // Source pat to be compiled. UParseError &pp, // Error position info UErrorCode &e) // Error Code +{ + UText patternText = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&patternText, &pat, &e); + + if (U_SUCCESS(e)) { + compile(&patternText, pp, e); + utext_close(&patternText); + } +} + +// +// compile, UText mode +// All the work is actually done here. +// +void RegexCompile::compile( + UText *pat, // Source pat to be compiled. + UParseError &pp, // Error position info + UErrorCode &e) // Error Code { fStatus = &e; fParseErr = &pp; @@ -108,16 +131,16 @@ void RegexCompile::compile( } // There should be no pattern stuff in the RegexPattern object. They can not be reused. - U_ASSERT(fRXPat->fPattern.length() == 0); + U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); // Prepare the RegexPattern object to receive the compiled pattern. - fRXPat->fPattern = pat; + fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; // Initialize the pattern scanning state machine - fPatternLength = pat.length(); + fPatternLength = utext_nativeLength(pat); uint16_t state = 1; const RegexTableEl *tableEl; nextChar(fC); // Fetch the first char from the pattern string. @@ -250,34 +273,13 @@ void RegexCompile::compile( // The pattern has now been read and processed, and the compiled code generated. // - // Back-reference fixup - // - int32_t loc; - for (loc=0; locfCompiledPat->size(); loc++) { - int32_t op = fRXPat->fCompiledPat->elementAti(loc); - int32_t opType = URX_TYPE(op); - if (opType == URX_BACKREF || opType == URX_BACKREF_I) { - int32_t where = URX_VAL(op); - if (where > fRXPat->fGroupMap->size()) { - error(U_REGEX_INVALID_BACK_REF); - break; - } - where = fRXPat->fGroupMap->elementAti(where-1); - op = URX_BUILD(opType, where); - fRXPat->fCompiledPat->setElementAt(op, loc); - } - } - - // // Compute the number of digits requried for the largest capture group number. // fRXPat->fMaxCaptureDigits = 1; int32_t n = 10; - for (;;) { - if (n > fRXPat->fGroupMap->size()) { - break; - } + int32_t groupCount = fRXPat->fGroupMap->size(); + while (n <= groupCount) { fRXPat->fMaxCaptureDigits++; n *= 10; } @@ -286,10 +288,15 @@ void RegexCompile::compile( // The pattern's fFrameSize so far has accumulated the requirements for // storage for capture parentheses, counters, etc. that are encountered // in the pattern. Add space for the two variables that are always - // present in the saved state: the input string position and the - // position in the compiled pattern. + // present in the saved state: the input string position (int64_t) and + // the position in the compiled pattern. // - fRXPat->fFrameSize+=2; + fRXPat->fFrameSize+=3; + + // + // Optimization pass 1: NOPs, back-references, and case-folding + // + stripNOPs(); // // Get bounds for the minimum and maximum length of a string that this @@ -299,10 +306,9 @@ void RegexCompile::compile( fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); // - // Optimization passes + // Optimization pass 2: match start type // matchStartType(); - stripNOPs(); // // Set up fast latin-1 range sets @@ -426,19 +432,19 @@ UBool RegexCompile::doParseActions(int32_t action) // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. // - // Each capture group gets three slots in the save stack frame: - // 0: Capture Group start position (in input string being matched.) - // 1: Capture Group end positino. - // 2: Start of Match-in-progress. + // Each capture group gets three double-width slots in the save stack frame: + // 0-1: Capture Group start position (in input string being matched.) + // 2-3: Capture Group end position. + // 4-5: Start of Match-in-progress. // The first two locations are for a completed capture group, and are // referred to by back references and the like. // The third location stores the capture start position when an START_CAPTURE is // encountered. This will be promoted to a completed capture when (and if) the corresponding - // END_CAPure is encountered. + // END_CAPTURE is encountered. { fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); - int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame. - fRXPat->fFrameSize += 3; + int32_t varsLoc = fRXPat->fFrameSize; // Reserve five slots in match stack frame. + fRXPat->fFrameSize += 6; int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); fRXPat->fCompiledPat->addElement(cop, *fStatus); fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); @@ -532,10 +538,10 @@ UBool RegexCompile::doParseActions(int32_t action) // 8. code for parenthesized stuff. // 9. LA_END // - // Two data slots are reserved, for saving the stack ptr and the input position. + // Three data slots are reserved, for saving the stack ptr and the (double-width) input position. { int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 2; + fRXPat->fDataSize += 3; int32_t op = URX_BUILD(URX_LA_START, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); @@ -576,9 +582,10 @@ UBool RegexCompile::doParseActions(int32_t action) // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. // 7. END_LA // Restore match region, in case look-ahead was using // an alternate (transparent) region. + // Three data slots are reserved, for saving the stack ptr and the (double-width) input position. { int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 2; + fRXPat->fDataSize += 3; int32_t op = URX_BUILD(URX_LA_START, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); @@ -617,12 +624,12 @@ UBool RegexCompile::doParseActions(int32_t action) // Allocate a block of matcher data, to contain (when running a match) // 0: Stack ptr on entry // 1: Input Index on entry - // 2: Start index of match current match attempt. - // 3: Original Input String len. + // 2-3: Start index of match current match attempt. + // 4-5: Original Input String len. // Allocate data space int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 4; + fRXPat->fDataSize += 6; // Emit URX_LB_START int32_t op = URX_BUILD(URX_LB_START, dataLoc); @@ -670,12 +677,12 @@ UBool RegexCompile::doParseActions(int32_t action) // Allocate a block of matcher data, to contain (when running a match) // 0: Stack ptr on entry // 1: Input Index on entry - // 2: Start index of match current match attempt. - // 3: Original Input String len. + // 2-3: Start index of match current match attempt. + // 4-5: Original Input String len. // Allocate data space int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 4; + fRXPat->fDataSize += 6; // Emit URX_LB_START int32_t op = URX_BUILD(URX_LB_START, dataLoc); @@ -764,7 +771,7 @@ UBool RegexCompile::doParseActions(int32_t action) int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); frameLoc = fRXPat->fFrameSize; - fRXPat->fFrameSize++; + fRXPat->fFrameSize += 2; // double-width index int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); break; @@ -784,7 +791,7 @@ UBool RegexCompile::doParseActions(int32_t action) } fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); frameLoc = fRXPat->fFrameSize; - fRXPat->fFrameSize++; + fRXPat->fFrameSize += 2; // double-width index int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); break; @@ -801,7 +808,7 @@ UBool RegexCompile::doParseActions(int32_t action) // Emit the code sequence that can handle it. insertOp(topLoc); frameLoc = fRXPat->fFrameSize; - fRXPat->fFrameSize++; + fRXPat->fFrameSize += 2; // double-width index int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); fRXPat->fCompiledPat->setElementAt(op, topLoc); @@ -907,7 +914,7 @@ UBool RegexCompile::doParseActions(int32_t action) int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); dataLoc = fRXPat->fFrameSize; - fRXPat->fFrameSize++; + fRXPat->fFrameSize += 2; // double-width index int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); break; @@ -927,7 +934,7 @@ UBool RegexCompile::doParseActions(int32_t action) } fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); dataLoc = fRXPat->fFrameSize; - fRXPat->fFrameSize++; + fRXPat->fFrameSize += 2; // double-width index int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); break; @@ -945,7 +952,7 @@ UBool RegexCompile::doParseActions(int32_t action) if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { insertOp(saveStateLoc); dataLoc = fRXPat->fFrameSize; - fRXPat->fFrameSize++; + fRXPat->fFrameSize += 2; // double-width index int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); @@ -1658,7 +1665,7 @@ UBool RegexCompile::doParseActions(int32_t action) } - case doSetNegate: + case doSetNegate: // Scanned a '^' at the start of a set. // Push the negation operator onto the set op stack. // A twist for case-insensitive matching: @@ -1770,17 +1777,12 @@ void RegexCompile::literalChar(UChar32 c) { // First char of a string in the pattern. // Emit a OneChar op into the compiled pattern. emitONE_CHAR(c); - - // Also add it to the string pool, in case we get a second adjacent literal - // and want to change form ONE_CHAR to STRING + + // Mark that we might actually be starting a string here fStringOpStart = fRXPat->fLiteralText.length(); - fRXPat->fLiteralText.append(c); return; } - // We are adding onto an existing string - fRXPat->fLiteralText.append(c); - op = fRXPat->fCompiledPat->lastElementi(); opType = URX_TYPE(op); U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN); @@ -1797,10 +1799,12 @@ void RegexCompile::literalChar(UChar32 c) { fRXPat->fCompiledPat->setElementAt(op, patternLoc); return; } - + // The most recently emitted op is a ONECHAR. // We've now received another adjacent char. Change the ONECHAR op // to a string op. + fRXPat->fLiteralText.append(URX_VAL(op)); + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { op = URX_BUILD(URX_STRING_I, fStringOpStart); } else { @@ -1811,7 +1815,10 @@ void RegexCompile::literalChar(UChar32 c) { op = URX_BUILD(URX_STRING_LEN, 0); fRXPat->fCompiledPat->addElement(op, *fStatus); } - + + // We are adding onto an existing string + fRXPat->fLiteralText.append(c); + // The pattern contains a URX_SRING / URX_STRING_LEN. Update the // string length to reflect the new char we just added to the string. stringLen = fRXPat->fLiteralText.length() - fStringOpStart; @@ -1834,7 +1841,7 @@ void RegexCompile::emitONE_CHAR(UChar32 c) { if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { // We have a cased character, and are in case insensitive matching mode. - c = u_foldCase(c, U_FOLD_CASE_DEFAULT); + //c = u_foldCase(c, U_FOLD_CASE_DEFAULT); // !!!: handled in stripNOPs() now op = URX_BUILD(URX_ONECHAR_I, c); } else { // Uncased char, or case sensitive match mode. @@ -2245,7 +2252,6 @@ void RegexCompile::compileSet(UnicodeSet *theSet) // ignored strings, that would be better.) theSet->removeAllStrings(); int32_t setSize = theSet->size(); - UChar32 firstSetChar = theSet->charAt(0); switch (setSize) { case 0: @@ -2261,7 +2267,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) // The set contains only a single code point. Put it into // the compiled pattern as a single char operation rather // than a set, and discard the set itself. - literalChar(firstSetChar); + literalChar(theSet->charAt(0)); delete theSet; } break; @@ -2472,7 +2478,7 @@ void RegexCompile::matchStartType() { case URX_STO_INP_LOC: case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: - + case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: break; @@ -3378,6 +3384,14 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // code generation to provide locations that may be patched later. // Many end up unneeded, and are removed by this function. // +// In order to minimize the number of passes through the pattern, +// back-reference fixup is also performed here (adjusting +// back-reference operands to point to the correct frame offsets). +// +// In addition, case-insensitive character and string literals are +// now case-folded here, rather than when first parsed or at match +// time. +// //------------------------------------------------------------------------------ void RegexCompile::stripNOPs() { @@ -3399,6 +3413,9 @@ void RegexCompile::stripNOPs() { d++; } } + + UnicodeString caseStringBuffer; + int32_t stringDelta = 0; // Make a second pass over the code, removing the NOPs by moving following // code up, and patching operands that refer to code locations that @@ -3432,12 +3449,69 @@ void RegexCompile::stripNOPs() { break; } + case URX_ONECHAR_I: + { + UChar32 c = URX_VAL(op); + if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { + // We have a cased character to fold + c = u_foldCase(c, U_FOLD_CASE_DEFAULT); + op = URX_BUILD(URX_ONECHAR_I, c); + } + + fRXPat->fCompiledPat->setElementAt(op, dst); + dst++; + break; + } + case URX_STRING_I: + { + op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta); + + src++; + int32_t lengthOp = fRXPat->fCompiledPat->elementAti(src); + + caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VAL(lengthOp)); + caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT); + + int32_t newLen = caseStringBuffer.length(); + if (newLen <= URX_VAL(lengthOp)) { + // don't shift if we don't have to, take the tiny memory hit of a smaller string + fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseStringBuffer); + } else { + // shift other strings over...at least UnicodeString handles this for us! + fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp), caseStringBuffer); + stringDelta += newLen - URX_VAL(lengthOp); + } + lengthOp = URX_BUILD(URX_STRING_LEN, newLen); + + fRXPat->fCompiledPat->setElementAt(op, dst); + fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1); + dst += 2; + break; + } + case URX_BACKREF: + case URX_BACKREF_I: + { + int32_t where = URX_VAL(op); + if (where > fRXPat->fGroupMap->size()) { + error(U_REGEX_INVALID_BACK_REF); + break; + } + where = fRXPat->fGroupMap->elementAti(where-1); + op = URX_BUILD(opType, where); + fRXPat->fCompiledPat->setElementAt(op, dst); + dst++; + + fRXPat->fNeedsAltInput = TRUE; + break; + } + case URX_STRING: + op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta); + // continue case URX_RESERVED_OP: case URX_RESERVED_OP_N: case URX_BACKTRACK: case URX_END: case URX_ONECHAR: - case URX_STRING: case URX_STRING_LEN: case URX_START_CAPTURE: case URX_END_CAPTURE: @@ -3460,13 +3534,9 @@ void RegexCompile::stripNOPs() { case URX_DOTANY_UNIX: case URX_STO_SP: case URX_LD_SP: - case URX_BACKREF: case URX_STO_INP_LOC: case URX_LA_START: case URX_LA_END: - case URX_ONECHAR_I: - case URX_STRING_I: - case URX_BACKREF_I: case URX_DOLLAR_M: case URX_CARET_M: case URX_CARET_M_UNIX: @@ -3509,15 +3579,15 @@ void RegexCompile::error(UErrorCode e) { *fStatus = e; fParseErr->line = fLineNum; fParseErr->offset = fCharNum; + + UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context // Fill in the context. // Note: extractBetween() pins supplied indicies to the string bounds. uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); - fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, - fParseErr->preContext, 0); - fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, - fParseErr->postContext, 0); + utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); + utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); } } @@ -3557,18 +3627,18 @@ static const UChar chLS = 0x2028; // Unicode Line Separator //------------------------------------------------------------------------------ UChar32 RegexCompile::nextCharLL() { UChar32 ch; - UnicodeString &pattern = fRXPat->fPattern; if (fPeekChar != -1) { ch = fPeekChar; fPeekChar = -1; return ch; } - if (fPatternLength==0 || fNextIndex >= fPatternLength) { - return (UChar32)-1; + + // assume we're already in the right place + ch = UTEXT_NEXT32(fRXPat->fPattern); + if (ch == U_SENTINEL) { + return ch; } - ch = pattern.char32At(fNextIndex); - fNextIndex = pattern.moveIndex32(fNextIndex, 1); if (ch == chCR || ch == chNEL || @@ -3613,7 +3683,7 @@ UChar32 RegexCompile::peekCharLL() { //------------------------------------------------------------------------------ void RegexCompile::nextChar(RegexPatternChar &c) { - fScanIndex = fNextIndex; + fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); c.fChar = nextCharLL(); c.fQuoted = FALSE; @@ -3670,8 +3740,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { // check for backslash escaped characters. // if (c.fChar == chBackSlash) { - int32_t startX = fNextIndex; // start and end positions of the - int32_t endX = fNextIndex; // sequence following the '\' + int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { // // A '\' sequence that is handled by ICU's standard unescapeAt function. @@ -3680,19 +3749,39 @@ void RegexCompile::nextChar(RegexPatternChar &c) { // nextCharLL(); // get & discard the peeked char. c.fQuoted = TRUE; - c.fChar = fRXPat->fPattern.unescapeAt(endX); - if (startX == endX) { - error(U_REGEX_BAD_ESCAPE_SEQUENCE); + + if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { + int32_t endIndex = pos; + c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, fPatternLength, (void *)fRXPat->fPattern->chunkContents); + + if (endIndex == pos) { + error(U_REGEX_BAD_ESCAPE_SEQUENCE); + } + fCharNum += endIndex - pos; + UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); + } else { + int32_t offset = 0; + struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); + + UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); + c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); + + if (offset == 0) { + error(U_REGEX_BAD_ESCAPE_SEQUENCE); + } else if (context.lastOffset == offset) { + UTEXT_PREVIOUS32(fRXPat->fPattern); + } else if (context.lastOffset != offset-1) { + utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); + } + fCharNum += offset; } - fCharNum += endX - startX; - fNextIndex = endX; } else if (peekCharLL() == chDigit0) { // Octal Escape, using Java Regexp Conventions // which are \0 followed by 1-3 octal digits. // Different from ICU Unescape handling of Octal, which does not // require the leading 0. - // Java also has the convention of only consuning 2 octal digits if + // Java also has the convention of only consuming 2 octal digits if // the three digit number would be > 0xff // c.fChar = 0; @@ -3873,13 +3962,13 @@ UnicodeSet *RegexCompile::scanPosixProp() { // Save the scanner state. // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 - int32_t savedScanIndex = fScanIndex; - int32_t savedNextIndex = fNextIndex; + int64_t savedScanIndex = fScanIndex; + int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); UBool savedQuoteMode = fQuoteMode; UBool savedInBackslashQuote = fInBackslashQuote; UBool savedEOLComments = fEOLComments; - int32_t savedLineNum = fLineNum; - int32_t savedCharNum = fCharNum; + int64_t savedLineNum = fLineNum; + int64_t savedCharNum = fCharNum; UChar32 savedLastChar = fLastChar; UChar32 savedPeekChar = fPeekChar; RegexPatternChar savedfC = fC; @@ -3926,7 +4015,6 @@ UnicodeSet *RegexCompile::scanPosixProp() { // The main scanner will retry the input as a normal set expression, // not a [:Property:] expression. fScanIndex = savedScanIndex; - fNextIndex = savedNextIndex; fQuoteMode = savedQuoteMode; fInBackslashQuote = savedInBackslashQuote; fEOLComments = savedEOLComments; @@ -3935,6 +4023,7 @@ UnicodeSet *RegexCompile::scanPosixProp() { fLastChar = savedLastChar; fPeekChar = savedPeekChar; fC = savedfC; + UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); } return uset; } diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 94c83b63088..032fdb7f9cc 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -1,7 +1,7 @@ // // regexcmp.h // -// Copyright (C) 2002-2008, International Business Machines Corporation and others. +// Copyright (C) 2002-2010, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains declarations for the class RegexCompile @@ -54,7 +54,8 @@ public: RegexCompile(RegexPattern *rp, UErrorCode &e); void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); - + void compile(UText *pat, UParseError &pp, UErrorCode &e); + virtual ~RegexCompile(); @@ -102,7 +103,7 @@ private: void fixLiterals(UBool split=FALSE); // Fix literal strings. void insertOp(int32_t where); // Open up a slot for a new op in the // generated code at the specified location. - void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code, + void emitONE_CHAR(UChar32 c); // Emit a ONE_CHAR op into the compiled code, // taking case mode into account. int32_t minMatchLength(int32_t start, int32_t end); @@ -124,16 +125,14 @@ private: // // Data associated with low level character scanning // - int32_t fScanIndex; // Index of current character being processed + int64_t fScanIndex; // Index of current character being processed // in the rule input string. - int32_t fNextIndex; // Index of the next character, which - // is the first character not yet scanned. UBool fQuoteMode; // Scan is in a \Q...\E quoted region UBool fInBackslashQuote; // Scan is between a '\' and the following char. UBool fEOLComments; // When scan is just after '(?', inhibit #... to // end of line comments, in favor of (?#...) comments. - int32_t fLineNum; // Line number in input file. - int32_t fCharNum; // Char position within the line. + int64_t fLineNum; // Line number in input file. + int64_t fCharNum; // Char position within the line. UChar32 fLastChar; // Previous char, needed to count CR-LF // as a single line, not two. UChar32 fPeekChar; // Saved char, if we've scanned ahead. @@ -168,8 +167,8 @@ private: // holds the start index within RegexPattern. // fLiteralText where the string is being stored. - int32_t fPatternLength; // Length of the input pattern string. - + int64_t fPatternLength; // Length of the input pattern string. + UVector32 fParenStack; // parentheses stack. Each frame consists of // the positions of compiled pattern operations // needing fixup, followed by negative value. The @@ -196,7 +195,7 @@ private: // -1 for the upper interval value means none // was specified (unlimited occurences.) - int32_t fNameStartPos; // Starting position of a \N{NAME} name in a + int64_t fNameStartPos; // Starting position of a \N{NAME} name in a // pattern, valid while remainder of name is // scanned. @@ -208,7 +207,6 @@ private: UChar32 fLastSetLiteral; // The last single code point added to a set. // needed when "-y" is scanned, and we need // to turn "x-y" into a range. - }; // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 07206e6448d..57a4576cc20 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2002-2007 International Business Machines Corporation +// Copyright (C) 2002-2010 International Business Machines Corporation // and others. All rights reserved. // // file: regeximp.h @@ -279,11 +279,12 @@ enum { // Match Engine State Stack Frame Layout. // struct REStackFrame { - int32_t fInputIdx; // Position of next character in the input string + int64_t fInputIdx; // Position of next character in the input string int32_t fPatIdx; // Position of next Op in the compiled pattern int32_t fExtra[2]; // Extra state, for capture group start/ends // atomic parentheses, repeat counts, etc. // Locations assigned at pattern compile time. + // Note that this will likely end up longer than 64 bits. }; // @@ -307,7 +308,6 @@ enum StartOfMatch { (v)==START_STRING? "START_STRING" : \ "ILLEGAL") - // // 8 bit set, to fast-path latin-1 set membership tests. // @@ -347,7 +347,6 @@ inline void Regex8BitSet::operator = (const Regex8BitSet &s) { uprv_memcpy(d, s.d, sizeof(d)); } - U_NAMESPACE_END #endif diff --git a/icu4c/source/i18n/regexst.cpp b/icu4c/source/i18n/regexst.cpp index cfc1f7d80bb..c2a8933a8cc 100644 --- a/icu4c/source/i18n/regexst.cpp +++ b/icu4c/source/i18n/regexst.cpp @@ -1,7 +1,7 @@ // // regexst.h // -// Copyright (C) 2004-2008, International Business Machines Corporation and others. +// Copyright (C) 2004-2010, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains class RegexStaticSets @@ -214,6 +214,10 @@ fRuleDigitsAlias(NULL) for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) { fRuleSets[i].compact(); } + + // Finally, initialize an empty string for utility purposes + fEmptyText = utext_openUChars(NULL, NULL, 0, status); + return; // If we reached this point, everything is fine so just exit ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error @@ -233,6 +237,8 @@ RegexStaticSets::~RegexStaticSets() { fPropSets[i] = NULL; } fRuleDigitsAlias = NULL; + + utext_close(fEmptyText); } diff --git a/icu4c/source/i18n/regexst.h b/icu4c/source/i18n/regexst.h index 639ca65c504..07034d4450a 100644 --- a/icu4c/source/i18n/regexst.h +++ b/icu4c/source/i18n/regexst.h @@ -1,7 +1,7 @@ // // regexst.h // -// Copyright (C) 2003-2008, International Business Machines Corporation and others. +// Copyright (C) 2003-2010, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains declarations for the class RegexStaticSets @@ -19,6 +19,7 @@ #define REGEXST_H #include "unicode/utypes.h" +#include "unicode/utext.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "regeximp.h" @@ -45,7 +46,7 @@ public: UnicodeSet fUnescapeCharSet; // Set of chars handled by unescape when // encountered with a \ in a pattern. UnicodeSet *fRuleDigitsAlias; - UnicodeString fEmptyString; // An empty string, to be used when a matcher + UText *fEmptyText; // An empty string, to be used when a matcher // is created with no input. }; diff --git a/icu4c/source/i18n/regextxt.cpp b/icu4c/source/i18n/regextxt.cpp new file mode 100644 index 00000000000..6960dac1b74 --- /dev/null +++ b/icu4c/source/i18n/regextxt.cpp @@ -0,0 +1,45 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2008-2010, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// +// file: regextxt.cpp +// +// This file contains utility code for supporting UText in the regular expression engine. +// + +#include "regextxt.h" + +U_NAMESPACE_BEGIN + +U_CFUNC UChar U_CALLCONV +uregex_utext_unescape_charAt(int32_t offset, void *ct) { + struct URegexUTextUnescapeCharContext *context = (struct URegexUTextUnescapeCharContext *)ct; + UChar32 c; + if (offset == context->lastOffset + 1) { + c = UTEXT_NEXT32(context->text); + context->lastOffset++; + } else if (offset == context->lastOffset) { + c = UTEXT_PREVIOUS32(context->text); + UTEXT_NEXT32(context->text); + } else { + utext_moveIndex32(context->text, offset - context->lastOffset - 1); + c = UTEXT_NEXT32(context->text); + context->lastOffset = offset; + } + + // !!!: Doesn't handle characters outside BMP + if (U_IS_BMP(c)) { + return (UChar)c; + } else { + return 0; + } +} + +U_CFUNC UChar U_CALLCONV +uregex_ucstr_unescape_charAt(int32_t offset, void *context) { + return ((UChar *)context)[offset]; +} + +U_NAMESPACE_END diff --git a/icu4c/source/i18n/regextxt.h b/icu4c/source/i18n/regextxt.h new file mode 100644 index 00000000000..b82e0aa3c04 --- /dev/null +++ b/icu4c/source/i18n/regextxt.h @@ -0,0 +1,48 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2008-2010, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// +// file: regextxt.h +// +// This file contains utility code for supporting UText in the regular expression engine. +// +// This class is internal to the regular expression implementation. +// For the public Regular Expression API, see the file "unicode/regex.h" +// + +#ifndef _REGEXTXT_H +#define _REGEXTXT_H + +#include +#include + +U_NAMESPACE_BEGIN + +#define UTEXT_USES_U16(ut) (NULL==((ut)->pFuncs->mapNativeIndexToUTF16)) + +#if 0 +#define REGEX_DISABLE_CHUNK_MODE 1 +#endif + +#ifdef REGEX_DISABLE_CHUNK_MODE +# define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) (FALSE) +#else +# define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) ((0==((ut)->chunkNativeStart))&&((len)==((ut)->chunkNativeLimit))&&((len)==((ut)->nativeIndexingLimit))) +#endif + +struct URegexUTextUnescapeCharContext { + UText *text; + int32_t lastOffset; +}; +#define U_REGEX_UTEXT_UNESCAPE_CONTEXT(text) { (text), -1 } + +U_CFUNC UChar U_CALLCONV +uregex_utext_unescape_charAt(int32_t offset, void * /* struct URegexUTextUnescapeCharContext* */ context); +U_CFUNC UChar U_CALLCONV +uregex_ucstr_unescape_charAt(int32_t offset, void * /* UChar* */ context); + +U_NAMESPACE_END + +#endif diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 52190e268ab..86558c6642c 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1,6 +1,6 @@ /* ************************************************************************** -* Copyright (C) 2002-2008 International Business Machines Corporation * +* Copyright (C) 2002-2010 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ @@ -25,9 +25,26 @@ #include "uvectr32.h" #include "regeximp.h" #include "regexst.h" +#include "regextxt.h" +#include "ucase.h" // #include // Needed for heapcheck testing + + +// Smart Backtracking +// ------------------ +// When a failure would go back to a LOOP_C instruction, +// strings, characters, and setrefs scan backwards for a valid start +// character themselves, pop the stack, and save state, emulating the +// LOOP_C's effect but assured that the next character of input is a +// possible matching character. +// +// Good idea in theory; unfortunately it only helps out a few specific +// cases and slows the engine down a little in the rest. + +//#define REGEX_SMART_BACKTRACKING 1 + U_NAMESPACE_BEGIN // Default limit for the size of the back track stack, to avoid system @@ -60,7 +77,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) { return; } fPattern = pat; - init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus); + init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); } @@ -74,6 +91,25 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); fPattern = fPatternOwned; + + UText inputText = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&inputText, &input, &status); + init2(&inputText, status); + utext_close(&inputText); + + fInputUniStrMaybeMutable = TRUE; +} + + +RegexMatcher::RegexMatcher(UText *regexp, UText *input, + uint32_t flags, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; + } + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + fPattern = fPatternOwned; init2(input, status); } @@ -87,7 +123,19 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp, UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); fPattern = fPatternOwned; - init2(RegexStaticSets::gStaticSets->fEmptyString, status); + init2(RegexStaticSets::gStaticSets->fEmptyText, status); +} + +RegexMatcher::RegexMatcher(UText *regexp, + uint32_t flags, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; + } + UParseError pe; + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); + fPattern = fPatternOwned; + init2(RegexStaticSets::gStaticSets->fEmptyText, status); } @@ -104,6 +152,17 @@ RegexMatcher::~RegexMatcher() { fPatternOwned = NULL; fPattern = NULL; } + + if (fInput) { + delete fInput; + } + if (fInputText) { + utext_close(fInputText); + } + if (fAltInputText) { + utext_close(fAltInputText); + } + #if UCONFIG_NO_BREAK_ITERATION==0 delete fWordBreakItr; #endif @@ -119,7 +178,6 @@ RegexMatcher::~RegexMatcher() { void RegexMatcher::init(UErrorCode &status) { fPattern = NULL; fPatternOwned = NULL; - fInput = NULL; fFrameSize = 0; fRegionStart = 0; fRegionLimit = 0; @@ -152,6 +210,12 @@ void RegexMatcher::init(UErrorCode &status) { fWordBreakItr = NULL; fStack = new UVector32(status); + fInputText = NULL; + fAltInputText = NULL; + fInput = NULL; + fInputLength = 0; + fInputUniStrMaybeMutable = FALSE; + if (U_FAILURE(status)) { fDeferredStatus = status; } @@ -161,7 +225,7 @@ void RegexMatcher::init(UErrorCode &status) { // init2() Common initialization for use by RegexMatcher constructors, part 2. // This handles the common setup to be done after the Pattern is available. // -void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) { +void RegexMatcher::init2(UText *input, UErrorCode &status) { if (U_FAILURE(status)) { fDeferredStatus = status; return; @@ -194,6 +258,29 @@ static const UChar DOLLARSIGN = 0x24; RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + if (U_SUCCESS(status)) { + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &dest, &status); + + if (U_SUCCESS(status)) { + appendReplacement(&resultText, &replacementText, status); + utext_close(&resultText); + } + utext_close(&replacementText); + } + + return *this; +} + +// +// appendReplacement, UText mode +// +RegexMatcher &RegexMatcher::appendReplacement(UText *dest, + UText *replacement, + UErrorCode &status) { if (U_FAILURE(status)) { return *this; } @@ -205,102 +292,150 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, status = U_REGEX_INVALID_STATE; return *this; } - + // Copy input string from the end of previous match to start of current match - int32_t len = fMatchStart-fAppendPosition; - if (len > 0) { - dest.append(*fInput, fAppendPosition, len); + int64_t destLen = utext_nativeLength(dest); + if (fMatchStart > fAppendPosition) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, fMatchStart-fAppendPosition, &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = fMatchStart-fAppendPosition; + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); + } + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); + destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); + uprv_free(inputChars); + } } fAppendPosition = fMatchEnd; - + // scan the replacement text, looking for substitutions ($n) and \escapes. // TODO: optimize this loop by efficiently scanning for '$' or '\', // move entire ranges not containing substitutions. - int32_t replLen = replacement.length(); - int32_t replIdx = 0; - while (replIdx= replLen) { + c = UTEXT_CURRENT32(replacement); + if (c == U_SENTINEL) { break; } - c = replacement.charAt(replIdx); - + if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. - UChar32 escapedChar = replacement.unescapeAt(replIdx); + int32_t offset = 0; + struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); + UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); if (escapedChar != (UChar32)0xFFFFFFFF) { - dest.append(escapedChar); + if (U_IS_BMP(escapedChar)) { + UChar c16 = (UChar)escapedChar; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(escapedChar); + surrogate[1] = U16_TRAIL(escapedChar); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } + } // TODO: Report errors for mal-formed \u escapes? // As this is, the original sequence is output, which may be OK. - continue; + if (context.lastOffset == offset) { + UTEXT_PREVIOUS32(replacement); + } else if (context.lastOffset != offset-1) { + utext_moveIndex32(replacement, offset - context.lastOffset - 1); + } + } + } else { + UTEXT_NEXT32(replacement); + // Plain backslash escape. Just put out the escaped character. + if (U_IS_BMP(c)) { + UChar c16 = (UChar)c; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(c); + surrogate[1] = U16_TRAIL(c); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } } } - - // Plain backslash escape. Just put out the escaped character. - dest.append(c); - replIdx++; - continue; - } - - if (c != DOLLARSIGN) { + } else if (c != DOLLARSIGN) { // Normal char, not a $. Copy it out without further checks. - dest.append(c); - continue; - } - - // We've got a $. Pick up a capture group number if one follows. - // Consume at most the number of digits necessary for the largest capture - // number that is valid for this pattern. - - int32_t numDigits = 0; - int32_t groupNum = 0; - UChar32 digitC; - for (;;) { - if (replIdx >= replLen) { - break; + if (U_IS_BMP(c)) { + UChar c16 = (UChar)c; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + UChar surrogate[2]; + surrogate[0] = U16_LEAD(c); + surrogate[1] = U16_TRAIL(c); + if (U_SUCCESS(status)) { + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); + } } - digitC = replacement.char32At(replIdx); - if (u_isdigit(digitC) == FALSE) { - break; + } else { + // We've got a $. Pick up a capture group number if one follows. + // Consume at most the number of digits necessary for the largest capture + // number that is valid for this pattern. + + int32_t numDigits = 0; + int32_t groupNum = 0; + UChar32 digitC; + for (;;) { + digitC = UTEXT_CURRENT32(replacement); + if (digitC == U_SENTINEL) { + break; + } + if (u_isdigit(digitC) == FALSE) { + break; + } + UTEXT_NEXT32(replacement); + groupNum=groupNum*10 + u_charDigitValue(digitC); + numDigits++; + if (numDigits >= fPattern->fMaxCaptureDigits) { + break; + } } - replIdx = replacement.moveIndex32(replIdx, 1); - groupNum=groupNum*10 + u_charDigitValue(digitC); - numDigits++; - if (numDigits >= fPattern->fMaxCaptureDigits) { - break; + + + if (numDigits == 0) { + // The $ didn't introduce a group number at all. + // Treat it as just part of the substitution text. + UChar c16 = DOLLARSIGN; + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); + } else { + // Finally, append the capture group data to the destination. + destLen += appendGroup(groupNum, dest, status); + if (U_FAILURE(status)) { + // Can fail if group number is out of range. + break; + } } } - - - if (numDigits == 0) { - // The $ didn't introduce a group number at all. - // Treat it as just part of the substitution text. - dest.append(DOLLARSIGN); - continue; - } - - // Finally, append the capture group data to the destination. - dest.append(group(groupNum, status)); + if (U_FAILURE(status)) { - // Can fail if group number is out of range. break; + } else { + c = UTEXT_NEXT32(replacement); } - } - + return *this; } - //-------------------------------------------------------------------------------- // // appendTail Intended to be used in conjunction with appendReplacement() @@ -311,9 +446,44 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, // //-------------------------------------------------------------------------------- UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { - int32_t len = fInput->length() - fAppendPosition; - if (len > 0) { - dest.append(*fInput, fAppendPosition, len); + UErrorCode status = U_ZERO_ERROR; + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &dest, &status); + + if (U_SUCCESS(status)) { + appendTail(&resultText); + utext_close(&resultText); + } + + return dest; +} + +// +// appendTail, UText mode +// +UText *RegexMatcher::appendTail(UText *dest) { + if (fInputLength > fAppendPosition) { + UErrorCode status = U_ZERO_ERROR; + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + int64_t destLen = utext_nativeLength(dest); + utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, fInputLength-fAppendPosition, &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = fInputLength-fAppendPosition; + } else { + len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); + status = U_ZERO_ERROR; // buffer overflow + } + + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); + utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated + + int64_t destLen = utext_nativeLength(dest); + utext_replace(dest, destLen, destLen, inputChars, len16, &status); + + uprv_free(inputChars); + } } return dest; } @@ -343,7 +513,7 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { err = U_INDEX_OUTOFBOUNDS_ERROR; return -1; } - int32_t e = -1; + int64_t e = -1; if (group == 0) { e = fMatchEnd; } else { @@ -352,9 +522,16 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); U_ASSERT(groupOffset < fPattern->fFrameSize); U_ASSERT(groupOffset >= 0); - e = fFrame->fExtra[groupOffset + 1]; + e = *(int64_t *) &fFrame->fExtra[groupOffset + 2]; + } + + if (e == -1 || UTEXT_USES_U16(fInputText)) { + return e; + } else { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + return utext_extract(fInputText, 0, e, NULL, 0, &status); } - return e; } @@ -366,13 +543,17 @@ int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { //-------------------------------------------------------------------------------- UBool RegexMatcher::find() { // Start at the position of the last match end. (Will be zero if the - // matcher has been reset. + // matcher has been reset.) // if (U_FAILURE(fDeferredStatus)) { return FALSE; } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + return findUsingChunk(); + } - int32_t startPos = fMatchEnd; + int64_t startPos = fMatchEnd; if (startPos==0) { startPos = fActiveStart; } @@ -389,7 +570,9 @@ UBool RegexMatcher::find() { fHitEnd = TRUE; return FALSE; } - startPos = fInput->moveIndex32(startPos, 1); + UTEXT_SETNATIVEINDEX(fInputText, startPos); + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); } } else { if (fLastMatchEnd >= 0) { @@ -406,14 +589,20 @@ UBool RegexMatcher::find() { // the minimum length match would extend past the end of the input. // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. // Be aware of possible overflows if making changes here. - int32_t testLen = fActiveLimit - fPattern->fMinMatchLen; - if (startPos > testLen) { - fMatch = FALSE; - fHitEnd = TRUE; - return FALSE; + int64_t testStartLimit; + if (UTEXT_USES_U16(fInputText)) { + testStartLimit = fActiveLimit - fPattern->fMinMatchLen; + if (startPos > testStartLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + } else { + // For now, let the matcher discover that it can't match on its own + // We don't know how long the match len is in native characters + testStartLimit = fActiveLimit; } - const UChar *inputBuf = fInput->getBuffer(); UChar32 c; U_ASSERT(startPos >= 0); @@ -429,14 +618,16 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } - if (startPos >= testLen) { + if (startPos >= testStartLimit) { fHitEnd = TRUE; return FALSE; } - U16_FWD_1(inputBuf, startPos, fActiveLimit); + UTEXT_SETNATIVEINDEX(fInputText, startPos); + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. + // runs with startPos == testStartLimit the last time through. } U_ASSERT(FALSE); @@ -458,24 +649,28 @@ UBool RegexMatcher::find() { { // Match may start on any char from a pre-computed set. U_ASSERT(fPattern->fMinMatchLen > 0); + int64_t pos; + UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { - int32_t pos = startPos; - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + pos = UTEXT_GETNATIVEINDEX(fInputText); if (c<256 && fPattern->fInitialChars8->contains(c) || c>=256 && fPattern->fInitialChars->contains(c)) { - MatchAt(pos, FALSE, fDeferredStatus); + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, pos); } - if (pos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } + startPos = pos; } } U_ASSERT(FALSE); @@ -486,24 +681,28 @@ UBool RegexMatcher::find() { // Match starts on exactly one char. U_ASSERT(fPattern->fMinMatchLen > 0); UChar32 theChar = fPattern->fInitialChar; + int64_t pos; + UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { - int32_t pos = startPos; - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + pos = UTEXT_GETNATIVEINDEX(fInputText); if (c == theChar) { - MatchAt(pos, FALSE, fDeferredStatus); + MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return FALSE; } if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, pos); } - if (pos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } - } + startPos = pos; + } } U_ASSERT(FALSE); @@ -518,12 +717,17 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + UTEXT_SETNATIVEINDEX(fInputText, startPos); + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + } else { + UTEXT_SETNATIVEINDEX(fInputText, startPos); + c = UTEXT_PREVIOUS32(fInputText); + UTEXT_SETNATIVEINDEX(fInputText, startPos); } if (fPattern->fFlags & UREGEX_UNIX_LINES) { - for (;;) { - c = inputBuf[startPos-1]; + for (;;) { if (c == 0x0a) { MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { @@ -532,24 +736,26 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, startPos); } - if (startPos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. + // runs with startPos == testStartLimit the last time through. } } else { for (;;) { - c = inputBuf[startPos-1]; if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { - if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { - startPos++; + if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { + UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); } MatchAt(startPos, FALSE, fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { @@ -558,16 +764,18 @@ UBool RegexMatcher::find() { if (fMatch) { return TRUE; } + UTEXT_SETNATIVEINDEX(fInputText, startPos); } - if (startPos >= testLen) { + if (startPos >= testStartLimit) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; } - U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + c = UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop - // runs with startPos == testLen the last time through. + // runs with startPos == testStartLimit the last time through. } } } @@ -592,15 +800,259 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) { } this->reset(); // Note: Reset() is specified by Java Matcher documentation. // This will reset the region to be the full input length. - if (start < fActiveStart || start > fActiveLimit) { + if (start < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return FALSE; } - fMatchEnd = start; + + UBool couldFindStart = TRUE; + int64_t nativeStart; + if (UTEXT_USES_U16(fInputText)) { + nativeStart = start; + } else { + UTEXT_SETNATIVEINDEX(fInputText, 0); + int32_t i = 0; + while (i < start) { + UChar32 c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + i += U16_LENGTH(c); + } else { + couldFindStart = FALSE; + break; + } + } + nativeStart = UTEXT_GETNATIVEINDEX(fInputText); + } + if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + fMatchEnd = nativeStart; return find(); } +//-------------------------------------------------------------------------------- +// +// findUsingChunk() -- like find(), but with the advance knowledge that the +// entire string is available in the UText's chunk buffer. +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::findUsingChunk() { + // Start at the position of the last match end. (Will be zero if the + // matcher has been reset. + // + + int32_t startPos = fMatchEnd; + if (startPos==0) { + startPos = fActiveStart; + } + + const UChar *inputBuf = fInputText->chunkContents; + + if (fMatch) { + // Save the position of any previous successful match. + fLastMatchEnd = fMatchEnd; + + if (fMatchStart == fMatchEnd) { + // Previous match had zero length. Move start position up one position + // to avoid sending find() into a loop on zero-length matches. + if (startPos >= fActiveLimit) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fInputLength); + } + } else { + if (fLastMatchEnd >= 0) { + // A previous find() failed to match. Don't try again. + // (without this test, a pattern with a zero-length match + // could match again at the end of an input string.) + fHitEnd = TRUE; + return FALSE; + } + } + + + // Compute the position in the input string beyond which a match can not begin, because + // the minimum length match would extend past the end of the input. + // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. + // Be aware of possible overflows if making changes here. + int32_t testLen = fActiveLimit - fPattern->fMinMatchLen; + if (startPos > testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + + UChar32 c; + U_ASSERT(startPos >= 0); + + switch (fPattern->fStartType) { + case START_NO_INFO: + // No optimization was found. + // Try a match at each input position. + for (;;) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + if (startPos >= testLen) { + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + } + U_ASSERT(FALSE); + + case START_START: + // Matches are only possible at the start of the input string + // (pattern begins with ^ or \A) + if (startPos > fActiveStart) { + fMatch = FALSE; + return FALSE; + } + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + return fMatch; + + + case START_SET: + { + // Match may start on any char from a pre-computed set. + U_ASSERT(fPattern->fMinMatchLen > 0); + for (;;) { + int32_t pos = startPos; + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + if (c<256 && fPattern->fInitialChars8->contains(c) || + c>=256 && fPattern->fInitialChars->contains(c)) { + MatchChunkAt(pos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (pos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + } + } + U_ASSERT(FALSE); + + case START_STRING: + case START_CHAR: + { + // Match starts on exactly one char. + U_ASSERT(fPattern->fMinMatchLen > 0); + UChar32 theChar = fPattern->fInitialChar; + for (;;) { + int32_t pos = startPos; + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; + if (c == theChar) { + MatchChunkAt(pos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (pos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + } + } + U_ASSERT(FALSE); + + case START_LINE: + { + UChar32 c; + if (startPos == fAnchorStart) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + } + + if (fPattern->fFlags & UREGEX_UNIX_LINES) { + for (;;) { + c = inputBuf[startPos-1]; + if (c == 0x0a) { + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (startPos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + } + } else { + for (;;) { + c = inputBuf[startPos-1]; + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { + if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { + startPos++; + } + MatchChunkAt(startPos, FALSE, fDeferredStatus); + if (U_FAILURE(fDeferredStatus)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + } + if (startPos >= testLen) { + fMatch = FALSE; + fHitEnd = TRUE; + return FALSE; + } + U16_FWD_1(inputBuf, startPos, fActiveLimit); + // Note that it's perfectly OK for a pattern to have a zero-length + // match at the end of a string, so we must make sure that the loop + // runs with startPos == testLen the last time through. + } + } + } + + default: + U_ASSERT(FALSE); + } + + U_ASSERT(FALSE); + return FALSE; +} + + //-------------------------------------------------------------------------------- // @@ -611,44 +1063,187 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const { return group(0, status); } - - -UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { - int32_t s = start(groupNum, status); - int32_t e = end(groupNum, status); - - // Note: calling start() and end() above will do all necessary checking that - // the group number is OK and that a match exists. status will be set. - if (U_FAILURE(status)) { - return UnicodeString(); - } - if (U_FAILURE(fDeferredStatus)) { - status = fDeferredStatus; - return UnicodeString(); - } - - if (s < 0) { - // A capture group wasn't part of the match - return UnicodeString(); - } - U_ASSERT(s <= e); - return UnicodeString(*fInput, s, e-s); +UText *RegexMatcher::group(UText *dest, UErrorCode &status) const { + return group(0, dest, status); } +UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { + UnicodeString result; + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &result, &status); + group(groupNum, &resultText, status); + utext_close(&resultText); + return result; +} + +UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { + UBool bailOut = FALSE; + if (U_FAILURE(status)) { + bailOut = TRUE; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + bailOut = TRUE; + } + + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + bailOut = TRUE; + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + bailOut = TRUE; + } + + if (bailOut) { + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_openUChars(NULL, NULL, 0, &status); + } + } + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = *(int64_t *) &fFrame->fExtra[groupOffset]; + e = *(int64_t *) &fFrame->fExtra[groupOffset+2]; + } + + if (s < 0) { + // A capture group wasn't part of the match + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); + return dest; + } else { + return utext_openUChars(NULL, NULL, 0, &status); + } + } + U_ASSERT(s <= e); + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + U_ASSERT(e <= fInputLength); + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, e-s, &status); + } else { + UText groupText = UTEXT_INITIALIZER; + utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); + utext_close(&groupText); + } + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = e-s; + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); + } + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + utext_extract(fInputText, s, e, groupChars, len16+1, &status); + + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); + } else { + UText groupText = UTEXT_INITIALIZER; + utext_openUChars(&groupText, groupChars, len16, &status); + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); + utext_close(&groupText); + } + + uprv_free(groupChars); + } + return dest; +} + +//-------------------------------------------------------------------------------- +// +// appendGroup() -- currently internal only, appends a group to a UText rather +// than replacing its contents +// +//-------------------------------------------------------------------------------- + +int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { + int64_t destLen = utext_nativeLength(dest); + + if (U_FAILURE(status)) { + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + + int64_t s, e; + if (groupNum == 0) { + s = fMatchStart; + e = fMatchEnd; + } else { + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + s = *(int64_t *) &fFrame->fExtra[groupOffset]; + e = *(int64_t *) &fFrame->fExtra[groupOffset+2]; + } + + if (s < 0) { + // A capture group wasn't part of the match + return utext_replace(dest, destLen, destLen, NULL, 0, &status); + } + U_ASSERT(s <= e); + + int64_t deltaLen; + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + U_ASSERT(e <= fInputLength); + deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, e-s, &status); + } else { + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = e-s; + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); + } + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); + utext_extract(fInputText, s, e, groupChars, len16+1, &status); + + deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); + uprv_free(groupChars); + } + return deltaLen; +} + + + +//-------------------------------------------------------------------------------- +// +// groupCount() +// +//-------------------------------------------------------------------------------- int32_t RegexMatcher::groupCount() const { return fPattern->fGroupMap->size(); } -const UnicodeString &RegexMatcher::input() const { - return *fInput; -} - - //-------------------------------------------------------------------------------- // // hasAnchoringBounds() @@ -679,6 +1274,104 @@ UBool RegexMatcher::hitEnd() const { return fHitEnd; } + +//-------------------------------------------------------------------------------- +// +// input() +// +//-------------------------------------------------------------------------------- +const UnicodeString &RegexMatcher::input() const { + if (!fInput) { + UErrorCode status = U_ZERO_ERROR; + int32_t len16; + if (UTEXT_USES_U16(fInputText)) { + len16 = fInputLength; + } else { + len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); + status = U_ZERO_ERROR; // overflow, length status + } + UnicodeString *result = new UnicodeString(len16, 0, 0); + + UChar *inputChars = result->getBuffer(len16); + utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning + result->releaseBuffer(len16); + + (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= + } + + return *fInput; +} + +//-------------------------------------------------------------------------------- +// +// inputText() +// +//-------------------------------------------------------------------------------- +UText *RegexMatcher::inputText() const { + return fInputText; +} + + +//-------------------------------------------------------------------------------- +// +// getInput() -- like inputText(), but makes a clone or copies into another UText +// +//-------------------------------------------------------------------------------- +UText *RegexMatcher::getInput (UText *dest) const { + UErrorCode status = U_ZERO_ERROR; // ignored + if (dest) { + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, fInputLength, &status); + } else { + int32_t input16Len; + if (UTEXT_USES_U16(fInputText)) { + input16Len = fInputLength; + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error + } + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); + + status = U_ZERO_ERROR; + utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning + status = U_ZERO_ERROR; + utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); + + uprv_free(inputChars); + } + return dest; + } else { + return utext_clone(NULL, fInputText, FALSE, TRUE, &status); + } +} + + +static UBool compat_SyncMutableUTextContents(UText *ut); +static UBool compat_SyncMutableUTextContents(UText *ut) { + UBool retVal = FALSE; + + // In the following test, we're really only interested in whether the UText should switch + // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents + // will still point to the correct data. + if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { + UnicodeString *us=(UnicodeString *)ut->context; + + // Update to the latest length. + // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). + int32_t newLength = us->length(); + + // Update the chunk description. + // The buffer may have switched between stack- and heap-based. + ut->chunkContents = us->getBuffer(); + ut->chunkLength = newLength; + ut->chunkNativeLimit = newLength; + ut->nativeIndexingLimit = newLength; + retVal = TRUE; + } + + return retVal; +} + //-------------------------------------------------------------------------------- // // lookingAt() @@ -692,8 +1385,21 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) { status = fDeferredStatus; return FALSE; } - resetPreserveRegion(); - MatchAt(fActiveStart, FALSE, status); + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + else { + resetPreserveRegion(); + } + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt(fActiveStart, FALSE, status); + } else { + MatchAt(fActiveStart, FALSE, status); + } return fMatch; } @@ -707,11 +1413,47 @@ UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) { return FALSE; } reset(); - if (start < fActiveStart || start > fActiveLimit) { + + if (start < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return FALSE; } - MatchAt(start, FALSE, status); + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + + int64_t nativeStart; + UBool couldFindStart = TRUE; + if (UTEXT_USES_U16(fInputText)) { + nativeStart = start; + } else { + UTEXT_SETNATIVEINDEX(fInputText, 0); + int32_t i = 0; + while (i < start) { + UChar32 c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + i += U16_LENGTH(c); + } else { + couldFindStart = FALSE; + break; + } + } + nativeStart = UTEXT_GETNATIVEINDEX(fInputText); + } + if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt(nativeStart, FALSE, status); + } else { + MatchAt(nativeStart, FALSE, status); + } return fMatch; } @@ -730,8 +1472,22 @@ UBool RegexMatcher::matches(UErrorCode &status) { status = fDeferredStatus; return FALSE; } - resetPreserveRegion(); - MatchAt(fActiveStart, TRUE, status); + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + else { + resetPreserveRegion(); + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt(fActiveStart, TRUE, status); + } else { + MatchAt(fActiveStart, TRUE, status); + } return fMatch; } @@ -745,11 +1501,47 @@ UBool RegexMatcher::matches(int32_t start, UErrorCode &status) { return FALSE; } reset(); - if (start < fActiveStart || start > fActiveLimit) { + + if (start < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return FALSE; } - MatchAt(start, TRUE, status); + + if (fInputUniStrMaybeMutable) { + if (compat_SyncMutableUTextContents(fInputText)) { + fInputLength = utext_nativeLength(fInputText); + reset(); + } + } + + int64_t nativeStart; + UBool couldFindStart = TRUE; + if (UTEXT_USES_U16(fInputText)) { + nativeStart = start; + } else { + UTEXT_SETNATIVEINDEX(fInputText, 0); + int32_t i = 0; + while (i < start) { + UChar32 c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + i += U16_LENGTH(c); + } else { + couldFindStart = FALSE; + break; + } + } + nativeStart = UTEXT_GETNATIVEINDEX(fInputText); + } + if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { + MatchChunkAt(nativeStart, TRUE, status); + } else { + MatchAt(nativeStart, TRUE, status); + } return fMatch; } @@ -775,21 +1567,68 @@ RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &sta if (U_FAILURE(status)) { return *this; } - if (start>limit || start<0 || limit<0 || limit>fInput->length()) { + + if (start>limit || start<0 || limit<0) { status = U_ILLEGAL_ARGUMENT_ERROR; } + + int64_t nativeStart; + int32_t i = 0; + UBool couldFindStart = TRUE; + if (UTEXT_USES_U16(fInputText)) { + nativeStart = start; + couldFindStart = (nativeStart <= fInputLength); + } else { + UTEXT_SETNATIVEINDEX(fInputText, 0); + while (i < start) { + UChar32 c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + i += U16_LENGTH(c); + } else { + couldFindStart = FALSE; + break; + } + } + nativeStart = UTEXT_GETNATIVEINDEX(fInputText); + } + int64_t nativeLimit = nativeStart; + + if (!couldFindStart) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + UBool couldFindLimit = TRUE; + if (UTEXT_USES_U16(fInputText)) { + nativeLimit = limit; + couldFindLimit = (nativeLimit <= fInputLength); + } else { + while (i < limit) { + UChar32 c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + i += U16_LENGTH(c); + } else { + couldFindLimit = FALSE; + break; + } + } + nativeLimit = UTEXT_GETNATIVEINDEX(fInputText); + } + if (!couldFindLimit) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + } + this->reset(); - fRegionStart = start; - fRegionLimit = limit; - fActiveStart = start; - fActiveLimit = limit; + fRegionStart = nativeStart; + fRegionLimit = nativeLimit; + fActiveStart = nativeStart; + fActiveLimit = nativeLimit; if (!fTransparentBounds) { - fLookStart = start; - fLookLimit = limit; + fLookStart = nativeStart; + fLookLimit = nativeLimit; } if (fAnchoringBounds) { - fAnchorStart = start; - fAnchorLimit = limit; + fAnchorStart = nativeStart; + fAnchorLimit = nativeLimit; } return *this; } @@ -802,7 +1641,13 @@ RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &sta // //-------------------------------------------------------------------------------- int32_t RegexMatcher::regionEnd() const { - return fRegionLimit; + if (UTEXT_USES_U16(fInputText)) { + return fRegionLimit; + } else { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + return utext_extract(fInputText, 0, fRegionLimit, NULL, 0, &status); + } } @@ -812,7 +1657,13 @@ int32_t RegexMatcher::regionEnd() const { // //-------------------------------------------------------------------------------- int32_t RegexMatcher::regionStart() const { - return fRegionStart; + if (UTEXT_USES_U16(fInputText)) { + return fRegionStart; + } else { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + return utext_extract(fInputText, 0, fRegionStart, NULL, 0, &status); + } } @@ -822,26 +1673,56 @@ int32_t RegexMatcher::regionStart() const { // //-------------------------------------------------------------------------------- UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { - if (U_FAILURE(status)) { - return *fInput; - } - if (U_FAILURE(fDeferredStatus)) { - status = fDeferredStatus; - return *fInput; - } - UnicodeString destString; - reset(); - while (find()) { - appendReplacement(destString, replacement, status); - if (U_FAILURE(status)) { - break; - } - } - appendTail(destString); - return destString; + UText replacementText = UTEXT_INITIALIZER; + UText resultText = UTEXT_INITIALIZER; + UnicodeString resultString; + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + utext_openUnicodeString(&resultText, &resultString, &status); + + replaceAll(&replacementText, &resultText, status); + + utext_close(&resultText); + utext_close(&replacementText); + + return resultString; } +// +// replaceAll, UText mode +// +UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { + if (U_FAILURE(status)) { + return getInput(dest); + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return getInput(dest); + } + + if (dest == NULL) { + UnicodeString emptyString; + UText empty = UTEXT_INITIALIZER; + + utext_openUnicodeString(&empty, &emptyString, &status); + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); + utext_close(&empty); + } + + if (U_SUCCESS(status)) { + reset(); + while (find()) { + appendReplacement(dest, replacement, status); + if (U_FAILURE(status)) { + break; + } + } + appendTail(dest); + } + + return dest; +} //-------------------------------------------------------------------------------- @@ -850,23 +1731,51 @@ UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC // //-------------------------------------------------------------------------------- UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { + UText replacementText = UTEXT_INITIALIZER; + UText resultText = UTEXT_INITIALIZER; + UnicodeString resultString; + + utext_openConstUnicodeString(&replacementText, &replacement, &status); + utext_openUnicodeString(&resultText, &resultString, &status); + + replaceFirst(&replacementText, &resultText, status); + + utext_close(&resultText); + utext_close(&replacementText); + + return resultString; +} + +// +// replaceFirst, UText mode +// +UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { if (U_FAILURE(status)) { - return *fInput; + return getInput(dest); } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; - return *fInput; + return getInput(dest); } reset(); if (!find()) { - return *fInput; + return getInput(dest); } - - UnicodeString destString; - appendReplacement(destString, replacement, status); - appendTail(destString); - return destString; + + if (dest == NULL) { + UnicodeString emptyString; + UText empty = UTEXT_INITIALIZER; + + utext_openUnicodeString(&empty, &emptyString, &status); + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); + utext_close(&empty); + } + + appendReplacement(dest, replacement, status); + appendTail(dest); + + return dest; } @@ -887,13 +1796,13 @@ UBool RegexMatcher::requireEnd() const { //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::reset() { fRegionStart = 0; - fRegionLimit = fInput->length(); + fRegionLimit = fInputLength; fActiveStart = 0; - fActiveLimit = fRegionLimit; + fActiveLimit = fInputLength; fAnchorStart = 0; - fAnchorLimit = fRegionLimit; + fAnchorLimit = fInputLength; fLookStart = 0; - fLookLimit = fRegionLimit; + fLookLimit = fInputLength; resetPreserveRegion(); return *this; } @@ -910,21 +1819,55 @@ void RegexMatcher::resetPreserveRegion() { fRequireEnd = FALSE; fTime = 0; fTickCounter = TIMER_INITIAL_VALUE; - resetStack(); + //resetStack(); // more expensive than it looks... } RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { - fInput = &input; + fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); + if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); + fInputLength = utext_nativeLength(fInputText); + reset(); + delete fInput; + fInput = NULL; + + // Do the following for any UnicodeString. + // This is for compatibility for those clients who modify the input string "live" during regex operations. + fInputUniStrMaybeMutable = TRUE; + if (fWordBreakItr != NULL) { - #if UCONFIG_NO_BREAK_ITERATION==0 - fWordBreakItr->setText(input); - #endif +#if UCONFIG_NO_BREAK_ITERATION==0 + UErrorCode status = U_ZERO_ERROR; + fWordBreakItr->setText(fInputText, status); +#endif } return *this; } + +RegexMatcher &RegexMatcher::reset(UText *input) { + if (fInputText != input) { + fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); + if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); + fInputLength = utext_nativeLength(fInputText); + + delete fInput; + fInput = NULL; + + if (fWordBreakItr != NULL) { +#if UCONFIG_NO_BREAK_ITERATION==0 + UErrorCode status = U_ZERO_ERROR; + fWordBreakItr->setText(input, status); +#endif + } + } + reset(); + fInputUniStrMaybeMutable = FALSE; + + return *this; +} + /*RegexMatcher &RegexMatcher::reset(const UChar *) { fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; return *this; @@ -936,11 +1879,35 @@ RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) { return *this; } reset(); // Reset also resets the region to be the entire string. - if (position < 0 || position >= fActiveLimit) { + + if (position < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return *this; } - fMatchEnd = position; + + int64_t nativePos; + UBool couldFindStart = TRUE; + if (UTEXT_USES_U16(fInputText)) { + nativePos = position; + } else { + UTEXT_SETNATIVEINDEX(fInputText, 0); + int32_t i = 0; + while (i < position) { + UChar32 c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + i += U16_LENGTH(c); + } else { + couldFindStart = FALSE; + break; + } + } + nativePos = UTEXT_GETNATIVEINDEX(fInputText); + } + if (!couldFindStart || nativePos < fActiveStart || nativePos >= fActiveLimit) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return *this; + } + fMatchEnd = nativePos; return *this; } @@ -967,7 +1934,35 @@ void RegexMatcher::setTrace(UBool state) { int32_t RegexMatcher::split(const UnicodeString &input, UnicodeString dest[], int32_t destCapacity, - UErrorCode &status) + UErrorCode &status) +{ + UText inputText = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&inputText, &input, &status); + + UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); + int32_t i; + for (i = 0; i < destCapacity; i++) { + destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); + } + + int32_t fieldCount = split(&inputText, destText, destCapacity, status); + + for (i = 0; i < destCapacity; i++) { + utext_close(destText[i]); + } + + uprv_free(destText); + utext_close(&inputText); + return fieldCount; +} + +// +// split, UText mode +// +int32_t RegexMatcher::split(UText *input, + UText *dest[], + int32_t destCapacity, + UErrorCode &status) { // // Check arguements for validity @@ -985,7 +1980,7 @@ int32_t RegexMatcher::split(const UnicodeString &input, // Reset for the input text // reset(input); - int32_t nextOutputStringStart = 0; + int64_t nextOutputStringStart = 0; if (fActiveLimit == 0) { return 0; } @@ -999,38 +1994,94 @@ int32_t RegexMatcher::split(const UnicodeString &input, if (i>=destCapacity-1) { // There is one or zero output string left. // Fill the last output string with whatever is left from the input, then exit the loop. - // ( i will be == destCapicity if we filled the output array while processing + // ( i will be == destCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) i = destCapacity-1; - int32_t remainingLength = fActiveLimit-nextOutputStringStart; - if (remainingLength > 0) { - dest[i].setTo(input, nextOutputStringStart, remainingLength); + if (fActiveLimit > nextOutputStringStart) { + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } } break; } if (find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. - int32_t fieldLen = fMatchStart - nextOutputStringStart; - dest[i].setTo(input, nextOutputStringStart, fieldLen); + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, fMatchStart-nextOutputStringStart, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fMatchStart-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + + utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } nextOutputStringStart = fMatchEnd; // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. int32_t groupNum; + UBool lastGroupWasNullUText = FALSE; for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { if (i==destCapacity-1) { break; } i++; - dest[i] = group(groupNum, status); + lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE); + dest[i] = group(groupNum, dest[i], status); } if (nextOutputStringStart == fActiveLimit) { // The delimiter was at the end of the string. We're done. break; + } else if (i == destCapacity-1) { + // We're out of capture groups, and the rest of the string is more important + if (lastGroupWasNullUText) { + utext_close(dest[i]); + dest[i] = NULL; + } } } @@ -1038,7 +2089,32 @@ int32_t RegexMatcher::split(const UnicodeString &input, { // We ran off the end of the input while looking for the next delimiter. // All the remaining text goes into the current output string. - dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart); + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); + + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); + if (dest[i]) { + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); + } else { + UText remainingText = UTEXT_INITIALIZER; + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); + utext_close(&remainingText); + } + + uprv_free(remainingChars); + } break; } } @@ -1046,7 +2122,6 @@ int32_t RegexMatcher::split(const UnicodeString &input, } - //-------------------------------------------------------------------------------- // // start @@ -1080,16 +2155,23 @@ int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { status = U_INDEX_OUTOFBOUNDS_ERROR; return -1; } - int32_t s; + int64_t s; if (group == 0) { s = fMatchStart; } else { int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); U_ASSERT(groupOffset < fPattern->fFrameSize); U_ASSERT(groupOffset >= 0); - s = fFrame->fExtra[groupOffset]; + s = *(int64_t *) &fFrame->fExtra[groupOffset]; + } + + if (s == -1 || UTEXT_USES_U16(fInputText)) { + return s; + } else { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + return utext_extract(fInputText, 0, s, NULL, 0, &status); } - return s; } @@ -1101,9 +2183,8 @@ int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { fAnchoringBounds = b; - UErrorCode status = U_ZERO_ERROR; - region(fRegionStart, fRegionLimit, status); - U_ASSERT(U_SUCCESS(status)); + fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); + fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); return *this; } @@ -1115,9 +2196,8 @@ RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { fTransparentBounds = b; - UErrorCode status = U_ZERO_ERROR; - region(fRegionStart, fRegionLimit, status); - U_ASSERT(U_SUCCESS(status)); + fLookStart = (fTransparentBounds ? 0 : fRegionStart); + fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); return *this; } @@ -1210,11 +2290,11 @@ int32_t RegexMatcher::getStackLimit() const { void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, const void *context, UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - fCallbackFn = callback; - fCallbackContext = context; + if (U_FAILURE(status)) { + return; + } + fCallbackFn = callback; + fCallbackContext = context; } @@ -1251,16 +2331,16 @@ void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, //-------------------------------------------------------------------------------- REStackFrame *RegexMatcher::resetStack() { // Discard any previous contents of the state save stack, and initialize a - // new stack frame to all -1. The -1s are needed for capture group limits, where - // they indicate that a group has not yet matched anything. + // new stack frame with all -1 data. The -1s are needed for capture group limits, + // where they indicate that a group has not yet matched anything. fStack->removeAllElements(); - int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); + REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); int32_t i; - for (i=0; ifFrameSize; i++) { - iFrame[i] = -1; + for (i=0; ifFrameSize-3; i++) { + iFrame->fExtra[i] = -1; } - return (REStackFrame *)iFrame; + return iFrame; } @@ -1281,7 +2361,7 @@ REStackFrame *RegexMatcher::resetStack() { // TODO: double-check edge cases at region boundaries. // //-------------------------------------------------------------------------------- -UBool RegexMatcher::isWordBoundary(int32_t pos) { +UBool RegexMatcher::isWordBoundary(int64_t pos) { UBool isBoundary = FALSE; UBool cIsWord = FALSE; @@ -1290,7 +2370,8 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) { } else { // Determine whether char c at current position is a member of the word set of chars. // If we're off the end of the string, behave as though we're not at a word char. - UChar32 c = fInput->char32At(pos); + UTEXT_SETNATIVEINDEX(fInputText, pos); + UChar32 c = UTEXT_CURRENT32(fInputText); if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { // Current char is a combining one. Not a boundary. return FALSE; @@ -1301,13 +2382,50 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) { // Back up until we come to a non-combining char, determine whether // that char is a word char. UBool prevCIsWord = FALSE; - int32_t prevPos = pos; for (;;) { - if (prevPos <= fLookStart) { + if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { break; } - prevPos = fInput->moveIndex32(prevPos, -1); - UChar32 prevChar = fInput->char32At(prevPos); + UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); + if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) + || u_charType(prevChar) == U_FORMAT_CHAR)) { + prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); + break; + } + } + isBoundary = cIsWord ^ prevCIsWord; + return isBoundary; +} + +UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { + UBool isBoundary = FALSE; + UBool cIsWord = FALSE; + + const UChar *inputBuf = fInputText->chunkContents; + + if (pos >= fLookLimit) { + fHitEnd = TRUE; + } else { + // Determine whether char c at current position is a member of the word set of chars. + // If we're off the end of the string, behave as though we're not at a word char. + UChar32 c; + U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); + if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { + // Current char is a combining one. Not a boundary. + return FALSE; + } + cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); + } + + // Back up until we come to a non-combining char, determine whether + // that char is a word char. + UBool prevCIsWord = FALSE; + for (;;) { + if (pos <= fLookStart) { + break; + } + UChar32 prevChar; + U16_PREV(inputBuf, fLookStart, pos, prevChar); if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) || u_charType(prevChar) == U_FORMAT_CHAR)) { prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); @@ -1327,7 +2445,7 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) { // parameters: pos - the current position in the input buffer // //-------------------------------------------------------------------------------- -UBool RegexMatcher::isUWordBoundary(int32_t pos) { +UBool RegexMatcher::isUWordBoundary(int64_t pos) { UBool returnVal = FALSE; #if UCONFIG_NO_BREAK_ITERATION==0 @@ -1338,7 +2456,7 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) { if (U_FAILURE(fDeferredStatus)) { return FALSE; } - fWordBreakItr->setText(*fInput); + fWordBreakItr->setText(fInputText, fDeferredStatus); } if (pos >= fLookLimit) { @@ -1347,6 +2465,11 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) { // words are not boundaries. All non-word chars stand by themselves, // with word boundaries on both sides. } else { + if (!UTEXT_USES_U16(fInputText)) { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); + } returnVal = fWordBreakItr->isBoundary(pos); } #endif @@ -1442,30 +2565,39 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatId // toEnd: if true, match must extend to end of the input region // //-------------------------------------------------------------------------------- -void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { +void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { UBool isMatch = FALSE; // True if the we have a match. + + int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards int32_t op; // Operation from the compiled pattern, split into int32_t opType; // the opcode int32_t opValue; // and the operand value. - + #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { - printf("MatchAt(startIdx=%d)\n", startIdx); + printf("MatchAt(startIdx=%ld)\n", startIdx); printf("Original Pattern: "); - int32_t i; - for (i=0; ifPattern.length(); i++) { - printf("%c", fPattern->fPattern.charAt(i)); + UChar32 c = utext_next32From(fPattern->fPattern, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); + + c = UTEXT_NEXT32(fPattern->fPattern); } printf("\n"); printf("Input String: "); - for (i=0; ilength(); i++) { - UChar c = fInput->charAt(i); + c = utext_next32From(fInputText, 0); + while (c != U_SENTINEL) { if (c<32 || c>256) { c = '.'; } printf("%c", c); + + c = UTEXT_NEXT32(fInputText); } printf("\n"); printf("\n"); @@ -1483,8 +2615,6 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { const UChar *litText = fPattern->fLiteralText.getBuffer(); UVector *sets = fPattern->fSets; - const UChar *inputBuf = fInput->getBuffer(); - fFrameSize = fPattern->fFrameSize; REStackFrame *fp = resetStack(); @@ -1507,18 +2637,20 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { fprintf(stderr, "Heap Trouble\n"); } #endif + op = pat[fp->fPatIdx]; opType = URX_TYPE(op); opValue = URX_VAL(op); #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { - printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx, - fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer()); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + UTEXT_CURRENT32(fInputText), (int32_t *)fp-fStack->getBuffer(), fActiveLimit); fPattern->dumpOp(fp->fPatIdx); } #endif fp->fPatIdx++; - + switch (opType) { @@ -1536,14 +2668,45 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { case URX_ONECHAR: if (fp->fInputIdx < fActiveLimit) { - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); if (c == opValue) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { fHitEnd = TRUE; } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + UChar32 c = UTEXT_PREVIOUS32(fInputText); + while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { + if (c == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + c = UTEXT_PREVIOUS32(fInputText); + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -1562,37 +2725,73 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { stringLen = URX_VAL(op); U_ASSERT(opType == URX_STRING_LEN); U_ASSERT(stringLen >= 2); - - if (fp->fInputIdx + stringLen > fActiveLimit) { - // No match. String is longer than the remaining input text. - fHitEnd = TRUE; // TODO: See ticket 6074 - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - break; - } - - const UChar * pInp = inputBuf + fp->fInputIdx; - const UChar * pPat = litText+stringStartIdx; - const UChar * pEnd = pInp + stringLen; - for(;;) { - if (*pInp == *pPat) { - pInp++; - pPat++; - if (pInp == pEnd) { - // Successful Match. - fp->fInputIdx += stringLen; - break; + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c; + UBool success = TRUE; + + while (patternChars < patternEnd && success) { + c = UTEXT_NEXT32(fInputText); + + if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; } } else { - // Match failed. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - break; + success = FALSE; + fHitEnd = TRUE; // TODO: See ticket 6074 } } + + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } } break; - case URX_STATE_SAVE: fp = StateSave(fp, opValue, status); break; @@ -1610,78 +2809,76 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { goto breakFromLoop; // Start and End Capture stack frame variables are layout out like this: - // fp->fExtra[opValue] - The start of a completed capture group - // opValue+1 - The end of a completed capture group - // opValue+2 - the start of a capture group whose end + // fp->fExtra[opValue] - The start of a completed capture group (double-width) + // opValue+2 - The end of a completed capture group (double-width) + // opValue+4 - the start of a capture group whose end // has not yet been reached (and might not ever be). case URX_START_CAPTURE: - U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); - fp->fExtra[opValue+2] = fp->fInputIdx; + U_ASSERT(opValue >= 0 && opValue < fFrameSize-5); + *((int64_t *) &fp->fExtra[opValue+4]) = fp->fInputIdx; break; case URX_END_CAPTURE: - U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); - U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. - fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. - fp->fExtra[opValue+1] = fp->fInputIdx; // End position - U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-5); + U_ASSERT(*((int64_t *) &fp->fExtra[opValue+4]) >= 0); // Start pos for this group must be set. + *((int64_t *) &fp->fExtra[opValue]) = *((int64_t *) &fp->fExtra[opValue+4]); // Tentative start becomes real. + *((int64_t *) &fp->fExtra[opValue+2]) = fp->fInputIdx; // End position + U_ASSERT(*((int64_t *) &fp->fExtra[opValue]) <= *((int64_t *) &fp->fExtra[opValue+2])); break; case URX_DOLLAR: // $, test for End of line // or for position before new line at end of input - if (fp->fInputIdx < fAnchorLimit-2) { - // We are no where near the end of input. Fail. - // This is the common case. Keep it first. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - break; - } - if (fp->fInputIdx >= fAnchorLimit) { - // We really are at the end of input. Success. - fHitEnd = TRUE; - fRequireEnd = TRUE; - break; - } - // If we are positioned just before a new-line that is located at the - // end of input, succeed. - if (fp->fInputIdx == fAnchorLimit-1) { - UChar32 c = fInput->char32At(fp->fInputIdx); - if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { - // If not in the middle of a CR/LF sequence - if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { - // At new-line at end of input. Success - fHitEnd = TRUE; - fRequireEnd = TRUE; - break; - } - } - } - - if (fp->fInputIdx == fAnchorLimit-2 && - fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) { + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. fHitEnd = TRUE; fRequireEnd = TRUE; - break; // At CR/LF at end of input. Success + break; + } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // If we are positioned just before a new-line that is located at the + // end of input, succeed. + UChar32 c = UTEXT_NEXT32(fInputText); + if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { + // If not in the middle of a CR/LF sequence + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { + // At new-line at end of input. Success + fHitEnd = TRUE; + fRequireEnd = TRUE; + + break; + } + } + } else { + UChar32 nextC = UTEXT_NEXT32(fInputText); + if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; // At CR/LF at end of input. Success + } + } + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } - - fp = (REStackFrame *)fStack->popFrame(fFrameSize); - break; case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. - if (fp->fInputIdx >= fAnchorLimit-1) { + if (fp->fInputIdx >= fAnchorLimit) { + // Off the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } else { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); // Either at the last character of input, or off the end. - if (fp->fInputIdx == fAnchorLimit-1) { - // At last char of input. Success if it's a new line. - if (fInput->char32At(fp->fInputIdx) == 0x0a) { - fHitEnd = TRUE; - fRequireEnd = TRUE; - break; - } - } else { - // Off the end of input. Success. + if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { fHitEnd = TRUE; fRequireEnd = TRUE; break; @@ -1703,12 +2900,13 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } // If we are positioned just before a new-line, succeed. // It makes no difference where the new-line is within the input. - UChar32 c = inputBuf[fp->fInputIdx]; + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_CURRENT32(fInputText); if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { // At a line end, except for the odd chance of being in the middle of a CR/LF sequence // In multi-line mode, hitting a new-line just before the end of input does not // set the hitEnd or requireEnd flags - if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { break; } } @@ -1728,7 +2926,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } // If we are not positioned just before a new-line, the test fails; backtrack out. // It makes no difference where the new-line is within the input. - if (inputBuf[fp->fInputIdx] != 0x0a) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + if (UTEXT_CURRENT32(fInputText) != 0x0a) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } @@ -1750,7 +2949,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } // Check whether character just before the current pos is a new-line // unless we are at the end of input - UChar c = inputBuf[fp->fInputIdx - 1]; + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar c = UTEXT_PREVIOUS32(fInputText); if ((fp->fInputIdx < fAnchorLimit) && ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { // It's a new-line. ^ is true. Success. @@ -1772,7 +2972,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { } // Check whether character just before the current pos is a new-line U_ASSERT(fp->fInputIdx <= fAnchorLimit); - UChar c = inputBuf[fp->fInputIdx - 1]; + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar c = UTEXT_PREVIOUS32(fInputText); if (c != 0x0a) { // Not at the start of a line. Back-track out. fp = (REStackFrame *)fStack->popFrame(fFrameSize); @@ -1810,12 +3011,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { break; } - UChar32 c = fInput->char32At(fp->fInputIdx); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); success ^= (opValue != 0); // flip sense for \D if (success) { - fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } @@ -1842,11 +3045,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); // Examine (and consume) the current char. // Dispatch into a little state machine, based on the char. UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); UnicodeSet **sets = fPattern->fStaticSets; if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; @@ -1861,27 +3067,33 @@ void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { GC_L: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); if (sets[URX_GC_L]->contains(c)) goto GC_L; if (sets[URX_GC_LV]->contains(c)) goto GC_V; if (sets[URX_GC_LVT]->contains(c)) goto GC_T; if (sets[URX_GC_V]->contains(c)) goto GC_V; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); goto GC_Extend; GC_V: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); if (sets[URX_GC_V]->contains(c)) goto GC_V; if (sets[URX_GC_T]->contains(c)) goto GC_T; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); goto GC_Extend; GC_T: if (fp->fInputIdx >= fActiveLimit) goto GC_Done; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); if (sets[URX_GC_T]->contains(c)) goto GC_T; - U16_PREV(inputBuf, 0, fp->fInputIdx, c); + UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); goto GC_Extend; GC_Extend: @@ -1890,19 +3102,21 @@ GC_Extend: if (fp->fInputIdx >= fActiveLimit) { break; } - U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c); + c = UTEXT_CURRENT32(fInputText); if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { break; } - U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); + UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } goto GC_Done; GC_Control: // Most control chars stand alone (don't combine with combining chars), // except for that CR/LF sequence is a single grapheme cluster. - if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { - fp->fInputIdx++; + if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } GC_Done: @@ -1942,8 +3156,9 @@ GC_Done: UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); opValue &= ~URX_NEG_SET; U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); if (c < 256) { Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; if (s8->contains(c)) { @@ -1955,7 +3170,46 @@ GC_Done: success = !success; } } - if (!success) { + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success); + + if (success && c != U_SENTINEL) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } @@ -1973,20 +3227,62 @@ GC_Done: } U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); if (c < 256) { Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; if (s8->contains(c) == FALSE) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { const UnicodeSet *s = fPattern->fStaticSets[opValue]; if (s->contains(c) == FALSE) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } - + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + UBool success = FALSE; + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + success = TRUE; + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -1997,25 +3293,68 @@ GC_Done: fHitEnd = TRUE; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; - } - // There is input left. Pick up one char and test it for set membership. - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - U_ASSERT(opValue > 0 && opValue < sets->size()); - if (c<256) { - Regex8BitSet *s8 = &fPattern->fSets8[opValue]; - if (s8->contains(c)) { - break; - } } else { - UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); - if (s->contains(c)) { - // The character is in the set. A Match. - break; + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + // There is input left. Pick up one char and test it for set membership. + UChar32 c = UTEXT_NEXT32(fInputText); + U_ASSERT(opValue > 0 && opValue < sets->size()); + if (c<256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + // The character is in the set. A Match. + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + break; + } } + + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + UTEXT_PREVIOUS32(fInputText); // skip the first character we tried + UBool success = FALSE; + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else if (c < 256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + success = TRUE; + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } - // the character wasn't in the set. Back track out. - fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -2028,15 +3367,18 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + // There is input left. Advance over one char, unless we've hit end-of-line - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UChar32 c = UTEXT_NEXT32(fInputText); if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { // End of line in normal mode. . does not match. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } break; @@ -2050,15 +3392,20 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + // There is input left. Advance over one char, except if we are // at a cr/lf, advance over both of them. UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + c = UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); if (c==0x0d && fp->fInputIdx < fActiveLimit) { // In the case of a CR/LF, we need to advance over both. - UChar nextc = inputBuf[fp->fInputIdx]; + UChar32 nextc = UTEXT_CURRENT32(fInputText); if (nextc == 0x0a) { - fp->fInputIdx++; + UTEXT_NEXT32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } } } @@ -2075,12 +3422,16 @@ GC_Done: fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + // There is input left. Advance over one char, unless we've hit end-of-line - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UChar32 c = UTEXT_NEXT32(fInputText); if (c == 0x0a) { // End of line in normal mode. '.' does not match the \n fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } else { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } } break; @@ -2111,13 +3462,13 @@ GC_Done: U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); int32_t frameLoc = URX_VAL(stoOp); U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); - int32_t prevInputIdx = fp->fExtra[frameLoc]; + int64_t prevInputIdx = *(int64_t *) &fp->fExtra[frameLoc]; U_ASSERT(prevInputIdx <= fp->fInputIdx); if (prevInputIdx < fp->fInputIdx) { // The match did make progress. Repeat the loop. fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; - fp->fExtra[frameLoc] = fp->fInputIdx; + *((int64_t *) &fp->fExtra[frameLoc]) = fp->fInputIdx; } // If the input position did not advance, we do nothing here, // execution will fall out of the loop. @@ -2126,7 +3477,7 @@ GC_Done: case URX_CTR_INIT: { - U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and @@ -2157,9 +3508,11 @@ GC_Done: int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = pat[opValue+2]; int32_t maxCount = pat[opValue+3]; - // Increment the counter. Note: we're not worrying about counter + // Increment the counter. Note: we DIDN'T worry about counter // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. (*pCounter)++; U_ASSERT(*pCounter > 0); if ((uint32_t)*pCounter >= (uint32_t)maxCount) { @@ -2176,7 +3529,7 @@ GC_Done: case URX_CTR_INIT_NG: { // Initialize a non-greedy loop - U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and @@ -2208,9 +3561,11 @@ GC_Done: int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = pat[opValue+2]; int32_t maxCount = pat[opValue+3]; - // Increment the counter. Note: we're not worrying about counter + // Increment the counter. Note: we DIDN'T worry about counter // overflow, since the data comes from UnicodeStrings, which - // stores its length in an int32_t. + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. (*pCounter)++; U_ASSERT(*pCounter > 0); @@ -2263,51 +3618,45 @@ GC_Done: case URX_BACKREF_I: { U_ASSERT(opValue < fFrameSize); - int32_t groupStartIdx = fp->fExtra[opValue]; - int32_t groupEndIdx = fp->fExtra[opValue+1]; + int64_t groupStartIdx = *(int64_t *) &fp->fExtra[opValue]; + int64_t groupEndIdx = *(int64_t *) &fp->fExtra[opValue+2]; U_ASSERT(groupStartIdx <= groupEndIdx); - int32_t len = groupEndIdx-groupStartIdx; if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. } - - if (len == 0) { - // The capture group match was of an empty string. - // Verified by testing: Perl matches succeed in this case, so - // we do too. - break; - } - - UBool haveMatch = FALSE; - if (fp->fInputIdx + len <= fActiveLimit) { - if (opType == URX_BACKREF) { - if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) { - haveMatch = TRUE; - } - } else { - if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, - len, U_FOLD_CASE_DEFAULT) == 0) { - haveMatch = TRUE; - } - } - } else { - // TODO: probably need to do a partial string comparison, and only - // set HitEnd if the available input matched. Ticket #6074 - fHitEnd = TRUE; + + if (groupEndIdx == groupStartIdx) { + // The capture group match was of an empty string. + // Verified by testing: Perl matches succeed in this case, so + // we do too. + break; } - if (haveMatch) { - fp->fInputIdx += len; // Match. Advance current input position. - } else { + + UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UBool haveMatch = (opType == URX_BACKREF ? + (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) : + (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + + if (fp->fInputIdx > fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } else if (!haveMatch) { + if (fp->fInputIdx == fActiveLimit) { + fHitEnd = TRUE; + } fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. } } break; - + case URX_STO_INP_LOC: { U_ASSERT(opValue >= 0 && opValue < fFrameSize); - fp->fExtra[opValue] = fp->fInputIdx; + *((int64_t *) &fp->fExtra[opValue]) = fp->fInputIdx; } break; @@ -2317,7 +3666,7 @@ GC_Done: fp->fPatIdx += 1; int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); - int32_t savedInputIdx = fp->fExtra[dataLoc]; + int64_t savedInputIdx = *(int64_t *) &fp->fExtra[dataLoc]; U_ASSERT(savedInputIdx <= fp->fInputIdx); if (savedInputIdx < fp->fInputIdx) { fp->fPatIdx = opValue; // JMP @@ -2333,7 +3682,7 @@ GC_Done: // Save Stack Ptr, Input Pos. U_ASSERT(opValue>=0 && opValue+1fDataSize); fData[opValue] = fStack->size(); - fData[opValue+1] = fp->fInputIdx; + *((int64_t *) &fData[opValue+1]) = fp->fInputIdx; fActiveStart = fLookStart; // Set the match region change for fActiveLimit = fLookLimit; // transparent bounds. } @@ -2359,7 +3708,7 @@ GC_Done: fp = (REStackFrame *)newFP; fStack->setSize(newStackSize); } - fp->fInputIdx = fData[opValue+1]; + fp->fInputIdx = *(int64_t *) &fData[opValue+1]; // Restore the active region bounds in the input string; they may have // been changed because of transparent bounds on a Region. @@ -2370,14 +3719,46 @@ GC_Done: case URX_ONECHAR_I: if (fp->fInputIdx < fActiveLimit) { - UChar32 c; - U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + + UChar32 c = UTEXT_NEXT32(fInputText); if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { fHitEnd = TRUE; } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + UChar32 c = UTEXT_PREVIOUS32(fInputText); + while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + c = UTEXT_PREVIOUS32(fInputText); + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -2386,30 +3767,114 @@ GC_Done: // Test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. - int32_t stringStartIdx, stringLen; - stringStartIdx = opValue; + const UCaseProps *csp = ucase_getSingleton(&status); + if (U_SUCCESS(status)) { + int32_t stringStartIdx, stringLen; + stringStartIdx = opValue; - op = pat[fp->fPatIdx]; - fp->fPatIdx++; - opType = URX_TYPE(op); - opValue = URX_VAL(op); - U_ASSERT(opType == URX_STRING_LEN); - stringLen = opValue; + op = pat[fp->fPatIdx]; + fp->fPatIdx++; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + stringLen = opValue; - int32_t stringEndIndex = fp->fInputIdx + stringLen; - if (stringEndIndex <= fActiveLimit) { - if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx, - stringLen, U_FOLD_CASE_DEFAULT) == 0) { - // Success. Advance the current input position. - fp->fInputIdx = stringEndIndex; - break; + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + const UChar *foldChars; + int32_t foldOffset, foldLength; + UChar32 c; + + foldOffset = foldLength = 0; + UBool success = TRUE; + + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + while (patternChars < patternEnd && success) { + if(foldOffset < foldLength) { + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = UTEXT_NEXT32(fInputText); + if (c != U_SENTINEL) { + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + } + + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } + + success = FALSE; + if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + c = UTEXT_PREVIOUS32(fInputText); + if (c == U_SENTINEL) { + break; + } else { + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + + if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } + } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); } - } else { - // Insufficent input left for a match. - fHitEnd = TRUE; // See ticket 6074 } - // No match. Back up matching to a saved state - fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; @@ -2420,12 +3885,12 @@ GC_Done: // TODO: implement transparent bounds. Ticket #6067 U_ASSERT(opValue>=0 && opValue+1fDataSize); fData[opValue] = fStack->size(); - fData[opValue+1] = fp->fInputIdx; + *((int64_t *) &fData[opValue+1]) = fp->fInputIdx; // Init the variable containing the start index for attempted matches. - fData[opValue+2] = -1; + *((int64_t *) &fData[opValue+3]) = -1; // Save input string length, then reset to pin any matches to end at // the current position. - fData[opValue+3] = fActiveLimit; + *((int64_t *) &fData[opValue+5]) = fActiveLimit; fActiveLimit = fp->fInputIdx; } break; @@ -2445,7 +3910,7 @@ GC_Done: // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t *lbStartIdx = &fData[opValue+2]; + int64_t *lbStartIdx = (int64_t *) &fData[opValue+3]; if (*lbStartIdx < 0) { // First time through loop. *lbStartIdx = fp->fInputIdx - minML; @@ -2453,9 +3918,11 @@ GC_Done: // 2nd through nth time through the loop. // Back up start position for match by one. if (*lbStartIdx == 0) { - (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. + (*lbStartIdx)--; } else { - U16_BACK_1(inputBuf, 0, *lbStartIdx); + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); + UTEXT_PREVIOUS32(fInputText); + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); } } @@ -2464,9 +3931,9 @@ GC_Done: // getting a match. Backtrack out, and out of the // Look Behind altogether. fp = (REStackFrame *)fStack->popFrame(fFrameSize); - int32_t restoreInputLen = fData[opValue+3]; + int64_t restoreInputLen = *(int64_t *) &fData[opValue+5]; U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInput->length()); + U_ASSERT(restoreInputLen <= fInputLength); fActiveLimit = restoreInputLen; break; } @@ -2474,7 +3941,7 @@ GC_Done: // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will fall off the end of the loop.) fp = StateSave(fp, fp->fPatIdx-3, status); - fp->fInputIdx = *lbStartIdx; + fp->fInputIdx = *lbStartIdx; } break; @@ -2495,9 +3962,9 @@ GC_Done: // Look-behind match is good. Restore the orignal input string length, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. - int32_t originalInputLen = fData[opValue+3]; + int64_t originalInputLen = *(int64_t *) &fData[opValue+5]; U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInput->length()); + U_ASSERT(originalInputLen <= fInputLength); fActiveLimit = originalInputLen; } break; @@ -2519,7 +3986,7 @@ GC_Done: // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+1fDataSize); - int32_t *lbStartIdx = &fData[opValue+2]; + int64_t *lbStartIdx = (int64_t *) &fData[opValue+3]; if (*lbStartIdx < 0) { // First time through loop. *lbStartIdx = fp->fInputIdx - minML; @@ -2527,9 +3994,11 @@ GC_Done: // 2nd through nth time through the loop. // Back up start position for match by one. if (*lbStartIdx == 0) { - (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. + (*lbStartIdx)--; } else { - U16_BACK_1(inputBuf, 0, *lbStartIdx); + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); + UTEXT_PREVIOUS32(fInputText); + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); } } @@ -2537,9 +4006,9 @@ GC_Done: // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location - int32_t restoreInputLen = fData[opValue+3]; + int64_t restoreInputLen = *(int64_t *) &fData[opValue+5]; U_ASSERT(restoreInputLen >= fActiveLimit); - U_ASSERT(restoreInputLen <= fInput->length()); + U_ASSERT(restoreInputLen <= fInputLength); fActiveLimit = restoreInputLen; fp->fPatIdx = continueLoc; break; @@ -2548,7 +4017,7 @@ GC_Done: // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will cause a FAIL out of the loop altogether.) fp = StateSave(fp, fp->fPatIdx-4, status); - fp->fInputIdx = *lbStartIdx; + fp->fInputIdx = *lbStartIdx; } break; @@ -2572,9 +4041,9 @@ GC_Done: // Restore the orignal input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. - int32_t originalInputLen = fData[opValue+3]; + int64_t originalInputLen = *(int64_t *) &fData[opValue+5]; U_ASSERT(originalInputLen >= fActiveLimit); - U_ASSERT(originalInputLen <= fInput->length()); + U_ASSERT(originalInputLen <= fInputLength); fActiveLimit = originalInputLen; // Restore original stack position, discarding any state saved @@ -2603,25 +4072,24 @@ GC_Done: // Loop through input, until either the input is exhausted or // we reach a character that is not a member of the set. - int32_t ix = fp->fInputIdx; + int64_t ix = fp->fInputIdx; + UTEXT_SETNATIVEINDEX(fInputText, ix); for (;;) { if (ix >= fActiveLimit) { fHitEnd = TRUE; break; } - UChar32 c; - U16_NEXT(inputBuf, ix, fActiveLimit, c); + UChar32 c = UTEXT_NEXT32(fInputText); if (c<256) { if (s8->contains(c) == FALSE) { - U16_BACK_1(inputBuf, 0, ix); break; } } else { if (s->contains(c) == FALSE) { - U16_BACK_1(inputBuf, 0, ix); break; } } + ix = UTEXT_GETNATIVEINDEX(fInputText); } // If there were no matching characters, skip over the loop altogether. @@ -2638,7 +4106,10 @@ GC_Done: U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); - fp->fExtra[stackLoc] = fp->fInputIdx; + *((int64_t *) &fp->fExtra[stackLoc]) = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, @@ -2657,7 +4128,7 @@ GC_Done: { // Loop through input until the input is exhausted (we reach an end-of-line) // In DOTALL mode, we can just go straight to the end of the input. - int32_t ix; + int64_t ix; if ((opValue & 1) == 1) { // Dot-matches-All mode. Jump straight to the end of the string. ix = fActiveLimit; @@ -2666,24 +4137,22 @@ GC_Done: // NOT DOT ALL mode. Line endings do not match '.' // Scan forward until a line ending or end of input. ix = fp->fInputIdx; + UTEXT_SETNATIVEINDEX(fInputText, ix); for (;;) { if (ix >= fActiveLimit) { fHitEnd = TRUE; - ix = fActiveLimit; break; } - UChar32 c; - U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] + UChar32 c = UTEXT_NEXT32(fInputText); if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s if ((c == 0x0a) || // 0x0a is newline in both modes. ((opValue & 2) == 0) && // IF not UNIX_LINES mode (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) { - // char is a line ending. Put the input pos back to the - // line ending char, and exit the scanning loop. - U16_BACK_1(inputBuf, 0, ix); + // char is a line ending. Exit the scanning loop. break; } } + ix = UTEXT_GETNATIVEINDEX(fInputText); } } @@ -2701,7 +4170,10 @@ GC_Done: U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); - fp->fExtra[stackLoc] = fp->fInputIdx; + *((int64_t *) &fp->fExtra[stackLoc]) = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, @@ -2716,9 +4188,9 @@ GC_Done: case URX_LOOP_C: { U_ASSERT(opValue>=0 && opValuefExtra[opValue]; - U_ASSERT(terminalIdx <= fp->fInputIdx); - if (terminalIdx == fp->fInputIdx) { + backSearchIndex = *(int64_t *) &fp->fExtra[opValue]; + U_ASSERT(backSearchIndex <= fp->fInputIdx); + if (backSearchIndex == fp->fInputIdx) { // We've backed up the input idx to the point that the loop started. // The loop is done. Leave here without saving state. // Subsequent failures won't come back here. @@ -2730,14 +4202,18 @@ GC_Done: // (We're going backwards because this loop emulates stack unwinding, not // the initial scan forward.) U_ASSERT(fp->fInputIdx > 0); - U16_BACK_1(inputBuf, 0, fp->fInputIdx); - if (inputBuf[fp->fInputIdx] == 0x0a && - fp->fInputIdx > terminalIdx && - inputBuf[fp->fInputIdx-1] == 0x0d) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 prevC = UTEXT_PREVIOUS32(fInputText); + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + + UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); + if (prevC == 0x0a && + fp->fInputIdx > backSearchIndex && + twoPrevC == 0x0d) { int32_t prevOp = pat[fp->fPatIdx-2]; if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { // .*, stepping back over CRLF pair. - fp->fInputIdx--; + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } } @@ -2780,11 +4256,1677 @@ breakFromLoop: fFrame = fp; // The active stack frame when the engine stopped. // Contains the capture group results that we need to // access later. - return; } +//-------------------------------------------------------------------------------- +// +// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the +// assumption that the entire string is available in the UText's +// chunk buffer. For now, that means we can use int32_t indexes, +// except for anything that needs to be saved (like group starts +// and ends). +// +// startIdx: begin matching a this index. +// toEnd: if true, match must extend to end of the input region +// +//-------------------------------------------------------------------------------- +void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { + UBool isMatch = FALSE; // True if the we have a match. + + int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards + + int32_t op; // Operation from the compiled pattern, split into + int32_t opType; // the opcode + int32_t opValue; // and the operand value. + +#ifdef REGEX_RUN_DEBUG + if (fTraceDebug) + { + printf("MatchAt(startIdx=%ld)\n", startIdx); + printf("Original Pattern: "); + UChar32 c = utext_next32From(fPattern->fPattern, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); + + c = UTEXT_NEXT32(fPattern->fPattern); + } + printf("\n"); + printf("Input String: "); + c = utext_next32From(fInputText, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + printf("%c", c); + + c = UTEXT_NEXT32(fInputText); + } + printf("\n"); + printf("\n"); + } +#endif + + if (U_FAILURE(status)) { + return; + } + + // Cache frequently referenced items from the compiled pattern + // + int32_t *pat = fPattern->fCompiledPat->getBuffer(); + + const UChar *litText = fPattern->fLiteralText.getBuffer(); + UVector *sets = fPattern->fSets; + + const UChar *inputBuf = fInputText->chunkContents; + + fFrameSize = fPattern->fFrameSize; + REStackFrame *fp = resetStack(); + + fp->fPatIdx = 0; + fp->fInputIdx = startIdx; + + // Zero out the pattern's static data + int32_t i; + for (i = 0; ifDataSize; i++) { + fData[i] = 0; + } + + // + // Main loop for interpreting the compiled pattern. + // One iteration of the loop per pattern operation performed. + // + for (;;) { +#if 0 + if (_heapchk() != _HEAPOK) { + fprintf(stderr, "Heap Trouble\n"); + } +#endif + + op = pat[fp->fPatIdx]; + opType = URX_TYPE(op); + opValue = URX_VAL(op); +#ifdef REGEX_RUN_DEBUG + if (fTraceDebug) { + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, + UTEXT_CURRENT32(fInputText), (int32_t *)fp-fStack->getBuffer(), fActiveLimit); + fPattern->dumpOp(fp->fPatIdx); + } +#endif + fp->fPatIdx++; + + switch (opType) { + + + case URX_NOP: + break; + + + case URX_BACKTRACK: + // Force a backtrack. In some circumstances, the pattern compiler + // will notice that the pattern can't possibly match anything, and will + // emit one of these at that point. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_ONECHAR: + if (fp->fInputIdx < fActiveLimit) { + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c == opValue) { + break; + } + } else { + fHitEnd = TRUE; + } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + int32_t reverseIndex = fp->fInputIdx; + UChar32 c; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c == opValue) { + break; + } + } while (reverseIndex > backSearchIndex); + if (c == opValue) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_STRING: + { + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + int32_t stringStartIdx = opValue; + int32_t stringLen; + + op = pat[fp->fPatIdx]; // Fetch the second operand + fp->fPatIdx++; + opType = URX_TYPE(op); + stringLen = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + U_ASSERT(stringLen >= 2); + + if (fp->fInputIdx + stringLen > fActiveLimit) { + // No match. String is longer than the remaining input text. + fHitEnd = TRUE; // TODO: See ticket 6074 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + const UChar * pInp = inputBuf + fp->fInputIdx; + const UChar * pPat = litText+stringStartIdx; + const UChar * pEnd = pInp + stringLen; + UBool success = FALSE; + for(;;) { + if (*pInp == *pPat) { + pInp++; + pPat++; + if (pInp == pEnd) { + // Successful Match. + success = TRUE; + break; + } + } else { + // Match failed. + break; + } + } + + if (success) { + fp->fInputIdx += stringLen; + } else { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + int32_t reverseIndex = fp->fInputIdx; + UChar32 c; + pPat = litText+stringStartIdx; + + // Search backwards for a possible start + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if ((U_IS_BMP(c) && *pPat == c) || + (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (reverseIndex > backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_STATE_SAVE: + fp = StateSave(fp, opValue, status); + break; + + + case URX_END: + // The match loop will exit via this path on a successful match, + // when we reach the end of the pattern. + if (toEnd && fp->fInputIdx != fActiveLimit) { + // The pattern matched, but not to the end of input. Try some more. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + isMatch = TRUE; + goto breakFromLoop; + + // Start and End Capture stack frame variables are layout out like this: + // fp->fExtra[opValue] - The start of a completed capture group (double-width) + // opValue+2 - The end of a completed capture group (double-width) + // opValue+4 - the start of a capture group whose end + // has not yet been reached (and might not ever be). + case URX_START_CAPTURE: + U_ASSERT(opValue >= 0 && opValue < fFrameSize-5); + fp->fExtra[opValue+4] = fp->fInputIdx; + break; + + + case URX_END_CAPTURE: + U_ASSERT(opValue >= 0 && opValue < fFrameSize-5); + U_ASSERT(fp->fExtra[opValue+4] >= 0); // Start pos for this group must be set. + *((int64_t *) &fp->fExtra[opValue]) = fp->fExtra[opValue+4]; // Tentative start becomes real. + *((int64_t *) &fp->fExtra[opValue+2]) = fp->fInputIdx; // End position + U_ASSERT(*((int64_t *) &fp->fExtra[opValue]) <= *((int64_t *) &fp->fExtra[opValue+2])); + break; + + + case URX_DOLLAR: // $, test for End of line + // or for position before new line at end of input + if (fp->fInputIdx < fAnchorLimit-2) { + // We are no where near the end of input. Fail. + // This is the common case. Keep it first. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + + // If we are positioned just before a new-line that is located at the + // end of input, succeed. + if (fp->fInputIdx == fAnchorLimit-1) { + UChar32 c; + U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); + + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { + // At new-line at end of input. Success + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + } + } else if (fp->fInputIdx == fAnchorLimit-2 && + inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; // At CR/LF at end of input. Success + } + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + + break; + + + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. + if (fp->fInputIdx >= fAnchorLimit-1) { + // Either at the last character of input, or off the end. + if (fp->fInputIdx == fAnchorLimit-1) { + // At last char of input. Success if it's a new line. + if (inputBuf[fp->fInputIdx] == 0x0a) { + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + } else { + // Off the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + } + + // Not at end of input. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + + case URX_DOLLAR_M: // $, test for End of line in multi-line mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; + break; + } + // If we are positioned just before a new-line, succeed. + // It makes no difference where the new-line is within the input. + UChar32 c = inputBuf[fp->fInputIdx]; + if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence + // In multi-line mode, hitting a new-line just before the end of input does not + // set the hitEnd or requireEnd flags + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { + break; + } + } + // not at a new line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode + { + if (fp->fInputIdx >= fAnchorLimit) { + // We really are at the end of input. Success. + fHitEnd = TRUE; + fRequireEnd = TRUE; // Java set requireEnd in this case, even though + break; // adding a new-line would not lose the match. + } + // If we are not positioned just before a new-line, the test fails; backtrack out. + // It makes no difference where the new-line is within the input. + if (inputBuf[fp->fInputIdx] != 0x0a) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_CARET: // ^, test for start of line + if (fp->fInputIdx != fAnchorStart) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M: // ^, test for start of line in mulit-line mode + { + if (fp->fInputIdx == fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + // unless we are at the end of input + UChar c = inputBuf[fp->fInputIdx - 1]; + if ((fp->fInputIdx < fAnchorLimit) && + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // It's a new-line. ^ is true. Success. + // TODO: what should be done with positions between a CR and LF? + break; + } + // Not at the start of a line. Fail. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode + { + U_ASSERT(fp->fInputIdx >= fAnchorStart); + if (fp->fInputIdx <= fAnchorStart) { + // We are at the start input. Success. + break; + } + // Check whether character just before the current pos is a new-line + U_ASSERT(fp->fInputIdx <= fAnchorLimit); + UChar c = inputBuf[fp->fInputIdx - 1]; + if (c != 0x0a) { + // Not at the start of a line. Back-track out. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_BACKSLASH_B: // Test for word boundaries + { + UBool success = isChunkWordBoundary(fp->fInputIdx); + success ^= (opValue != 0); // flip sense for \B + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style + { + UBool success = isUWordBoundary(fp->fInputIdx); + success ^= (opValue != 0); // flip sense for \B + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_D: // Test for decimal digit + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. + UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); + success ^= (opValue != 0); // flip sense for \D + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_G: // Test for position at end of previous match + if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_BACKSLASH_X: + // Match a Grapheme, as defined by Unicode TR 29. + // Differs slightly from Perl, which consumes combining marks independently + // of context. + { + + // Fail if at end of input + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // Examine (and consume) the current char. + // Dispatch into a little state machine, based on the char. + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UnicodeSet **sets = fPattern->fStaticSets; + if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; + if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + goto GC_Extend; + + + +GC_L: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_L]->contains(c)) goto GC_L; + if (sets[URX_GC_LV]->contains(c)) goto GC_V; + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; + if (sets[URX_GC_V]->contains(c)) goto GC_V; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; + +GC_V: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_V]->contains(c)) goto GC_V; + if (sets[URX_GC_T]->contains(c)) goto GC_T; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; + +GC_T: + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_T]->contains(c)) goto GC_T; + U16_PREV(inputBuf, 0, fp->fInputIdx, c); + goto GC_Extend; + +GC_Extend: + // Combining characters are consumed here + for (;;) { + if (fp->fInputIdx >= fActiveLimit) { + break; + } + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { + U16_BACK_1(inputBuf, 0, fp->fInputIdx); + break; + } + } + goto GC_Done; + +GC_Control: + // Most control chars stand alone (don't combine with combining chars), + // except for that CR/LF sequence is a single grapheme cluster. + if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { + fp->fInputIdx++; + } + +GC_Done: + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + } + break; + } + + + + + case URX_BACKSLASH_Z: // Test for end of Input + if (fp->fInputIdx < fAnchorLimit) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } else { + fHitEnd = TRUE; + fRequireEnd = TRUE; + } + break; + + + + case URX_STATIC_SETREF: + { + // Test input character against one of the predefined sets + // (Word Characters, for example) + // The high bit of the op value is a flag for the match polarity. + // 0: success if input char is in set. + // 1: success if input char is not in set. + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); + opValue &= ~URX_NEG_SET; + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int32_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c)) { + success = !success; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c)) { + success = !success; + } + } + } while (reverseIndex > backSearchIndex && !success); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_STAT_SETREF_N: + { + // Test input character for NOT being a member of one of + // the predefined sets (Word Characters, for example) + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); + + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + break; + } + } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int32_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + UBool success = FALSE; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; + if (s8->contains(c) == FALSE) { + success = TRUE; + break; + } + } else { + const UnicodeSet *s = fPattern->fStaticSets[opValue]; + if (s->contains(c) == FALSE) { + success = TRUE; + break; + } + } + } while (reverseIndex > backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_SETREF: + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + U_ASSERT(opValue > 0 && opValue < sets->size()); + + // There is input left. Pick up one char and test it for set membership. + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c<256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + // The character is in the set. A Match. + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + // The character is in the set. A Match. + break; + } + } + + // the character wasn't in the set. + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Try to find it, backwards + int32_t reverseIndex = fp->fInputIdx; + U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried + UBool success = FALSE; + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (c < 256) { + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + if (s8->contains(c)) { + success = TRUE; + break; + } + } else { + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + if (s->contains(c)) { + success = TRUE; + break; + } + } + } while (reverseIndex > backSearchIndex); + + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > reverseIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_DOTANY: + { + // . matches anything, but stops at end-of-line. + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // There is input left. Advance over one char, unless we've hit end-of-line + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + // End of line in normal mode. . does not match. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + } + break; + + + case URX_DOTANY_ALL: + { + // . in dot-matches-all (including new lines) mode + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // There is input left. Advance over one char, except if we are + // at a cr/lf, advance over both of them. + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c==0x0d && fp->fInputIdx < fActiveLimit) { + // In the case of a CR/LF, we need to advance over both. + if (inputBuf[fp->fInputIdx] == 0x0a) { + U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); + } + } + } + break; + + + case URX_DOTANY_UNIX: + { + // '.' operator, matches all, but stops at end-of-line. + // UNIX_LINES mode, so 0x0a is the only recognized line ending. + if (fp->fInputIdx >= fActiveLimit) { + // At end of input. Match failed. Backtrack out. + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // There is input left. Advance over one char, unless we've hit end-of-line + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (c == 0x0a) { + // End of line in normal mode. '.' does not match the \n + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_JMP: + fp->fPatIdx = opValue; + break; + + case URX_FAIL: + isMatch = FALSE; + goto breakFromLoop; + + case URX_JMP_SAV: + U_ASSERT(opValue < fPattern->fCompiledPat->size()); + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; // Then JMP. + break; + + case URX_JMP_SAV_X: + // This opcode is used with (x)+, when x can match a zero length string. + // Same as JMP_SAV, except conditional on the match having made forward progress. + // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the + // data address of the input position at the start of the loop. + { + U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); + int32_t stoOp = pat[opValue-1]; + U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); + int32_t frameLoc = URX_VAL(stoOp); + U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); + int32_t prevInputIdx = fp->fExtra[frameLoc]; + U_ASSERT(prevInputIdx <= fp->fInputIdx); + if (prevInputIdx < fp->fInputIdx) { + // The match did make progress. Repeat the loop. + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current + fp->fPatIdx = opValue; + fp->fExtra[frameLoc] = fp->fInputIdx; + } + // If the input position did not advance, we do nothing here, + // execution will fall out of the loop. + } + break; + + case URX_CTR_INIT: + { + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + int32_t instrOperandLoc = fp->fPatIdx; + fp->fPatIdx += 3; + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); + int32_t minCount = pat[instrOperandLoc+1]; + int32_t maxCount = pat[instrOperandLoc+2]; + U_ASSERT(minCount>=0); + U_ASSERT(maxCount>=minCount || maxCount==-1); + U_ASSERT(loopLoc>fp->fPatIdx); + + if (minCount == 0) { + fp = StateSave(fp, loopLoc+1, status); + } + if (maxCount == 0) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_CTR_LOOP: + { + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); + int32_t initOp = pat[opValue]; + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); + int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = pat[opValue+2]; + int32_t maxCount = pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter + // overflow, since the data comes from UnicodeStrings, which + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. + (*pCounter)++; + U_ASSERT(*pCounter > 0); + if ((uint32_t)*pCounter >= (uint32_t)maxCount) { + U_ASSERT(*pCounter == maxCount || maxCount == -1); + break; + } + if (*pCounter >= minCount) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx = opValue + 4; // Loop back. + } + break; + + case URX_CTR_INIT_NG: + { + // Initialize a non-greedy loop + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + int32_t instrOperandLoc = fp->fPatIdx; + fp->fPatIdx += 3; + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); + int32_t minCount = pat[instrOperandLoc+1]; + int32_t maxCount = pat[instrOperandLoc+2]; + U_ASSERT(minCount>=0); + U_ASSERT(maxCount>=minCount || maxCount==-1); + U_ASSERT(loopLoc>fp->fPatIdx); + + if (minCount == 0) { + if (maxCount != 0) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block + } + } + break; + + case URX_CTR_LOOP_NG: + { + // Non-greedy {min, max} loops + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); + int32_t initOp = pat[opValue]; + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); + int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = pat[opValue+2]; + int32_t maxCount = pat[opValue+3]; + // Increment the counter. Note: we DIDN'T worry about counter + // overflow, since the data comes from UnicodeStrings, which + // stores its length in an int32_t. Do we have to think about + // this now that we're using UText? Probably not, since the length + // in UChar32s is still an int32_t. + (*pCounter)++; + U_ASSERT(*pCounter > 0); + + if ((uint32_t)*pCounter >= (uint32_t)maxCount) { + // The loop has matched the maximum permitted number of times. + // Break out of here with no action. Matching will + // continue with the following pattern. + U_ASSERT(*pCounter == maxCount || maxCount == -1); + break; + } + + if (*pCounter < minCount) { + // We haven't met the minimum number of matches yet. + // Loop back for another one. + fp->fPatIdx = opValue + 4; // Loop back. + } else { + // We do have the minimum number of matches. + // Fall into the following pattern, but first do + // a state save to the top of the loop, so that a failure + // in the following pattern will try another iteration of the loop. + fp = StateSave(fp, opValue + 4, status); + } + } + break; + + case URX_STO_SP: + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + fData[opValue] = fStack->size(); + break; + + case URX_LD_SP: + { + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + int32_t newStackSize = fData[opValue]; + U_ASSERT(newStackSize <= fStack->size()); + int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + if (newFP == (int32_t *)fp) { + break; + } + int32_t i; + for (i=0; isetSize(newStackSize); + } + break; + + case URX_BACKREF: + case URX_BACKREF_I: + { + U_ASSERT(opValue < fFrameSize); + int64_t groupStartIdx = *(int64_t *) &fp->fExtra[opValue]; + int64_t groupEndIdx = *(int64_t *) &fp->fExtra[opValue+2]; + U_ASSERT(groupStartIdx <= groupEndIdx); + int64_t len = groupEndIdx-groupStartIdx; + if (groupStartIdx < 0) { + // This capture group has not participated in the match thus far, + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } + + if (len == 0) { + // The capture group match was of an empty string. + // Verified by testing: Perl matches succeed in this case, so + // we do too. + break; + } + + UBool haveMatch = FALSE; + if (fp->fInputIdx + len <= fActiveLimit) { + if (opType == URX_BACKREF) { + if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) { + haveMatch = TRUE; + } + } else { + if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, + len, U_FOLD_CASE_DEFAULT) == 0) { + haveMatch = TRUE; + } + } + } else { + // TODO: probably need to do a partial string comparison, and only + // set HitEnd if the available input matched. Ticket #6074 + fHitEnd = TRUE; + } + if (haveMatch) { + fp->fInputIdx += len; // Match. Advance current input position. + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. + } + } + break; + + case URX_STO_INP_LOC: + { + U_ASSERT(opValue >= 0 && opValue < fFrameSize); + fp->fExtra[opValue] = fp->fInputIdx; + } + break; + + case URX_JMPX: + { + int32_t instrOperandLoc = fp->fPatIdx; + fp->fPatIdx += 1; + int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); + U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); + int32_t savedInputIdx = fp->fExtra[dataLoc]; + U_ASSERT(savedInputIdx <= fp->fInputIdx); + if (savedInputIdx < fp->fInputIdx) { + fp->fPatIdx = opValue; // JMP + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. + } + } + break; + + case URX_LA_START: + { + // Entering a lookahead block. + // Save Stack Ptr, Input Pos. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + fData[opValue] = fStack->size(); + fData[opValue+1] = fp->fInputIdx; + fActiveStart = fLookStart; // Set the match region change for + fActiveLimit = fLookLimit; // transparent bounds. + } + break; + + case URX_LA_END: + { + // Leaving a look-ahead block. + // restore Stack Ptr, Input Pos to positions they had on entry to block. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t stackSize = fStack->size(); + int32_t newStackSize = fData[opValue]; + U_ASSERT(stackSize >= newStackSize); + if (stackSize > newStackSize) { + // Copy the current top frame back to the new (cut back) top frame. + // This makes the capture groups from within the look-ahead + // expression available. + int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; + int32_t i; + for (i=0; isetSize(newStackSize); + } + fp->fInputIdx = fData[opValue+1]; + + // Restore the active region bounds in the input string; they may have + // been changed because of transparent bounds on a Region. + fActiveStart = fRegionStart; + fActiveLimit = fRegionLimit; + } + break; + + case URX_ONECHAR_I: + if (fp->fInputIdx < fActiveLimit) { + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + break; + } + } else { + fHitEnd = TRUE; + } + + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + UBool success = FALSE; + int32_t reverseIndex = fp->fInputIdx; + UChar32 c; + while (reverseIndex > backSearchIndex) { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { + success = TRUE; + break; + } else if (c == U_SENTINEL) { + break; + } + } + if (success) { + fHitEnd = FALSE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + + case URX_STRING_I: + { + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + const UCaseProps *csp = ucase_getSingleton(&status); + if (U_SUCCESS(status)) { + int32_t stringStartIdx, stringLen; + stringStartIdx = opValue; + + op = pat[fp->fPatIdx]; + fp->fPatIdx++; + opType = URX_TYPE(op); + opValue = URX_VAL(op); + U_ASSERT(opType == URX_STRING_LEN); + stringLen = opValue; + + const UChar *patternChars = litText+stringStartIdx; + const UChar *patternEnd = patternChars+stringLen; + + const UChar *foldChars; + int32_t foldOffset, foldLength; + UChar32 c; + + #ifdef REGEX_SMART_BACKTRACKING + int32_t originalInputIdx = fp->fInputIdx; + #endif + UBool success = TRUE; + + foldOffset = foldLength = 0; + + while (patternChars < patternEnd && success) { + if(foldOffset < foldLength) { + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + } + + if (fp->fInputIdx <= fActiveLimit) { + if (U_IS_BMP(c)) { + success = (*patternChars == c); + patternChars += 1; + } else if (patternChars+1 < patternEnd) { + success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); + patternChars += 2; + } + } else { + success = FALSE; + fHitEnd = TRUE; // TODO: See ticket 6074 + } + } + + if (!success) { + #ifdef REGEX_SMART_BACKTRACKING + if (fp->fInputIdx > backSearchIndex && fStack->size()) { + REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); + if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { + // Reset to last start point + int32_t reverseIndex = originalInputIdx; + patternChars = litText+stringStartIdx; + + // Search backwards for a possible start + do { + U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); + foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); + if(foldLength >= 0) { + if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings + foldOffset = 0; + U16_NEXT_UNSAFE(foldChars, foldOffset, c); + } else { + c = foldLength; + foldLength = foldOffset; // to avoid reading chars from the folding buffer + } + } + + if ((U_IS_BMP(c) && *patternChars == c) || + (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { + success = TRUE; + break; + } + } while (reverseIndex > backSearchIndex); + + // And try again + if (success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + fp->fInputIdx = reverseIndex; + if (fp->fInputIdx > backSearchIndex) { + fp = StateSave(fp, fp->fPatIdx, status); + } + fp->fPatIdx++; // Skip the LOOP_C, we just did that + break; + } + } + } + #endif + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + } + break; + + case URX_LB_START: + { + // Entering a look-behind block. + // Save Stack Ptr, Input Pos. + // TODO: implement transparent bounds. Ticket #6067 + U_ASSERT(opValue>=0 && opValue+1fDataSize); + fData[opValue] = fStack->size(); + fData[opValue+1] = fp->fInputIdx; + // Init the variable containing the start index for attempted matches. + fData[opValue+3] = -1; + // Save input string length, then reset to pin any matches to end at + // the current position. + fData[opValue+5] = fActiveLimit; + fActiveLimit = fp->fInputIdx; + } + break; + + + case URX_LB_CONT: + { + // Positive Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the min and max possible match lengths. They are the operands + // of this op in the pattern. + int32_t minML = pat[fp->fPatIdx++]; + int32_t maxML = pat[fp->fPatIdx++]; + U_ASSERT(minML <= maxML); + U_ASSERT(minML >= 0); + + // Fetch (from data) the last input index where a match was attempted. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t *lbStartIdx = &fData[opValue+3]; + if (*lbStartIdx < 0) { + // First time through loop. + *lbStartIdx = fp->fInputIdx - minML; + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + if (*lbStartIdx == 0) { + (*lbStartIdx)--; + } else { + U16_BACK_1(inputBuf, 0, *lbStartIdx); + } + } + + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { + // We have tried all potential match starting points without + // getting a match. Backtrack out, and out of the + // Look Behind altogether. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + int32_t restoreInputLen = fData[opValue+5]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; + break; + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will fall off the end of the loop.) + fp = StateSave(fp, fp->fPatIdx-3, status); + fp->fInputIdx = *lbStartIdx; + } + break; + + case URX_LB_END: + // End of a look-behind block, after a successful match. + { + U_ASSERT(opValue>=0 && opValue+1fDataSize); + if (fp->fInputIdx != fActiveLimit) { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or fail + // the look-behind altogether, whichever is appropriate. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // Look-behind match is good. Restore the orignal input string length, + // which had been truncated to pin the end of the lookbehind match to the + // position being looked-behind. + int32_t originalInputLen = fData[opValue+5]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; + } + break; + + + case URX_LBN_CONT: + { + // Negative Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the extra parameters of this op. + int32_t minML = pat[fp->fPatIdx++]; + int32_t maxML = pat[fp->fPatIdx++]; + int32_t continueLoc = pat[fp->fPatIdx++]; + continueLoc = URX_VAL(continueLoc); + U_ASSERT(minML <= maxML); + U_ASSERT(minML >= 0); + U_ASSERT(continueLoc > fp->fPatIdx); + + // Fetch (from data) the last input index where a match was attempted. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t *lbStartIdx = &fData[opValue+3]; + if (*lbStartIdx < 0) { + // First time through loop. + *lbStartIdx = fp->fInputIdx - minML; + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + if (*lbStartIdx == 0) { + (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. + } else { + U16_BACK_1(inputBuf, 0, *lbStartIdx); + } + } + + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { + // We have tried all potential match starting points without + // getting a match, which means that the negative lookbehind as + // a whole has succeeded. Jump forward to the continue location + int32_t restoreInputLen = fData[opValue+5]; + U_ASSERT(restoreInputLen >= fActiveLimit); + U_ASSERT(restoreInputLen <= fInputLength); + fActiveLimit = restoreInputLen; + fp->fPatIdx = continueLoc; + break; + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will cause a FAIL out of the loop altogether.) + fp = StateSave(fp, fp->fPatIdx-4, status); + fp->fInputIdx = *lbStartIdx; + } + break; + + case URX_LBN_END: + // End of a negative look-behind block, after a successful match. + { + U_ASSERT(opValue>=0 && opValue+1fDataSize); + if (fp->fInputIdx != fActiveLimit) { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or succeed + // the look-behind altogether, whichever is appropriate. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + + // Look-behind expression matched, which means look-behind test as + // a whole Fails + + // Restore the orignal input string length, which had been truncated + // inorder to pin the end of the lookbehind match + // to the position being looked-behind. + int32_t originalInputLen = fData[opValue+5]; + U_ASSERT(originalInputLen >= fActiveLimit); + U_ASSERT(originalInputLen <= fInputLength); + fActiveLimit = originalInputLen; + + // Restore original stack position, discarding any state saved + // by the successful pattern match. + U_ASSERT(opValue>=0 && opValue+1fDataSize); + int32_t newStackSize = fData[opValue]; + U_ASSERT(fStack->size() > newStackSize); + fStack->setSize(newStackSize); + + // FAIL, which will take control back to someplace + // prior to entering the look-behind test. + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + break; + + + case URX_LOOP_SR_I: + // Loop Initialization for the optimized implementation of + // [some character set]* + // This op scans through all matching input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + { + U_ASSERT(opValue > 0 && opValue < sets->size()); + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + + // Loop through input, until either the input is exhausted or + // we reach a character that is not a member of the set. + int32_t ix = fp->fInputIdx; + for (;;) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; + break; + } + UChar32 c; + U16_NEXT(inputBuf, ix, fActiveLimit, c); + if (c<256) { + if (s8->contains(c) == FALSE) { + U16_BACK_1(inputBuf, 0, ix); + break; + } + } else { + if (s->contains(c) == FALSE) { + U16_BACK_1(inputBuf, 0, ix); + break; + } + } + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if (ix == fp->fInputIdx) { + fp->fPatIdx++; // skip the URX_LOOP_C op. + break; + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this [set]* + int32_t loopcOp = pat[fp->fPatIdx]; + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); + int32_t stackLoc = URX_VAL(loopcOp); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); + fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif + fp->fInputIdx = ix; + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp = StateSave(fp, fp->fPatIdx, status); + fp->fPatIdx++; + } + break; + + + case URX_LOOP_DOT_I: + // Loop Initialization for the optimized implementation of .* + // This op scans through all remaining input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + { + // Loop through input until the input is exhausted (we reach an end-of-line) + // In DOTALL mode, we can just go straight to the end of the input. + int32_t ix; + if ((opValue & 1) == 1) { + // Dot-matches-All mode. Jump straight to the end of the string. + ix = fActiveLimit; + fHitEnd = TRUE; + } else { + // NOT DOT ALL mode. Line endings do not match '.' + // Scan forward until a line ending or end of input. + ix = fp->fInputIdx; + for (;;) { + if (ix >= fActiveLimit) { + fHitEnd = TRUE; + break; + } + UChar32 c; + U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] + if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s + if ((c == 0x0a) || // 0x0a is newline in both modes. + ((opValue & 2) == 0) && // IF not UNIX_LINES mode + (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) { + // char is a line ending. Put the input pos back to the + // line ending char, and exit the scanning loop. + U16_BACK_1(inputBuf, 0, ix); + break; + } + } + } + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if (ix == fp->fInputIdx) { + fp->fPatIdx++; // skip the URX_LOOP_C op. + break; + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this .* + int32_t loopcOp = pat[fp->fPatIdx]; + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); + int32_t stackLoc = URX_VAL(loopcOp); + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); + fp->fExtra[stackLoc] = fp->fInputIdx; + #ifdef REGEX_SMART_BACKTRACKING + backSearchIndex = fp->fInputIdx; + #endif + fp->fInputIdx = ix; + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp = StateSave(fp, fp->fPatIdx, status); + fp->fPatIdx++; + } + break; + + + case URX_LOOP_C: + { + U_ASSERT(opValue>=0 && opValuefExtra[opValue]; + U_ASSERT(backSearchIndex <= fp->fInputIdx); + if (backSearchIndex == fp->fInputIdx) { + // We've backed up the input idx to the point that the loop started. + // The loop is done. Leave here without saving state. + // Subsequent failures won't come back here. + break; + } + // Set up for the next iteration of the loop, with input index + // backed up by one from the last time through, + // and a state save to this instruction in case the following code fails again. + // (We're going backwards because this loop emulates stack unwinding, not + // the initial scan forward.) + U_ASSERT(fp->fInputIdx > 0); + UChar32 prevC; + U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? + + if (prevC == 0x0a && + fp->fInputIdx > backSearchIndex && + inputBuf[fp->fInputIdx-1] == 0x0d) { + int32_t prevOp = pat[fp->fPatIdx-2]; + if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { + // .*, stepping back over CRLF pair. + U16_BACK_1(inputBuf, 0, fp->fInputIdx); + } + } + + + fp = StateSave(fp, fp->fPatIdx-1, status); + } + break; + + + + default: + // Trouble. The compiled pattern contains an entry with an + // unrecognized type tag. + U_ASSERT(FALSE); + } + + if (U_FAILURE(status)) { + isMatch = FALSE; + break; + } + } + +breakFromLoop: + fMatch = isMatch; + if (isMatch) { + fLastMatchEnd = fMatchEnd; + fMatchStart = startIdx; + fMatchEnd = fp->fInputIdx; + if (fTraceDebug) { + REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); + } + } + else + { + if (fTraceDebug) { + REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); + } + } + + fFrame = fp; // The active stack frame when the engine stopped. + // Contains the capture group results that we need to + // access later. + return; +} + UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index bfe2520ad61..8060b87767e 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -3,7 +3,7 @@ // /* *************************************************************************** -* Copyright (C) 2002-2009 International Business Machines Corporation * +* Copyright (C) 2002-2010 International Business Machines Corporation * * and others. All rights reserved. * *************************************************************************** */ @@ -29,11 +29,11 @@ U_NAMESPACE_BEGIN // //-------------------------------------------------------------------------- RegexPattern::RegexPattern() { - // Init all of this instance's data. - init(); + UErrorCode status = U_ZERO_ERROR; + u_init(&status); - // Lazy init of all shared global sets. - RegexStaticSets::initGlobals(&fDeferredStatus); + // Init all of this instances data. + init(); } @@ -52,7 +52,7 @@ RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { //-------------------------------------------------------------------------- // -// Assignmenet Operator +// Assignment Operator // //-------------------------------------------------------------------------- RegexPattern &RegexPattern::operator = (const RegexPattern &other) { @@ -68,7 +68,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { init(); // Copy simple fields - fPattern = other.fPattern; + fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); fFlags = other.fFlags; fLiteralText = other.fLiteralText; fDeferredStatus = other.fDeferredStatus; @@ -85,6 +85,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { *fInitialChars = *other.fInitialChars; fInitialChar = other.fInitialChar; *fInitialChars8 = *other.fInitialChars8; + fNeedsAltInput = other.fNeedsAltInput; // Copy the pattern. It's just values, nothing deep to copy. fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); @@ -126,7 +127,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { // //-------------------------------------------------------------------------- void RegexPattern::init() { - fPattern.remove(); fFlags = 0; fCompiledPat = 0; fLiteralText.remove(); @@ -146,7 +146,9 @@ void RegexPattern::init() { fInitialChars = NULL; fInitialChar = 0; fInitialChars8 = NULL; + fNeedsAltInput = FALSE; + fPattern = NULL; // will be set later fCompiledPat = new UVector32(fDeferredStatus); fGroupMap = new UVector32(fDeferredStatus); fSets = new UVector(fDeferredStatus); @@ -192,6 +194,9 @@ void RegexPattern::zap() { fInitialChars = NULL; delete fInitialChars8; fInitialChars8 = NULL; + if (fPattern != NULL) { + utext_close(fPattern); + } } @@ -220,13 +225,27 @@ RegexPattern *RegexPattern::clone() const { // // operator == (comparison) Consider to patterns to be == if the // pattern strings and the flags are the same. +// Note that pattern strings with the same +// characters can still be considered different. // //-------------------------------------------------------------------------- UBool RegexPattern::operator ==(const RegexPattern &other) const { - UBool r = this->fFlags == other.fFlags && - this->fPattern == other.fPattern && - this->fDeferredStatus == other.fDeferredStatus; - return r; + if (this->fPattern == NULL) { + if (other.fPattern == NULL) { + return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus; + } else { + return FALSE; + } + } else { + if (other.fPattern == NULL) { + return FALSE; + } else { + UTEXT_SETNATIVEINDEX(this->fPattern, 0); + UTEXT_SETNATIVEINDEX(other.fPattern, 0); + return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus && + utext_equals(this->fPattern, other.fPattern); + } + } } //--------------------------------------------------------------------- @@ -240,7 +259,57 @@ RegexPattern::compile(const UnicodeString ®ex, UParseError &pe, UErrorCode &status) { + if (U_FAILURE(status)) { + return NULL; + } + + const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | + UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | + UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES; + + if ((flags & ~allFlags) != 0) { + status = U_REGEX_INVALID_FLAG; + return NULL; + } + + if ((flags & UREGEX_CANON_EQ) != 0) { + status = U_REGEX_UNIMPLEMENTED; + return NULL; + } + + RegexPattern *This = new RegexPattern; + if (This == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + if (U_FAILURE(This->fDeferredStatus)) { + status = This->fDeferredStatus; + delete This; + return NULL; + } + This->fFlags = flags; + + RegexCompile compiler(This, status); + compiler.compile(regex, pe, status); + + if (U_FAILURE(status)) { + delete This; + This = NULL; + } + + return This; +} + +// +// compile, UText mode +// +RegexPattern * U_EXPORT2 +RegexPattern::compile(UText *regex, + uint32_t flags, + UParseError &pe, + UErrorCode &status) +{ if (U_FAILURE(status)) { return NULL; } @@ -294,20 +363,43 @@ RegexPattern::compile(const UnicodeString ®ex, } +// +// compile with default flags, UText mode +// +RegexPattern * U_EXPORT2 +RegexPattern::compile(UText *regex, + UParseError &pe, + UErrorCode &err) +{ + return compile(regex, 0, pe, err); +} + // // compile with no UParseErr parameter. // RegexPattern * U_EXPORT2 -RegexPattern::compile( const UnicodeString ®ex, - uint32_t flags, - UErrorCode &err) +RegexPattern::compile(const UnicodeString ®ex, + uint32_t flags, + UErrorCode &err) { UParseError pe; return compile(regex, flags, pe, err); } +// +// compile with no UParseErr parameter, UText mode +// +RegexPattern * U_EXPORT2 +RegexPattern::compile(UText *regex, + uint32_t flags, + UErrorCode &err) +{ + UParseError pe; + return compile(regex, flags, pe, err); +} + //--------------------------------------------------------------------- // @@ -327,8 +419,21 @@ uint32_t RegexPattern::flags() const { RegexMatcher *RegexPattern::matcher(const UnicodeString &input, UErrorCode &status) const { RegexMatcher *retMatcher = matcher(status); - retMatcher->fDeferredStatus = status; if (retMatcher != NULL) { + retMatcher->fDeferredStatus = status; + retMatcher->reset(input); + } + return retMatcher; +} + +// +// matcher, UText mode +// +RegexMatcher *RegexPattern::matcher(UText *input, + UErrorCode &status) const { + RegexMatcher *retMatcher = matcher(status); + if (retMatcher != NULL) { + retMatcher->fDeferredStatus = status; retMatcher->reset(input); } return retMatcher; @@ -399,6 +504,31 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, } +// +// matches, UText mode +// +UBool U_EXPORT2 RegexPattern::matches(UText *regex, + UText *input, + UParseError &pe, + UErrorCode &status) { + + if (U_FAILURE(status)) {return FALSE;} + + UBool retVal; + RegexPattern *pat = NULL; + RegexMatcher *matcher = NULL; + + pat = RegexPattern::compile(regex, 0, pe, status); + matcher = pat->matcher(input, status); + retVal = matcher->matches(status); + + delete matcher; + delete pat; + return retVal; +} + + + //--------------------------------------------------------------------- @@ -407,12 +537,43 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, // //--------------------------------------------------------------------- UnicodeString RegexPattern::pattern() const { - return fPattern; + if (fPattern == NULL) { + return UnicodeString(); + } else { + UErrorCode status = U_ZERO_ERROR; + int64_t nativeLen = utext_nativeLength(fPattern); + int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error + UnicodeString result; + + status = U_ZERO_ERROR; + UChar *resultChars = result.getBuffer(len16); + utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning + result.releaseBuffer(len16); + + return result; + } } +//--------------------------------------------------------------------- +// +// patternText +// +//--------------------------------------------------------------------- +UText *RegexPattern::patternText() const { + if (fPattern != NULL) { + return fPattern; + } else { + UErrorCode status = U_ZERO_ERROR; + RegexStaticSets::initGlobals(&status); + return RegexStaticSets::gStaticSets->fEmptyText; + } +} + + + //--------------------------------------------------------------------- // // split @@ -421,7 +582,28 @@ UnicodeString RegexPattern::pattern() const { int32_t RegexPattern::split(const UnicodeString &input, UnicodeString dest[], int32_t destCapacity, - UErrorCode &status) const + UErrorCode &status) const +{ + if (U_FAILURE(status)) { + return 0; + }; + + RegexMatcher m(this); + int32_t r = 0; + // Check m's status to make sure all is ok. + if (U_SUCCESS(m.fDeferredStatus)) { + r = m.split(input, dest, destCapacity, status); + } + return r; +} + +// +// split, UText mode +// +int32_t RegexPattern::split(UText *input, + UText *dest[], + int32_t destCapacity, + UErrorCode &status) const { if (U_FAILURE(status)) { return 0; @@ -572,17 +754,24 @@ RegexPatternDump(const RegexPattern *This) { int i; REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: ")); - for (i=0; ifPattern.length(); i++) { - REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i))); + UChar32 c = utext_next32From(This->fPattern, 0); + while (c != U_SENTINEL) { + if (c<32 || c>256) { + c = '.'; + } + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); + + c = UTEXT_NEXT32(This->fPattern); } REGEX_DUMP_DEBUG_PRINTF(("\n")); REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType))); if (This->fStartType == START_STRING) { - REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \"")); + REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \"")); for (i=This->fInitialStringIdx; ifInitialStringIdx+This->fInitialStringLen; i++) { REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates. } + REGEX_DUMP_DEBUG_PRINTF(("\"\n")); } else if (This->fStartType == START_SET) { int32_t numSetChars = This->fInitialChars->size(); diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 184c57b26be..2f2136c4475 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2002-2009, International Business Machines +* Copyright (C) 2002-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: regex.h @@ -48,6 +48,7 @@ #include "unicode/uobject.h" #include "unicode/unistr.h" +#include "unicode/utext.h" #include "unicode/parseerr.h" #include "unicode/uregex.h" @@ -187,6 +188,35 @@ public: UParseError &pe, UErrorCode &status); + + /** + * Compiles the regular expression in string form into a RegexPattern + * object. These compile methods, rather than the constructors, are the usual + * way that RegexPattern objects are created. + * + *

Note that RegexPattern objects must not be deleted while RegexMatcher + * objects created from the pattern are active. RegexMatchers keep a pointer + * back to their pattern, so premature deletion of the pattern is a + * catastrophic error.

+ * + *

All pattern match mode flags are set to their default values.

+ * + *

Note that it is often more convenient to construct a RegexMatcher directly + * from a pattern string rather than separately compiling the pattern and + * then creating a RegexMatcher object from the pattern.

+ * + * @param regex The regular expression to be compiled. + * @param pe Receives the position (line and column nubers) of any error + * within the regular expression.) + * @param status A reference to a UErrorCode to receive any errors. + * @return A regexPattern object for the compiled pattern. + * + * @internal ICU 4.4 technology preview + */ + static RegexPattern * U_EXPORT2 compile( UText *regex, + UParseError &pe, + UErrorCode &status); + /** * Compiles the regular expression in string form into a RegexPattern * object using the specified match mode flags. These compile methods, @@ -204,7 +234,7 @@ public: * * @param regex The regular expression to be compiled. * @param flags The match mode flags to be used. - * @param pe Receives the position (line and column nubers) of any error + * @param pe Receives the position (line and column numbers) of any error * within the regular expression.) * @param status A reference to a UErrorCode to receive any errors. * @return A regexPattern object for the compiled pattern. @@ -215,7 +245,37 @@ public: uint32_t flags, UParseError &pe, UErrorCode &status); - + + + /** + * Compiles the regular expression in string form into a RegexPattern + * object using the specified match mode flags. These compile methods, + * rather than the constructors, are the usual way that RegexPattern objects + * are created. + * + *

Note that RegexPattern objects must not be deleted while RegexMatcher + * objects created from the pattern are active. RegexMatchers keep a pointer + * back to their pattern, so premature deletion of the pattern is a + * catastrophic error.

+ * + *

Note that it is often more convenient to construct a RegexMatcher directly + * from a pattern string instead of than separately compiling the pattern and + * then creating a RegexMatcher object from the pattern.

+ * + * @param regex The regular expression to be compiled. + * @param flags The match mode flags to be used. + * @param pe Receives the position (line and column numbers) of any error + * within the regular expression.) + * @param status A reference to a UErrorCode to receive any errors. + * @return A regexPattern object for the compiled pattern. + * + * @internal ICU 4.4 technology preview + */ + static RegexPattern * U_EXPORT2 compile( UText *regex, + uint32_t flags, + UParseError &pe, + UErrorCode &status); + /** * Compiles the regular expression in string form into a RegexPattern @@ -244,6 +304,33 @@ public: UErrorCode &status); + /** + * Compiles the regular expression in string form into a RegexPattern + * object using the specified match mode flags. These compile methods, + * rather than the constructors, are the usual way that RegexPattern objects + * are created. + * + *

Note that RegexPattern objects must not be deleted while RegexMatcher + * objects created from the pattern are active. RegexMatchers keep a pointer + * back to their pattern, so premature deletion of the pattern is a + * catastrophic error.

+ * + *

Note that it is often more convenient to construct a RegexMatcher directly + * from a pattern string instead of than separately compiling the pattern and + * then creating a RegexMatcher object from the pattern.

+ * + * @param regex The regular expression to be compiled. + * @param flags The match mode flags to be used. + * @param status A reference to a UErrorCode to receive any errors. + * @return A regexPattern object for the compiled pattern. + * + * @internal ICU 4.4 technology preview + */ + static RegexPattern * U_EXPORT2 compile( UText *regex, + uint32_t flags, + UErrorCode &status); + + /** * Get the match mode flags that were used when compiling this pattern. * @return the match mode flags @@ -270,6 +357,27 @@ public: */ virtual RegexMatcher *matcher(const UnicodeString &input, UErrorCode &status) const; + + + /** + * Creates a RegexMatcher that will match the given input against this pattern. The + * RegexMatcher can then be used to perform match, find or replace operations + * on the input. Note that a RegexPattern object must not be deleted while + * RegexMatchers created from it still exist and might possibly be used again. + *

+ * The matcher will make a shallow clone of the supplied input text, and all regexp + * pattern matching operations happen on this clone. While read-only operations on + * the supplied text are permitted, it is critical that the underlying string not be + * altered or deleted before use by the regular expression operations is complete. + * + * @param input The input text to which the regular expression will be applied. + * @param status A reference to a UErrorCode to receive any errors. + * @return A RegexMatcher object for this pattern and input. + * + * @internal ICU 4.4 technology preview + */ + virtual RegexMatcher *matcher(UText *input, + UErrorCode &status) const; private: /** @@ -280,6 +388,8 @@ private: * To efficiently work with UChar *strings, wrap the data in a UnicodeString * using one of the aliasing constructors, such as * UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength); + * or in a UText, using + * utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status); * * @internal */ @@ -318,15 +428,52 @@ public: */ static UBool U_EXPORT2 matches(const UnicodeString ®ex, const UnicodeString &input, + UParseError &pe, + UErrorCode &status); + + + /** + * Test whether a string matches a regular expression. This convenience function + * both compiles the reguluar expression and applies it in a single operation. + * Note that if the same pattern needs to be applied repeatedly, this method will be + * less efficient than creating and reusing a RegexMatcher object. + * + * @param regex The regular expression + * @param input The string data to be matched + * @param pe Receives the position of any syntax errors within the regular expression + * @param status A reference to a UErrorCode to receive any errors. + * @return True if the regular expression exactly matches the full input string. + * + * @internal ICU 4.4 technology preview + */ + static UBool U_EXPORT2 matches(UText *regex, + UText *input, UParseError &pe, UErrorCode &status); /** - * Returns the regular expression from which this pattern was compiled. - * @stable ICU 2.4 + * Returns the regular expression from which this pattern was compiled. This method will work + * even if the pattern was compiled from a UText. + * + * Note: If the pattern was originally compiled from a UText, and that UText was modified, + * the returned string may no longer reflect the RegexPattern object. + * @stable ICU 2.4 */ virtual UnicodeString pattern() const; + + + /** + * Returns the regular expression from which this pattern was compiled. This method will work + * even if the pattern was compiled from a UnicodeString. + * + * Note: This is the original input, not a clone. If the pattern was originally compiled from a + * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern + * object. + * + * @internal ICU 4.4 technology preview + */ + virtual UText *patternText() const; /** @@ -360,6 +507,37 @@ public: UErrorCode &status) const; + /** + * Split a string into fields. Somewhat like split() from Perl. + * The pattern matches identify delimiters that separate the input + * into fields. The input data between the matches becomes the + * fields themselves. + *

+ * For the best performance on split() operations, + * RegexMatcher::split is perferable to this function + * + * @param input The string to be split into fields. The field delimiters + * match the pattern (in the "this" object) + * @param dest An array of mutable UText structs to receive the results of the split. + * If a field is NULL, a new UText is allocated to contain the results for + * that field. This new UText is not guaranteed to be mutable. + * @param destCapacity The number of elements in the destination array. + * If the number of fields found is less than destCapacity, the + * extra strings in the destination array are not altered. + * If the number of destination strings is less than the number + * of fields, the trailing part of the input string, including any + * field delimiters, is placed in the last destination string. + * @param status A reference to a UErrorCode to receive any errors. + * @return The number of fields into which the input string was split. + * + * @internal ICU 4.4 technology preview + */ + virtual int32_t split(UText *input, + UText *dest[], + int32_t destCapacity, + UErrorCode &status) const; + + /** * ICU "poor man's RTTI", returns a UClassID for the actual class. * @@ -378,7 +556,7 @@ private: // // Implementation Data // - UnicodeString fPattern; // The original pattern string. + UText *fPattern; // The original pattern string. uint32_t fFlags; // The flags used when compiling the pattern. // UVector32 *fCompiledPat; // The compiled pattern p-code. @@ -396,7 +574,7 @@ private: // >= this value. For some patterns, this calculated // value may be less than the true shortest // possible match. - + int32_t fFrameSize; // Size of a state stack frame in the // execution engine. @@ -421,6 +599,7 @@ private: UnicodeSet *fInitialChars; UChar32 fInitialChar; Regex8BitSet *fInitialChars8; + UBool fNeedsAltInput; friend class RegexCompile; friend class RegexMatcher; @@ -468,6 +647,23 @@ public: */ RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); + /** + * Construct a RegexMatcher for a regular expression. + * This is a convenience method that avoids the need to explicitly create + * a RegexPattern object. Note that if several RegexMatchers need to be + * created for the same expression, it will be more efficient to + * separately create and cache a RegexPattern object, and use + * its matcher() method to create the RegexMatcher objects. + * + * @param regexp The regular expression to be compiled. + * @param flags Regular expression options, such as case insensitive matching. + * @see UREGEX_CASE_INSENSITIVE + * @param status Any errors are reported by setting this UErrorCode variable. + * + * @internal ICU 4.4 technology preview + */ + RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); + /** * Construct a RegexMatcher for a regular expression. * This is a convenience method that avoids the need to explicitly create @@ -492,6 +688,30 @@ public: RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, uint32_t flags, UErrorCode &status); + /** + * Construct a RegexMatcher for a regular expression. + * This is a convenience method that avoids the need to explicitly create + * a RegexPattern object. Note that if several RegexMatchers need to be + * created for the same expression, it will be more efficient to + * separately create and cache a RegexPattern object, and use + * its matcher() method to create the RegexMatcher objects. + *

+ * The matcher will make a shallow clone of the supplied input text, and all regexp + * pattern matching operations happen on this clone. While read-only operations on + * the supplied text are permitted, it is critical that the underlying string not be + * altered or deleted before use by the regular expression operations is complete. + * + * @param regexp The Regular Expression to be compiled. + * @param input The string to match. The matcher retains a shallow clone of the text. + * @param flags Regular expression options, such as case insensitive matching. + * @see UREGEX_CASE_INSENSITIVE + * @param status Any errors are reported by setting this UErrorCode variable. + * + * @internal ICU 4.4 technology preview + */ + RegexMatcher(UText *regexp, UText *input, + uint32_t flags, UErrorCode &status); + private: /** * Cause a compilation error if an application accidently attempts to @@ -501,6 +721,8 @@ private: * To efficiently work with UChar *strings, wrap the data in a UnicodeString * using one of the aliasing constructors, such as * UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength); + * or in a UText, using + * utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status); * * @internal */ @@ -525,6 +747,7 @@ public: */ virtual UBool matches(UErrorCode &status); + /** * Resets the matcher, then attempts to match the input beginning * at the specified startIndex, and extending to the end of the input. @@ -538,8 +761,6 @@ public: virtual UBool matches(int32_t startIndex, UErrorCode &status); - - /** * Attempts to match the input string, starting from the beginning of the region, * against the pattern. Like the matches() method, this function @@ -571,6 +792,7 @@ public: */ virtual UBool lookingAt(int32_t startIndex, UErrorCode &status); + /** * Find the next pattern match in the input string. * The find begins searching the input at the location following the end of @@ -610,6 +832,22 @@ public: virtual UnicodeString group(UErrorCode &status) const; + /** + * Returns a string containing the text matched by the previous match. + * If the pattern can match an empty string, an empty string may be returned. + * @param dest A mutable UText in which the matching text is placed. + * If NULL, a new UText will be created (which may not be mutable). + * @param status A reference to a UErrorCode to receive any errors. + * Possible errors are U_REGEX_INVALID_STATE if no match + * has been attempted or the last match failed. + * @return A string containing the matched input text. If a pre-allocated UText + * was provided, it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ + virtual UText *group(UText *dest, UErrorCode &status) const; + + /** * Returns a string containing the text captured by the given group * during the previous match operation. Group(0) is the entire match. @@ -625,6 +863,24 @@ public: virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; + /** + * Returns a string containing the text captured by the given group + * during the previous match operation. Group(0) is the entire match. + * + * @param groupNum the capture group number + * @param dest A mutable UText in which the matching text is placed. + * If NULL, a new UText will be created (which may not be mutable). + * @param status A reference to a UErrorCode to receive any errors. + * Possible errors are U_REGEX_INVALID_STATE if no match + * has been attempted or the last match failed. + * @return A string containing the matched input text. If a pre-allocated UText + * was provided, it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ + virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; + + /** * Returns the number of capturing groups in this matcher's pattern. * @return the number of capture groups @@ -726,11 +982,31 @@ public: * Because no copy of the string is made, it is essential that the * caller not delete the string until after regexp operations on it * are done. + * Note that while a reset on the matcher with an input string that is then + * modified across/during matcher operations may be supported currently for UnicodeString, + * this was not originally intended behavior, and support for this is not guaranteed + * in upcoming versions of ICU. * @return this RegexMatcher. * @stable ICU 2.4 */ virtual RegexMatcher &reset(const UnicodeString &input); + + /** + * Resets this matcher with a new input string. This allows instances of RegexMatcher + * to be reused, which is more efficient than creating a new RegexMatcher for + * each input string to be processed. + * @param input The new string on which subsequent pattern matches will operate. + * The matcher makes a shallow clone of the given text; ownership of the + * original string remains with the caller. Because no deep copy of the + * text is made, it is essential that the caller not modify the string + * until after regexp operations on it are done. + * @return this RegexMatcher. + * + * @internal ICU 4.4 technology preview + */ + virtual RegexMatcher &reset(UText *input); + private: /** * Cause a compilation error if an application accidently attempts to @@ -740,6 +1016,8 @@ private: * To efficiently work with UChar *strings, wrap the data in a UnicodeString * using one of the aliasing constructors, such as * UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength); + * or in a UText, using + * utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status); * * @internal */ @@ -747,13 +1025,34 @@ private: public: /** - * Returns the input string being matched. The returned string is not a copy, - * but the live input string. It should not be altered or deleted. + * Returns the input string being matched. Ownership of the string belongs to + * the matcher; it should not be altered or deleted. This method will work even if the input + * was originally supplied as a UText. * @return the input string * @stable ICU 2.4 */ virtual const UnicodeString &input() const; + /** + * Returns the input string being matched. This is the live input text; it should not be + * altered or deleted. This method will work even if the input was originally supplied as + * a UnicodeString. + * @return the input text + * + * @internal ICU 4.4 technology preview + */ + virtual UText *inputText() const; + + /** + * Returns the input string being matched, either by copying it into the provided + * UText parameter or by returning a shallow clone of the live input. Note that copying + * the entire input may cause significant performance and memory issues. + * @param dest The UText into which the input should be copied, or NULL to create a new UText + * @return dest if non-NULL, a shallow copy of the input text otherwise + * + * @internal ICU 4.4 technology preview + */ + virtual UText *getInput(UText *dest) const; /** Sets the limits of this matcher's region. @@ -838,6 +1137,7 @@ public: */ virtual UBool hasAnchoringBounds() const; + /** * Set whether this matcher is using Anchoring Bounds for its region. * With anchoring bounds, pattern anchors such as ^ and $ will match at the start @@ -852,6 +1152,7 @@ public: */ virtual RegexMatcher &useAnchoringBounds(UBool b); + /** * Return TRUE if the most recent matching operation touched the * end of the text being processed. In this case, additional input text could @@ -878,9 +1179,6 @@ public: virtual UBool requireEnd() const; - - - /** * Returns the pattern that is interpreted by this matcher. * @return the RegexPattern for this RegexMatcher @@ -908,6 +1206,29 @@ public: virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); + /** + * Replaces every substring of the input that matches the pattern + * with the given replacement string. This is a convenience function that + * provides a complete find-and-replace-all operation. + * + * This method first resets this matcher. It then scans the input string + * looking for matches of the pattern. Input that is not part of any + * match is left unchanged; each match is replaced in the result by the + * replacement string. The replacement string may contain references to + * capture groups. + * + * @param replacement a string containing the replacement text. + * @param dest a mutable UText in which the results are placed. + * If NULL, a new UText will be created (which may not be mutable). + * @param status a reference to a UErrorCode to receive any errors. + * @return a string containing the results of the find and replace. + * If a pre-allocated UText was provided, it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ + virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); + + /** * Replaces the first substring of the input that matches * the pattern with the replacement string. This is a convenience @@ -929,7 +1250,35 @@ public: * @stable ICU 2.4 */ virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); + + /** + * Replaces the first substring of the input that matches + * the pattern with the replacement string. This is a convenience + * function that provides a complete find-and-replace operation. + * + *

This function first resets this RegexMatcher. It then scans the input string + * looking for a match of the pattern. Input that is not part + * of the match is appended directly to the result string; the match is replaced + * in the result by the replacement string. The replacement string may contain + * references to captured groups.

+ * + *

The state of the matcher (the position at which a subsequent find() + * would begin) after completing a replaceFirst() is not specified. The + * RegexMatcher should be reset before doing additional find() operations.

+ * + * @param replacement a string containing the replacement text. + * @param dest a mutable UText in which the results are placed. + * If NULL, a new UText will be created (which may not be mutable). + * @param status a reference to a UErrorCode to receive any errors. + * @return a string containing the results of the find and replace. + * If a pre-allocated UText was provided, it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ + virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); + + /** * Implements a replace operation intended to be used as part of an * incremental find-and-replace. @@ -959,6 +1308,37 @@ public: */ virtual RegexMatcher &appendReplacement(UnicodeString &dest, const UnicodeString &replacement, UErrorCode &status); + + + /** + * Implements a replace operation intended to be used as part of an + * incremental find-and-replace. + * + *

The input string, starting from the end of the previous replacement and ending at + * the start of the current match, is appended to the destination string. Then the + * replacement string is appended to the output string, + * including handling any substitutions of captured text.

+ * + *

For simple, prepackaged, non-incremental find-and-replace + * operations, see replaceFirst() or replaceAll().

+ * + * @param dest A mutable UText to which the results of the find-and-replace are appended. + * Must not be NULL. + * @param replacement A UText that provides the text to be substituted for + * the input text that matched the regexp pattern. The replacement + * text may contain references to captured text from the input. + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR + * if the replacement text specifies a capture group that + * does not exist in the pattern. + * + * @return this RegexMatcher + * + * @internal ICU 4.4 technology preview + */ + virtual RegexMatcher &appendReplacement(UText *dest, + UText *replacement, UErrorCode &status); /** @@ -974,13 +1354,26 @@ public: virtual UnicodeString &appendTail(UnicodeString &dest); + /** + * As the final step in a find-and-replace operation, append the remainder + * of the input string, starting at the position following the last appendReplacement(), + * to the destination string. appendTail() is intended to be invoked after one + * or more invocations of the RegexMatcher::appendReplacement(). + * + * @param dest A mutable UText to which the results of the find-and-replace are appended. + * Must not be NULL. + * @return the destination string. + * + * @internal ICU 4.4 technology preview + */ + virtual UText *appendTail(UText *dest); + /** * Split a string into fields. Somewhat like split() from Perl. * The pattern matches identify delimiters that separate the input * into fields. The input data between the matches becomes the * fields themselves. - *

* * @param input The string to be split into fields. The field delimiters * match the pattern (in the "this" object). This matcher @@ -1004,6 +1397,35 @@ public: int32_t destCapacity, UErrorCode &status); + + /** + * Split a string into fields. Somewhat like split() from Perl. + * The pattern matches identify delimiters that separate the input + * into fields. The input data between the matches becomes the + * fields themselves. + * + * @param input The string to be split into fields. The field delimiters + * match the pattern (in the "this" object). This matcher + * will be reset to this input string. + * @param dest An array of mutable UText structs to receive the results of the split. + * If a field is NULL, a new UText is allocated to contain the results for + * that field. This new UText is not guaranteed to be mutable. + * @param destCapacity The number of elements in the destination array. + * If the number of fields found is less than destCapacity, the + * extra strings in the destination array are not altered. + * If the number of destination strings is less than the number + * of fields, the trailing part of the input string, including any + * field delimiters, is placed in the last destination string. + * @param status A reference to a UErrorCode to receive any errors. + * @return The number of fields into which the input string was split. + * + * @internal ICU 4.4 technology preview + */ + virtual int32_t split(UText *input, + UText *dest[], + int32_t destCapacity, + UErrorCode &status); + /** * Set a processing time limit for match operations with this Matcher. * @@ -1086,7 +1508,6 @@ public: UErrorCode &status); - /** * Get the callback function for this URegularExpression. * @@ -1132,7 +1553,7 @@ private: RegexMatcher(const RegexMatcher &other); RegexMatcher &operator =(const RegexMatcher &rhs); void init(UErrorCode &status); // Common initialization - void init2(const UnicodeString &s, UErrorCode &e); // Common initialization, part 2. + void init2(UText *t, UErrorCode &e); // Common initialization, part 2. friend class RegexPattern; friend class RegexCImpl; @@ -1145,34 +1566,43 @@ private: // MatchAt This is the internal interface to the match engine itself. // Match status comes back in matcher member variables. // - void MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status); - inline void backTrack(int32_t &inputIdx, int32_t &patIdx); - UBool isWordBoundary(int32_t pos); // perform Perl-like \b test - UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test + void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); + inline void backTrack(int64_t &inputIdx, int32_t &patIdx); + UBool isWordBoundary(int64_t pos); // perform Perl-like \b test + UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test REStackFrame *resetStack(); inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status); void IncrementTime(UErrorCode &status); - + + int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; + + UBool findUsingChunk(); + void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); + UBool isChunkWordBoundary(int32_t pos); const RegexPattern *fPattern; RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and // should delete it when through. - const UnicodeString *fInput; // The text being matched. Is never NULL. + const UnicodeString *fInput; // The string being matched. Only used for input() + UText *fInputText; // The text being matched. Is never NULL. + UText *fAltInputText; // A shallow copy of the text being matched. + // Only created if the pattern contains backreferences. + int64_t fInputLength; // Full length of the input text. int32_t fFrameSize; // The size of a frame in the backtrack stack. - int32_t fRegionStart; // Start of the input region, default = 0. - int32_t fRegionLimit; // End of input region, default to input.length. + int64_t fRegionStart; // Start of the input region, default = 0. + int64_t fRegionLimit; // End of input region, default to input.length. - int32_t fAnchorStart; // Region bounds for anchoring operations (^ or $). - int32_t fAnchorLimit; // See useAnchoringBounds + int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). + int64_t fAnchorLimit; // See useAnchoringBounds - int32_t fLookStart; // Region bounds for look-ahead/behind and - int32_t fLookLimit; // and other boundary tests. See + int64_t fLookStart; // Region bounds for look-ahead/behind and + int64_t fLookLimit; // and other boundary tests. See // useTransparentBounds - int32_t fActiveStart; // Currently active bounds for matching. - int32_t fActiveLimit; // Usually is the same as region, but + int64_t fActiveStart; // Currently active bounds for matching. + int64_t fActiveLimit; // Usually is the same as region, but // is changed to fLookStart/Limit when // entering look around regions. @@ -1180,13 +1610,13 @@ private: UBool fAnchoringBounds; // True if using anchoring bounds. UBool fMatch; // True if the last attempted match was successful. - int32_t fMatchStart; // Position of the start of the most recent match - int32_t fMatchEnd; // First position after the end of the most recent match + int64_t fMatchStart; // Position of the start of the most recent match + int64_t fMatchEnd; // First position after the end of the most recent match // Zero if no previous match, even when a region // is active. - int32_t fLastMatchEnd; // First position after the end of the previous match, + int64_t fLastMatchEnd; // First position after the end of the previous match, // or -1 if there was no previous match. - int32_t fAppendPosition; // First position after the end of the previous + int64_t fAppendPosition; // First position after the end of the previous // appendReplacement(). As described by the // JavaDoc for Java Matcher, where it is called // "append position" @@ -1218,6 +1648,8 @@ private: // NULL if there is no callback. const void *fCallbackContext; // User Context ptr for callback function. + UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. + UBool fTraceDebug; // Set true for debug tracing of match engine. UErrorCode fDeferredStatus; // Save error state that cannot be immediately diff --git a/icu4c/source/i18n/unicode/uregex.h b/icu4c/source/i18n/unicode/uregex.h index 73f2c80b780..3567e61e5ed 100644 --- a/icu4c/source/i18n/unicode/uregex.h +++ b/icu4c/source/i18n/unicode/uregex.h @@ -3,7 +3,7 @@ * Copyright (C) 2004-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** -* file name: regex.h +* file name: uregex.h * encoding: US-ASCII * indentation:4 * @@ -23,6 +23,7 @@ #ifndef UREGEX_H #define UREGEX_H +#include "unicode/utext.h" #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS @@ -112,6 +113,7 @@ typedef enum URegexpFlag{ * string form into an internal representation using the specified match mode flags. * The resulting regular expression handle can then be used to perform various * matching operations. + * * * @param pattern The Regular Expression pattern to be compiled. * @param patternLength The length of the pattern, or -1 if the pattern is @@ -134,7 +136,36 @@ uregex_open( const UChar *pattern, uint32_t flags, UParseError *pe, UErrorCode *status); - + +/** + * Open (compile) an ICU regular expression. Compiles the regular expression in + * string form into an internal representation using the specified match mode flags. + * The resulting regular expression handle can then be used to perform various + * matching operations. + *

+ * The contents of the pattern UText will be extracted and saved. Ownership of the + * UText struct itself remains with the caller. This is to match the behavior of + * uregex_open(). + * + * @param pattern The Regular Expression pattern to be compiled. + * @param flags Flags that alter the default matching behavior for + * the regular expression, UREGEX_CASE_INSENSITIVE, for + * example. For default behavior, set this parameter to zero. + * See enum URegexpFlag. All desired flags + * are bitwise-ORed together. + * @param pe Receives the position (line and column nubers) of any syntax + * error within the source regular expression string. If this + * information is not wanted, pass NULL for this parameter. + * @param status Receives error detected by this function. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL URegularExpression * U_EXPORT2 +uregex_openUText(UText *pattern, + uint32_t flags, + UParseError *pe, + UErrorCode *status); + /** * Open (compile) an ICU regular expression. The resulting regular expression * handle can then be used to perform various matching operations. @@ -219,7 +250,8 @@ U_STABLE URegularExpression * U_EXPORT2 uregex_clone(const URegularExpression *regexp, UErrorCode *status); /** - * Return a pointer to the source form of the pattern for this regular expression. + * Returns a pointer to the source form of the pattern for this regular expression. + * This function will work even if the pattern was originally specified as a UText. * * @param regexp The compiled regular expression. * @param patLength This output parameter will be set to the length of the @@ -235,9 +267,24 @@ uregex_clone(const URegularExpression *regexp, UErrorCode *status); * @stable ICU 3.0 */ U_STABLE const UChar * U_EXPORT2 -uregex_pattern(const URegularExpression *regexp, - int32_t *patLength, - UErrorCode *status); +uregex_pattern(const URegularExpression *regexp, + int32_t *patLength, + UErrorCode *status); + +/** + * Returns the source text of the pattern for this regular expression. + * This function will work even if the pattern was originally specified as a UChar string. + * + * @param regexp The compiled regular expression. + * @param status Receives errors detected by this function. + * @return the pattern text. The storage for the text is owned by the regular expression + * object, and must not be altered or deleted. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL UText * U_EXPORT2 +uregex_patternUText(const URegularExpression *regexp, + UErrorCode *status); /** @@ -279,10 +326,36 @@ uregex_setText(URegularExpression *regexp, int32_t textLength, UErrorCode *status); + +/** + * Set the subject text string upon which the regular expression will look for matches. + * This function may be called any number of times, allowing the regular + * expression pattern to be applied to different strings. + *

+ * Regular expression matching operations work directly on the application's + * string data; only a shallow clone is made. The subject string data must not be + * altered after calling this function until after all regular expression + * operations involving this string data are completed. + * + * @param regexp The compiled regular expression. + * @param text The subject text string. + * @param status Receives errors detected by this function. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL void U_EXPORT2 +uregex_setUText(URegularExpression *regexp, + UText *text, + UErrorCode *status); + /** * Get the subject text that is currently associated with this - * regular expression object. This simply returns whatever string - * pointer was previously supplied via uregex_setText(). + * regular expression object. If the input was supplied using uregex_setText(), + * that pointer will be returned. Otherwise, the characters in the input will + * be extracted to a buffer and returned. In either case, ownership remains + * with the regular expression object. + * + * This function will work even if the input was originally specified as a UText. * * @param regexp The compiled regular expression. * @param textLength The length of the string is returned in this output parameter. @@ -291,7 +364,7 @@ uregex_setText(URegularExpression *regexp, * the text is known in advance to be a NUL terminated * string. * @param status Receives errors detected by this function. - * @return Poiner to the subject text string currently associated with + * @return Pointer to the subject text string currently associated with * this regular expression. * @stable ICU 3.0 */ @@ -299,6 +372,28 @@ U_STABLE const UChar * U_EXPORT2 uregex_getText(URegularExpression *regexp, int32_t *textLength, UErrorCode *status); + + +/** + * Get the subject text that is currently associated with this + * regular expression object. + * + * This function will work even if the input was originally specified as a UChar string. + * + * @param regexp The compiled regular expression. + * @param dest A mutable UText in which to store the current input. + * If NULL, a new UText will be created as an immutable shallow clone + * of the actual input string. + * @param status Receives errors detected by this function. + * @return The subject text currently associated with this regular expression. + * If a pre-allocated UText was provided, it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL UText * U_EXPORT2 +uregex_getUText(URegularExpression *regexp, + UText *dest, + UErrorCode *status); /** * Attempts to match the input string against the pattern. @@ -428,6 +523,29 @@ uregex_group(URegularExpression *regexp, int32_t destCapacity, UErrorCode *status); +/** Extract the string for the specified matching expression or subexpression. + * Group #0 is the complete string of matched text. + * Group #1 is the text matched by the first set of capturing parentheses. + * + * @param regexp The compiled regular expression. + * @param groupNum The capture group to extract. Group 0 is the complete + * match. The value of this parameter must be + * less than or equal to the number of capture groups in + * the pattern. + * @param dest Mutable UText to receive the matching string data. + * If NULL, a new UText will be created (which may not be mutable). + * @param status A reference to a UErrorCode to receive any errors. + * @return The matching string data. If a pre-allocated UText was provided, + * it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL UText * U_EXPORT2 +uregex_groupUText(URegularExpression *regexp, + int32_t groupNum, + UText *dest, + UErrorCode *status); + /** * Returns the index in the input string of the start of the text matched by the @@ -676,6 +794,32 @@ uregex_replaceAll(URegularExpression *regexp, int32_t destCapacity, UErrorCode *status); +/** + * Replaces every substring of the input that matches the pattern + * with the given replacement string. This is a convenience function that + * provides a complete find-and-replace-all operation. + * + * This method scans the input string looking for matches of the pattern. + * Input that is not part of any match is copied unchanged to the + * destination buffer. Matched regions are replaced in the output + * buffer by the replacement string. The replacement string may contain + * references to capture groups; these take the form of $1, $2, etc. + * + * @param regexp The compiled regular expression. + * @param replacement A string containing the replacement text. + * @param dest A mutable UText that will receive the result. + * If NULL, a new UText will be created (which may not be mutable). + * @param status A reference to a UErrorCode to receive any errors. + * @return A UText containing the results of the find and replace. + * If a pre-allocated UText was provided, it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL UText * U_EXPORT2 +uregex_replaceAllUText(URegularExpression *regexp, + UText *replacement, + UText *dest, + UErrorCode *status); /** * Replaces the first substring of the input that matches the pattern @@ -709,6 +853,33 @@ uregex_replaceFirst(URegularExpression *regexp, int32_t destCapacity, UErrorCode *status); +/** + * Replaces the first substring of the input that matches the pattern + * with the given replacement string. This is a convenience function that + * provides a complete find-and-replace operation. + * + * This method scans the input string looking for a match of the pattern. + * All input that is not part of the match is copied unchanged to the + * destination buffer. The matched region is replaced in the output + * buffer by the replacement string. The replacement string may contain + * references to capture groups; these take the form of $1, $2, etc. + * + * @param regexp The compiled regular expression. + * @param replacement A string containing the replacement text. + * @param dest A mutable UText that will receive the result. + * If NULL, a new UText will be created (which may not be mutable). + * @param status A reference to a UErrorCode to receive any errors. + * @return A UText containing the results of the find and replace. + * If a pre-allocated UText was provided, it will always be used and returned. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL UText * U_EXPORT2 +uregex_replaceFirstUText(URegularExpression *regexp, + UText *replacement, + UText *dest, + UErrorCode *status); + /** * Implements a replace operation intended to be used as part of an @@ -758,11 +929,40 @@ uregex_replaceFirst(URegularExpression *regexp, */ U_STABLE int32_t U_EXPORT2 uregex_appendReplacement(URegularExpression *regexp, - const UChar *replacementText, - int32_t replacementLength, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status); + const UChar *replacementText, + int32_t replacementLength, + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status); + + +/** + * Implements a replace operation intended to be used as part of an + * incremental find-and-replace. + * + *

The input string, starting from the end of the previous match and ending at + * the start of the current match, is appended to the destination string. Then the + * replacement string is appended to the output string, + * including handling any substitutions of captured text.

+ * + *

For simple, prepackaged, non-incremental find-and-replace + * operations, see replaceFirst() or replaceAll().

+ * + * @param regexp The regular expression object. + * @param replacementText The string that will replace the matched portion of the + * input string as it is copied to the destination buffer. + * The replacement text may contain references ($1, for + * example) to capture groups from the match. + * @param dest A mutable UText that will receive the result. Must not be NULL. + * @param status A reference to a UErrorCode to receive any errors. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL void U_EXPORT2 +uregex_appendReplacementUText(URegularExpression *regexp, + UText *replacementText, + UText *dest, + UErrorCode *status); /** @@ -794,7 +994,27 @@ uregex_appendTail(URegularExpression *regexp, UChar **destBuf, int32_t *destCapacity, UErrorCode *status); + +/** + * As the final step in a find-and-replace operation, append the remainder + * of the input string, starting at the position following the last match, + * to the destination string. uregex_appendTailUText() is intended + * to be invoked after one or more invocations of the + * uregex_appendReplacementUText() function. + * + * @param regexp The regular expression object. This is needed to + * obtain the input string and with the position + * of the last match within it. + * @param dest A mutable UText that will receive the result. Must not be NULL. + * @param status A reference to a UErrorCode to receive any errors. + * @return The destination UText. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL UText * U_EXPORT2 +uregex_appendTailUText(URegularExpression *regexp, + UText *dest); @@ -808,6 +1028,22 @@ uregex_appendTail(URegularExpression *regexp, * buffer, and NUL terminated. The position of each field within * the destination buffer is returned in the destFields array. * + * Note: another choice for the design of this function would be to not + * copy the resulting fields at all, but to return indexes and + * lengths within the source text. + * Advantages would be + * o Faster. No Copying. + * o Nothing extra needed when field data may contain embedded NUL chars. + * o Less memory needed if working on large data. + * Disadvantages + * o Less consistent with C++ split, which copies into an + * array of UnicodeStrings. + * o No NUL termination, extracted fields would be less convenient + * to use in most cases. + * o Possible problems in the future, when support Unicode Normalization + * could cause the fields to not correspond exactly to + * a range of the source text. + * * @param regexp The compiled regular expression. * @param destBuf A (UChar *) buffer to receive the fields that * are extracted from the input string. These @@ -846,6 +1082,39 @@ uregex_split( URegularExpression *regexp, UErrorCode *status); + /** + * Split a string into fields. Somewhat like split() from Perl. + * The pattern matches identify delimiters that separate the input + * into fields. The input data between the matches becomes the + * fields themselves. + *

+ * The behavior of this function is not very closely aligned with uregex_split(); + * instead, it is based on (and implemented directly on top of) the C++ split method. + * + * @param regexp The compiled regular expression. + * @param dest An array of mutable UText structs to receive the results of the split. + * If a field is NULL, a new UText is allocated to contain the results for + * that field. This new UText is not guaranteed to be mutable. + * @param destCapacity The number of elements in the destination array. + * If the number of fields found is less than destCapacity, the + * extra strings in the destination array are not altered. + * If the number of destination strings is less than the number + * of fields, the trailing part of the input string, including any + * field delimiters, is placed in the last destination string. + * This behavior mimics that of Perl. It is not an error condition, and no + * error status is returned when all destField positions are used. + * @param status A reference to a UErrorCode to receive any errors. + * @return The number of fields into which the input string was split. + * + * @internal ICU 4.4 technology preview + */ +U_INTERNAL int32_t U_EXPORT2 +uregex_splitUText(URegularExpression *regexp, + UText *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status); + + /** diff --git a/icu4c/source/i18n/uregex.cpp b/icu4c/source/i18n/uregex.cpp index 048dba29ac1..09966c8e951 100644 --- a/icu4c/source/i18n/uregex.cpp +++ b/icu4c/source/i18n/uregex.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2004-2009, International Business Machines +* Copyright (C) 2004-2010, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: regex.cpp @@ -20,8 +20,14 @@ #include "uassert.h" #include "cmemory.h" +#include "regextxt.h" + +#include + U_NAMESPACE_BEGIN +#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) + struct RegularExpression: public UMemory { public: RegularExpression(); @@ -35,9 +41,7 @@ public: const UChar *fText; // Text from setText() int32_t fTextLength; // Length provided by user with setText(), which // may be -1. - - UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString. - // TODO: regexp engine should not depend on UnicodeString. + UBool fOwnsText; }; static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII @@ -51,6 +55,7 @@ RegularExpression::RegularExpression() { fMatcher = NULL; fText = NULL; fTextLength = 0; + fOwnsText = FALSE; } RegularExpression::~RegularExpression() { @@ -61,6 +66,9 @@ RegularExpression::~RegularExpression() { uprv_free(fPatString); uprv_free(fPatRefCount); } + if (fOwnsText && fText!=NULL) { + uprv_free((void *)fText); + } fMagic = 0; } @@ -81,7 +89,8 @@ static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool r *status = U_ILLEGAL_ARGUMENT_ERROR; return FALSE; } - if (requiresText && re->fText == NULL) { + // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway + if (requiresText && re->fText == NULL && !re->fOwnsText) { *status = U_REGEX_INVALID_STATE; return FALSE; } @@ -127,23 +136,27 @@ uregex_open( const UChar *pattern, // // Make a copy of the pattern string, so we can return it later if asked. - // For compiling the pattern, we will use a read-only-aliased UnicodeString - // of this local copy, to avoid making even more copies. + // For compiling the pattern, we will use a UText wrapper around + // this local copy, to avoid making even more copies. // re->fPatString = patBuf; re->fPatStringLen = patternLength; u_memcpy(patBuf, pattern, actualPatLen); patBuf[actualPatLen] = 0; - UnicodeString patString(patternLength==-1, patBuf, patternLength); + + UText patText = UTEXT_INITIALIZER; + utext_openUChars(&patText, patBuf, patternLength, status); // // Compile the pattern // if (pe != NULL) { - re->fPat = RegexPattern::compile(patString, flags, *pe, *status); + re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); } else { - re->fPat = RegexPattern::compile(patString, flags, *status); + re->fPat = RegexPattern::compile(&patText, flags, *status); } + utext_close(&patText); + if (U_FAILURE(*status)) { goto ErrorExit; } @@ -162,6 +175,89 @@ ErrorExit: } +//---------------------------------------------------------------------------------------- +// +// uregex_openUText +// +//---------------------------------------------------------------------------------------- +U_CAPI URegularExpression * U_EXPORT2 +uregex_openUText(UText *pattern, + uint32_t flags, + UParseError *pe, + UErrorCode *status) { + + if (U_FAILURE(*status)) { + return NULL; + } + if (pattern == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + int64_t patternNativeLength = utext_nativeLength(pattern); + + if (patternNativeLength == 0) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + RegularExpression *re = new RegularExpression; + + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); + + int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); + UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); + if (re == NULL || refC == NULL || patBuf == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + delete re; + uprv_free(refC); + uprv_free(patBuf); + return NULL; + } + re->fPatRefCount = refC; + *re->fPatRefCount = 1; + + // + // Make a copy of the pattern string, so we can return it later if asked. + // For compiling the pattern, we will use a read-only UText wrapper + // around this local copy, to avoid making even more copies. + // + re->fPatString = patBuf; + re->fPatStringLen = pattern16Length; + utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); + + UText patText = UTEXT_INITIALIZER; + utext_openUChars(&patText, patBuf, pattern16Length, status); + + // + // Compile the pattern + // + if (pe != NULL) { + re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); + } else { + re->fPat = RegexPattern::compile(&patText, flags, *status); + } + utext_close(&patText); + + if (U_FAILURE(*status)) { + goto ErrorExit; + } + + // + // Create the matcher object + // + re->fMatcher = re->fPat->matcher(*status); + if (U_SUCCESS(*status)) { + return (URegularExpression*)re; + } + +ErrorExit: + delete re; + return NULL; + +} + //---------------------------------------------------------------------------------------- // // uregex_close @@ -222,8 +318,8 @@ uregex_clone(const URegularExpression *source2, UErrorCode *status) { //------------------------------------------------------------------------------ U_CAPI const UChar * U_EXPORT2 uregex_pattern(const URegularExpression *regexp2, - int32_t *patLength, - UErrorCode *status) { + int32_t *patLength, + UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { @@ -236,6 +332,20 @@ uregex_pattern(const URegularExpression *regexp2, } +//------------------------------------------------------------------------------ +// +// uregex_patternUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_patternUText(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + (void)status; + return regexp->fPat->patternText(); +} + + //------------------------------------------------------------------------------ // // uregex_flags @@ -270,12 +380,48 @@ uregex_setText(URegularExpression *regexp2, *status = U_ILLEGAL_ARGUMENT_ERROR; return; } + + if (regexp->fOwnsText && regexp->fText != NULL) { + uprv_free((void *)regexp->fText); + } + regexp->fText = text; regexp->fTextLength = textLength; - UBool isTerminated = (textLength == -1); + regexp->fOwnsText = FALSE; + + UText input = UTEXT_INITIALIZER; + utext_openUChars(&input, text, textLength, status); + regexp->fMatcher->reset(&input); + utext_close(&input); // reset() made a shallow clone, so we don't need this copy +} - regexp->fTextString.setTo(isTerminated, text, textLength); - regexp->fMatcher->reset(regexp->fTextString); + +//------------------------------------------------------------------------------ +// +// uregex_setUText +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_setUText(URegularExpression *regexp2, + UText *text, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, status, FALSE) == FALSE) { + return; + } + if (text == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if (regexp->fOwnsText && regexp->fText != NULL) { + uprv_free((void *)regexp->fText); + } + + regexp->fText = NULL; // only fill it in on request + regexp->fTextLength = -1; + regexp->fOwnsText = TRUE; + regexp->fMatcher->reset(text); } @@ -293,6 +439,26 @@ uregex_getText(URegularExpression *regexp2, if (validateRE(regexp, status, FALSE) == FALSE) { return NULL; } + + if (regexp->fText == NULL) { + // need to fill in the text + UText *inputText = regexp->fMatcher->inputText(); + int64_t inputNativeLength = utext_nativeLength(inputText); + if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { + regexp->fText = inputText->chunkContents; + regexp->fTextLength = inputNativeLength; + regexp->fOwnsText = FALSE; // because the UText owns it + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); + + utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); + regexp->fText = inputChars; + regexp->fOwnsText = TRUE; // should already be set but just in case + } + } + if (textLength != NULL) { *textLength = regexp->fTextLength; } @@ -300,6 +466,23 @@ uregex_getText(URegularExpression *regexp2, } +//------------------------------------------------------------------------------ +// +// uregex_getUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_getUText(URegularExpression *regexp2, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, status, FALSE) == FALSE) { + return dest; + } + return regexp->fMatcher->getInput(dest); +} + + //------------------------------------------------------------------------------ // // uregex_matches @@ -423,37 +606,91 @@ uregex_group(URegularExpression *regexp2, *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } + + if (destCapacity == 0 || regexp->fText != NULL) { + // If preflighting or if we already have the text as UChars, + // this is a little cheaper than going through uregex_groupUText() + + // + // Pick up the range of characters from the matcher + // + int32_t startIx = regexp->fMatcher->start(groupNum, *status); + int32_t endIx = regexp->fMatcher->end (groupNum, *status); + if (U_FAILURE(*status)) { + return 0; + } - // - // Pick up the range of characters from the matcher - // - int32_t startIx = regexp->fMatcher->start(groupNum, *status); - int32_t endIx = regexp->fMatcher->end (groupNum, *status); - if (U_FAILURE(*status)) { - return 0; - } - - // - // Trim length based on buffer capacity - // - int32_t fullLength = endIx - startIx; - int32_t copyLength = fullLength; - if (copyLength < destCapacity) { - dest[copyLength] = 0; - } else if (copyLength == destCapacity) { - *status = U_STRING_NOT_TERMINATED_WARNING; + // + // Trim length based on buffer capacity + // + int32_t fullLength = endIx - startIx; + int32_t copyLength = fullLength; + if (copyLength < destCapacity) { + dest[copyLength] = 0; + } else if (copyLength == destCapacity) { + *status = U_STRING_NOT_TERMINATED_WARNING; + } else { + copyLength = destCapacity; + *status = U_BUFFER_OVERFLOW_ERROR; + } + + // + // Copy capture group to user's buffer + // + if (copyLength > 0) { + u_memcpy(dest, ®exp->fText[startIx], copyLength); + } + return fullLength; } else { - copyLength = destCapacity; - *status = U_BUFFER_OVERFLOW_ERROR; + UText *groupText = uregex_groupUText(regexp2, groupNum, NULL, status); + int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); + utext_close(groupText); + return result; + } +} + + +//------------------------------------------------------------------------------ +// +// uregex_groupUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_groupUText(URegularExpression *regexp2, + int32_t groupNum, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, status) == FALSE) { + UErrorCode emptyTextStatus = U_ZERO_ERROR; + return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); } - // - // Copy capture group to user's buffer - // - if (copyLength > 0) { - u_memcpy(dest, ®exp->fText[startIx], copyLength); + if (regexp->fText != NULL) { + // + // Pick up the range of characters from the matcher + // and use our already-extracted characters + // + int32_t startIx = regexp->fMatcher->start(groupNum, *status); + int32_t endIx = regexp->fMatcher->end (groupNum, *status); + if (U_FAILURE(*status)) { + UErrorCode emptyTextStatus = U_ZERO_ERROR; + return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); + } + + if (dest) { + utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); + } else { + UText groupText = UTEXT_INITIALIZER; + utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); + dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); + utext_close(&groupText); + } + + return dest; + } else { + return regexp->fMatcher->group(groupNum, dest, *status); } - return fullLength; } @@ -582,8 +819,8 @@ uregex_hasTransparentBounds(const URegularExpression *regexp2, //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_useTransparentBounds(URegularExpression *regexp2, - UBool b, - UErrorCode *status) { + UBool b, + UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return; @@ -599,7 +836,7 @@ uregex_useTransparentBounds(URegularExpression *regexp2, //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_hasAnchoringBounds(const URegularExpression *regexp2, - UErrorCode *status) { + UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return FALSE; @@ -615,8 +852,8 @@ uregex_hasAnchoringBounds(const URegularExpression *regexp2, //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_useAnchoringBounds(URegularExpression *regexp2, - UBool b, - UErrorCode *status) { + UBool b, + UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return; @@ -699,8 +936,8 @@ uregex_getTimeLimit(const URegularExpression *regexp2, //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setStackLimit(URegularExpression *regexp2, - int32_t limit, - UErrorCode *status) { + int32_t limit, + UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { regexp->fMatcher->setStackLimit(limit, *status); @@ -716,7 +953,7 @@ uregex_setStackLimit(URegularExpression *regexp2, //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_getStackLimit(const URegularExpression *regexp2, - UErrorCode *status) { + UErrorCode *status) { int32_t retVal = 0; RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { @@ -738,7 +975,7 @@ uregex_setMatchCallback(URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { - regexp->fMatcher->setMatchCallback(callback, context, *status); + regexp->fMatcher->setMatchCallback(callback, context, *status); } } @@ -810,6 +1047,30 @@ uregex_replaceAll(URegularExpression *regexp2, } +//------------------------------------------------------------------------------ +// +// uregex_replaceAllUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_replaceAllUText(URegularExpression *regexp2, + UText *replacementText, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, status) == FALSE) { + return 0; + } + if (replacementText == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); + return dest; +} + + //------------------------------------------------------------------------------ // // uregex_replaceFirst @@ -847,6 +1108,30 @@ uregex_replaceFirst(URegularExpression *regexp2, } +//------------------------------------------------------------------------------ +// +// uregex_replaceFirstUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_replaceFirstUText(URegularExpression *regexp2, + UText *replacementText, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, status) == FALSE) { + return 0; + } + if (replacementText == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); + return dest; +} + + //------------------------------------------------------------------------------ // // uregex_appendReplacement @@ -868,28 +1153,23 @@ class RegexCImpl { UErrorCode *status); inline static int32_t appendTail(RegularExpression *regexp, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status); + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status); + + inline static int32_t split(RegularExpression *regexp, + UChar *destBuf, + int32_t destCapacity, + int32_t *requiredCapacity, + UChar *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status); }; U_NAMESPACE_END -// -// Call-back function for u_unescapeAt(), used when we encounter -// \uxxxx or \Uxxxxxxxxx escapes in the replacement text. -// -U_CDECL_BEGIN -static UChar U_CALLCONV -unescape_charAt(int32_t offset, void *context) { - UChar c16 = ((UChar *)context)[offset]; - return c16; -} -U_CDECL_END - - static const UChar BACKSLASH = 0x5c; static const UChar DOLLARSIGN = 0x24; @@ -910,11 +1190,11 @@ static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCap // appendReplacement, the actual implementation. // int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, - const UChar *replacementText, - int32_t replacementLength, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status) { + const UChar *replacementText, + int32_t replacementLength, + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status) { // If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight @@ -958,10 +1238,27 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, } // Copy input string from the end of previous match to start of current match - for (i=m->fLastMatchEnd; ifMatchStart; i++) { - appendToBuf(regexp->fText[i], &destIdx, dest, capacity); + if (regexp->fText != NULL) { + int32_t matchStart; + int32_t lastMatchEnd; + if (UTEXT_USES_U16(m->fInputText)) { + lastMatchEnd = m->fLastMatchEnd; + matchStart = m->fMatchStart; + } else { + // !!!: Would like a better way to do this! + UErrorCode status = U_ZERO_ERROR; + lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); + status = U_ZERO_ERROR; + matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); + } + for (i=lastMatchEnd; ifText[i], &destIdx, dest, capacity); + } + } else { + UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore + destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, + &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); } - // scan the replacement text, looking for substitutions ($n) and \escapes. @@ -990,7 +1287,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. UChar32 escapedChar = - u_unescapeAt(unescape_charAt, + u_unescapeAt(uregex_ucstr_unescape_charAt, &replIdx, // Index is updated by unescapeAt replacementLength, // Length of replacement text (void *)replacementText); @@ -1050,11 +1347,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, } // Finally, append the capture group data to the destination. - int32_t capacityRemaining = capacity - destIdx; - if (capacityRemaining < 0) { - capacityRemaining = 0; - } - destIdx += uregex_group((URegularExpression*)regexp, groupNum, dest+destIdx, capacityRemaining, status); + destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); if (*status == U_BUFFER_OVERFLOW_ERROR) { // Ignore buffer overflow when extracting the group. We need to // continue on to get full size of the untruncated result. We will @@ -1105,20 +1398,33 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, } // -// appendReplacement the acutal API function, +// appendReplacement the actual API function, // U_CAPI int32_t U_EXPORT2 uregex_appendReplacement(URegularExpression *regexp2, - const UChar *replacementText, - int32_t replacementLength, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status) { + const UChar *replacementText, + int32_t replacementLength, + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendReplacement( regexp, replacementText, replacementLength,destBuf, destCapacity, status); } +// +// uregex_appendReplacementUText...can just use the normal C++ method +// +U_CAPI void U_EXPORT2 +uregex_appendReplacementUText(URegularExpression *regexp2, + UText *replText, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + regexp->fMatcher->appendReplacement(dest, replText, *status); +} + //------------------------------------------------------------------------------ // @@ -1126,9 +1432,9 @@ uregex_appendReplacement(URegularExpression *regexp2, // //------------------------------------------------------------------------------ int32_t RegexCImpl::appendTail(RegularExpression *regexp, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status) + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status) { // If we come in with a buffer overflow error, don't suppress the operation. @@ -1154,46 +1460,62 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, RegexMatcher *m = regexp->fMatcher; - int32_t srcIdx; - if (m->fMatch) { - // The most recent call to find() succeeded. - srcIdx = m->fMatchEnd; - } else { - // The last call to find() on this matcher failed(). - // Look back to the end of the last find() that succeeded for src index. - srcIdx = m->fLastMatchEnd; - if (srcIdx == -1) { - // There has been no successful match with this matcher. - // We want to copy the whole string. - srcIdx = 0; - } - } - int32_t destIdx = 0; int32_t destCap = *destCapacity; UChar *dest = *destBuf; - - for (;;) { - if (srcIdx == regexp->fTextLength) { - break; - } - UChar c = regexp->fText[srcIdx]; - if (c == 0 && regexp->fTextLength == -1) { - break; - } - if (destIdx < destCap) { - dest[destIdx] = c; + + if (regexp->fText != NULL) { + int32_t srcIdx; + int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); + if (nativeIdx == -1) { + srcIdx = 0; + } else if (UTEXT_USES_U16(m->fInputText)) { + srcIdx = nativeIdx; } else { - // We've overflowed the dest buffer. - // If the total input string length is known, we can - // compute the total buffer size needed without scanning through the string. - if (regexp->fTextLength > 0) { - destIdx += (regexp->fTextLength - srcIdx); + UErrorCode status = U_ZERO_ERROR; + srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); + } + + for (;;) { + if (srcIdx == regexp->fTextLength) { break; } + UChar c = regexp->fText[srcIdx]; + if (c == 0 && regexp->fTextLength == -1) { + regexp->fTextLength = srcIdx; + break; + } + if (destIdx < destCap) { + dest[destIdx] = c; + } else { + // We've overflowed the dest buffer. + // If the total input string length is known, we can + // compute the total buffer size needed without scanning through the string. + if (regexp->fTextLength > 0) { + destIdx += (regexp->fTextLength - srcIdx); + break; + } + } + srcIdx++; + destIdx++; + } + } else { + int64_t srcIdx; + if (m->fMatch) { + // The most recent call to find() succeeded. + srcIdx = m->fMatchEnd; + } else { + // The last call to find() on this matcher failed(). + // Look back to the end of the last find() that succeeded for src index. + srcIdx = m->fLastMatchEnd; + if (srcIdx == -1) { + // There has been no successful match with this matcher. + // We want to copy the whole string. + srcIdx = 0; + } } - srcIdx++; - destIdx++; + + destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); } // @@ -1228,6 +1550,9 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, } +// +// appendTail the actual API function +// U_CAPI int32_t U_EXPORT2 uregex_appendTail(URegularExpression *regexp2, UChar **destBuf, @@ -1238,6 +1563,17 @@ uregex_appendTail(URegularExpression *regexp2, } +// +// uregex_appendTailUText...can just use the normal C++ method +// +U_CAPI UText * U_EXPORT2 +uregex_appendTailUText(URegularExpression *regexp2, + UText *dest) { + RegularExpression *regexp = (RegularExpression*)regexp2; + return regexp->fMatcher->appendTail(dest); +} + + //------------------------------------------------------------------------------ // // copyString Internal utility to copy a string to an output buffer, @@ -1280,75 +1616,67 @@ static void copyString(UChar *destBuffer, // Destination buffer. // uregex_split // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_split( URegularExpression *regexp2, - UChar *destBuf, - int32_t destCapacity, - int32_t *requiredCapacity, - UChar *destFields[], - int32_t destFieldsCapacity, - UErrorCode *status) { - RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { - return 0; - } - if (destBuf == NULL && destCapacity > 0 || - destCapacity < 0 || - destFields == NULL || - destFieldsCapacity < 1 ) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - +int32_t RegexCImpl::split(RegularExpression *regexp, + UChar *destBuf, + int32_t destCapacity, + int32_t *requiredCapacity, + UChar *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status) { // // Reset for the input text // regexp->fMatcher->reset(); - int32_t inputLen = regexp->fTextString.length(); - int32_t nextOutputStringStart = 0; + UText *inputText = regexp->fMatcher->fInputText; + int64_t nextOutputStringStart = 0; + int64_t inputLen = regexp->fMatcher->fInputLength; if (inputLen == 0) { return 0; } - // // Loop through the input text, searching for the delimiter pattern // int32_t i; // Index of the field being processed. int32_t destIdx = 0; // Next available position in destBuf; int32_t numCaptureGroups = regexp->fMatcher->groupCount(); + UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted for (i=0; ; i++) { if (i>=destFieldsCapacity-1) { - // There are one or zero output string left. + // There are one or zero output strings left. // Fill the last output string with whatever is left from the input, then exit the loop. // ( i will be == destFieldsCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) - int32_t remainingLength = inputLen-nextOutputStringStart; - if (remainingLength > 0) { + if (inputLen > nextOutputStringStart) { + if (i != destFieldsCapacity-1) { + // No fields are left. Recycle the last one for holding the trailing part of + // the input string. + i = destFieldsCapacity-1; + destIdx = (int32_t)(destFields[i] - destFields[0]); + } + + destFields[i] = &destBuf[destIdx]; + destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, + &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); } - if (i >= destFieldsCapacity) { - // No fields are left. Recycle the last one for holding the trailing part of - // the input string. - i = destFieldsCapacity-1; - destIdx = (int32_t)(destFields[i] - destFields[0]); - } - - destFields[i] = &destBuf[destIdx]; - copyString(destBuf, destCapacity, &destIdx, - ®exp->fText[nextOutputStringStart], remainingLength); break; } if (regexp->fMatcher->find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. - int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart; destFields[i] = &destBuf[destIdx]; - copyString(destBuf, destCapacity, &destIdx, - ®exp->fText[nextOutputStringStart], fieldLen); - nextOutputStringStart = regexp->fMatcher->end(*status); + + destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, + &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); + if (tStatus == U_BUFFER_OVERFLOW_ERROR) { + tStatus = U_ZERO_ERROR; + } else { + *status = tStatus; + } + nextOutputStringStart = regexp->fMatcher->fMatchEnd; // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. @@ -1361,16 +1689,16 @@ uregex_split( URegularExpression *regexp2, i++; // Set up to extract the capture group contents into the dest buffer. - UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow - // error while extracting this group. - int32_t remainingCapacity = destCapacity - destIdx; - if (remainingCapacity < 0) { - remainingCapacity = 0; - } destFields[i] = &destBuf[destIdx]; - int32_t t = uregex_group(regexp2, groupNum, destFields[i], remainingCapacity, &tStatus); + tStatus = U_ZERO_ERROR; + int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); destIdx += t + 1; // Record the space used in the output string buffer. // +1 for the NUL that terminates the string. + if (tStatus == U_BUFFER_OVERFLOW_ERROR) { + tStatus = U_ZERO_ERROR; + } else { + *status = tStatus; + } } if (nextOutputStringStart == inputLen) { @@ -1384,8 +1712,8 @@ uregex_split( URegularExpression *regexp2, // We ran off the end of the input while looking for the next delimiter. // All the remaining text goes into the current output string. destFields[i] = &destBuf[destIdx]; - copyString(destBuf, destCapacity, &destIdx, - ®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart); + destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, + &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); break; } } @@ -1405,6 +1733,45 @@ uregex_split( URegularExpression *regexp2, return i+1; } +// +// uregex_split The actual API function +// +U_CAPI int32_t U_EXPORT2 +uregex_split(URegularExpression *regexp2, + UChar *destBuf, + int32_t destCapacity, + int32_t *requiredCapacity, + UChar *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, status) == FALSE) { + return 0; + } + if (destBuf == NULL && destCapacity > 0 || + destCapacity < 0 || + destFields == NULL || + destFieldsCapacity < 1 ) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); +} + + +// +// uregex_splitUText...can just use the normal C++ method +// +U_CAPI int32_t U_EXPORT2 +uregex_splitUText(URegularExpression *regexp2, + UText *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); +} + #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS diff --git a/icu4c/source/test/cintltst/reapits.c b/icu4c/source/test/cintltst/reapits.c index 89bc44c0987..2bba4ff2004 100644 --- a/icu4c/source/test/cintltst/reapits.c +++ b/icu4c/source/test/cintltst/reapits.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2004-2009, International Business Machines Corporation and + * Copyright (c) 2004-2010, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -26,6 +26,7 @@ #include "unicode/uloc.h" #include "unicode/uregex.h" #include "unicode/ustring.h" +#include "unicode/utext.h" #include "cintltst.h" #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ @@ -86,11 +87,34 @@ static void test_assert_string(const char *expected, const UChar *actual, UBool #define TEST_ASSERT_STRING(expected, actual, nulTerm) test_assert_string(expected, actual, nulTerm, __FILE__, __LINE__) +static void test_assert_utext(const char *expected, UText *actual, const char *file, int line) { + UErrorCode status = U_ZERO_ERROR; + UText expectedText = UTEXT_INITIALIZER; + utext_openUTF8(&expectedText, expected, -1, &status); + utext_setNativeIndex(actual, 0); + if (utext_compare(&expectedText, -1, actual, -1) != 0) { + UChar32 c; + log_err("Failure at file %s, line %d, expected \"%s\", got \"", file, line, expected); + c = utext_next32From(actual, 0); + while (c != U_SENTINEL) { + if (0x20", -1, &status); + + re = uregex_openC("x(.*?)x", 0, NULL, &status); + TEST_ASSERT_SUCCESS(status); + + /* Normal case, with match */ + uregex_setText(re, text1, -1, &status); + result = uregex_replaceFirstUText(re, &replText, NULL, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_UTEXT("Replace x1x x...x.", result); + utext_close(result); + + /* No match. Text should copy to output with no changes. */ + uregex_setText(re, text2, -1, &status); + result = uregex_replaceFirstUText(re, &replText, NULL, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_UTEXT("No match here.", result); + utext_close(result); + + /* Unicode escapes */ + uregex_setText(re, text1, -1, &status); + utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); + result = uregex_replaceFirstUText(re, &replText, NULL, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result); + utext_close(result); + + uregex_close(re); + utext_close(&replText); + } + + + /* + * replaceAll() + */ + { + UChar text1[80]; + UChar text2[80]; + UText replText = UTEXT_INITIALIZER; + UText *result; + + status = U_ZERO_ERROR; + u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); + u_uastrncpy(text2, "No match here.", sizeof(text2)/2); + utext_openUTF8(&replText, "<$1>", -1, &status); + + re = uregex_openC("x(.*?)x", 0, NULL, &status); + TEST_ASSERT_SUCCESS(status); + + /* Normal case, with match */ + uregex_setText(re, text1, -1, &status); + result = uregex_replaceAllUText(re, &replText, NULL, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_UTEXT("Replace <1> <...>.", result); + utext_close(result); + + /* No match. Text should copy to output with no changes. */ + uregex_setText(re, text2, -1, &status); + result = uregex_replaceAllUText(re, &replText, NULL, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_UTEXT("No match here.", result); + utext_close(result); + + uregex_close(re); + utext_close(&replText); + } + + + /* + * appendReplacement() + */ + { + UChar text[100]; + UChar repl[100]; + UChar buf[100]; + UChar *bufPtr; + int32_t bufCap; + + + status = U_ZERO_ERROR; + re = uregex_openC(".*", 0, 0, &status); + TEST_ASSERT_SUCCESS(status); + + u_uastrncpy(text, "whatever", sizeof(text)/2); + u_uastrncpy(repl, "some other", sizeof(repl)/2); + uregex_setText(re, text, -1, &status); + + /* match covers whole target string */ + uregex_find(re, 0, &status); + TEST_ASSERT_SUCCESS(status); + bufPtr = buf; + bufCap = sizeof(buf) / 2; + uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_STRING("some other", buf, TRUE); + + /* Match has \u \U escapes */ + uregex_find(re, 0, &status); + TEST_ASSERT_SUCCESS(status); + bufPtr = buf; + bufCap = sizeof(buf) / 2; + u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2); + uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE); + + uregex_close(re); + } + + + /* + * appendReplacement(), appendTail() checked in replaceFirst(), replaceAll(). + */ + + /* + * splitUText() + */ + { + UChar textToSplit[80]; + UChar text2[80]; + UText *fields[10]; + int32_t numFields; + + u_uastrncpy(textToSplit, "first : second: third", sizeof(textToSplit)/2); + u_uastrncpy(text2, "No match here.", sizeof(text2)/2); + + status = U_ZERO_ERROR; + re = uregex_openC(":", 0, NULL, &status); + + + /* Simple split */ + + uregex_setText(re, textToSplit, -1, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if (U_SUCCESS(status)) { + memset(fields, 0, sizeof(fields)); + numFields = uregex_splitUText(re, fields, 10, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + TEST_ASSERT(numFields == 3); + TEST_ASSERT_UTEXT("first ", fields[0]); + TEST_ASSERT_UTEXT(" second", fields[1]); + TEST_ASSERT_UTEXT(" third", fields[2]); + TEST_ASSERT(fields[3] == NULL); + } + } + + uregex_close(re); + + + /* Split with too few output strings available */ + status = U_ZERO_ERROR; + re = uregex_openC(":", 0, NULL, &status); + uregex_setText(re, textToSplit, -1, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + fields[0] = NULL; + fields[1] = NULL; + fields[2] = &patternText; + numFields = uregex_splitUText(re, fields, 2, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + TEST_ASSERT(numFields == 2); + TEST_ASSERT_UTEXT("first ", fields[0]); + TEST_ASSERT_UTEXT(" second: third", fields[1]); + TEST_ASSERT(fields[2] == &patternText); + } + } + + uregex_close(re); + } + + /* splitUText(), part 2. Patterns with capture groups. The capture group text + * comes out as additional fields. */ + { + UChar textToSplit[80]; + UText *fields[10]; + int32_t numFields; + + u_uastrncpy(textToSplit, "first second third", sizeof(textToSplit)/2); + + status = U_ZERO_ERROR; + re = uregex_openC("<(.*?)>", 0, NULL, &status); + + uregex_setText(re, textToSplit, -1, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + memset(fields, 0, sizeof(fields)); + numFields = uregex_splitUText(re, fields, 10, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + TEST_ASSERT(numFields == 5); + TEST_ASSERT_UTEXT("first ", fields[0]); + TEST_ASSERT_UTEXT("tag-a", fields[1]); + TEST_ASSERT_UTEXT(" second", fields[2]); + TEST_ASSERT_UTEXT("tag-b", fields[3]); + TEST_ASSERT_UTEXT(" third", fields[4]); + TEST_ASSERT(fields[5] == NULL); + } + } + + /* Split with too few output strings available (2) */ + status = U_ZERO_ERROR; + fields[0] = NULL; + fields[1] = NULL; + fields[2] = &patternText; + numFields = uregex_splitUText(re, fields, 2, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + TEST_ASSERT(numFields == 2); + TEST_ASSERT_UTEXT("first ", fields[0]); + TEST_ASSERT_UTEXT(" second third", fields[1]); + TEST_ASSERT(fields[2] == &patternText); + } + + /* Split with too few output strings available (3) */ + status = U_ZERO_ERROR; + fields[0] = NULL; + fields[1] = NULL; + fields[2] = NULL; + fields[3] = &patternText; + numFields = uregex_splitUText(re, fields, 3, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + TEST_ASSERT(numFields == 3); + TEST_ASSERT_UTEXT("first ", fields[0]); + TEST_ASSERT_UTEXT("tag-a", fields[1]); + TEST_ASSERT_UTEXT(" second third", fields[2]); + TEST_ASSERT(fields[3] == &patternText); + } + + /* Split with just enough output strings available (5) */ + status = U_ZERO_ERROR; + fields[0] = NULL; + fields[1] = NULL; + fields[2] = NULL; + fields[3] = NULL; + fields[4] = NULL; + fields[5] = &patternText; + numFields = uregex_splitUText(re, fields, 5, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + TEST_ASSERT(numFields == 5); + TEST_ASSERT_UTEXT("first ", fields[0]); + TEST_ASSERT_UTEXT("tag-a", fields[1]); + TEST_ASSERT_UTEXT(" second", fields[2]); + TEST_ASSERT_UTEXT("tag-b", fields[3]); + TEST_ASSERT_UTEXT(" third", fields[4]); + TEST_ASSERT(fields[5] == &patternText); + } + + /* Split, end of text is a field delimiter. */ + status = U_ZERO_ERROR; + uregex_setText(re, textToSplit, strlen("first second"), &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + memset(fields, 0, sizeof(fields)); + fields[9] = &patternText; + numFields = uregex_splitUText(re, fields, 9, &status); + TEST_ASSERT_SUCCESS(status); + + /* The TEST_ASSERT_SUCCESS call above should change too... */ + if(U_SUCCESS(status)) { + TEST_ASSERT(numFields == 4); + TEST_ASSERT_UTEXT("first ", fields[0]); + TEST_ASSERT_UTEXT("tag-a", fields[1]); + TEST_ASSERT_UTEXT(" second", fields[2]); + TEST_ASSERT_UTEXT("tag-b", fields[3]); + TEST_ASSERT(fields[4] == NULL); + TEST_ASSERT(fields[8] == NULL); + TEST_ASSERT(fields[9] == &patternText); + } + } + + uregex_close(re); + } +} + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index ee169aef7f6..5c3a1f57cfd 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2002-2009, International Business Machines Corporation and + * Copyright (c) 2002-2010, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -16,6 +16,7 @@ #include "unicode/regex.h" #include "unicode/uchar.h" #include "unicode/ucnv.h" +#include "unicode/ustring.h" #include "regextst.h" #include "uvector.h" #include "util.h" @@ -24,6 +25,11 @@ #include +#include + +#define SUPPORT_MUTATING_INPUT_STRING 0 + + //--------------------------------------------------------------------------- // // Test class boilerplate @@ -77,6 +83,24 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch case 8: name = "Bug 6149"; if (exec) Bug6149(); break; + case 9: name = "UTextBasic"; + if (exec) UTextBasic(); + break; + case 10: name = "API_Match_UTF8"; + if (exec) API_Match_UTF8(); + break; + case 11: name = "API_Replace_UTF8"; + if (exec) API_Replace_UTF8(); + break; + case 12: name = "API_Pattern_UTF8"; + if (exec) API_Pattern_UTF8(); + break; + case 13: name = "PerlTestsUTF8"; + if (exec) PerlTestsUTF8(); + break; + case 14: name = "PreAllocatedUTextCAPI"; + if (exec) PreAllocatedUTextCAPI(); + break; default: name = ""; break; //needed to end loop @@ -104,6 +128,32 @@ if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} +void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { + UErrorCode status = U_ZERO_ERROR; + UText expectedText = UTEXT_INITIALIZER; + utext_openUTF8(&expectedText, expected, -1, &status); + utext_setNativeIndex(actual, 0); + if (utext_compare(&expectedText, -1, actual, -1) != 0) { + char buf[201 /*21*/]; + char *bufPtr = buf; + UChar32 c = utext_next32From(actual, 0); + while (c != U_SENTINEL && bufPtr < buf+200/*20*/) { + if (0x20matcher(&inputText, status); + if (U_FAILURE(status)) { + errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", + line, u_errorName(status)); + return FALSE; + } + + UBool actualmatch; + actualmatch = REMatcher->lookingAt(status); + if (U_FAILURE(status)) { + errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", + line, u_errorName(status)); + retVal = FALSE; + } + if (actualmatch != looking) { + errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); + retVal = FALSE; + } + + status = U_ZERO_ERROR; + actualmatch = REMatcher->matches(status); + if (U_FAILURE(status)) { + errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", + line, u_errorName(status)); + retVal = FALSE; + } + if (actualmatch != match) { + errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); + retVal = FALSE; + } + + if (retVal == FALSE) { + RegexPatternDump(REPattern); + } + + delete REPattern; + delete REMatcher; + ucnv_close(UTF8Converter); + utext_close(&inputText); + utext_close(&pattern); + delete textChars; + return retVal; +} @@ -222,6 +351,26 @@ void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, } delete callerPattern; + + // + // Compile again, using a UTF-8-based UText + // + UText patternText = UTEXT_INITIALIZER; + utext_openUTF8(&patternText, pat, -1, &status); + callerPattern = RegexPattern::compile(&patternText, 0, pe, status); + if (status != expectedStatus) { + errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); + } else { + if (status != U_ZERO_ERROR) { + if (pe.line != errLine || pe.offset != errCol) { + errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", + line, errLine, errCol, pe.line, pe.offset); + } + } + } + + delete callerPattern; + utext_close(&patternText); } @@ -373,8 +522,35 @@ void RegexTest::Basic() { // Escape of special chars in patterns REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); +} +//--------------------------------------------------------------------------- +// +// UTextBasic Check for quirks that are specific to the UText +// implementation. +// +//--------------------------------------------------------------------------- +void RegexTest::UTextBasic() { + UErrorCode status = U_ZERO_ERROR; + UText pattern = UTEXT_INITIALIZER; + utext_openUTF8(&pattern, "abc", -1, &status); + RegexMatcher matcher(&pattern, 0, status); + REGEX_CHECK_STATUS; + + UText input = UTEXT_INITIALIZER; + utext_openUTF8(&input, "abc", -1, &status); + REGEX_CHECK_STATUS; + matcher.reset(&input); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("abc", matcher.inputText()); + + matcher.reset(matcher.inputText()); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("abc", matcher.inputText()); + + utext_close(&pattern); + utext_close(&input); } @@ -882,7 +1058,7 @@ void RegexTest::API_Match() { // { UErrorCode status = U_ZERO_ERROR; - UnicodeString testString(600000, 0x41, 600000); // Length 600,000, filled with 'A' + UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations // of the '+', and makes the stack frames larger. @@ -1404,6 +1580,1189 @@ void RegexTest::API_Pattern() { } +//--------------------------------------------------------------------------- +// +// API_Match_UTF8 Test that the alternate engine for class RegexMatcher +// is present and working, but excluding functions +// implementing replace operations. +// +//--------------------------------------------------------------------------- +void RegexTest::API_Match_UTF8() { + UParseError pe; + UErrorCode status=U_ZERO_ERROR; + int32_t flags = 0; + + // + // Debug - slide failing test cases early + // +#if 0 + { + } + return; +#endif + + // + // Simple pattern compilation + // + { + UText re = UTEXT_INITIALIZER; + utext_openUTF8(&re, "abc", -1, &status); + RegexPattern *pat2; + pat2 = RegexPattern::compile(&re, flags, pe, status); + REGEX_CHECK_STATUS; + + UText input1 = UTEXT_INITIALIZER; + UText input2 = UTEXT_INITIALIZER; + UText empty = UTEXT_INITIALIZER; + utext_openUTF8(&input1, "abcdef this is a test", -1, &status); + utext_openUTF8(&input2, "not abc", -1, &status); + utext_openUChars(&empty, NULL, 0, &status); + + int32_t input1Len = strlen("abcdef this is a test"); + int32_t input2Len = strlen("not abc"); + + + // + // Matcher creation and reset. + // + RegexMatcher *m1 = pat2->matcher(&input1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(m1->lookingAt(status) == TRUE); + REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText()); + m1->reset(&input2); + REGEX_ASSERT(m1->lookingAt(status) == FALSE); + REGEX_ASSERT_UTEXT("not abc", m1->inputText()); + m1->reset(&input1); + REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText()); + REGEX_ASSERT(m1->lookingAt(status) == TRUE); + m1->reset(&empty); + REGEX_ASSERT(m1->lookingAt(status) == FALSE); + REGEX_ASSERT(utext_nativeLength(&empty) == 0); + + // + // reset(pos, status) + // + m1->reset(&input1); + m1->reset(4, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText()); + REGEX_ASSERT(m1->lookingAt(status) == TRUE); + + m1->reset(-1, status); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; + + m1->reset(0, status); + REGEX_CHECK_STATUS; + status = U_ZERO_ERROR; + + m1->reset(input1Len-1, status); + REGEX_CHECK_STATUS; + status = U_ZERO_ERROR; + + m1->reset(input1Len, status); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; + + // + // match(pos, status) + // + m1->reset(&input2); + REGEX_ASSERT(m1->matches(4, status) == TRUE); + m1->reset(); + REGEX_ASSERT(m1->matches(3, status) == FALSE); + m1->reset(); + REGEX_ASSERT(m1->matches(5, status) == FALSE); + REGEX_ASSERT(m1->matches(4, status) == TRUE); + REGEX_ASSERT(m1->matches(-1, status) == FALSE); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + // Match() at end of string should fail, but should not + // be an error. + status = U_ZERO_ERROR; + REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); + REGEX_CHECK_STATUS; + + // Match beyond end of string should fail with an error. + status = U_ZERO_ERROR; + REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + // Successful match at end of string. + { + status = U_ZERO_ERROR; + RegexMatcher m("A?", 0, status); // will match zero length string. + REGEX_CHECK_STATUS; + m.reset(&input1); + REGEX_ASSERT(m.matches(input1Len, status) == TRUE); + REGEX_CHECK_STATUS; + m.reset(&empty); + REGEX_ASSERT(m.matches(0, status) == TRUE); + REGEX_CHECK_STATUS; + } + + + // + // lookingAt(pos, status) + // + status = U_ZERO_ERROR; + m1->reset(&input2); // "not abc" + REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); + REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); + REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); + REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); + REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; + REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); + REGEX_CHECK_STATUS; + REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + delete m1; + delete pat2; + + utext_close(&re); + utext_close(&input1); + utext_close(&input2); + utext_close(&empty); + } + + + // + // Capture Group. + // RegexMatcher::start(); + // RegexMatcher::end(); + // RegexMatcher::groupCount(); + // + { + int32_t flags=0; + UParseError pe; + UErrorCode status=U_ZERO_ERROR; + UText re=UTEXT_INITIALIZER; + utext_openUTF8(&re, "01(23(45)67)(.*)", -1, &status); + + RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); + REGEX_CHECK_STATUS; + + UText input = UTEXT_INITIALIZER; + utext_openUTF8(&input, "0123456789", -1, &status); + + RegexMatcher *matcher = pat->matcher(&input, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + static const int32_t matchStarts[] = {0, 2, 4, 8}; + static const int32_t matchEnds[] = {10, 8, 6, 10}; + int32_t i; + for (i=0; i<4; i++) { + int32_t actualStart = matcher->start(i, status); + REGEX_CHECK_STATUS; + if (actualStart != matchStarts[i]) { + errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", + __LINE__, i, matchStarts[i], actualStart); + } + int32_t actualEnd = matcher->end(i, status); + REGEX_CHECK_STATUS; + if (actualEnd != matchEnds[i]) { + errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", + __LINE__, i, matchEnds[i], actualEnd); + } + } + + REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); + REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); + + REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); + REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); + matcher->reset(); + REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); + + matcher->lookingAt(status); + + UnicodeString dest; + UText destText = UTEXT_INITIALIZER; + utext_openUnicodeString(&destText, &dest, &status); + UText *result; + + result = matcher->group((UText *)NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("0123456789", result); + utext_close(result); + result = matcher->group(&destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("0123456789", result); + + result = matcher->group(0, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("0123456789", result); + utext_close(result); + result = matcher->group(0, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("0123456789", result); + + result = matcher->group(1, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("234567", result); + utext_close(result); + result = matcher->group(1, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("234567", result); + + result = matcher->group(2, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("45", result); + utext_close(result); + result = matcher->group(2, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("45", result); + + result = matcher->group(3, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("89", result); + utext_close(result); + result = matcher->group(3, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("89", result); + + REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); + REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); + matcher->reset(); + REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); + + delete matcher; + delete pat; + + utext_close(&destText); + utext_close(&input); + utext_close(&re); + } + + // + // find + // + { + int32_t flags=0; + UParseError pe; + UErrorCode status=U_ZERO_ERROR; + UText re=UTEXT_INITIALIZER; + utext_openUTF8(&re, "abc", -1, &status); + + RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); + REGEX_CHECK_STATUS; + UText input = UTEXT_INITIALIZER; + utext_openUTF8(&input, ".abc..abc...abc..", -1, &status); + // 012345678901234567 + + RegexMatcher *matcher = pat->matcher(&input, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->find()); + REGEX_ASSERT(matcher->start(status) == 1); + REGEX_ASSERT(matcher->find()); + REGEX_ASSERT(matcher->start(status) == 6); + REGEX_ASSERT(matcher->find()); + REGEX_ASSERT(matcher->start(status) == 12); + REGEX_ASSERT(matcher->find() == FALSE); + REGEX_ASSERT(matcher->find() == FALSE); + + matcher->reset(); + REGEX_ASSERT(matcher->find()); + REGEX_ASSERT(matcher->start(status) == 1); + + REGEX_ASSERT(matcher->find(0, status)); + REGEX_ASSERT(matcher->start(status) == 1); + REGEX_ASSERT(matcher->find(1, status)); + REGEX_ASSERT(matcher->start(status) == 1); + REGEX_ASSERT(matcher->find(2, status)); + REGEX_ASSERT(matcher->start(status) == 6); + REGEX_ASSERT(matcher->find(12, status)); + REGEX_ASSERT(matcher->start(status) == 12); + REGEX_ASSERT(matcher->find(13, status) == FALSE); + REGEX_ASSERT(matcher->find(16, status) == FALSE); + REGEX_ASSERT(matcher->find(17, status) == FALSE); + REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); + + status = U_ZERO_ERROR; + REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); + status = U_ZERO_ERROR; + REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); + + REGEX_ASSERT(matcher->groupCount() == 0); + + delete matcher; + delete pat; + + utext_close(&input); + utext_close(&re); + } + + + // + // find, with \G in pattern (true if at the end of a previous match). + // + { + int32_t flags=0; + UParseError pe; + UErrorCode status=U_ZERO_ERROR; + UText re=UTEXT_INITIALIZER; + utext_openUTF8(&re, ".*?(?:(\\Gabc)|(abc))", -1, &status); + + RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); + + REGEX_CHECK_STATUS; + UText input = UTEXT_INITIALIZER; + utext_openUTF8(&input, ".abcabc.abc..", -1, &status); + // 012345678901234567 + + RegexMatcher *matcher = pat->matcher(&input, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->find()); + REGEX_ASSERT(matcher->start(status) == 0); + REGEX_ASSERT(matcher->start(1, status) == -1); + REGEX_ASSERT(matcher->start(2, status) == 1); + + REGEX_ASSERT(matcher->find()); + REGEX_ASSERT(matcher->start(status) == 4); + REGEX_ASSERT(matcher->start(1, status) == 4); + REGEX_ASSERT(matcher->start(2, status) == -1); + REGEX_CHECK_STATUS; + + delete matcher; + delete pat; + + utext_close(&input); + utext_close(&re); + } + + // + // find with zero length matches, match position should bump ahead + // to prevent loops. + // + { + int32_t i; + UErrorCode status=U_ZERO_ERROR; + RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, + // using an always-true look-ahead. + REGEX_CHECK_STATUS; + UText s = UTEXT_INITIALIZER; + utext_openUTF8(&s, " ", -1, &status); + m.reset(&s); + for (i=0; ; i++) { + if (m.find() == FALSE) { + break; + } + REGEX_ASSERT(m.start(status) == i); + REGEX_ASSERT(m.end(status) == i); + } + REGEX_ASSERT(i==5); + + // Check that the bump goes over characters outside the BMP OK + // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 + unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; + utext_openUTF8(&s, (char *)aboveBMP, -1, &status); + m.reset(&s); + for (i=0; ; i+=2) { + if (m.find() == FALSE) { + break; + } + REGEX_ASSERT(m.start(status) == i); + REGEX_ASSERT(m.end(status) == i); + } + REGEX_ASSERT(i==10); + + utext_close(&s); + } + { + // find() loop breaking test. + // with pattern of /.?/, should see a series of one char matches, then a single + // match of zero length at the end of the input string. + int32_t i; + UErrorCode status=U_ZERO_ERROR; + RegexMatcher m(".?", 0, status); + REGEX_CHECK_STATUS; + UText s = UTEXT_INITIALIZER; + utext_openUTF8(&s, " ", -1, &status); + m.reset(&s); + for (i=0; ; i++) { + if (m.find() == FALSE) { + break; + } + REGEX_ASSERT(m.start(status) == i); + REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); + } + REGEX_ASSERT(i==5); + + utext_close(&s); + } + + + // + // Matchers with no input string behave as if they had an empty input string. + // + + { + UErrorCode status = U_ZERO_ERROR; + RegexMatcher m(".?", 0, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(m.find()); + REGEX_ASSERT(m.start(status) == 0); + REGEX_ASSERT(m.input() == ""); + } + { + UErrorCode status = U_ZERO_ERROR; + RegexPattern *p = RegexPattern::compile(".", 0, status); + RegexMatcher *m = p->matcher(status); + REGEX_CHECK_STATUS; + + REGEX_ASSERT(m->find() == FALSE); + REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); + delete m; + delete p; + } + + // + // Regions + // + { + UErrorCode status = U_ZERO_ERROR; + UText testPattern = UTEXT_INITIALIZER; + UText testText = UTEXT_INITIALIZER; + utext_openUTF8(&testPattern, ".*", -1, &status); + utext_openUTF8(&testText, "This is test data", -1, &status); + + RegexMatcher m(&testPattern, &testText, 0, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(m.regionStart() == 0); + REGEX_ASSERT(m.regionEnd() == strlen("This is test data")); + REGEX_ASSERT(m.hasTransparentBounds() == FALSE); + REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); + + m.region(2,4, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(m.matches(status)); + REGEX_ASSERT(m.start(status)==2); + REGEX_ASSERT(m.end(status)==4); + REGEX_CHECK_STATUS; + + m.reset(); + REGEX_ASSERT(m.regionStart() == 0); + REGEX_ASSERT(m.regionEnd() == strlen("This is test data")); + + utext_openUTF8(&testText, "short", -1, &status); + m.reset(&testText); + REGEX_ASSERT(m.regionStart() == 0); + REGEX_ASSERT(m.regionEnd() == strlen("short")); + + REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); + REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); + REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); + REGEX_ASSERT(&m == &m.reset()); + REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); + + REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); + REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); + REGEX_ASSERT(&m == &m.reset()); + REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); + + REGEX_ASSERT(m.hasTransparentBounds() == FALSE); + REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); + REGEX_ASSERT(m.hasTransparentBounds() == TRUE); + REGEX_ASSERT(&m == &m.reset()); + REGEX_ASSERT(m.hasTransparentBounds() == TRUE); + + REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); + REGEX_ASSERT(m.hasTransparentBounds() == FALSE); + REGEX_ASSERT(&m == &m.reset()); + REGEX_ASSERT(m.hasTransparentBounds() == FALSE); + + utext_close(&testText); + utext_close(&testPattern); + } + + // + // hitEnd() and requireEnd() + // + { + UErrorCode status = U_ZERO_ERROR; + UText testPattern = UTEXT_INITIALIZER; + UText testText = UTEXT_INITIALIZER; + utext_openUTF8(&testPattern, ".*", -1, &status); + utext_openUTF8(&testText, "aabb", -1, &status); + + RegexMatcher m1(&testPattern, &testText, 0, status); + REGEX_ASSERT(m1.lookingAt(status) == TRUE); + REGEX_ASSERT(m1.hitEnd() == TRUE); + REGEX_ASSERT(m1.requireEnd() == FALSE); + REGEX_CHECK_STATUS; + + status = U_ZERO_ERROR; + utext_openUTF8(&testPattern, "a*", -1, &status); + RegexMatcher m2(&testPattern, &testText, 0, status); + REGEX_ASSERT(m2.lookingAt(status) == TRUE); + REGEX_ASSERT(m2.hitEnd() == FALSE); + REGEX_ASSERT(m2.requireEnd() == FALSE); + REGEX_CHECK_STATUS; + + status = U_ZERO_ERROR; + utext_openUTF8(&testPattern, ".*$", -1, &status); + RegexMatcher m3(&testPattern, &testText, 0, status); + REGEX_ASSERT(m3.lookingAt(status) == TRUE); + REGEX_ASSERT(m3.hitEnd() == TRUE); + REGEX_ASSERT(m3.requireEnd() == TRUE); + REGEX_CHECK_STATUS; + + utext_close(&testText); + utext_close(&testPattern); + } +} + + +//--------------------------------------------------------------------------- +// +// API_Replace_UTF8 API test for class RegexMatcher, testing the +// Replace family of functions. +// +//--------------------------------------------------------------------------- +void RegexTest::API_Replace_UTF8() { + // + // Replace + // + int32_t flags=0; + UParseError pe; + UErrorCode status=U_ZERO_ERROR; + + UText re=UTEXT_INITIALIZER; + utext_openUTF8(&re, "abc", -1, &status); + RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); + REGEX_CHECK_STATUS; + + char data[] = ".abc..abc...abc.."; + // 012345678901234567 + UText dataText = UTEXT_INITIALIZER; + utext_openUTF8(&dataText, data, -1, &status); + RegexMatcher *matcher = pat->matcher(&dataText, status); + + // + // Plain vanilla matches. + // + UnicodeString dest; + UText destText = UTEXT_INITIALIZER; + utext_openUnicodeString(&destText, &dest, &status); + UText *result; + + UText replText = UTEXT_INITIALIZER; + + utext_openUTF8(&replText, "yz", -1, &status); + result = matcher->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT(".yz..abc...abc..", result); + utext_close(result); + result = matcher->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT(".yz..abc...abc..", result); + + result = matcher->replaceAll(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT(".yz..yz...yz..", result); + utext_close(result); + + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher->replaceAll(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT(".yz..yz...yz..", result); + + // + // Plain vanilla non-matches. + // + utext_openUTF8(&dataText, ".abx..abx...abx..", -1, &status); + matcher->reset(&dataText); + + result = matcher->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); + utext_close(result); + result = matcher->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); + + result = matcher->replaceAll(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher->replaceAll(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT(".abx..abx...abx..", result); + + // + // Empty source string + // + utext_openUTF8(&dataText, NULL, 0, &status); + matcher->reset(&dataText); + + result = matcher->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("", result); + utext_close(result); + result = matcher->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("", result); + + result = matcher->replaceAll(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("", result); + utext_close(result); + result = matcher->replaceAll(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("", result); + + // + // Empty substitution string + // + utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." + matcher->reset(&dataText); + + utext_openUTF8(&replText, NULL, 0, &status); + result = matcher->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("...abc...abc..", result); + utext_close(result); + result = matcher->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("...abc...abc..", result); + + result = matcher->replaceAll(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("........", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher->replaceAll(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("........", result); + + // + // match whole string + // + utext_openUTF8(&dataText, "abc", -1, &status); + matcher->reset(&dataText); + + utext_openUTF8(&replText, "xyz", -1, &status); + result = matcher->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("xyz", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("xyz", result); + + result = matcher->replaceAll(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("xyz", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher->replaceAll(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("xyz", result); + + // + // Capture Group, simple case + // + utext_openUTF8(&re, "a(..)", -1, &status); + RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); + REGEX_CHECK_STATUS; + + utext_openUTF8(&dataText, "abcdefg", -1, &status); + RegexMatcher *matcher2 = pat2->matcher(&dataText, status); + REGEX_CHECK_STATUS; + + utext_openUTF8(&replText, "$1$1", -1, &status); + result = matcher2->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("bcbcdefg", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher2->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("bcbcdefg", result); + + utext_openUTF8(&replText, "The value of \\$1 is $1.", -1, &status); + result = matcher2->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher2->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result); + + utext_openUTF8(&replText, "$ by itself, no group number $$$", -1, &status); + result = matcher2->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher2->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result); + + unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE + // 012345678901234567890123456 + supplDigitChars[22] = 0xF0; + supplDigitChars[23] = 0x9D; + supplDigitChars[24] = 0x9F; + supplDigitChars[25] = 0x8F; + utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); + + result = matcher2->replaceFirst(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher2->replaceFirst(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result); + + utext_openUTF8(&replText, "bad capture group number $5...", -1, &status); + REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); +// REGEX_ASSERT_UTEXT("abcdefg", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); + REGEX_ASSERT(result == &destText); +// REGEX_ASSERT_UTEXT("abcdefg", result); + + // + // Replacement String with \u hex escapes + // + { + utext_openUTF8(&dataText, "abc 1 abc 2 abc 3", -1, &status); + utext_openUTF8(&replText, "--\\u0043--", -1, &status); + matcher->reset(&dataText); + + result = matcher->replaceAll(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher->replaceAll(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result); + } + { + utext_openUTF8(&dataText, "abc !", -1, &status); + utext_openUTF8(&replText, "--\\U00010000--", -1, &status); + matcher->reset(&dataText); + + unsigned char expected[] = "--xxxx-- !"; // \U00010000, "LINEAR B SYLLABLE B008 A" + // 0123456789 + expected[2] = 0xF0; + expected[3] = 0x90; + expected[4] = 0x80; + expected[5] = 0x80; + + result = matcher->replaceAll(&replText, NULL, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT((char *)expected, result); + utext_close(result); + utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); + result = matcher->replaceAll(&replText, &destText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &destText); + REGEX_ASSERT_UTEXT((char *)expected, result); + } + // TODO: need more through testing of capture substitutions. + + // Bug 4057 + // + { + status = U_ZERO_ERROR; + utext_openUTF8(&re, "ss(.*?)ee", -1, &status); + utext_openUTF8(&dataText, "The matches start with ss and end with ee ss stuff ee fin", -1, &status); + utext_openUTF8(&replText, "ooh", -1, &status); + + RegexMatcher m(&re, 0, status); + REGEX_CHECK_STATUS; + + UnicodeString result; + UText resultText = UTEXT_INITIALIZER; + utext_openUnicodeString(&resultText, &result, &status); + + // Multiple finds do NOT bump up the previous appendReplacement postion. + m.reset(&dataText); + m.find(); + m.find(); + m.appendReplacement(&resultText, &replText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText); + + // After a reset into the interior of a string, appendReplacement still starts at beginning. + status = U_ZERO_ERROR; + result.truncate(0); + utext_openUnicodeString(&resultText, &result, &status); + m.reset(10, status); + m.find(); + m.find(); + m.appendReplacement(&resultText, &replText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText); + + // find() at interior of string, appendReplacement still starts at beginning. + status = U_ZERO_ERROR; + result.truncate(0); + utext_openUnicodeString(&resultText, &result, &status); + m.reset(); + m.find(10, status); + m.find(); + m.appendReplacement(&resultText, &replText, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText); + + m.appendTail(&resultText); + REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh fin", &resultText); + + utext_close(&resultText); + } + + delete matcher2; + delete pat2; + delete matcher; + delete pat; + + utext_close(&dataText); + utext_close(&replText); + utext_close(&destText); + utext_close(&re); +} + + +//--------------------------------------------------------------------------- +// +// API_Pattern_UTF8 Test that the API for class RegexPattern is +// present and nominally working. +// +//--------------------------------------------------------------------------- +void RegexTest::API_Pattern_UTF8() { + RegexPattern pata; // Test default constructor to not crash. + RegexPattern patb; + + REGEX_ASSERT(pata == patb); + REGEX_ASSERT(pata == pata); + + UText re1 = UTEXT_INITIALIZER; + UText re2 = UTEXT_INITIALIZER; + UErrorCode status = U_ZERO_ERROR; + UParseError pe; + + utext_openUTF8(&re1, "abc[a-l][m-z]", -1, &status); + utext_openUTF8(&re2, "def", -1, &status); + + RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); + RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(*pat1 == *pat1); + REGEX_ASSERT(*pat1 != pata); + + // Assign + patb = *pat1; + REGEX_ASSERT(patb == *pat1); + + // Copy Construct + RegexPattern patc(*pat1); + REGEX_ASSERT(patc == *pat1); + REGEX_ASSERT(patb == patc); + REGEX_ASSERT(pat1 != pat2); + patb = *pat2; + REGEX_ASSERT(patb != patc); + REGEX_ASSERT(patb == *pat2); + + // Compile with no flags. + RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); + REGEX_ASSERT(*pat1a == *pat1); + + REGEX_ASSERT(pat1a->flags() == 0); + + // Compile with different flags should be not equal + RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); + REGEX_CHECK_STATUS; + + REGEX_ASSERT(*pat1b != *pat1a); + REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); + REGEX_ASSERT(pat1a->flags() == 0); + delete pat1b; + + // clone + RegexPattern *pat1c = pat1->clone(); + REGEX_ASSERT(*pat1c == *pat1); + REGEX_ASSERT(*pat1c != *pat2); + + delete pat1c; + delete pat1a; + delete pat1; + delete pat2; + + utext_close(&re1); + utext_close(&re2); + + + // + // Verify that a matcher created from a cloned pattern works. + // (Jitterbug 3423) + // + { + UErrorCode status = U_ZERO_ERROR; + UText pattern = UTEXT_INITIALIZER; + utext_openUTF8(&pattern, "\\p{L}+", -1, &status); + + RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); + RegexPattern *pClone = pSource->clone(); + delete pSource; + RegexMatcher *mFromClone = pClone->matcher(status); + REGEX_CHECK_STATUS; + + UText input = UTEXT_INITIALIZER; + utext_openUTF8(&input, "Hello World", -1, &status); + mFromClone->reset(&input); + REGEX_ASSERT(mFromClone->find() == TRUE); + REGEX_ASSERT(mFromClone->group(status) == "Hello"); + REGEX_ASSERT(mFromClone->find() == TRUE); + REGEX_ASSERT(mFromClone->group(status) == "World"); + REGEX_ASSERT(mFromClone->find() == FALSE); + delete mFromClone; + delete pClone; + + utext_close(&input); + utext_close(&pattern); + } + + // + // matches convenience API + // + { + UErrorCode status = U_ZERO_ERROR; + UText pattern = UTEXT_INITIALIZER; + UText input = UTEXT_INITIALIZER; + + utext_openUTF8(&input, "random input", -1, &status); + + utext_openUTF8(&pattern, ".*", -1, &status); + REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); + REGEX_CHECK_STATUS; + + utext_openUTF8(&pattern, "abc", -1, &status); + REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); + REGEX_CHECK_STATUS; + + utext_openUTF8(&pattern, ".*nput", -1, &status); + REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); + REGEX_CHECK_STATUS; + + utext_openUTF8(&pattern, "random input", -1, &status); + REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); + REGEX_CHECK_STATUS; + + utext_openUTF8(&pattern, ".*u", -1, &status); + REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); + REGEX_CHECK_STATUS; + + utext_openUTF8(&input, "abc", -1, &status); + utext_openUTF8(&pattern, "abc", -1, &status); + status = U_INDEX_OUTOFBOUNDS_ERROR; + REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + + utext_close(&input); + utext_close(&pattern); + } + + + // + // Split() + // + status = U_ZERO_ERROR; + utext_openUTF8(&re1, " +", -1, &status); + pat1 = RegexPattern::compile(&re1, pe, status); + REGEX_CHECK_STATUS; + UnicodeString fields[10]; + + int32_t n; + n = pat1->split("Now is the time", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==4); + REGEX_ASSERT(fields[0]=="Now"); + REGEX_ASSERT(fields[1]=="is"); + REGEX_ASSERT(fields[2]=="the"); + REGEX_ASSERT(fields[3]=="time"); + REGEX_ASSERT(fields[4]==""); + + n = pat1->split("Now is the time", fields, 2, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==2); + REGEX_ASSERT(fields[0]=="Now"); + REGEX_ASSERT(fields[1]=="is the time"); + REGEX_ASSERT(fields[2]=="the"); // left over from previous test + + fields[1] = "*"; + status = U_ZERO_ERROR; + n = pat1->split("Now is the time", fields, 1, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==1); + REGEX_ASSERT(fields[0]=="Now is the time"); + REGEX_ASSERT(fields[1]=="*"); + status = U_ZERO_ERROR; + + n = pat1->split(" Now is the time ", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==5); + REGEX_ASSERT(fields[0]==""); + REGEX_ASSERT(fields[1]=="Now"); + REGEX_ASSERT(fields[2]=="is"); + REGEX_ASSERT(fields[3]=="the"); + REGEX_ASSERT(fields[4]=="time"); + REGEX_ASSERT(fields[5]==""); + + n = pat1->split(" ", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==1); + REGEX_ASSERT(fields[0]==""); + + fields[0] = "foo"; + n = pat1->split("", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==0); + REGEX_ASSERT(fields[0]=="foo"); + + delete pat1; + + // split, with a pattern with (capture) + utext_openUTF8(&re1, "<(\\w*)>", -1, &status); + pat1 = RegexPattern::compile(&re1, pe, status); + REGEX_CHECK_STATUS; + + status = U_ZERO_ERROR; + n = pat1->split("Now is the time", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==6); + REGEX_ASSERT(fields[0]==""); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="b"); + REGEX_ASSERT(fields[4]=="the time"); + REGEX_ASSERT(fields[5]=="c"); + REGEX_ASSERT(fields[6]==""); + REGEX_ASSERT(status==U_ZERO_ERROR); + + n = pat1->split(" Now is the time", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==6); + REGEX_ASSERT(fields[0]==" "); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="b"); + REGEX_ASSERT(fields[4]=="the time"); + REGEX_ASSERT(fields[5]=="c"); + REGEX_ASSERT(fields[6]==""); + + status = U_ZERO_ERROR; + fields[6] = "foo"; + n = pat1->split(" Now is the time", fields, 6, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==6); + REGEX_ASSERT(fields[0]==" "); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="b"); + REGEX_ASSERT(fields[4]=="the time"); + REGEX_ASSERT(fields[5]=="c"); + REGEX_ASSERT(fields[6]=="foo"); + + status = U_ZERO_ERROR; + fields[5] = "foo"; + n = pat1->split(" Now is the time", fields, 5, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==5); + REGEX_ASSERT(fields[0]==" "); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="b"); + REGEX_ASSERT(fields[4]=="the time"); + REGEX_ASSERT(fields[5]=="foo"); + + status = U_ZERO_ERROR; + fields[5] = "foo"; + n = pat1->split(" Now is the time", fields, 5, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==5); + REGEX_ASSERT(fields[0]==" "); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="b"); + REGEX_ASSERT(fields[4]=="the time"); + REGEX_ASSERT(fields[5]=="foo"); + + status = U_ZERO_ERROR; + n = pat1->split(" Now is the time", fields, 4, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==4); + REGEX_ASSERT(fields[0]==" "); + REGEX_ASSERT(fields[1]=="a"); + REGEX_ASSERT(fields[2]=="Now is "); + REGEX_ASSERT(fields[3]=="the time"); + status = U_ZERO_ERROR; + delete pat1; + + utext_openUTF8(&re1, "([-,])", -1, &status); + pat1 = RegexPattern::compile(&re1, pe, status); + REGEX_CHECK_STATUS; + n = pat1->split("1-10,20", fields, 10, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(n==5); + REGEX_ASSERT(fields[0]=="1"); + REGEX_ASSERT(fields[1]=="-"); + REGEX_ASSERT(fields[2]=="10"); + REGEX_ASSERT(fields[3]==","); + REGEX_ASSERT(fields[4]=="20"); + delete pat1; + + + // + // RegexPattern::pattern() and patternText() + // + pat1 = new RegexPattern(); + REGEX_ASSERT(pat1->pattern() == ""); + REGEX_ASSERT_UTEXT("", pat1->patternText()); + delete pat1; + + utext_openUTF8(&re1, "(Hello, world)*", -1, &status); + pat1 = RegexPattern::compile(&re1, pe, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); + REGEX_ASSERT_UTEXT("(Hello, world)*", pat1->patternText()); + delete pat1; + + utext_close(&re1); +} + + //--------------------------------------------------------------------------- // // Extended A more thorough check for features of regex patterns @@ -1586,16 +2945,22 @@ void RegexTest::regex_find(const UnicodeString &pattern, int32_t line) { UnicodeString unEscapedInput; UnicodeString deTaggedInput; + + int32_t patternUTF8Length, inputUTF8Length; + char *patternChars = NULL, *inputChars = NULL; + UText patternText = UTEXT_INITIALIZER; + UText inputText = UTEXT_INITIALIZER; + UConverter *UTF8Converter = NULL; UErrorCode status = U_ZERO_ERROR; UParseError pe; RegexPattern *parsePat = NULL; RegexMatcher *parseMatcher = NULL; - RegexPattern *callerPattern = NULL; - RegexMatcher *matcher = NULL; + RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; + RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; UVector groupStarts(status); UVector groupEnds(status); - UBool isMatch = FALSE; + UBool isMatch = FALSE, isUTF8Match = FALSE; UBool failed = FALSE; int32_t numFinds; int32_t i; @@ -1651,6 +3016,46 @@ void RegexTest::regex_find(const UnicodeString &pattern, } } + UTF8Converter = ucnv_open("UTF8", &status); + ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); + + patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); + status = U_ZERO_ERROR; // buffer overflow + patternChars = new char[patternUTF8Length+1]; + pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); + utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); + + if (status == U_ZERO_ERROR) { + UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); + + if (status != U_ZERO_ERROR) { +#if UCONFIG_NO_BREAK_ITERATION==1 + // 'v' test flag means that the test pattern should not compile if ICU was configured + // to not include break iteration. RBBI is needed for Unicode word boundaries. + if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { + goto cleanupAndReturn; + } +#endif + if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' + // Expected pattern compilation error. + if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' + logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); + } + goto cleanupAndReturn; + } else { + // Unexpected pattern compilation error. + errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); + goto cleanupAndReturn; + } + } + } + + if (UTF8Pattern == NULL) { + // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine + logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for line %d", line); + status = U_ZERO_ERROR; + } + if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag RegexPatternDump(callerPattern); } @@ -1730,15 +3135,45 @@ void RegexTest::regex_find(const UnicodeString &pattern, if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag matcher->setTrace(TRUE); } + + if (UTF8Pattern != NULL) { + inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); + status = U_ZERO_ERROR; // buffer overflow + inputChars = new char[inputUTF8Length+1]; + deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); + utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); + + if (status == U_ZERO_ERROR) { + UTF8Matcher = UTF8Pattern->matcher(&inputText, status); + REGEX_CHECK_STATUS_L(line); + } + + if (UTF8Matcher == NULL) { + // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine + logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for line %d", line); + status = U_ZERO_ERROR; + } + } + if (regionStart>=0) { matcher->region(regionStart, regionEnd, status); REGEX_CHECK_STATUS_L(line); + if (UTF8Matcher != NULL) { + UTF8Matcher->region(regionStart, regionEnd, status); + REGEX_CHECK_STATUS_L(line); + } } if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag matcher->useAnchoringBounds(FALSE); + if (UTF8Matcher != NULL) { + UTF8Matcher->useAnchoringBounds(FALSE); + } } if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag matcher->useTransparentBounds(TRUE); + if (UTF8Matcher != NULL) { + UTF8Matcher->useTransparentBounds(TRUE); + } } @@ -1751,10 +3186,19 @@ void RegexTest::regex_find(const UnicodeString &pattern, for (i=0; imatches(status); + if (UTF8Matcher != NULL) { + isUTF8Match = UTF8Matcher->matches(status); + } } else if (useLookingAtFunc) { isMatch = matcher->lookingAt(status); + if (UTF8Matcher != NULL) { + isUTF8Match = UTF8Matcher->lookingAt(status); + } } else { isMatch = matcher->find(); + if (UTF8Matcher != NULL) { + isUTF8Match = UTF8Matcher->find(); + } } } matcher->setTrace(FALSE); @@ -1768,7 +3212,11 @@ void RegexTest::regex_find(const UnicodeString &pattern, // G option in test means that capture group data is not available in the // expected results, so the check needs to be suppressed. if (isMatch == FALSE && groupStarts.size() != 0) { - errln("Error at line %d: Match expected, but none found.\n", line); + errln("Error at line %d: Match expected, but none found.", line); + failed = TRUE; + goto cleanupAndReturn; + } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { + errln("Error at line %d: Match expected, but none found. (UTF8)", line); failed = TRUE; goto cleanupAndReturn; } @@ -1776,12 +3224,16 @@ void RegexTest::regex_find(const UnicodeString &pattern, if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { // Only check for match / no match. Don't check capture groups. if (isMatch && groupStarts.size() == 0) { - errln("Error at line %d: No match expected, but one found.\n", line); + errln("Error at line %d: No match expected, but one found.", line); + failed = TRUE; + } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { + errln("Error at line %d: No match expected, but one found. (UTF8)", line); failed = TRUE; } goto cleanupAndReturn; } + REGEX_CHECK_STATUS_L(line); for (i=0; i<=matcher->groupCount(); i++) { int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); if (matcher->start(i, status) != expectedStart) { @@ -1789,7 +3241,13 @@ void RegexTest::regex_find(const UnicodeString &pattern, line, i, expectedStart, matcher->start(i, status)); failed = TRUE; goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. + } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStart) { + errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", + line, i, expectedStart, UTF8Matcher->start(i, status)); + failed = TRUE; + goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. } + int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); if (matcher->end(i, status) != expectedEnd) { errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", @@ -1797,6 +3255,12 @@ void RegexTest::regex_find(const UnicodeString &pattern, failed = TRUE; // Error on end position; keep going; real error is probably yet to come as group // end positions work from end of the input data towards the front. + } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEnd) { + errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", + line, i, expectedEnd, UTF8Matcher->end(i, status)); + failed = TRUE; + // Error on end position; keep going; real error is probably yet to come as group + // end positions work from end of the input data towards the front. } } if ( matcher->groupCount()+1 < groupStarts.size()) { @@ -1804,39 +3268,71 @@ void RegexTest::regex_find(const UnicodeString &pattern, line, groupStarts.size()-1, matcher->groupCount()); failed = TRUE; } + else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { + errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", + line, groupStarts.size()-1, UTF8Matcher->groupCount()); + failed = TRUE; + } if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false matcher->requireEnd() == TRUE) { errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); failed = TRUE; + } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false + UTF8Matcher->requireEnd() == TRUE) { + errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); + failed = TRUE; } + if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true matcher->requireEnd() == FALSE) { errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); failed = TRUE; + } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false + UTF8Matcher->requireEnd() == FALSE) { + errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); + failed = TRUE; } + if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false matcher->hitEnd() == TRUE) { errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); failed = TRUE; + } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false + UTF8Matcher->hitEnd() == TRUE) { + errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); + failed = TRUE; } + if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true matcher->hitEnd() == FALSE) { errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); failed = TRUE; + } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true + UTF8Matcher->hitEnd() == FALSE) { + errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); + failed = TRUE; } cleanupAndReturn: if (failed) { - errln((UnicodeString)"\""+pattern+(UnicodeString)"\" " + infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); // callerPattern->dump(); } delete parseMatcher; delete parsePat; + delete UTF8Matcher; + delete UTF8Pattern; delete matcher; delete callerPattern; + + utext_close(&inputText); + delete[] inputChars; + utext_close(&patternText); + delete[] patternChars; + ucnv_close(UTF8Converter); } @@ -2268,10 +3764,18 @@ void RegexTest::PerlTests() { // UnicodeString resultString; UnicodeString perlExpr = fields[3]; +#if SUPPORT_MUTATING_INPUT_STRING groupsMat->reset(perlExpr); cgMat->reset(perlExpr); +#endif while (perlExpr.length() > 0) { +#if !SUPPORT_MUTATING_INPUT_STRING + // Perferred usage. Reset after any modification to input string. + groupsMat->reset(perlExpr); + cgMat->reset(perlExpr); +#endif + if (perlExpr.startsWith("$&")) { resultString.append(testMat->group(status)); perlExpr.remove(0, 2); @@ -2368,7 +3872,7 @@ void RegexTest::PerlTests() { if (expectedS.compare(resultString) != 0) { err("Line %d: Incorrect perl expression results.", lineNum); - errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); + infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); } delete testMat; @@ -2399,6 +3903,400 @@ void RegexTest::PerlTests() { } +//------------------------------------------------------------------------------- +// +// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts +// (instead of using UnicodeStrings) to test the alternate engine. +// The input file for this test is re_tests, the standard regular +// expression test data distributed with the Perl source code. +// See PerlTests() for more information. +// +//------------------------------------------------------------------------------- +void RegexTest::PerlTestsUTF8() { + char tdd[2048]; + const char *srcPath; + UErrorCode status = U_ZERO_ERROR; + UParseError pe; + UConverter *UTF8Converter = ucnv_open("UTF-8", &status); + UText patternText = UTEXT_INITIALIZER; + char *patternChars = NULL; + int32_t patternLength; + int32_t patternCapacity = 0; + UText inputText = UTEXT_INITIALIZER; + char *inputChars = NULL; + int32_t inputLength; + int32_t inputCapacity = 0; + + ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); + + // + // Open and read the test data file. + // + srcPath=getPath(tdd, "re_tests.txt"); + if(srcPath==NULL) { + return; /* something went wrong, error already output */ + } + + int32_t len; + UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); + if (U_FAILURE(status)) { + return; /* something went wrong, error already output */ + } + + // + // Put the test data into a UnicodeString + // + UnicodeString testDataString(FALSE, testData, len); + + // + // Regex to break the input file into lines, and strip the new lines. + // One line per match, capture group one is the desired data. + // + RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); + if (U_FAILURE(status)) { + dataerrln("RegexPattern::compile() error"); + return; + } + RegexMatcher* lineMat = linePat->matcher(testDataString, status); + + // + // Regex to split a test file line into fields. + // There are six fields, separated by tabs. + // + RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); + + // + // Regex to identify test patterns with flag settings, and to separate them. + // Test patterns with flags look like 'pattern'i + // Test patterns without flags are not quoted: pattern + // Coming out, capture group 2 is the pattern, capture group 3 is the flags. + // + RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); + RegexMatcher* flagMat = flagPat->matcher(status); + + // + // The Perl tests reference several perl-isms, which are evaluated/substituted + // in the test data. Not being perl, this must be done explicitly. Here + // are string constants and REs for these constructs. + // + UnicodeString nulnulSrc("${nulnul}"); + UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); + nulnul = nulnul.unescape(); + + UnicodeString ffffSrc("${ffff}"); + UnicodeString ffff("\\uffff", -1, US_INV); + ffff = ffff.unescape(); + + // regexp for $-[0], $+[2], etc. + RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); + RegexMatcher *groupsMat = groupsPat->matcher(status); + + // regexp for $0, $1, $2, etc. + RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); + RegexMatcher *cgMat = cgPat->matcher(status); + + + // + // Main Loop for the Perl Tests, runs once per line from the + // test data file. + // + int32_t lineNum = 0; + int32_t skippedUnimplementedCount = 0; + while (lineMat->find()) { + lineNum++; + + // + // Get a line, break it into its fields, do the Perl + // variable substitutions. + // + UnicodeString line = lineMat->group(1, status); + UnicodeString fields[7]; + fieldPat->split(line, fields, 7, status); + + flagMat->reset(fields[0]); + flagMat->matches(status); + UnicodeString pattern = flagMat->group(2, status); + pattern.findAndReplace("${bang}", "!"); + pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); + pattern.findAndReplace(ffffSrc, ffff); + + // + // Identify patterns that include match flag settings, + // split off the flags, remove the extra quotes. + // + UnicodeString flagStr = flagMat->group(3, status); + if (U_FAILURE(status)) { + errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); + return; + } + int32_t flags = 0; + const UChar UChar_c = 0x63; // Char constants for the flag letters. + const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) + const UChar UChar_m = 0x6d; + const UChar UChar_x = 0x78; + const UChar UChar_y = 0x79; + if (flagStr.indexOf(UChar_i) != -1) { + flags |= UREGEX_CASE_INSENSITIVE; + } + if (flagStr.indexOf(UChar_m) != -1) { + flags |= UREGEX_MULTILINE; + } + if (flagStr.indexOf(UChar_x) != -1) { + flags |= UREGEX_COMMENTS; + } + + // + // Put the pattern in a UTF-8 UText + // + status = U_ZERO_ERROR; + patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter, status); + if (status == U_BUFFER_OVERFLOW_ERROR) { + status = U_ZERO_ERROR; + delete[] patternChars; + patternCapacity = patternLength + 1; + patternChars = new char[patternCapacity]; + pattern.extract(patternChars, patternCapacity, UTF8Converter, status); + } + utext_openUTF8(&patternText, patternChars, patternLength, &status); + + // + // Compile the test pattern. + // + RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); + if (status == U_REGEX_UNIMPLEMENTED) { + // + // Test of a feature that is planned for ICU, but not yet implemented. + // skip the test. + skippedUnimplementedCount++; + delete testPat; + status = U_ZERO_ERROR; + continue; + } + + if (U_FAILURE(status)) { + // Some tests are supposed to generate errors. + // Only report an error for tests that are supposed to succeed. + if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND + fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility + { + errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); + } + status = U_ZERO_ERROR; + delete testPat; + continue; + } + + if (fields[2].indexOf(UChar_i) >= 0) { + // ICU should skip this test. + delete testPat; + continue; + } + + if (fields[2].indexOf(UChar_c) >= 0) { + // This pattern should have caused a compilation error, but didn't/ + errln("line %d: Expected a pattern compile error, got success.", lineNum); + delete testPat; + continue; + } + + + // + // replace the Perl variables that appear in some of the + // match data strings. + // + UnicodeString matchString = fields[1]; + matchString.findAndReplace(nulnulSrc, nulnul); + matchString.findAndReplace(ffffSrc, ffff); + + // Replace any \n in the match string with an actual new-line char. + // Don't do full unescape, as this unescapes more than Perl does, which + // causes other spurious failures in the tests. + matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); + + // + // Put the input in a UTF-8 UText + // + status = U_ZERO_ERROR; + inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter, status); + if (status == U_BUFFER_OVERFLOW_ERROR) { + status = U_ZERO_ERROR; + delete[] inputChars; + inputCapacity = inputLength + 1; + inputChars = new char[inputCapacity]; + matchString.extract(inputChars, inputCapacity, UTF8Converter, status); + } + utext_openUTF8(&inputText, inputChars, inputLength, &status); + + // + // Run the test, check for expected match/don't match result. + // + RegexMatcher *testMat = testPat->matcher(&inputText, status); + UBool found = testMat->find(); + UBool expected = FALSE; + if (fields[2].indexOf(UChar_y) >=0) { + expected = TRUE; + } + if (expected != found) { + errln("line %d: Expected %smatch, got %smatch", + lineNum, expected?"":"no ", found?"":"no " ); + continue; + } + + // Don't try to check expected results if there is no match. + // (Some have stuff in the expected fields) + if (!found) { + delete testMat; + delete testPat; + continue; + } + + // + // Interpret the Perl expression from the fourth field of the data file, + // building up an ICU string from the results of the ICU match. + // The Perl expression will contain references to the results of + // a regex match, including the matched string, capture group strings, + // group starting and ending indicies, etc. + // + UnicodeString resultString; + UnicodeString perlExpr = fields[3]; + + while (perlExpr.length() > 0) { + groupsMat->reset(perlExpr); + cgMat->reset(perlExpr); + + if (perlExpr.startsWith("$&")) { + resultString.append(testMat->group(status)); + perlExpr.remove(0, 2); + } + + else if (groupsMat->lookingAt(status)) { + // $-[0] $+[2] etc. + UnicodeString digitString = groupsMat->group(2, status); + int32_t t = 0; + int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); + UnicodeString plusOrMinus = groupsMat->group(1, status); + int32_t matchPosition; + if (plusOrMinus.compare("+") == 0) { + matchPosition = testMat->end(groupNum, status); + } else { + matchPosition = testMat->start(groupNum, status); + } + if (matchPosition != -1) { + ICU_Utility::appendNumber(resultString, matchPosition); + } + perlExpr.remove(0, groupsMat->end(status)); + } + + else if (cgMat->lookingAt(status)) { + // $1, $2, $3, etc. + UnicodeString digitString = cgMat->group(1, status); + int32_t t = 0; + int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); + if (U_SUCCESS(status)) { + resultString.append(testMat->group(groupNum, status)); + status = U_ZERO_ERROR; + } + perlExpr.remove(0, cgMat->end(status)); + } + + else if (perlExpr.startsWith("@-")) { + int32_t i; + for (i=0; i<=testMat->groupCount(); i++) { + if (i>0) { + resultString.append(" "); + } + ICU_Utility::appendNumber(resultString, testMat->start(i, status)); + } + perlExpr.remove(0, 2); + } + + else if (perlExpr.startsWith("@+")) { + int32_t i; + for (i=0; i<=testMat->groupCount(); i++) { + if (i>0) { + resultString.append(" "); + } + ICU_Utility::appendNumber(resultString, testMat->end(i, status)); + } + perlExpr.remove(0, 2); + } + + else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. + // or as an escaped sequence (e.g. \n) + if (perlExpr.length() > 1) { + perlExpr.remove(0, 1); // Remove the '\', but only if not last char. + } + UChar c = perlExpr.charAt(0); + switch (c) { + case 'n': c = '\n'; break; + // add any other escape sequences that show up in the test expected results. + } + resultString.append(c); + perlExpr.remove(0, 1); + } + + else { + // Any characters from the perl expression that we don't explicitly + // recognize before here are assumed to be literals and copied + // as-is to the expected results. + resultString.append(perlExpr.charAt(0)); + perlExpr.remove(0, 1); + } + + if (U_FAILURE(status)) { + errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); + break; + } + } + + // + // Expected Results Compare + // + UnicodeString expectedS(fields[4]); + expectedS.findAndReplace(nulnulSrc, nulnul); + expectedS.findAndReplace(ffffSrc, ffff); + expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); + + + if (expectedS.compare(resultString) != 0) { + err("Line %d: Incorrect perl expression results.", lineNum); + infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); + } + + delete testMat; + delete testPat; + } + + // + // All done. Clean up allocated stuff. + // + delete cgMat; + delete cgPat; + + delete groupsMat; + delete groupsPat; + + delete flagMat; + delete flagPat; + + delete lineMat; + delete linePat; + + delete fieldPat; + delete [] testData; + + utext_close(&patternText); + utext_close(&inputText); + + delete [] patternChars; + delete [] inputChars; + + + logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); + +} + + //-------------------------------------------------------------- // // Bug6149 Verify limits to heap expansion for backtrack stack. @@ -2514,5 +4412,205 @@ void RegexTest::Callbacks() { } + +//--------------------------------------------------------------------------- +// +// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable +// UTexts. The pure-C implementation of UText +// has no mutable backing stores, but we can +// use UnicodeString here to test the functionality. +// +//--------------------------------------------------------------------------- +void RegexTest::PreAllocatedUTextCAPI () { + UErrorCode status = U_ZERO_ERROR; + URegularExpression *re; + UText patternText = UTEXT_INITIALIZER; + UnicodeString buffer; + UText bufferText = UTEXT_INITIALIZER; + + utext_openUnicodeString(&bufferText, &buffer, &status); + + /* + * getText() and getUText() + */ + { + UText text1 = UTEXT_INITIALIZER; + UText text2 = UTEXT_INITIALIZER; + UChar text2Chars[20]; + UText *resultText; + + status = U_ZERO_ERROR; + utext_openUTF8(&text1, "abcccd", -1, &status); + utext_openUTF8(&text2, "abcccxd", -1, &status); + u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); + utext_openUChars(&text2, text2Chars, -1, &status); + + utext_openUTF8(&patternText, "abc*d", -1, &status); + re = uregex_openUText(&patternText, 0, NULL, &status); + + /* First set a UText */ + uregex_setUText(re, &text1, &status); + resultText = uregex_getUText(re, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(resultText == &bufferText); + utext_setNativeIndex(resultText, 0); + utext_setNativeIndex(&text1, 0); + REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); + + resultText = uregex_getUText(re, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(resultText == &bufferText); + utext_setNativeIndex(resultText, 0); + utext_setNativeIndex(&text1, 0); + REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); + + /* Then set a UChar * */ + uregex_setText(re, text2Chars, 7, &status); + resultText = uregex_getUText(re, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(resultText == &bufferText); + utext_setNativeIndex(resultText, 0); + utext_setNativeIndex(&text2, 0); + REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0); + + uregex_close(re); + utext_close(&text1); + utext_close(&text2); + } + + /* + * group() + */ + { + UChar text1[80]; + UText *actual; + UBool result; + u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); + + status = U_ZERO_ERROR; + re = uregex_openC("abc(.*?)def", 0, NULL, &status); + REGEX_CHECK_STATUS; + + uregex_setText(re, text1, -1, &status); + result = uregex_find(re, 0, &status); + REGEX_ASSERT(result==TRUE); + + /* Capture Group 0, the full match. Should succeed. */ + status = U_ZERO_ERROR; + actual = uregex_groupUText(re, 0, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(actual == &bufferText); + REGEX_ASSERT_UTEXT("abc interior def", actual); + + /* Capture group #1. Should succeed. */ + status = U_ZERO_ERROR; + actual = uregex_groupUText(re, 1, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(actual == &bufferText); + REGEX_ASSERT_UTEXT(" interior ", actual); + + /* Capture group out of range. Error. */ + status = U_ZERO_ERROR; + actual = uregex_groupUText(re, 2, &bufferText, &status); + REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); + REGEX_ASSERT(actual == &bufferText); + + uregex_close(re); + + } + + /* + * replaceFirst() + */ + { + UChar text1[80]; + UChar text2[80]; + UText replText = UTEXT_INITIALIZER; + UText *result; + + status = U_ZERO_ERROR; + u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); + u_uastrncpy(text2, "No match here.", sizeof(text2)/2); + utext_openUTF8(&replText, "<$1>", -1, &status); + + re = uregex_openC("x(.*?)x", 0, NULL, &status); + REGEX_CHECK_STATUS; + + /* Normal case, with match */ + uregex_setText(re, text1, -1, &status); + utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &bufferText); + REGEX_ASSERT_UTEXT("Replace x1x x...x.", result); + + /* No match. Text should copy to output with no changes. */ + uregex_setText(re, text2, -1, &status); + utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &bufferText); + REGEX_ASSERT_UTEXT("No match here.", result); + + /* Unicode escapes */ + uregex_setText(re, text1, -1, &status); + utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); + utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &bufferText); + REGEX_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result); + + uregex_close(re); + utext_close(&replText); + } + + + /* + * replaceAll() + */ + { + UChar text1[80]; + UChar text2[80]; + UText replText = UTEXT_INITIALIZER; + UText *result; + + status = U_ZERO_ERROR; + u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); + u_uastrncpy(text2, "No match here.", sizeof(text2)/2); + utext_openUTF8(&replText, "<$1>", -1, &status); + + re = uregex_openC("x(.*?)x", 0, NULL, &status); + REGEX_CHECK_STATUS; + + /* Normal case, with match */ + uregex_setText(re, text1, -1, &status); + utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + result = uregex_replaceAllUText(re, &replText, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &bufferText); + REGEX_ASSERT_UTEXT("Replace <1> <...>.", result); + + /* No match. Text should copy to output with no changes. */ + uregex_setText(re, text2, -1, &status); + utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); + result = uregex_replaceAllUText(re, &replText, &bufferText, &status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(result == &bufferText); + REGEX_ASSERT_UTEXT("No match here.", result); + + uregex_close(re); + utext_close(&replText); + } + + + /* + * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, + * so we don't need to test it here. + */ + + utext_close(&bufferText); +} + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ diff --git a/icu4c/source/test/intltest/utxttest.cpp b/icu4c/source/test/intltest/utxttest.cpp index c84b3652bf1..fb93becb9d2 100644 --- a/icu4c/source/test/intltest/utxttest.cpp +++ b/icu4c/source/test/intltest/utxttest.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2005-2009, International Business Machines Corporation and + * Copyright (c) 2005-2010, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /************************************************************************ @@ -58,6 +58,8 @@ UTextTest::runIndexedTest(int32_t index, UBool exec, if (exec) Ticket5560(); break; case 4: name = "Ticket6847"; if (exec) Ticket6847(); break; + case 5: name = "ComparisonTest"; + if (exec) ComparisonTest(); break; default: name = ""; break; } } @@ -836,6 +838,476 @@ void UTextTest::TestAccessNoClone(const UnicodeString &us, UText *ut, int cpCoun } +// +// ComparisonTest() Check the string comparison functions. Based on UnicodeStringTest::TestCompare() +// +void UTextTest::ComparisonTest() +{ + UErrorCode status = U_ZERO_ERROR; + UnicodeString test1Str("this is a test"); + UnicodeString test2Str("this is a test"); + UnicodeString test3Str("this is a test of the emergency broadcast system"); + UnicodeString test4Str("never say, \"this is a test\"!!"); + + UText test1 = UTEXT_INITIALIZER; + UText test2 = UTEXT_INITIALIZER; + UText test3 = UTEXT_INITIALIZER; + UText test4 = UTEXT_INITIALIZER; + + UChar uniChars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, + 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 }; + char chars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, + 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 }; + + UText uniCharText = UTEXT_INITIALIZER; + UText charText = UTEXT_INITIALIZER; + + utext_openUnicodeString(&test1, &test1Str, &status); + utext_openUnicodeString(&test2, &test2Str, &status); + utext_openUnicodeString(&test3, &test3Str, &status); + utext_openUnicodeString(&test4, &test4Str, &status); + + utext_openUChars(&uniCharText, uniChars, -1, &status); + utext_openUTF8(&charText, chars, -1, &status); + + TEST_SUCCESS(status); + + // test utext_compare(), simple + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test1, -1, &test2, -1) != 0) errln("utext_compare() failed, simple setup"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compare(&test1, -1, &test3, -1) >= 0) errln("utext_compare() failed, simple setup"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test4, 0); + if (utext_compare(&test1, -1, &test4, -1) <= 0) errln("utext_compare() failed, simple setup"); + + // test utext_compareNativeLimit(), simple + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test1, -1, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, simple setup"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compareNativeLimit(&test1, -1, &test3, -1) >= 0) errln("utext_compareNativeLimit() failed, simple setup"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test4, 0); + if (utext_compareNativeLimit(&test1, -1, &test4, -1) <= 0) errln("utext_compareNativeLimit() failed, simple setup"); + + // test utext_compare(), one explicit length + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test1, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length"); + UTEXT_SETNATIVEINDEX(&test2, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compare(&test3, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length"); + UTEXT_SETNATIVEINDEX(&test2, 0); + UTEXT_SETNATIVEINDEX(&test4, 12); + if (utext_compare(&test4, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length and offset"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compare(&test3, 18, &test2, -1) <= 0) errln("utext_compare() failed, one explicit length"); + + // test utext_compareNativeLimit(), one explicit length + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test1, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length"); + UTEXT_SETNATIVEINDEX(&test2, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compareNativeLimit(&test3, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length"); + UTEXT_SETNATIVEINDEX(&test2, 0); + UTEXT_SETNATIVEINDEX(&test4, 12); + if (utext_compareNativeLimit(&test4, 26, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length and limit"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compareNativeLimit(&test3, 18, &test2, -1) <= 0) errln("utext_compareNativeLimit() failed, one explicit length"); + + // test utext_compare(), UChar-based UText + UTEXT_SETNATIVEINDEX(&uniCharText, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test2, -1, &uniCharText, -1) != 0) errln("utext_compare() failed, UChar-based UText"); + UTEXT_SETNATIVEINDEX(&uniCharText, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compare(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compare() failed, UChar-based UText"); + UTEXT_SETNATIVEINDEX(&uniCharText, 0); + UTEXT_SETNATIVEINDEX(&test4, 0); + if (utext_compare(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compare() failed, UChar-based UText"); + + // test utext_compareNativeLimit(), UChar-based UText + UTEXT_SETNATIVEINDEX(&uniCharText, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test2, -1, &uniCharText, -1) != 0) errln("utext_compareNativeLimit() failed, UChar-based UText"); + UTEXT_SETNATIVEINDEX(&uniCharText, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compareNativeLimit(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compareNativeLimit() failed, UChar-based UText"); + UTEXT_SETNATIVEINDEX(&uniCharText, 0); + UTEXT_SETNATIVEINDEX(&test4, 0); + if (utext_compareNativeLimit(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compareNativeLimit() failed, UChar-based UText"); + + // test utext_compare(), UTF8-based UText + UTEXT_SETNATIVEINDEX(&charText, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test2, -1, &charText, -1) != 0) errln("utext_compare() failed, UTF8-based UText"); + UTEXT_SETNATIVEINDEX(&charText, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compare(&test3, -1, &charText, -1) <= 0) errln("utext_compare() failed, UTF8-based UText"); + UTEXT_SETNATIVEINDEX(&charText, 0); + UTEXT_SETNATIVEINDEX(&test4, 0); + if (utext_compare(&test4, -1, &charText, -1) >= 0) errln("utext_compare() failed, UTF8-based UText"); + + // test utext_compareNativeLimit(), UTF8-based UText + UTEXT_SETNATIVEINDEX(&charText, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test2, -1, &charText, -1) != 0) errln("utext_compareNativeLimit() failed, UTF8-based UText"); + UTEXT_SETNATIVEINDEX(&charText, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compareNativeLimit(&test3, -1, &charText, -1) <= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText"); + UTEXT_SETNATIVEINDEX(&charText, 0); + UTEXT_SETNATIVEINDEX(&test4, 0); + if (utext_compareNativeLimit(&test4, -1, &charText, -1) >= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText"); + + // test utext_compare(), length + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test1, -1, &test2, 4) != 0) errln("utext_compare() failed, one length"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test1, 5, &test2, 4) <= 0) errln("utext_compare() failed, both lengths"); + + // test utext_compareNativeLimit(), limit + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test1, -1, &test2, 4) != 0) errln("utext_compareNativeLimit() failed, one limit"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test1, 5, &test2, 4) <= 0) errln("utext_compareNativeLimit() failed, both limits"); + + // test utext_compare(), both explicit offsets and lengths + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test1, 14, &test2, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compare(&test1, 14, &test3, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test4, 12); + if (utext_compare(&test1, 14, &test4, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths"); + UTEXT_SETNATIVEINDEX(&test1, 10); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compare(&test1, 4, &test2, 4) >= 0) errln("utext_compare() failed, both explicit offsets and lengths"); + UTEXT_SETNATIVEINDEX(&test1, 10); + UTEXT_SETNATIVEINDEX(&test3, 22); + if (utext_compare(&test1, 4, &test3, 9) <= 0) errln("utext_compare() failed, both explicit offsets and lengths"); + UTEXT_SETNATIVEINDEX(&test1, 10); + UTEXT_SETNATIVEINDEX(&test4, 22); + if (utext_compare(&test1, 4, &test4, 4) != 0) errln("utext_compare() failed, both explicit offsets and lengths"); + + // test utext_compareNativeLimit(), both explicit offsets and limits + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test1, 14, &test2, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test3, 0); + if (utext_compareNativeLimit(&test1, 14, &test3, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits"); + UTEXT_SETNATIVEINDEX(&test1, 0); + UTEXT_SETNATIVEINDEX(&test4, 12); + if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits"); + UTEXT_SETNATIVEINDEX(&test1, 10); + UTEXT_SETNATIVEINDEX(&test2, 0); + if (utext_compareNativeLimit(&test1, 14, &test2, 4) >= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits"); + UTEXT_SETNATIVEINDEX(&test1, 10); + UTEXT_SETNATIVEINDEX(&test3, 22); + if (utext_compareNativeLimit(&test1, 14, &test3, 31) <= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits"); + UTEXT_SETNATIVEINDEX(&test1, 10); + UTEXT_SETNATIVEINDEX(&test4, 22); + if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits"); + + /* test caseCompare() */ + { + static const UChar + _mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0x130, 0x49, 0xfb03, 0xd93f, 0xdfff, 0 }, + _otherDefault[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x69, 0x307, 0x69, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 }, + _otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x69, 0x131, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 }, + _different[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x130, 0x49, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 }; + + UText + mixed = UTEXT_INITIALIZER, + otherDefault = UTEXT_INITIALIZER, + otherExcludeSpecialI = UTEXT_INITIALIZER, + different = UTEXT_INITIALIZER; + + utext_openUChars(&mixed, _mixed, -1, &status); + utext_openUChars(&otherDefault, _otherDefault, -1, &status); + utext_openUChars(&otherExcludeSpecialI, _otherExcludeSpecialI, -1, &status); + utext_openUChars(&different, _different, -1, &status); + + TEST_SUCCESS(status); + + int32_t result; + + /* test default options */ + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&otherDefault, 0); + result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status); + if (0 != result || U_FAILURE(status)) { + errln("error: utext_caseCompare (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status)); + } + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&otherDefault, 0); + result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status); + if (0 != result || U_FAILURE(status)) { + errln("error: utext_caseCompareNativeLimit (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status)); + } + + /* test excluding special I */ + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0); + result = utext_caseCompare(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status); + if (0 != result || U_FAILURE(status)) { + errln("error: utext_caseCompare (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status)); + } + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0); + result = utext_caseCompareNativeLimit(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status); + if (0 != result || U_FAILURE(status)) { + errln("error: utext_caseCompareNativeLimit (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status)); + } + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&otherDefault, 0); + result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status); + if (0 == result || U_FAILURE(status)) { + errln("error: utext_caseCompare (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status)); + } + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&otherDefault, 0); + result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status); + if (0 == result || U_FAILURE(status)) { + errln("error: utext_caseCompareNativeLimit (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status)); + } + + /* test against different string */ + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&different, 0); + result = utext_caseCompare(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status); + if (0 >= result || U_FAILURE(status)) { + errln("error: utext_caseCompare (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status)); + } + UTEXT_SETNATIVEINDEX(&mixed, 0); + UTEXT_SETNATIVEINDEX(&different, 0); + result = utext_caseCompareNativeLimit(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status); + if (0 >= result || U_FAILURE(status)) { + errln("error: utext_caseCompareNativeLimit (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status)); + } + + /* test caseCompare() - include the folded sharp s (U+00df) with different lengths */ + UTEXT_SETNATIVEINDEX(&mixed, 1); + UTEXT_SETNATIVEINDEX(&different, 1); + result = utext_caseCompare(&mixed, 4, &different, 5, U_FOLD_CASE_DEFAULT, &status); + if (0 != result || U_FAILURE(status)) { + errln("error: utext_caseCompare (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status)); + } + UTEXT_SETNATIVEINDEX(&mixed, 1); + UTEXT_SETNATIVEINDEX(&different, 1); + result = utext_caseCompareNativeLimit(&mixed, 5, &different, 6, U_FOLD_CASE_DEFAULT, &status); + if (0 != result || U_FAILURE(status)) { + errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status)); + } + + /* test caseCompare() - stop in the middle of the sharp s (U+00df) */ + UTEXT_SETNATIVEINDEX(&mixed, 1); + UTEXT_SETNATIVEINDEX(&different, 1); + result = utext_caseCompare(&mixed, 4, &different, 4, U_FOLD_CASE_DEFAULT, &status); + if (0 >= result || U_FAILURE(status)) { + errln("error: utext_caseCompare (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status)); + } + UTEXT_SETNATIVEINDEX(&mixed, 1); + UTEXT_SETNATIVEINDEX(&different, 1); + result = utext_caseCompareNativeLimit(&mixed, 5, &different, 5, U_FOLD_CASE_DEFAULT, &status); + if (0 >= result || U_FAILURE(status)) { + errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status)); + } + } + + /* test surrogates in comparison */ + { + static const UChar + _before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x65, 0x00 }, + _after[] = { 0x65, 0xd800, 0xdc00, 0x65, 0x00 }; + + UText + before = UTEXT_INITIALIZER, + after = UTEXT_INITIALIZER; + + utext_openUChars(&before, _before, -1, &status); + utext_openUChars(&after, _after, -1, &status); + + TEST_SUCCESS(status); + int32_t result; + + UTEXT_SETNATIVEINDEX(&before, 1); + UTEXT_SETNATIVEINDEX(&after, 1); + result = utext_compare(&before, -1, &after, -1); + if (0 <= result || U_FAILURE(status)) { + errln("error: utext_compare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status)); + } + + UTEXT_SETNATIVEINDEX(&before, 1); + UTEXT_SETNATIVEINDEX(&after, 1); + result = utext_compare(&before, 3, &after, 3); + if (0 <= result || U_FAILURE(status)) { + errln("error: utext_compare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status)); + } + + UTEXT_SETNATIVEINDEX(&before, 1); + UTEXT_SETNATIVEINDEX(&after, 1); + result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status); + if (0 <= result || U_FAILURE(status)) { + errln("error: utext_caseCompare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status)); + } + + UTEXT_SETNATIVEINDEX(&before, 1); + UTEXT_SETNATIVEINDEX(&after, 1); + result = utext_caseCompare(&before, 3, &after, 3, U_FOLD_CASE_DEFAULT, &status); + if (0 <= result || U_FAILURE(status)) { + errln("error: utext_caseCompare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status)); + } + + utext_close(&before); + utext_close(&after); + } + + /* test surrogates at end of string */ + { + static const UChar + _before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x00 }, + _after[] = { 0x65, 0xd800, 0xdc00, 0x00 }; + + UText + before = UTEXT_INITIALIZER, + after = UTEXT_INITIALIZER; + + utext_openUChars(&before, _before, -1, &status); + utext_openUChars(&after, _after, -1, &status); + + TEST_SUCCESS(status); + int32_t result; + + UTEXT_SETNATIVEINDEX(&before, 1); + UTEXT_SETNATIVEINDEX(&after, 1); + result = utext_compare(&before, -1, &after, -1); + if (0 <= result || U_FAILURE(status)) { + errln("error: utext_compare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status)); + } + + UTEXT_SETNATIVEINDEX(&before, 1); + UTEXT_SETNATIVEINDEX(&after, 1); + result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status); + if (0 <= result || U_FAILURE(status)) { + errln("error: utext_caseCompare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status)); + } + + utext_close(&before); + utext_close(&after); + } + + /* test empty strings */ + { + UChar zero16 = 0; + char zero8 = 0; + UText emptyUChar = UTEXT_INITIALIZER; + UText emptyUTF8 = UTEXT_INITIALIZER; + UText nullUChar = UTEXT_INITIALIZER; + UText nullUTF8 = UTEXT_INITIALIZER; + + utext_openUChars(&emptyUChar, &zero16, -1, &status); + utext_openUTF8(&emptyUTF8, &zero8, -1, &status); + utext_openUChars(&nullUChar, NULL, 0, &status); + utext_openUTF8(&nullUTF8, NULL, 0, &status); + + if (utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0) { + errln("error: utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0"); + } + if (utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0) { + errln("error: utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0"); + } + if (utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0) { + errln("error: utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0"); + } + if (utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0) { + errln("error: utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0"); + } + if (utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0) { + errln("error: utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0"); + } + if (utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0) { + errln("error: utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0"); + } + + if (utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0) { + errln("error: utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0"); + } + if (utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0) { + errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0"); + } + if (utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0) { + errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0"); + } + if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0) { + errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0"); + } + if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0) { + errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0"); + } + if (utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0) { + errln("error: utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0"); + } + + if (utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0"); + } + if (utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) { + errln("error: utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0"); + } + if (utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0"); + } + if (utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) { + errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0"); + } + if (utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0"); + } + if (utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0"); + } + + if (utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0"); + } + if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) { + errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0"); + } + if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0"); + } + if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) { + errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0"); + } + if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0"); + } + if (utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) { + errln("error: utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0"); + } + + utext_close(&emptyUChar); + utext_close(&emptyUTF8); + utext_close(&nullUChar); + utext_close(&nullUTF8); + } +} + + // // ErrorTest() Check various error and edge cases. diff --git a/icu4c/source/test/intltest/utxttest.h b/icu4c/source/test/intltest/utxttest.h index 5e52f54d308..e34f564c344 100644 --- a/icu4c/source/test/intltest/utxttest.h +++ b/icu4c/source/test/intltest/utxttest.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2005-2009, International Business Machines Corporation and + * Copyright (c) 2005-2010, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /************************************************************************ @@ -33,6 +33,7 @@ public: void FreezeTest(); void Ticket5560(); void Ticket6847(); + void ComparisonTest(); private: struct m { // Map between native indices & code points.