ICU-2145 consistent behavior of binary string searches

X-SVN-Rev: 10055
This commit is contained in:
Markus Scherer 2002-10-24 01:49:58 +00:00
parent 8d1a83e3d5
commit b1246ef900
5 changed files with 680 additions and 444 deletions

View file

@ -890,7 +890,7 @@ public:
int32_t length) const;
/**
* Locate in this the first occurrence of the code unit <TT>c</TT>,
* Locate in this the first occurrence of the BMP code point <code>c</code>,
* using bitwise comparison.
* @param c The code unit to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -902,19 +902,6 @@ public:
* Locate in this the first occurrence of the code point <TT>c</TT>,
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from indexOf(UChar c, ...) only for surrogates:
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but indexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
@ -922,7 +909,7 @@ public:
inline int32_t indexOf(UChar32 c) const;
/**
* Locate in this the first occurrence of the code unit <TT>c</TT>
* Locate in this the first occurrence of the BMP code point <code>c</code>,
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param c The code unit to search for.
* @param start The offset at which searching will start.
@ -936,19 +923,6 @@ public:
* Locate in this the first occurrence of the code point <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from indexOf(UChar c, ...) only for surrogates:
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but indexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -958,7 +932,7 @@ public:
int32_t start) const;
/**
* Locate in this the first occurrence of the code unit <TT>c</TT>
* Locate in this the first occurrence of the BMP code point <code>c</code>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
* @param c The code unit to search for.
@ -976,19 +950,6 @@ public:
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from indexOf(UChar c, ...) only for surrogates:
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but indexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
@ -1112,7 +1073,7 @@ public:
int32_t length) const;
/**
* Locate in this the last occurrence of the code unit <TT>c</TT>,
* Locate in this the last occurrence of the BMP code point <code>c</code>,
* using bitwise comparison.
* @param c The code unit to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -1124,19 +1085,6 @@ public:
* Locate in this the last occurrence of the code point <TT>c</TT>,
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from lastIndexOf(UChar c, ...) only for surrogates:
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but lastIndexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
@ -1144,7 +1092,7 @@ public:
inline int32_t lastIndexOf(UChar32 c) const;
/**
* Locate in this the last occurrence of the code unit <TT>c</TT>
* Locate in this the last occurrence of the BMP code point <code>c</code>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param c The code unit to search for.
* @param start The offset at which searching will start.
@ -1158,19 +1106,6 @@ public:
* Locate in this the last occurrence of the code point <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from lastIndexOf(UChar c, ...) only for surrogates:
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but lastIndexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
@ -1180,7 +1115,7 @@ public:
int32_t start) const;
/**
* Locate in this the last occurrence of the code unit <TT>c</TT>
* Locate in this the last occurrence of the BMP code point <code>c</code>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
* @param c The code unit to search for.
@ -1198,19 +1133,6 @@ public:
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
*
* This function finds code points, which differs for BMP code points
* from lastIndexOf(UChar c, ...) only for surrogates:
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
* will find code units U+d800 at 0 and U+dc00 at 1,
* but lastIndexOf(UChar32 c, ...) will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
*
* @param c The code point to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
@ -2981,7 +2903,6 @@ private:
int32_t start,
int32_t length) const;
// only for c>=0xd800
int32_t doIndexOf(UChar32 c,
int32_t start,
int32_t length) const;
@ -2990,7 +2911,6 @@ private:
int32_t start,
int32_t length) const;
// only for c>=0xd800
int32_t doLastIndexOf(UChar32 c,
int32_t start,
int32_t length) const;
@ -3490,11 +3410,7 @@ inline int32_t
UnicodeString::indexOf(UChar32 c,
int32_t start,
int32_t length) const {
if((uint32_t)c<0xd800) {
return doIndexOf((UChar)c, start, length);
} else {
return doIndexOf(c, start, length);
}
return doIndexOf(c, start, length);
}
inline int32_t
@ -3571,11 +3487,7 @@ inline int32_t
UnicodeString::lastIndexOf(UChar32 c,
int32_t start,
int32_t length) const {
if((uint32_t)c<0xd800) {
return doLastIndexOf((UChar)c, start, length);
} else {
return doLastIndexOf(c, start, length);
}
return doLastIndexOf(c, start, length);
}
inline UBool

View file

@ -27,7 +27,7 @@
* \file
* \brief C API: Unicode string handling functions
*
* These C API functions provide Unicode string handling.
* These C API functions provide general Unicode string handling.
*
* Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
* functions. (For example, they do not check for bad arguments like NULL string pointers.)
@ -39,25 +39,32 @@
*
* ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
* UTF-16 encodes each Unicode code point with either one or two UChar code units.
* Some APIs accept a 32-bit UChar32 value for a single code point.
* (This is the default form of Unicode, and a forward-compatible extension of the original,
* fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
* in 1996.)
*
* Some APIs accept a 32-bit UChar32 value for a single code point.
*
* ICU also handles 16-bit Unicode text with unpaired surrogates.
* Such text is not well-formed UTF-16.
* Code-point-related functions treat unpaired surrogates as surrogate code points,
* i.e., as separate units.
*
* Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
* it is much more efficient even for random access because the code unit values
* for single-unit characters vs. lead units vs. trail units are completely disjoint.
* This means that it is easy to determine character (code point) boundaries from
* random offsets in the string.
* (It also means, e.g., that u_strstr() does not need to verify that a match was
* found on actual character boundaries; with some legacy encodings, strstr() may need to
* scan back to the start of the text to verify this.)
*
* Unicode (UTF-16) string processing is optimized for the single-unit case.
* Although it is important to support supplementary characters
* (which use pairs of lead/trail code units called "surrogates"),
* their occurrence is rare. Almost all characters in modern use require only
* a single UChar code unit (i.e., their code point values are <=0xffff).
*
* For more details see the User Guide Strings chapter (http://oss.software.ibm.com/icu/userguide/strings.html).
* For a discussion of the handling of unpaired surrogates see also
* Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
*/
/**
@ -137,59 +144,180 @@ u_strncat(UChar *dst,
const UChar *src,
int32_t n);
/**
* Find the first occurrence of a specified character in a ustring.
*
* @param s The string to search.
* @param c The character to find.
* @return A pointer to the first occurrence of <TT>c</TT> in <TT>s</TT>,
* or a null pointer if <TT>s</TT> does not contain <TT>c</TT>.
* @stable
*/
U_CAPI UChar* U_EXPORT2
u_strchr(const UChar *s,
UChar c);
/**
* Find the first occurrence of a substring in a string.
* The substring is found at code point boundaries.
* That means that if the substring begins with
* a trail surrogate or ends with a lead surrogate,
* then it is found only if these surrogates stand alone in the text.
* Otherwise, the substring edge units would be matched against
* halves of surrogate pairs.
*
* @param s The string to search.
* @param substring The substring to find
* @return A pointer to the first occurrence of <TT>substring</TT> in
* <TT>s</TT>, or a null pointer if <TT>substring</TT>
* is not in <TT>s</TT>.
* @param s The string to search (NUL-terminated).
* @param substring The substring to find (NUL-terminated).
* @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
* or <code>s</code> itself if the <code>substring</code> is empty,
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
* @stable
*
* @see u_strrstr
* @see u_strFindFirst
* @see u_strFindLast
*/
U_CAPI UChar * U_EXPORT2
u_strstr(const UChar *s, const UChar *substring);
/**
* Find the first occurence of a specified code point in a string.
*
* This function finds code points, which differs for BMP code points
* from u_strchr() only for surrogates:
* While u_strchr() finds any surrogate code units in a string,
* u_strchr32() finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" u_strchr()
* will find code units U+d800 at 0 and U+dc00 at 1,
* but u_strchr32() will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that U16_GET(u_strchr32(c))==c.
* Find the first occurrence of a substring in a string.
* The substring is found at code point boundaries.
* That means that if the substring begins with
* a trail surrogate or ends with a lead surrogate,
* then it is found only if these surrogates stand alone in the text.
* Otherwise, the substring edge units would be matched against
* halves of surrogate pairs.
*
* @param s The string to search.
* @param c The code point (0..0x10ffff) to find.
* @return A pointer to the first occurrence of <TT>c</TT> in <TT>s</TT>,
* or a null pointer if there is no such character.
* If <TT>c</TT> is represented with several UChars, then the returned
* pointer will point to the first of them.
* @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
* @param substring The substring to find (NUL-terminated).
* @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
* @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
* or <code>s</code> itself if the <code>substring</code> is empty,
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
* @stable
*
* @see u_strstr
* @see u_strFindLast
*/
U_CAPI UChar * U_EXPORT2
u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
/**
* Find the first occurrence of a BMP code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* @param s The string to search (NUL-terminated).
* @param c The BMP code point to find.
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strchr32
* @see u_memchr
* @see u_strstr
* @see u_strFindFirst
*/
U_CAPI UChar * U_EXPORT2
u_strchr(const UChar *s, UChar c);
/**
* Find the first occurrence of a code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* @param s The string to search (NUL-terminated).
* @param c The code point to find.
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strchr
* @see u_memchr32
* @see u_strstr
* @see u_strFindFirst
*/
U_CAPI UChar * U_EXPORT2
u_strchr32(const UChar *s, UChar32 c);
/**
* Find the last occurrence of a substring in a string.
* The substring is found at code point boundaries.
* That means that if the substring begins with
* a trail surrogate or ends with a lead surrogate,
* then it is found only if these surrogates stand alone in the text.
* Otherwise, the substring edge units would be matched against
* halves of surrogate pairs.
*
* @param s The string to search (NUL-terminated).
* @param substring The substring to find (NUL-terminated).
* @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
* or <code>s</code> itself if the <code>substring</code> is empty,
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
* @stable
*
* @see u_strstr
* @see u_strFindFirst
* @see u_strFindLast
*/
U_CAPI UChar * U_EXPORT2
u_strrstr(const UChar *s, const UChar *substring);
/**
* Find the last occurrence of a substring in a string.
* The substring is found at code point boundaries.
* That means that if the substring begins with
* a trail surrogate or ends with a lead surrogate,
* then it is found only if these surrogates stand alone in the text.
* Otherwise, the substring edge units would be matched against
* halves of surrogate pairs.
*
* @param s The string to search.
* @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
* @param substring The substring to find (NUL-terminated).
* @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
* @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
* or <code>s</code> itself if the <code>substring</code> is empty,
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
* @stable
*
* @see u_strstr
* @see u_strFindLast
*/
U_CAPI UChar * U_EXPORT2
u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
/**
* Find the last occurrence of a BMP code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* @param s The string to search (NUL-terminated).
* @param c The BMP code point to find.
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strrchr32
* @see u_memrchr
* @see u_strrstr
* @see u_strFindLast
*/
U_CAPI UChar * U_EXPORT2
u_strrchr(const UChar *s, UChar c);
/**
* Find the last occurrence of a code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* @param s The string to search (NUL-terminated).
* @param c The code point to find.
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strrchr
* @see u_memchr32
* @see u_strrstr
* @see u_strFindLast
*/
U_CAPI UChar * U_EXPORT2
u_strrchr32(const UChar *s, UChar32 c);
/**
* Locates the first occurrence in the string str of any of the characters
* in the string accept.
@ -621,46 +749,84 @@ U_CAPI int32_t U_EXPORT2
u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
/**
* Search for a UChar within a Unicode string until <TT>count</TT>
* is reached.
* Find the first occurrence of a BMP code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* @param src string to search in
* @param ch character to find
* @param count maximum number of UChars in <TT>src</TT>to search for
* <TT>ch</TT>.
* @return A pointer within src, pointing to <TT>ch</TT>, or NULL if it
* was not found.
* @param s The string to search (contains <code>count</code> UChars).
* @param c The BMP code point to find.
* @param count The length of the string.
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strchr
* @see u_memchr32
* @see u_strFindFirst
*/
U_CAPI UChar* U_EXPORT2
u_memchr(const UChar *src, UChar ch, int32_t count);
u_memchr(const UChar *s, UChar c, int32_t count);
/**
* Find the first occurence of a specified code point in a string.
* Find the first occurrence of a code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* This function finds code points, which differs for BMP code points
* from u_memchr() only for surrogates:
* While u_memchr() finds any surrogate code units in a string,
* u_memchr32() finds only unmatched surrogate code points,
* i.e., only those that do not combine with an adjacent surrogate
* to form a supplementary code point.
* For example, in a string "\ud800\udc00" u_memchr()
* will find code units U+d800 at 0 and U+dc00 at 1,
* but u_memchr32() will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that U16_GET(u_memchr32(c))==c.
*
* @param src string to search in
* @param ch character to find
* @param count maximum number of UChars in <TT>src</TT>to search for
* <TT>ch</TT>.
* @return A pointer within src, pointing to <TT>ch</TT>, or NULL if it
* was not found.
* @param s The string to search (contains <code>count</code> UChars).
* @param c The code point to find.
* @param count The length of the string.
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strchr32
* @see u_memchr
* @see u_strFindFirst
*/
U_CAPI UChar* U_EXPORT2
u_memchr32(const UChar *src, UChar32 ch, int32_t count);
u_memchr32(const UChar *s, UChar32 c, int32_t count);
/**
* Find the last occurrence of a BMP code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* @param s The string to search (contains <code>count</code> UChars).
* @param c The BMP code point to find.
* @param count The length of the string.
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strrchr
* @see u_memrchr32
* @see u_strFindLast
*/
U_CAPI UChar* U_EXPORT2
u_memrchr(const UChar *s, UChar c, int32_t count);
/**
* Find the last occurrence of a code point in a string.
* A surrogate code point is found only if its match in the text is not
* part of a surrogate pair.
* A NUL character is found at the string terminator.
*
* @param s The string to search (contains <code>count</code> UChars).
* @param c The code point to find.
* @param count The length of the string.
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
* @stable
*
* @see u_strrchr32
* @see u_memrchr
* @see u_strFindLast
*/
U_CAPI UChar* U_EXPORT2
u_memrchr32(const UChar *s, UChar32 c, int32_t count);
/**
* Unicode String literals in C.

View file

@ -814,42 +814,21 @@ UnicodeString::indexOf(const UChar *srcChars,
return -1;
}
// get the srcLength if necessary
if(srcLength < 0) {
srcLength = u_strlen(srcChars + srcStart);
if(srcLength == 0) {
return -1;
}
// UnicodeString does not find empty substrings
if(srcLength < 0 && srcChars[srcStart] == 0) {
return -1;
}
// now we will only work with srcLength-1
--srcLength;
// get the indices within bounds
pinIndices(start, length);
// set length for the last possible match start position
// note the --srcLength above
length -= srcLength;
if(length <= 0) {
// find the first occurrence of the substring
const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength);
if(match == NULL) {
return -1;
} else {
return match - fArray;
}
const UChar *array = getArrayStart();
int32_t limit = start + length;
// search for the first char, then compare the rest of the string
// increment srcStart here for that, matching the --srcLength above
UChar ch = srcChars[srcStart++];
do {
if(array[start] == ch && (srcLength == 0 || compare(start + 1, srcLength, srcChars, srcStart, srcLength) == 0)) {
return start;
}
} while(++start < limit);
return -1;
}
int32_t
@ -859,21 +838,14 @@ UnicodeString::doIndexOf(UChar c,
{
// pin indices
pinIndices(start, length);
if(length == 0) {
return -1;
}
// find the first occurrence of c
const UChar *begin = getArrayStart() + start;
const UChar *limit = begin + length;
do {
if(*begin == c) {
return (int32_t)(begin - getArrayStart());
}
} while(++begin < limit);
return -1;
const UChar *match = u_memchr(fArray + start, c, length);
if(match == NULL) {
return -1;
} else {
return match - fArray;
}
}
int32_t
@ -882,26 +854,13 @@ UnicodeString::doIndexOf(UChar32 c,
int32_t length) const {
// pin indices
pinIndices(start, length);
if(length == 0) {
return -1;
}
// c<0xd800 handled by inline function indexOf(UChar32 c, start, length)
if(c<=0xdfff) {
// surrogate code point
const UChar *t = uprv_strFindSurrogate(fArray + start, length, (UChar)c);
if(t != 0) {
return (int32_t)(t - fArray);
} else {
return -1;
}
} else if(c<=0xffff) {
// non-surrogate BMP code point
return doIndexOf((UChar)c, start, length);
// find the first occurrence of c
const UChar *match = u_memchr32(fArray + start, c, length);
if(match == NULL) {
return -1;
} else {
// supplementary code point, search for string
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
return indexOf(buffer, 2, start, length);
return match - fArray;
}
}
@ -916,43 +875,21 @@ UnicodeString::lastIndexOf(const UChar *srcChars,
return -1;
}
// get the srcLength if necessary
if(srcLength < 0) {
srcLength = u_strlen(srcChars + srcStart);
if(srcLength == 0) {
return -1;
}
// UnicodeString does not find empty substrings
if(srcLength < 0 && srcChars[srcStart] == 0) {
return -1;
}
// now we will only work with srcLength-1
--srcLength;
// get the indices within bounds
pinIndices(start, length);
// set length for the last possible match start position
// note the --srcLength above
length -= srcLength;
if(length <= 0) {
// find the last occurrence of the substring
const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength);
if(match == NULL) {
return -1;
} else {
return match - fArray;
}
const UChar *array = getArrayStart();
int32_t pos;
// search for the first char, then compare the rest of the string
// increment srcStart here for that, matching the --srcLength above
UChar ch = srcChars[srcStart++];
pos = start + length;
do {
if(array[--pos] == ch && (srcLength == 0 || compare(pos + 1, srcLength, srcChars, srcStart, srcLength) == 0)) {
return pos;
}
} while(pos > start);
return -1;
}
int32_t
@ -966,20 +903,14 @@ UnicodeString::doLastIndexOf(UChar c,
// pin indices
pinIndices(start, length);
if(length == 0) {
// find the last occurrence of c
const UChar *match = u_memrchr(fArray + start, c, length);
if(match == NULL) {
return -1;
} else {
return match - fArray;
}
const UChar *begin = getArrayStart() + start;
const UChar *limit = begin + length;
do {
if(*--limit == c) {
return (int32_t)(limit - getArrayStart());
}
} while(limit > begin);
return -1;
}
int32_t
@ -988,26 +919,13 @@ UnicodeString::doLastIndexOf(UChar32 c,
int32_t length) const {
// pin indices
pinIndices(start, length);
if(length == 0) {
return -1;
}
// c<0xd800 handled by inline function lastIndexOf(UChar32 c, start, length)
if(c<=0xdfff) {
// surrogate code point
const UChar *t = uprv_strFindLastSurrogate(fArray + start, length, (UChar)c);
if(t != 0) {
return (int32_t)(t - fArray);
} else {
return -1;
}
} else if(c<=0xffff) {
// non-surrogate BMP code point
return doLastIndexOf((UChar)c, start, length);
// find the last occurrence of c
const UChar *match = u_memrchr32(fArray + start, c, length);
if(match == NULL) {
return -1;
} else {
// supplementary code point, search for string
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
return lastIndexOf(buffer, 2, start, length);
return match - fArray;
}
}

View file

@ -241,20 +241,4 @@ u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCod
#define u_getMaxCaseExpansion() 10
/**
* Find a single (unmatched) surrogate code point in the string s[0..length[ .
* Find the first such surrogate.
* @internal
*/
U_CFUNC const UChar *
uprv_strFindSurrogate(const UChar *s, int32_t length, UChar surrogate);
/**
* Find a single (unmatched) surrogate code point in the string s[0..length[ .
* Find the last such surrogate.
* @internal
*/
U_CFUNC const UChar *
uprv_strFindLastSurrogate(const UChar *s, int32_t length, UChar surrogate);
#endif

View file

@ -34,129 +34,427 @@ static UConverter *gDefaultConverter = NULL;
#define MAX_STRLEN 0x0FFFFFFF
/* ---- String searching functions ---- */
/* Forward binary string search functions ----------------------------------- */
U_CAPI UChar* U_EXPORT2
u_strchr(const UChar *s, UChar c)
{
while (*s && *s != c) {
++s;
}
if (*s == c)
return (UChar *)s;
return NULL;
}
/* A Boyer-Moore algorithm would be better, but that would require a hashtable
because UChar is so big. This algorithm doesn't use a lot of extra memory.
*/
U_CAPI UChar * U_EXPORT2
u_strstr(const UChar *s, const UChar *substring) {
UChar *strItr, *subItr;
if (*substring == 0) {
return (UChar *)s;
}
do {
strItr = (UChar *)s;
subItr = (UChar *)substring;
/* Only one string iterator needs checking for null terminator */
while ((*strItr != 0) && (*strItr == *subItr)) {
strItr++;
subItr++;
}
if (*subItr == 0) { /* Was the end of the substring reached? */
return (UChar *)s;
}
s++;
} while (*strItr != 0); /* Was the end of the string reached? */
return NULL; /* No match */
}
/**
* Check if there is an unmatched surrogate c in a string [start..limit[ at s.
* start<=s<limit or limit==NULL
* @return TRUE if *s is unmatched
/*
* Test if a substring match inside a string is at code point boundaries.
* All pointers refer to the same buffer.
* The limit pointer may be NULL, all others must be real pointers.
*/
static U_INLINE UBool
uprv_isSingleSurrogate(const UChar *start, const UChar *s, UChar c, const UChar *limit) {
if(UTF_IS_SURROGATE_FIRST(c)) {
++s;
return (UBool)(s==limit || !UTF_IS_TRAIL(*s));
} else {
return (UBool)(s==start || !UTF_IS_LEAD(*(s-1)));
isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
/* the leading edge of the match is in the middle of a surrogate pair */
return FALSE;
}
if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
/* the trailing edge of the match is in the middle of a surrogate pair */
return FALSE;
}
return TRUE;
}
U_CFUNC const UChar *
uprv_strFindSurrogate(const UChar *s, int32_t length, UChar surrogate) {
const UChar *limit, *t;
UChar c;
U_CAPI UChar * U_EXPORT2
u_strFindFirst(const UChar *s, int32_t length,
const UChar *sub, int32_t subLength) {
const UChar *start, *p, *q, *subLimit;
UChar c, cs, cq;
if(length>=0) {
limit=s+length;
} else {
limit=NULL;
if(sub==NULL || subLength<-1) {
return (UChar *)s;
}
if(s==NULL || length<-1) {
return NULL;
}
for(t=s; t!=limit && ((c=*t)!=0 || limit!=NULL); ++t) {
if(c==surrogate && uprv_isSingleSurrogate(s, t, c, limit)) {
return t;
start=s;
if(length<0 && subLength<0) {
/* both strings are NUL-terminated */
if((cs=*sub++)==0) {
return (UChar *)s;
}
if(*sub==0 && !U16_IS_SURROGATE(cs)) {
/* the substring consists of a single, non-surrogate BMP code point */
return u_strchr(s, cs);
}
while((c=*s++)!=0) {
if(c==cs) {
/* found first substring UChar, compare rest */
p=s;
q=sub;
for(;;) {
if((cq=*q)==0) {
if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
return (UChar *)(s-1); /* well-formed match */
} else {
break; /* no match because surrogate pair is split */
}
}
if((c=*p)==0) {
return NULL; /* no match, and none possible after s */
}
if(c!=cq) {
break; /* no match */
}
++p;
++q;
}
}
}
/* not found */
return NULL;
}
if(subLength<0) {
subLength=u_strlen(sub);
}
if(subLength==0) {
return (UChar *)s;
}
/* get sub[0] to search for it fast */
cs=*sub++;
--subLength;
subLimit=sub+subLength;
if(subLength==0 && !U16_IS_SURROGATE(cs)) {
/* the substring consists of a single, non-surrogate BMP code point */
return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
}
if(length<0) {
/* s is NUL-terminated */
while((c=*s++)!=0) {
if(c==cs) {
/* found first substring UChar, compare rest */
p=s;
q=sub;
for(;;) {
if(q==subLimit) {
if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
return (UChar *)(s-1); /* well-formed match */
} else {
break; /* no match because surrogate pair is split */
}
}
if((c=*p)==0) {
return NULL; /* no match, and none possible after s */
}
if(c!=*q) {
break; /* no match */
}
++p;
++q;
}
}
}
} else {
const UChar *limit, *preLimit;
/* subLength was decremented above */
if(length<=subLength) {
return NULL; /* s is shorter than sub */
}
limit=s+length;
/* the substring must start before preLimit */
preLimit=limit-subLength;
while(s!=preLimit) {
c=*s++;
if(c==cs) {
/* found first substring UChar, compare rest */
p=s;
q=sub;
for(;;) {
if(q==subLimit) {
if(isMatchAtCPBoundary(start, s-1, p, limit)) {
return (UChar *)(s-1); /* well-formed match */
} else {
break; /* no match because surrogate pair is split */
}
}
if(*p!=*q) {
break; /* no match */
}
++p;
++q;
}
}
}
}
/* not found */
return NULL;
}
U_CFUNC const UChar *
uprv_strFindLastSurrogate(const UChar *s, int32_t length, UChar surrogate) {
const UChar *limit, *t;
UChar c;
U_CAPI UChar * U_EXPORT2
u_strstr(const UChar *s, const UChar *substring) {
return u_strFindFirst(s, -1, substring, -1);
}
if(length>=0) {
limit=s+length;
U_CAPI UChar * U_EXPORT2
u_strchr(const UChar *s, UChar c) {
if(U16_IS_SURROGATE(c)) {
/* make sure to not find half of a surrogate pair */
return u_strFindFirst(s, -1, &c, 1);
} else {
limit=s+u_strlen(s);
}
UChar cs;
for(t=limit; t!=s;) {
c=*--t;
if(c==surrogate && uprv_isSingleSurrogate(s, t, c, limit)) {
return t;
/* trivial search for a BMP code point */
for(;;) {
if((cs=*s)==c) {
return (UChar *)s;
}
if(cs==0) {
return NULL;
}
++s;
}
}
return NULL;
}
U_CAPI UChar * U_EXPORT2
u_strchr32(const UChar *s, UChar32 c) {
if(c < 0xd800) {
/* non-surrogate BMP code point */
return u_strchr(s, (UChar)c);
} else if(c <= 0xdfff) {
/* surrogate code point */
return (UChar *)uprv_strFindSurrogate(s, -1, (UChar)c);
} else if(c <= 0xffff) {
/* non-surrogate BMP code point */
return u_strchr(s, (UChar)c);
} else {
/* supplementary code point, search for string */
UChar buffer[3];
if((uint32_t)c<=0xffff) {
/* find BMP code point */
return u_strchr(s, (UChar)c);
} else if((uint32_t)c<=0x10ffff) {
/* find supplementary code point as surrogate pair */
UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
buffer[0] = UTF16_LEAD(c);
buffer[1] = UTF16_TRAIL(c);
buffer[2] = 0;
return u_strstr(s, buffer);
}
while((cs=*s++)!=0) {
if(cs==lead && *s==trail) {
return (UChar *)(s-1);
}
}
return NULL;
} else {
/* not a Unicode code point, not findable */
return NULL;
}
}
U_CAPI UChar * U_EXPORT2
u_memchr(const UChar *s, UChar c, int32_t count) {
if(count<=0) {
return NULL; /* no string */
} else if(U16_IS_SURROGATE(c)) {
/* make sure to not find half of a surrogate pair */
return u_strFindFirst(s, count, &c, 1);
} else {
/* trivial search for a BMP code point */
const UChar *limit=s+count;
do {
if(*s==c) {
return (UChar *)s;
}
} while(++s!=limit);
return NULL;
}
}
U_CAPI UChar * U_EXPORT2
u_memchr32(const UChar *s, UChar32 c, int32_t count) {
if((uint32_t)c<=0xffff) {
/* find BMP code point */
return u_memchr(s, (UChar)c, count);
} else if(count<2) {
/* too short for a surrogate pair */
return NULL;
} else if((uint32_t)c<=0x10ffff) {
/* find supplementary code point as surrogate pair */
const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
do {
if(*s==lead && *(s+1)==trail) {
return (UChar *)s;
}
} while(++s!=limit);
return NULL;
} else {
/* not a Unicode code point, not findable */
return NULL;
}
}
/* Backward binary string search functions ---------------------------------- */
U_CAPI UChar * U_EXPORT2
u_strFindLast(const UChar *s, int32_t length,
const UChar *sub, int32_t subLength) {
const UChar *start, *limit, *p, *q, *subLimit;
UChar c, cs;
if(sub==NULL || subLength<-1) {
return (UChar *)s;
}
if(s==NULL || length<-1) {
return NULL;
}
/*
* This implementation is more lazy than the one for u_strFindFirst():
* There is no special search code for NUL-terminated strings.
* It does not seem to be worth it for searching substrings to
* search forward and find all matches like in u_strrchr() and similar.
* Therefore, we simply get both string lengths and search backward.
*
* markus 2002oct23
*/
if(subLength<0) {
subLength=u_strlen(sub);
}
if(subLength==0) {
return (UChar *)s;
}
/* get sub[subLength-1] to search for it fast */
subLimit=sub+subLength;
cs=*(--subLimit);
--subLength;
if(subLength==0 && !U16_IS_SURROGATE(cs)) {
/* the substring consists of a single, non-surrogate BMP code point */
return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
}
if(length<0) {
length=u_strlen(s);
}
/* subLength was decremented above */
if(length<=subLength) {
return NULL; /* s is shorter than sub */
}
start=s;
limit=s+length;
/* the substring must start no later than s+subLength */
s+=subLength;
while(s!=limit) {
c=*(--limit);
if(c==cs) {
/* found last substring UChar, compare rest */
p=limit;
q=subLimit;
for(;;) {
if(q==sub) {
if(isMatchAtCPBoundary(start, p, limit+1, s+length)) {
return (UChar *)p; /* well-formed match */
} else {
break; /* no match because surrogate pair is split */
}
}
if(*(--p)!=*(--q)) {
break; /* no match */
}
}
}
}
/* not found */
return NULL;
}
U_CAPI UChar * U_EXPORT2
u_strrchr(const UChar *s, UChar c) {
if(U16_IS_SURROGATE(c)) {
/* make sure to not find half of a surrogate pair */
return u_strFindLast(s, -1, &c, 1);
} else {
const UChar *result=NULL;
UChar cs;
/* trivial search for a BMP code point */
for(;;) {
if((cs=*s)==c) {
result=s;
}
if(cs==0) {
return (UChar *)result;
}
++s;
}
}
}
U_CAPI UChar * U_EXPORT2
u_strrchr32(const UChar *s, UChar32 c) {
if((uint32_t)c<=0xffff) {
/* find BMP code point */
return u_strrchr(s, (UChar)c);
} else if((uint32_t)c<=0x10ffff) {
/* find supplementary code point as surrogate pair */
const UChar *result=NULL;
UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
while((cs=*s++)!=0) {
if(cs==lead && *s==trail) {
result=s-1;
}
}
return (UChar *)result;
} else {
/* not a Unicode code point, not findable */
return NULL;
}
}
U_CAPI UChar * U_EXPORT2
u_memrchr(const UChar *s, UChar c, int32_t count) {
if(count<=0) {
return NULL; /* no string */
} else if(U16_IS_SURROGATE(c)) {
/* make sure to not find half of a surrogate pair */
return u_strFindLast(s, count, &c, 1);
} else {
/* trivial search for a BMP code point */
const UChar *limit=s+count;
do {
if(*(--limit)==c) {
return (UChar *)limit;
}
} while(s!=limit);
return NULL;
}
}
U_CAPI UChar * U_EXPORT2
u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
if((uint32_t)c<=0xffff) {
/* find BMP code point */
return u_memrchr(s, (UChar)c, count);
} else if(count<2) {
/* too short for a surrogate pair */
return NULL;
} else if((uint32_t)c<=0x10ffff) {
/* find supplementary code point as surrogate pair */
const UChar *limit=s+count-1;
UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
do {
if(*limit==trail && *(limit-1)==lead) {
return (UChar *)(limit-1);
}
} while(s!=--limit);
return NULL;
} else {
/* not a Unicode code point, not findable */
return NULL;
}
}
/* Tokenization functions --------------------------------------------------- */
/*
* Match each code point in a string against each code point in the matchSet.
* Return the index of the first string code point that
@ -321,6 +619,8 @@ u_strtok_r(UChar *src,
return NULL;
}
/* Miscellaneous functions -------------------------------------------------- */
U_CAPI UChar* U_EXPORT2
u_strcat(UChar *dst,
const UChar *src)
@ -754,50 +1054,6 @@ u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
}
U_CAPI UChar * U_EXPORT2
u_memchr(const UChar *src, UChar ch, int32_t count) {
if(count > 0) {
const UChar *ptr = src;
const UChar *limit = src + count;
do {
if (*ptr == ch) {
return (UChar *)ptr;
}
} while (++ptr < limit);
}
return NULL;
}
U_CAPI UChar * U_EXPORT2
u_memchr32(const UChar *src, UChar32 ch, int32_t count) {
if(count<=0 || (uint32_t)ch>0x10ffff) {
return NULL; /* no string, or illegal arguments */
}
if(ch<0xd800) {
/* non-surrogate BMP code point */
return u_memchr(src, (UChar)ch, count); /* BMP, single UChar */
} else if(ch<=0xdfff) {
/* surrogate code point */
return (UChar *)uprv_strFindSurrogate(src, count, (UChar)ch);
} else if(ch<=0xffff) {
return u_memchr(src, (UChar)ch, count); /* BMP, single UChar */
} else if(count<2) {
return NULL; /* too short for a surrogate pair */
} else {
const UChar *limit=src+count-1; /* -1 so that we do not need a separate check for the trail unit */
UChar lead=UTF16_LEAD(ch), trail=UTF16_TRAIL(ch);
do {
if(*src==lead && *(src+1)==trail) {
return (UChar *)src;
}
} while(++src<limit);
return NULL;
}
}
/* conversions between char* and UChar* ------------------------------------- */
/*