mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-09 15:27:38 +00:00
ICU-2145 consistent behavior of binary string searches
X-SVN-Rev: 10055
This commit is contained in:
parent
8d1a83e3d5
commit
b1246ef900
5 changed files with 680 additions and 444 deletions
|
@ -890,7 +890,7 @@ public:
|
|||
int32_t length) const;
|
||||
|
||||
/**
|
||||
* Locate in this the first occurrence of the code unit <TT>c</TT>,
|
||||
* Locate in this the first occurrence of the BMP code point <code>c</code>,
|
||||
* using bitwise comparison.
|
||||
* @param c The code unit to search for.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
|
@ -902,19 +902,6 @@ public:
|
|||
* Locate in this the first occurrence of the code point <TT>c</TT>,
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from indexOf(UChar c, ...) only for surrogates:
|
||||
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but indexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
* @stable
|
||||
|
@ -922,7 +909,7 @@ public:
|
|||
inline int32_t indexOf(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* Locate in this the first occurrence of the code unit <TT>c</TT>
|
||||
* Locate in this the first occurrence of the BMP code point <code>c</code>,
|
||||
* starting at offset <TT>start</TT>, using bitwise comparison.
|
||||
* @param c The code unit to search for.
|
||||
* @param start The offset at which searching will start.
|
||||
|
@ -936,19 +923,6 @@ public:
|
|||
* Locate in this the first occurrence of the code point <TT>c</TT>
|
||||
* starting at offset <TT>start</TT>, using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from indexOf(UChar c, ...) only for surrogates:
|
||||
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but indexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start The offset at which searching will start.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
|
@ -958,7 +932,7 @@ public:
|
|||
int32_t start) const;
|
||||
|
||||
/**
|
||||
* Locate in this the first occurrence of the code unit <TT>c</TT>
|
||||
* Locate in this the first occurrence of the BMP code point <code>c</code>
|
||||
* in the range [<TT>start</TT>, <TT>start + length</TT>),
|
||||
* using bitwise comparison.
|
||||
* @param c The code unit to search for.
|
||||
|
@ -976,19 +950,6 @@ public:
|
|||
* in the range [<TT>start</TT>, <TT>start + length</TT>),
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from indexOf(UChar c, ...) only for surrogates:
|
||||
* While indexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* indexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" indexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but indexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(indexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start the offset into this at which to start matching
|
||||
* @param length the number of characters in this to search
|
||||
|
@ -1112,7 +1073,7 @@ public:
|
|||
int32_t length) const;
|
||||
|
||||
/**
|
||||
* Locate in this the last occurrence of the code unit <TT>c</TT>,
|
||||
* Locate in this the last occurrence of the BMP code point <code>c</code>,
|
||||
* using bitwise comparison.
|
||||
* @param c The code unit to search for.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
|
@ -1124,19 +1085,6 @@ public:
|
|||
* Locate in this the last occurrence of the code point <TT>c</TT>,
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from lastIndexOf(UChar c, ...) only for surrogates:
|
||||
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but lastIndexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
* @stable
|
||||
|
@ -1144,7 +1092,7 @@ public:
|
|||
inline int32_t lastIndexOf(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* Locate in this the last occurrence of the code unit <TT>c</TT>
|
||||
* Locate in this the last occurrence of the BMP code point <code>c</code>
|
||||
* starting at offset <TT>start</TT>, using bitwise comparison.
|
||||
* @param c The code unit to search for.
|
||||
* @param start The offset at which searching will start.
|
||||
|
@ -1158,19 +1106,6 @@ public:
|
|||
* Locate in this the last occurrence of the code point <TT>c</TT>
|
||||
* starting at offset <TT>start</TT>, using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from lastIndexOf(UChar c, ...) only for surrogates:
|
||||
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but lastIndexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start The offset at which searching will start.
|
||||
* @return The offset into this of <TT>c</TT>, or -1 if not found.
|
||||
|
@ -1180,7 +1115,7 @@ public:
|
|||
int32_t start) const;
|
||||
|
||||
/**
|
||||
* Locate in this the last occurrence of the code unit <TT>c</TT>
|
||||
* Locate in this the last occurrence of the BMP code point <code>c</code>
|
||||
* in the range [<TT>start</TT>, <TT>start + length</TT>),
|
||||
* using bitwise comparison.
|
||||
* @param c The code unit to search for.
|
||||
|
@ -1198,19 +1133,6 @@ public:
|
|||
* in the range [<TT>start</TT>, <TT>start + length</TT>),
|
||||
* using bitwise comparison.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from lastIndexOf(UChar c, ...) only for surrogates:
|
||||
* While lastIndexOf(UChar c, ...) finds any surrogate code units in a string,
|
||||
* lastIndexOf(UChar32 c, ...) finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" lastIndexOf(UChar c, ...)
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but lastIndexOf(UChar32 c, ...) will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that char32At(lastIndexOf(UChar32 c))==c.
|
||||
*
|
||||
* @param c The code point to search for.
|
||||
* @param start the offset into this at which to start matching
|
||||
* @param length the number of characters in this to search
|
||||
|
@ -2981,7 +2903,6 @@ private:
|
|||
int32_t start,
|
||||
int32_t length) const;
|
||||
|
||||
// only for c>=0xd800
|
||||
int32_t doIndexOf(UChar32 c,
|
||||
int32_t start,
|
||||
int32_t length) const;
|
||||
|
@ -2990,7 +2911,6 @@ private:
|
|||
int32_t start,
|
||||
int32_t length) const;
|
||||
|
||||
// only for c>=0xd800
|
||||
int32_t doLastIndexOf(UChar32 c,
|
||||
int32_t start,
|
||||
int32_t length) const;
|
||||
|
@ -3490,11 +3410,7 @@ inline int32_t
|
|||
UnicodeString::indexOf(UChar32 c,
|
||||
int32_t start,
|
||||
int32_t length) const {
|
||||
if((uint32_t)c<0xd800) {
|
||||
return doIndexOf((UChar)c, start, length);
|
||||
} else {
|
||||
return doIndexOf(c, start, length);
|
||||
}
|
||||
return doIndexOf(c, start, length);
|
||||
}
|
||||
|
||||
inline int32_t
|
||||
|
@ -3571,11 +3487,7 @@ inline int32_t
|
|||
UnicodeString::lastIndexOf(UChar32 c,
|
||||
int32_t start,
|
||||
int32_t length) const {
|
||||
if((uint32_t)c<0xd800) {
|
||||
return doLastIndexOf((UChar)c, start, length);
|
||||
} else {
|
||||
return doLastIndexOf(c, start, length);
|
||||
}
|
||||
return doLastIndexOf(c, start, length);
|
||||
}
|
||||
|
||||
inline UBool
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
* \file
|
||||
* \brief C API: Unicode string handling functions
|
||||
*
|
||||
* These C API functions provide Unicode string handling.
|
||||
* These C API functions provide general Unicode string handling.
|
||||
*
|
||||
* Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
|
||||
* functions. (For example, they do not check for bad arguments like NULL string pointers.)
|
||||
|
@ -39,25 +39,32 @@
|
|||
*
|
||||
* ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
|
||||
* UTF-16 encodes each Unicode code point with either one or two UChar code units.
|
||||
* Some APIs accept a 32-bit UChar32 value for a single code point.
|
||||
* (This is the default form of Unicode, and a forward-compatible extension of the original,
|
||||
* fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
|
||||
* in 1996.)
|
||||
*
|
||||
* Some APIs accept a 32-bit UChar32 value for a single code point.
|
||||
*
|
||||
* ICU also handles 16-bit Unicode text with unpaired surrogates.
|
||||
* Such text is not well-formed UTF-16.
|
||||
* Code-point-related functions treat unpaired surrogates as surrogate code points,
|
||||
* i.e., as separate units.
|
||||
*
|
||||
* Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
|
||||
* it is much more efficient even for random access because the code unit values
|
||||
* for single-unit characters vs. lead units vs. trail units are completely disjoint.
|
||||
* This means that it is easy to determine character (code point) boundaries from
|
||||
* random offsets in the string.
|
||||
* (It also means, e.g., that u_strstr() does not need to verify that a match was
|
||||
* found on actual character boundaries; with some legacy encodings, strstr() may need to
|
||||
* scan back to the start of the text to verify this.)
|
||||
*
|
||||
* Unicode (UTF-16) string processing is optimized for the single-unit case.
|
||||
* Although it is important to support supplementary characters
|
||||
* (which use pairs of lead/trail code units called "surrogates"),
|
||||
* their occurrence is rare. Almost all characters in modern use require only
|
||||
* a single UChar code unit (i.e., their code point values are <=0xffff).
|
||||
*
|
||||
* For more details see the User Guide Strings chapter (http://oss.software.ibm.com/icu/userguide/strings.html).
|
||||
* For a discussion of the handling of unpaired surrogates see also
|
||||
* Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
|
||||
*/
|
||||
|
||||
/**
|
||||
|
@ -137,59 +144,180 @@ u_strncat(UChar *dst,
|
|||
const UChar *src,
|
||||
int32_t n);
|
||||
|
||||
/**
|
||||
* Find the first occurrence of a specified character in a ustring.
|
||||
*
|
||||
* @param s The string to search.
|
||||
* @param c The character to find.
|
||||
* @return A pointer to the first occurrence of <TT>c</TT> in <TT>s</TT>,
|
||||
* or a null pointer if <TT>s</TT> does not contain <TT>c</TT>.
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_strchr(const UChar *s,
|
||||
UChar c);
|
||||
|
||||
/**
|
||||
* Find the first occurrence of a substring in a string.
|
||||
* The substring is found at code point boundaries.
|
||||
* That means that if the substring begins with
|
||||
* a trail surrogate or ends with a lead surrogate,
|
||||
* then it is found only if these surrogates stand alone in the text.
|
||||
* Otherwise, the substring edge units would be matched against
|
||||
* halves of surrogate pairs.
|
||||
*
|
||||
* @param s The string to search.
|
||||
* @param substring The substring to find
|
||||
* @return A pointer to the first occurrence of <TT>substring</TT> in
|
||||
* <TT>s</TT>, or a null pointer if <TT>substring</TT>
|
||||
* is not in <TT>s</TT>.
|
||||
* @param s The string to search (NUL-terminated).
|
||||
* @param substring The substring to find (NUL-terminated).
|
||||
* @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
|
||||
* or <code>s</code> itself if the <code>substring</code> is empty,
|
||||
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strrstr
|
||||
* @see u_strFindFirst
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strstr(const UChar *s, const UChar *substring);
|
||||
|
||||
/**
|
||||
* Find the first occurence of a specified code point in a string.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from u_strchr() only for surrogates:
|
||||
* While u_strchr() finds any surrogate code units in a string,
|
||||
* u_strchr32() finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" u_strchr()
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but u_strchr32() will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that U16_GET(u_strchr32(c))==c.
|
||||
* Find the first occurrence of a substring in a string.
|
||||
* The substring is found at code point boundaries.
|
||||
* That means that if the substring begins with
|
||||
* a trail surrogate or ends with a lead surrogate,
|
||||
* then it is found only if these surrogates stand alone in the text.
|
||||
* Otherwise, the substring edge units would be matched against
|
||||
* halves of surrogate pairs.
|
||||
*
|
||||
* @param s The string to search.
|
||||
* @param c The code point (0..0x10ffff) to find.
|
||||
* @return A pointer to the first occurrence of <TT>c</TT> in <TT>s</TT>,
|
||||
* or a null pointer if there is no such character.
|
||||
* If <TT>c</TT> is represented with several UChars, then the returned
|
||||
* pointer will point to the first of them.
|
||||
* @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
|
||||
* @param substring The substring to find (NUL-terminated).
|
||||
* @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
|
||||
* @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
|
||||
* or <code>s</code> itself if the <code>substring</code> is empty,
|
||||
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strstr
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
|
||||
|
||||
/**
|
||||
* Find the first occurrence of a BMP code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* @param s The string to search (NUL-terminated).
|
||||
* @param c The BMP code point to find.
|
||||
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strchr32
|
||||
* @see u_memchr
|
||||
* @see u_strstr
|
||||
* @see u_strFindFirst
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strchr(const UChar *s, UChar c);
|
||||
|
||||
/**
|
||||
* Find the first occurrence of a code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* @param s The string to search (NUL-terminated).
|
||||
* @param c The code point to find.
|
||||
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strchr
|
||||
* @see u_memchr32
|
||||
* @see u_strstr
|
||||
* @see u_strFindFirst
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strchr32(const UChar *s, UChar32 c);
|
||||
|
||||
/**
|
||||
* Find the last occurrence of a substring in a string.
|
||||
* The substring is found at code point boundaries.
|
||||
* That means that if the substring begins with
|
||||
* a trail surrogate or ends with a lead surrogate,
|
||||
* then it is found only if these surrogates stand alone in the text.
|
||||
* Otherwise, the substring edge units would be matched against
|
||||
* halves of surrogate pairs.
|
||||
*
|
||||
* @param s The string to search (NUL-terminated).
|
||||
* @param substring The substring to find (NUL-terminated).
|
||||
* @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
|
||||
* or <code>s</code> itself if the <code>substring</code> is empty,
|
||||
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strstr
|
||||
* @see u_strFindFirst
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strrstr(const UChar *s, const UChar *substring);
|
||||
|
||||
/**
|
||||
* Find the last occurrence of a substring in a string.
|
||||
* The substring is found at code point boundaries.
|
||||
* That means that if the substring begins with
|
||||
* a trail surrogate or ends with a lead surrogate,
|
||||
* then it is found only if these surrogates stand alone in the text.
|
||||
* Otherwise, the substring edge units would be matched against
|
||||
* halves of surrogate pairs.
|
||||
*
|
||||
* @param s The string to search.
|
||||
* @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
|
||||
* @param substring The substring to find (NUL-terminated).
|
||||
* @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
|
||||
* @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
|
||||
* or <code>s</code> itself if the <code>substring</code> is empty,
|
||||
* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strstr
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
|
||||
|
||||
/**
|
||||
* Find the last occurrence of a BMP code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* @param s The string to search (NUL-terminated).
|
||||
* @param c The BMP code point to find.
|
||||
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strrchr32
|
||||
* @see u_memrchr
|
||||
* @see u_strrstr
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strrchr(const UChar *s, UChar c);
|
||||
|
||||
/**
|
||||
* Find the last occurrence of a code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* @param s The string to search (NUL-terminated).
|
||||
* @param c The code point to find.
|
||||
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strrchr
|
||||
* @see u_memchr32
|
||||
* @see u_strrstr
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strrchr32(const UChar *s, UChar32 c);
|
||||
|
||||
/**
|
||||
* Locates the first occurrence in the string str of any of the characters
|
||||
* in the string accept.
|
||||
|
@ -621,46 +749,84 @@ U_CAPI int32_t U_EXPORT2
|
|||
u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
|
||||
|
||||
/**
|
||||
* Search for a UChar within a Unicode string until <TT>count</TT>
|
||||
* is reached.
|
||||
* Find the first occurrence of a BMP code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* @param src string to search in
|
||||
* @param ch character to find
|
||||
* @param count maximum number of UChars in <TT>src</TT>to search for
|
||||
* <TT>ch</TT>.
|
||||
* @return A pointer within src, pointing to <TT>ch</TT>, or NULL if it
|
||||
* was not found.
|
||||
* @param s The string to search (contains <code>count</code> UChars).
|
||||
* @param c The BMP code point to find.
|
||||
* @param count The length of the string.
|
||||
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strchr
|
||||
* @see u_memchr32
|
||||
* @see u_strFindFirst
|
||||
*/
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_memchr(const UChar *src, UChar ch, int32_t count);
|
||||
u_memchr(const UChar *s, UChar c, int32_t count);
|
||||
|
||||
/**
|
||||
* Find the first occurence of a specified code point in a string.
|
||||
* Find the first occurrence of a code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* This function finds code points, which differs for BMP code points
|
||||
* from u_memchr() only for surrogates:
|
||||
* While u_memchr() finds any surrogate code units in a string,
|
||||
* u_memchr32() finds only unmatched surrogate code points,
|
||||
* i.e., only those that do not combine with an adjacent surrogate
|
||||
* to form a supplementary code point.
|
||||
* For example, in a string "\ud800\udc00" u_memchr()
|
||||
* will find code units U+d800 at 0 and U+dc00 at 1,
|
||||
* but u_memchr32() will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that U16_GET(u_memchr32(c))==c.
|
||||
*
|
||||
* @param src string to search in
|
||||
* @param ch character to find
|
||||
* @param count maximum number of UChars in <TT>src</TT>to search for
|
||||
* <TT>ch</TT>.
|
||||
* @return A pointer within src, pointing to <TT>ch</TT>, or NULL if it
|
||||
* was not found.
|
||||
* @param s The string to search (contains <code>count</code> UChars).
|
||||
* @param c The code point to find.
|
||||
* @param count The length of the string.
|
||||
* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strchr32
|
||||
* @see u_memchr
|
||||
* @see u_strFindFirst
|
||||
*/
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_memchr32(const UChar *src, UChar32 ch, int32_t count);
|
||||
u_memchr32(const UChar *s, UChar32 c, int32_t count);
|
||||
|
||||
/**
|
||||
* Find the last occurrence of a BMP code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* @param s The string to search (contains <code>count</code> UChars).
|
||||
* @param c The BMP code point to find.
|
||||
* @param count The length of the string.
|
||||
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strrchr
|
||||
* @see u_memrchr32
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_memrchr(const UChar *s, UChar c, int32_t count);
|
||||
|
||||
/**
|
||||
* Find the last occurrence of a code point in a string.
|
||||
* A surrogate code point is found only if its match in the text is not
|
||||
* part of a surrogate pair.
|
||||
* A NUL character is found at the string terminator.
|
||||
*
|
||||
* @param s The string to search (contains <code>count</code> UChars).
|
||||
* @param c The code point to find.
|
||||
* @param count The length of the string.
|
||||
* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
|
||||
* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
|
||||
* @stable
|
||||
*
|
||||
* @see u_strrchr32
|
||||
* @see u_memrchr
|
||||
* @see u_strFindLast
|
||||
*/
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_memrchr32(const UChar *s, UChar32 c, int32_t count);
|
||||
|
||||
/**
|
||||
* Unicode String literals in C.
|
||||
|
|
|
@ -814,42 +814,21 @@ UnicodeString::indexOf(const UChar *srcChars,
|
|||
return -1;
|
||||
}
|
||||
|
||||
// get the srcLength if necessary
|
||||
if(srcLength < 0) {
|
||||
srcLength = u_strlen(srcChars + srcStart);
|
||||
if(srcLength == 0) {
|
||||
return -1;
|
||||
}
|
||||
// UnicodeString does not find empty substrings
|
||||
if(srcLength < 0 && srcChars[srcStart] == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// now we will only work with srcLength-1
|
||||
--srcLength;
|
||||
|
||||
// get the indices within bounds
|
||||
pinIndices(start, length);
|
||||
|
||||
// set length for the last possible match start position
|
||||
// note the --srcLength above
|
||||
length -= srcLength;
|
||||
|
||||
if(length <= 0) {
|
||||
// find the first occurrence of the substring
|
||||
const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength);
|
||||
if(match == NULL) {
|
||||
return -1;
|
||||
} else {
|
||||
return match - fArray;
|
||||
}
|
||||
|
||||
const UChar *array = getArrayStart();
|
||||
int32_t limit = start + length;
|
||||
|
||||
// search for the first char, then compare the rest of the string
|
||||
// increment srcStart here for that, matching the --srcLength above
|
||||
UChar ch = srcChars[srcStart++];
|
||||
|
||||
do {
|
||||
if(array[start] == ch && (srcLength == 0 || compare(start + 1, srcLength, srcChars, srcStart, srcLength) == 0)) {
|
||||
return start;
|
||||
}
|
||||
} while(++start < limit);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
@ -859,21 +838,14 @@ UnicodeString::doIndexOf(UChar c,
|
|||
{
|
||||
// pin indices
|
||||
pinIndices(start, length);
|
||||
if(length == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// find the first occurrence of c
|
||||
const UChar *begin = getArrayStart() + start;
|
||||
const UChar *limit = begin + length;
|
||||
|
||||
do {
|
||||
if(*begin == c) {
|
||||
return (int32_t)(begin - getArrayStart());
|
||||
}
|
||||
} while(++begin < limit);
|
||||
|
||||
return -1;
|
||||
const UChar *match = u_memchr(fArray + start, c, length);
|
||||
if(match == NULL) {
|
||||
return -1;
|
||||
} else {
|
||||
return match - fArray;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
@ -882,26 +854,13 @@ UnicodeString::doIndexOf(UChar32 c,
|
|||
int32_t length) const {
|
||||
// pin indices
|
||||
pinIndices(start, length);
|
||||
if(length == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// c<0xd800 handled by inline function indexOf(UChar32 c, start, length)
|
||||
if(c<=0xdfff) {
|
||||
// surrogate code point
|
||||
const UChar *t = uprv_strFindSurrogate(fArray + start, length, (UChar)c);
|
||||
if(t != 0) {
|
||||
return (int32_t)(t - fArray);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else if(c<=0xffff) {
|
||||
// non-surrogate BMP code point
|
||||
return doIndexOf((UChar)c, start, length);
|
||||
// find the first occurrence of c
|
||||
const UChar *match = u_memchr32(fArray + start, c, length);
|
||||
if(match == NULL) {
|
||||
return -1;
|
||||
} else {
|
||||
// supplementary code point, search for string
|
||||
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
|
||||
return indexOf(buffer, 2, start, length);
|
||||
return match - fArray;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -916,43 +875,21 @@ UnicodeString::lastIndexOf(const UChar *srcChars,
|
|||
return -1;
|
||||
}
|
||||
|
||||
// get the srcLength if necessary
|
||||
if(srcLength < 0) {
|
||||
srcLength = u_strlen(srcChars + srcStart);
|
||||
if(srcLength == 0) {
|
||||
return -1;
|
||||
}
|
||||
// UnicodeString does not find empty substrings
|
||||
if(srcLength < 0 && srcChars[srcStart] == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// now we will only work with srcLength-1
|
||||
--srcLength;
|
||||
|
||||
// get the indices within bounds
|
||||
pinIndices(start, length);
|
||||
|
||||
// set length for the last possible match start position
|
||||
// note the --srcLength above
|
||||
length -= srcLength;
|
||||
|
||||
if(length <= 0) {
|
||||
// find the last occurrence of the substring
|
||||
const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength);
|
||||
if(match == NULL) {
|
||||
return -1;
|
||||
} else {
|
||||
return match - fArray;
|
||||
}
|
||||
|
||||
const UChar *array = getArrayStart();
|
||||
int32_t pos;
|
||||
|
||||
// search for the first char, then compare the rest of the string
|
||||
// increment srcStart here for that, matching the --srcLength above
|
||||
UChar ch = srcChars[srcStart++];
|
||||
|
||||
pos = start + length;
|
||||
do {
|
||||
if(array[--pos] == ch && (srcLength == 0 || compare(pos + 1, srcLength, srcChars, srcStart, srcLength) == 0)) {
|
||||
return pos;
|
||||
}
|
||||
} while(pos > start);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
@ -966,20 +903,14 @@ UnicodeString::doLastIndexOf(UChar c,
|
|||
|
||||
// pin indices
|
||||
pinIndices(start, length);
|
||||
if(length == 0) {
|
||||
|
||||
// find the last occurrence of c
|
||||
const UChar *match = u_memrchr(fArray + start, c, length);
|
||||
if(match == NULL) {
|
||||
return -1;
|
||||
} else {
|
||||
return match - fArray;
|
||||
}
|
||||
|
||||
const UChar *begin = getArrayStart() + start;
|
||||
const UChar *limit = begin + length;
|
||||
|
||||
do {
|
||||
if(*--limit == c) {
|
||||
return (int32_t)(limit - getArrayStart());
|
||||
}
|
||||
} while(limit > begin);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
@ -988,26 +919,13 @@ UnicodeString::doLastIndexOf(UChar32 c,
|
|||
int32_t length) const {
|
||||
// pin indices
|
||||
pinIndices(start, length);
|
||||
if(length == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// c<0xd800 handled by inline function lastIndexOf(UChar32 c, start, length)
|
||||
if(c<=0xdfff) {
|
||||
// surrogate code point
|
||||
const UChar *t = uprv_strFindLastSurrogate(fArray + start, length, (UChar)c);
|
||||
if(t != 0) {
|
||||
return (int32_t)(t - fArray);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
} else if(c<=0xffff) {
|
||||
// non-surrogate BMP code point
|
||||
return doLastIndexOf((UChar)c, start, length);
|
||||
// find the last occurrence of c
|
||||
const UChar *match = u_memrchr32(fArray + start, c, length);
|
||||
if(match == NULL) {
|
||||
return -1;
|
||||
} else {
|
||||
// supplementary code point, search for string
|
||||
UChar buffer[2] = { UTF16_LEAD(c), UTF16_TRAIL(c) };
|
||||
return lastIndexOf(buffer, 2, start, length);
|
||||
return match - fArray;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -241,20 +241,4 @@ u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCod
|
|||
|
||||
#define u_getMaxCaseExpansion() 10
|
||||
|
||||
/**
|
||||
* Find a single (unmatched) surrogate code point in the string s[0..length[ .
|
||||
* Find the first such surrogate.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC const UChar *
|
||||
uprv_strFindSurrogate(const UChar *s, int32_t length, UChar surrogate);
|
||||
|
||||
/**
|
||||
* Find a single (unmatched) surrogate code point in the string s[0..length[ .
|
||||
* Find the last such surrogate.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC const UChar *
|
||||
uprv_strFindLastSurrogate(const UChar *s, int32_t length, UChar surrogate);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -34,129 +34,427 @@ static UConverter *gDefaultConverter = NULL;
|
|||
|
||||
#define MAX_STRLEN 0x0FFFFFFF
|
||||
|
||||
/* ---- String searching functions ---- */
|
||||
/* Forward binary string search functions ----------------------------------- */
|
||||
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_strchr(const UChar *s, UChar c)
|
||||
{
|
||||
while (*s && *s != c) {
|
||||
++s;
|
||||
}
|
||||
if (*s == c)
|
||||
return (UChar *)s;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* A Boyer-Moore algorithm would be better, but that would require a hashtable
|
||||
because UChar is so big. This algorithm doesn't use a lot of extra memory.
|
||||
*/
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strstr(const UChar *s, const UChar *substring) {
|
||||
|
||||
UChar *strItr, *subItr;
|
||||
|
||||
if (*substring == 0) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
|
||||
do {
|
||||
strItr = (UChar *)s;
|
||||
subItr = (UChar *)substring;
|
||||
|
||||
/* Only one string iterator needs checking for null terminator */
|
||||
while ((*strItr != 0) && (*strItr == *subItr)) {
|
||||
strItr++;
|
||||
subItr++;
|
||||
}
|
||||
|
||||
if (*subItr == 0) { /* Was the end of the substring reached? */
|
||||
return (UChar *)s;
|
||||
}
|
||||
|
||||
s++;
|
||||
} while (*strItr != 0); /* Was the end of the string reached? */
|
||||
|
||||
return NULL; /* No match */
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if there is an unmatched surrogate c in a string [start..limit[ at s.
|
||||
* start<=s<limit or limit==NULL
|
||||
* @return TRUE if *s is unmatched
|
||||
/*
|
||||
* Test if a substring match inside a string is at code point boundaries.
|
||||
* All pointers refer to the same buffer.
|
||||
* The limit pointer may be NULL, all others must be real pointers.
|
||||
*/
|
||||
static U_INLINE UBool
|
||||
uprv_isSingleSurrogate(const UChar *start, const UChar *s, UChar c, const UChar *limit) {
|
||||
if(UTF_IS_SURROGATE_FIRST(c)) {
|
||||
++s;
|
||||
return (UBool)(s==limit || !UTF_IS_TRAIL(*s));
|
||||
} else {
|
||||
return (UBool)(s==start || !UTF_IS_LEAD(*(s-1)));
|
||||
isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
|
||||
if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
|
||||
/* the leading edge of the match is in the middle of a surrogate pair */
|
||||
return FALSE;
|
||||
}
|
||||
if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
|
||||
/* the trailing edge of the match is in the middle of a surrogate pair */
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CFUNC const UChar *
|
||||
uprv_strFindSurrogate(const UChar *s, int32_t length, UChar surrogate) {
|
||||
const UChar *limit, *t;
|
||||
UChar c;
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strFindFirst(const UChar *s, int32_t length,
|
||||
const UChar *sub, int32_t subLength) {
|
||||
const UChar *start, *p, *q, *subLimit;
|
||||
UChar c, cs, cq;
|
||||
|
||||
if(length>=0) {
|
||||
limit=s+length;
|
||||
} else {
|
||||
limit=NULL;
|
||||
if(sub==NULL || subLength<-1) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
if(s==NULL || length<-1) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for(t=s; t!=limit && ((c=*t)!=0 || limit!=NULL); ++t) {
|
||||
if(c==surrogate && uprv_isSingleSurrogate(s, t, c, limit)) {
|
||||
return t;
|
||||
start=s;
|
||||
|
||||
if(length<0 && subLength<0) {
|
||||
/* both strings are NUL-terminated */
|
||||
if((cs=*sub++)==0) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
if(*sub==0 && !U16_IS_SURROGATE(cs)) {
|
||||
/* the substring consists of a single, non-surrogate BMP code point */
|
||||
return u_strchr(s, cs);
|
||||
}
|
||||
|
||||
while((c=*s++)!=0) {
|
||||
if(c==cs) {
|
||||
/* found first substring UChar, compare rest */
|
||||
p=s;
|
||||
q=sub;
|
||||
for(;;) {
|
||||
if((cq=*q)==0) {
|
||||
if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
|
||||
return (UChar *)(s-1); /* well-formed match */
|
||||
} else {
|
||||
break; /* no match because surrogate pair is split */
|
||||
}
|
||||
}
|
||||
if((c=*p)==0) {
|
||||
return NULL; /* no match, and none possible after s */
|
||||
}
|
||||
if(c!=cq) {
|
||||
break; /* no match */
|
||||
}
|
||||
++p;
|
||||
++q;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* not found */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(subLength<0) {
|
||||
subLength=u_strlen(sub);
|
||||
}
|
||||
if(subLength==0) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
|
||||
/* get sub[0] to search for it fast */
|
||||
cs=*sub++;
|
||||
--subLength;
|
||||
subLimit=sub+subLength;
|
||||
|
||||
if(subLength==0 && !U16_IS_SURROGATE(cs)) {
|
||||
/* the substring consists of a single, non-surrogate BMP code point */
|
||||
return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
|
||||
}
|
||||
|
||||
if(length<0) {
|
||||
/* s is NUL-terminated */
|
||||
while((c=*s++)!=0) {
|
||||
if(c==cs) {
|
||||
/* found first substring UChar, compare rest */
|
||||
p=s;
|
||||
q=sub;
|
||||
for(;;) {
|
||||
if(q==subLimit) {
|
||||
if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
|
||||
return (UChar *)(s-1); /* well-formed match */
|
||||
} else {
|
||||
break; /* no match because surrogate pair is split */
|
||||
}
|
||||
}
|
||||
if((c=*p)==0) {
|
||||
return NULL; /* no match, and none possible after s */
|
||||
}
|
||||
if(c!=*q) {
|
||||
break; /* no match */
|
||||
}
|
||||
++p;
|
||||
++q;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const UChar *limit, *preLimit;
|
||||
|
||||
/* subLength was decremented above */
|
||||
if(length<=subLength) {
|
||||
return NULL; /* s is shorter than sub */
|
||||
}
|
||||
|
||||
limit=s+length;
|
||||
|
||||
/* the substring must start before preLimit */
|
||||
preLimit=limit-subLength;
|
||||
|
||||
while(s!=preLimit) {
|
||||
c=*s++;
|
||||
if(c==cs) {
|
||||
/* found first substring UChar, compare rest */
|
||||
p=s;
|
||||
q=sub;
|
||||
for(;;) {
|
||||
if(q==subLimit) {
|
||||
if(isMatchAtCPBoundary(start, s-1, p, limit)) {
|
||||
return (UChar *)(s-1); /* well-formed match */
|
||||
} else {
|
||||
break; /* no match because surrogate pair is split */
|
||||
}
|
||||
}
|
||||
if(*p!=*q) {
|
||||
break; /* no match */
|
||||
}
|
||||
++p;
|
||||
++q;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* not found */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
U_CFUNC const UChar *
|
||||
uprv_strFindLastSurrogate(const UChar *s, int32_t length, UChar surrogate) {
|
||||
const UChar *limit, *t;
|
||||
UChar c;
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strstr(const UChar *s, const UChar *substring) {
|
||||
return u_strFindFirst(s, -1, substring, -1);
|
||||
}
|
||||
|
||||
if(length>=0) {
|
||||
limit=s+length;
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strchr(const UChar *s, UChar c) {
|
||||
if(U16_IS_SURROGATE(c)) {
|
||||
/* make sure to not find half of a surrogate pair */
|
||||
return u_strFindFirst(s, -1, &c, 1);
|
||||
} else {
|
||||
limit=s+u_strlen(s);
|
||||
}
|
||||
UChar cs;
|
||||
|
||||
for(t=limit; t!=s;) {
|
||||
c=*--t;
|
||||
if(c==surrogate && uprv_isSingleSurrogate(s, t, c, limit)) {
|
||||
return t;
|
||||
/* trivial search for a BMP code point */
|
||||
for(;;) {
|
||||
if((cs=*s)==c) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
if(cs==0) {
|
||||
return NULL;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strchr32(const UChar *s, UChar32 c) {
|
||||
if(c < 0xd800) {
|
||||
/* non-surrogate BMP code point */
|
||||
return u_strchr(s, (UChar)c);
|
||||
} else if(c <= 0xdfff) {
|
||||
/* surrogate code point */
|
||||
return (UChar *)uprv_strFindSurrogate(s, -1, (UChar)c);
|
||||
} else if(c <= 0xffff) {
|
||||
/* non-surrogate BMP code point */
|
||||
return u_strchr(s, (UChar)c);
|
||||
} else {
|
||||
/* supplementary code point, search for string */
|
||||
UChar buffer[3];
|
||||
if((uint32_t)c<=0xffff) {
|
||||
/* find BMP code point */
|
||||
return u_strchr(s, (UChar)c);
|
||||
} else if((uint32_t)c<=0x10ffff) {
|
||||
/* find supplementary code point as surrogate pair */
|
||||
UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
|
||||
|
||||
buffer[0] = UTF16_LEAD(c);
|
||||
buffer[1] = UTF16_TRAIL(c);
|
||||
buffer[2] = 0;
|
||||
return u_strstr(s, buffer);
|
||||
}
|
||||
while((cs=*s++)!=0) {
|
||||
if(cs==lead && *s==trail) {
|
||||
return (UChar *)(s-1);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
} else {
|
||||
/* not a Unicode code point, not findable */
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_memchr(const UChar *s, UChar c, int32_t count) {
|
||||
if(count<=0) {
|
||||
return NULL; /* no string */
|
||||
} else if(U16_IS_SURROGATE(c)) {
|
||||
/* make sure to not find half of a surrogate pair */
|
||||
return u_strFindFirst(s, count, &c, 1);
|
||||
} else {
|
||||
/* trivial search for a BMP code point */
|
||||
const UChar *limit=s+count;
|
||||
do {
|
||||
if(*s==c) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
} while(++s!=limit);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_memchr32(const UChar *s, UChar32 c, int32_t count) {
|
||||
if((uint32_t)c<=0xffff) {
|
||||
/* find BMP code point */
|
||||
return u_memchr(s, (UChar)c, count);
|
||||
} else if(count<2) {
|
||||
/* too short for a surrogate pair */
|
||||
return NULL;
|
||||
} else if((uint32_t)c<=0x10ffff) {
|
||||
/* find supplementary code point as surrogate pair */
|
||||
const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
|
||||
UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
|
||||
|
||||
do {
|
||||
if(*s==lead && *(s+1)==trail) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
} while(++s!=limit);
|
||||
return NULL;
|
||||
} else {
|
||||
/* not a Unicode code point, not findable */
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Backward binary string search functions ---------------------------------- */
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strFindLast(const UChar *s, int32_t length,
|
||||
const UChar *sub, int32_t subLength) {
|
||||
const UChar *start, *limit, *p, *q, *subLimit;
|
||||
UChar c, cs;
|
||||
|
||||
if(sub==NULL || subLength<-1) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
if(s==NULL || length<-1) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* This implementation is more lazy than the one for u_strFindFirst():
|
||||
* There is no special search code for NUL-terminated strings.
|
||||
* It does not seem to be worth it for searching substrings to
|
||||
* search forward and find all matches like in u_strrchr() and similar.
|
||||
* Therefore, we simply get both string lengths and search backward.
|
||||
*
|
||||
* markus 2002oct23
|
||||
*/
|
||||
|
||||
if(subLength<0) {
|
||||
subLength=u_strlen(sub);
|
||||
}
|
||||
if(subLength==0) {
|
||||
return (UChar *)s;
|
||||
}
|
||||
|
||||
/* get sub[subLength-1] to search for it fast */
|
||||
subLimit=sub+subLength;
|
||||
cs=*(--subLimit);
|
||||
--subLength;
|
||||
|
||||
if(subLength==0 && !U16_IS_SURROGATE(cs)) {
|
||||
/* the substring consists of a single, non-surrogate BMP code point */
|
||||
return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
|
||||
}
|
||||
|
||||
if(length<0) {
|
||||
length=u_strlen(s);
|
||||
}
|
||||
|
||||
/* subLength was decremented above */
|
||||
if(length<=subLength) {
|
||||
return NULL; /* s is shorter than sub */
|
||||
}
|
||||
|
||||
start=s;
|
||||
limit=s+length;
|
||||
|
||||
/* the substring must start no later than s+subLength */
|
||||
s+=subLength;
|
||||
|
||||
while(s!=limit) {
|
||||
c=*(--limit);
|
||||
if(c==cs) {
|
||||
/* found last substring UChar, compare rest */
|
||||
p=limit;
|
||||
q=subLimit;
|
||||
for(;;) {
|
||||
if(q==sub) {
|
||||
if(isMatchAtCPBoundary(start, p, limit+1, s+length)) {
|
||||
return (UChar *)p; /* well-formed match */
|
||||
} else {
|
||||
break; /* no match because surrogate pair is split */
|
||||
}
|
||||
}
|
||||
if(*(--p)!=*(--q)) {
|
||||
break; /* no match */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* not found */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strrchr(const UChar *s, UChar c) {
|
||||
if(U16_IS_SURROGATE(c)) {
|
||||
/* make sure to not find half of a surrogate pair */
|
||||
return u_strFindLast(s, -1, &c, 1);
|
||||
} else {
|
||||
const UChar *result=NULL;
|
||||
UChar cs;
|
||||
|
||||
/* trivial search for a BMP code point */
|
||||
for(;;) {
|
||||
if((cs=*s)==c) {
|
||||
result=s;
|
||||
}
|
||||
if(cs==0) {
|
||||
return (UChar *)result;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strrchr32(const UChar *s, UChar32 c) {
|
||||
if((uint32_t)c<=0xffff) {
|
||||
/* find BMP code point */
|
||||
return u_strrchr(s, (UChar)c);
|
||||
} else if((uint32_t)c<=0x10ffff) {
|
||||
/* find supplementary code point as surrogate pair */
|
||||
const UChar *result=NULL;
|
||||
UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
|
||||
|
||||
while((cs=*s++)!=0) {
|
||||
if(cs==lead && *s==trail) {
|
||||
result=s-1;
|
||||
}
|
||||
}
|
||||
return (UChar *)result;
|
||||
} else {
|
||||
/* not a Unicode code point, not findable */
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_memrchr(const UChar *s, UChar c, int32_t count) {
|
||||
if(count<=0) {
|
||||
return NULL; /* no string */
|
||||
} else if(U16_IS_SURROGATE(c)) {
|
||||
/* make sure to not find half of a surrogate pair */
|
||||
return u_strFindLast(s, count, &c, 1);
|
||||
} else {
|
||||
/* trivial search for a BMP code point */
|
||||
const UChar *limit=s+count;
|
||||
do {
|
||||
if(*(--limit)==c) {
|
||||
return (UChar *)limit;
|
||||
}
|
||||
} while(s!=limit);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
|
||||
if((uint32_t)c<=0xffff) {
|
||||
/* find BMP code point */
|
||||
return u_memrchr(s, (UChar)c, count);
|
||||
} else if(count<2) {
|
||||
/* too short for a surrogate pair */
|
||||
return NULL;
|
||||
} else if((uint32_t)c<=0x10ffff) {
|
||||
/* find supplementary code point as surrogate pair */
|
||||
const UChar *limit=s+count-1;
|
||||
UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
|
||||
|
||||
do {
|
||||
if(*limit==trail && *(limit-1)==lead) {
|
||||
return (UChar *)(limit-1);
|
||||
}
|
||||
} while(s!=--limit);
|
||||
return NULL;
|
||||
} else {
|
||||
/* not a Unicode code point, not findable */
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Tokenization functions --------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Match each code point in a string against each code point in the matchSet.
|
||||
* Return the index of the first string code point that
|
||||
|
@ -321,6 +619,8 @@ u_strtok_r(UChar *src,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/* Miscellaneous functions -------------------------------------------------- */
|
||||
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_strcat(UChar *dst,
|
||||
const UChar *src)
|
||||
|
@ -754,50 +1054,6 @@ u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
|
|||
return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_memchr(const UChar *src, UChar ch, int32_t count) {
|
||||
if(count > 0) {
|
||||
const UChar *ptr = src;
|
||||
const UChar *limit = src + count;
|
||||
|
||||
do {
|
||||
if (*ptr == ch) {
|
||||
return (UChar *)ptr;
|
||||
}
|
||||
} while (++ptr < limit);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_memchr32(const UChar *src, UChar32 ch, int32_t count) {
|
||||
if(count<=0 || (uint32_t)ch>0x10ffff) {
|
||||
return NULL; /* no string, or illegal arguments */
|
||||
}
|
||||
|
||||
if(ch<0xd800) {
|
||||
/* non-surrogate BMP code point */
|
||||
return u_memchr(src, (UChar)ch, count); /* BMP, single UChar */
|
||||
} else if(ch<=0xdfff) {
|
||||
/* surrogate code point */
|
||||
return (UChar *)uprv_strFindSurrogate(src, count, (UChar)ch);
|
||||
} else if(ch<=0xffff) {
|
||||
return u_memchr(src, (UChar)ch, count); /* BMP, single UChar */
|
||||
} else if(count<2) {
|
||||
return NULL; /* too short for a surrogate pair */
|
||||
} else {
|
||||
const UChar *limit=src+count-1; /* -1 so that we do not need a separate check for the trail unit */
|
||||
UChar lead=UTF16_LEAD(ch), trail=UTF16_TRAIL(ch);
|
||||
|
||||
do {
|
||||
if(*src==lead && *(src+1)==trail) {
|
||||
return (UChar *)src;
|
||||
}
|
||||
} while(++src<limit);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* conversions between char* and UChar* ------------------------------------- */
|
||||
|
||||
/*
|
||||
|
|
Loading…
Add table
Reference in a new issue