diff --git a/icu4c/source/common/bmpset.cpp b/icu4c/source/common/bmpset.cpp index b874436eced..7cd32eb99c9 100644 --- a/icu4c/source/common/bmpset.cpp +++ b/icu4c/source/common/bmpset.cpp @@ -690,16 +690,9 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon int32_t prev=length; UChar32 c; - if(b<0xc0) { - // trail byte: collect a multi-byte character - c=utf8_prevCharSafeBody(s, 0, &length, b, -1); - if(c<0) { - c=0xfffd; - } - } else { - // lead byte in last-trail position - c=0xfffd; - } + // trail byte: collect a multi-byte character + // (or lead byte in last-trail position) + c=utf8_prevCharSafeBody(s, 0, &length, b, -3); // c is a valid code point, not ASCII, not a surrogate if(c<=0x7ff) { if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { diff --git a/icu4c/source/common/uiter.cpp b/icu4c/source/common/uiter.cpp index 8c89d4bbe46..2cc76a965fa 100644 --- a/icu4c/source/common/uiter.cpp +++ b/icu4c/source/common/uiter.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2002-2011, International Business Machines +* Copyright (C) 2002-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -600,12 +600,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { i=index=0; limit=iter->start; /* count up to the UTF-8 index */ while(istart=i; /* just in case setState() did not get us to a code point boundary */ @@ -636,12 +632,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { /* count from the beginning to the current index */ while(istart, set the UTF-16 index */ @@ -658,12 +650,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { /* count from the current index to the end */ limit=iter->limit; while(ilength=length; } @@ -787,8 +775,8 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) --delta; } while(delta>0 && i=2) { @@ -817,8 +805,8 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) ++delta; } while(delta<0 && i>0) { - U8_PREV(s, 0, i, c); - if(c<0xffff) { + U8_PREV_OR_FFFD(s, 0, i, c); + if(c<=0xffff) { --pos; ++delta; } else if(delta<=-2) { @@ -867,10 +855,8 @@ utf8IteratorCurrent(UCharIterator *iter) { UChar32 c; int32_t i=iter->start; - U8_NEXT(s, i, iter->limit, c); - if(c<0) { - return 0xfffd; - } else if(c<=0xffff) { + U8_NEXT_OR_FFFD(s, i, iter->limit, c); + if(c<=0xffff) { return c; } else { return U16_LEAD(c); @@ -895,7 +881,7 @@ utf8IteratorNext(UCharIterator *iter) { const uint8_t *s=(const uint8_t *)iter->context; UChar32 c; - U8_NEXT(s, iter->start, iter->limit, c); + U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c); if((index=iter->index)>=0) { iter->index=++index; if(iter->length<0 && iter->start==iter->limit) { @@ -904,9 +890,7 @@ utf8IteratorNext(UCharIterator *iter) { } else if(iter->start==iter->limit && iter->length>=0) { iter->index= c<=0xffff ? iter->length : iter->length-1; } - if(c<0) { - return 0xfffd; - } else if(c<=0xffff) { + if(c<=0xffff) { return c; } else { iter->reservedField=c; @@ -933,15 +917,13 @@ utf8IteratorPrevious(UCharIterator *iter) { const uint8_t *s=(const uint8_t *)iter->context; UChar32 c; - U8_PREV(s, 0, iter->start, c); + U8_PREV_OR_FFFD(s, 0, iter->start, c); if((index=iter->index)>0) { iter->index=index-1; } else if(iter->start<=1) { iter->index= c<=0xffff ? iter->start : iter->start+1; } - if(c<0) { - return 0xfffd; - } else if(c<=0xffff) { + if(c<=0xffff) { return c; } else { iter->start+=4; /* back to behind this supplementary code point for consistent state */ @@ -991,7 +973,7 @@ utf8IteratorSetState(UCharIterator *iter, } else { /* verified index>=4 above */ UChar32 c; - U8_PREV((const uint8_t *)iter->context, 0, index, c); + U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c); if(c<=0xffff) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; } else { diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index 2c9a1d6ed1e..6db33225c3e 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -253,6 +253,37 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); U8_NEXT(s, _u8_get_index, length, c); \ } +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to an illegal UTF-8 byte sequence, then + * c is set to U+FFFD. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_GET() if that distinction is important. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset + * @param i int32_t string offset, must be start<=i=0x80) { \ + uint8_t __t1, __t2; \ + if( /* handle U+1000..U+CFFF inline */ \ + (0xe0<(c) && (c)<=0xec) && \ + (((i)+1)<(length) || (length)<0) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ + (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ + ) { \ + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ + (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ + (i)+=2; \ + } else if( /* handle U+0080..U+07FF inline */ \ + ((c)<0xe0 && (c)>=0xc2) && \ + ((i)!=(length)) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ + ) { \ + (c)=(UChar)((((c)&0x1f)<<6)|__t1); \ + ++(i); \ } else { \ - (c)=U_SENTINEL; \ + /* function call for "complicated" and error cases */ \ + (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \ } \ } \ } @@ -588,11 +668,38 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); #define U8_PREV(s, start, i, c) { \ (c)=(uint8_t)(s)[--(i)]; \ if((c)>=0x80) { \ - if((c)<=0xbf) { \ - (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ - } else { \ - (c)=U_SENTINEL; \ - } \ + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_PREV() if that distinction is important. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start=0x80) { \ + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ } \ } diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index f1b3e31a20c..676c855d1f0 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2011, International Business Machines +* Copyright (C) 1999-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description @@ -2234,10 +2234,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp UChar32 c; int32_t start=0, prev=0; do { - U8_NEXT(s, start, length, c); - if(c<0) { - c=0xfffd; - } + U8_NEXT_OR_FFFD(s, start, length, c); if(spanCondition!=contains(c)) { break; } @@ -2275,10 +2272,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio UChar32 c; int32_t prev=length; do { - U8_PREV(s, 0, length, c); - if(c<0) { - c=0xfffd; - } + U8_PREV_OR_FFFD(s, 0, length, c); if(spanCondition!=contains(c)) { break; } diff --git a/icu4c/source/common/unisetspan.cpp b/icu4c/source/common/unisetspan.cpp index 90a9fc47497..e107abe5452 100644 --- a/icu4c/source/common/unisetspan.cpp +++ b/icu4c/source/common/unisetspan.cpp @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 2007-2011, International Business Machines +* Copyright (C) 2007-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -503,9 +503,9 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { if((int8_t)c>=0) { return set.contains(c) ? 1 : -1; } - // Take advantage of non-ASCII fastpaths in U8_NEXT(). + // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). int32_t i=0; - U8_NEXT(s, i, length, c); + U8_NEXT_OR_FFFD(s, i, length, c); return set.contains(c) ? i : -i; } @@ -516,7 +516,7 @@ spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { return set.contains(c) ? 1 : -1; } int32_t i=length-1; - c=utf8_prevCharSafeBody(s, 0, &i, c, -1); + c=utf8_prevCharSafeBody(s, 0, &i, c, -3); length-=i; return set.contains(c) ? length : -length; } diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp index fc5977554dd..396ee0777df 100644 --- a/icu4c/source/common/utext.cpp +++ b/icu4c/source/common/utext.cpp @@ -1217,15 +1217,11 @@ fillForward: int32_t cIx = srcIx; int32_t dIx = destIx; int32_t dIxSaved = destIx; - U8_NEXT(s8, srcIx, strLen, c); + U8_NEXT_OR_FFFD(s8, srcIx, strLen, c); if (c==0 && nulTerminated) { srcIx--; break; } - if (c<0) { - // Illegal UTF-8. Replace with sub character. - c = 0x0fffd; - } U16_APPEND_UNSAFE(buf, destIx, c); do { @@ -1334,15 +1330,11 @@ fillReverse: int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char // Get the full character from the UTF8 string. - // use code derived from tbe macros in utf.8 + // use code derived from tbe macros in utf8.h // Leaves srcIx pointing at the first byte of the UTF-8 char. // - if (c<=0xbf) { - c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1); - // leaves srcIx at first byte of the multi-byte char. - } else { - c=0x0fffd; - } + c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3); + // leaves srcIx at first byte of the multi-byte char. // Store the character in UTF-16 buffer. if (c<0x10000) { @@ -1415,10 +1407,7 @@ utext_strFromUTF8(UChar *dest, if(ch <=0x7f){ *pDest++=(UChar)ch; }else{ - ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); - if(ch<0){ - ch = 0xfffd; - } + ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); if(U_IS_BMP(ch)){ *(pDest++)=(UChar)ch; }else{ @@ -1438,10 +1427,7 @@ utext_strFromUTF8(UChar *dest, if(ch <= 0x7f){ reqLength++; }else{ - ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); - if(ch<0){ - ch = 0xfffd; - } + ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); reqLength+=U16_LENGTH(ch); } } diff --git a/icu4c/source/common/utf_impl.c b/icu4c/source/common/utf_impl.c index 99fca6e86af..ce65e14d4a3 100644 --- a/icu4c/source/common/utf_impl.c +++ b/icu4c/source/common/utf_impl.c @@ -86,15 +86,31 @@ utf8_errorValue[6]={ 0x3ffffff, 0x7fffffff }; +static UChar32 +errorValue(int32_t count, int8_t strict) { + if(strict>=0) { + return utf8_errorValue[count]; + } else if(strict==-3) { + return 0xfffd; + } else { + return U_SENTINEL; + } +} + /* - * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling - * UTF8_NEXT_CHAR_SAFE(). + * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros + * and their obsolete sibling UTF8_NEXT_CHAR_SAFE(). * * U8_NEXT() supports NUL-terminated strings indicated via length<0. * * The "strict" parameter controls the error behavior: - * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative - * code point result. + * <0 "Safe" behavior of U8_NEXT(): + * -1: All illegal byte sequences yield U_SENTINEL=-1. + * -2: Same as -1, except for lenient treatment of surrogate code points as legal. + * Some implementations use this for roundtripping of + * Unicode 16-bit strings that are not well-formed UTF-16, that is, they + * contain unpaired surrogates. + * -3: All illegal byte sequences yield U+FFFD. * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): * All illegal byte sequences yield a positive code point such that this * result code point would be encoded with the same number of bytes as @@ -103,11 +119,6 @@ utf8_errorValue[6]={ * Same as the obsolete "safe" behavior, but non-characters are also treated * like illegal sequences. * - * The special negative (<0) value -2 is used for lenient treatment of surrogate - * code points as legal. Some implementations use this for roundtripping of - * Unicode 16-bit strings that are not well-formed UTF-16, that is, they - * contain unpaired surrogates. - * * Note that a UBool is the same as an int8_t. */ U_CAPI UChar32 U_EXPORT2 @@ -165,11 +176,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, ++i; --count; } - if(strict>=0) { - c=utf8_errorValue[i-*pi]; - } else { - c=U_SENTINEL; - } + c=errorValue(i-*pi, strict); *pi=i; return c; } @@ -224,18 +231,15 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U int32_t i=*pi; uint8_t b, count=1, shift=6; + if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); } + /* extract value bits from the last trail byte */ c&=0x3f; for(;;) { if(i<=start) { /* no lead byte at all */ - if(strict>=0) { - return UTF8_ERROR_VALUE_1; - } else { - return U_SENTINEL; - } - /*break;*/ + return errorValue(0, strict); } /* read another previous byte */ @@ -255,11 +259,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U if(count>=4) { count=3; } - if(strict>=0) { - c=utf8_errorValue[count]; - } else { - c=U_SENTINEL; - } + c=errorValue(count, strict); } else { /* exit with correct c */ } @@ -269,17 +269,9 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U include the trail byte that we started with */ if(count=0) { - c=utf8_errorValue[count]; - } else { - c=U_SENTINEL; - } + c=errorValue(count, strict); } else { - if(strict>=0) { - c=UTF8_ERROR_VALUE_1; - } else { - c=U_SENTINEL; - } + c=errorValue(0, strict); } } break; @@ -290,20 +282,12 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U shift+=6; } else { /* more than 5 trail bytes is illegal */ - if(strict>=0) { - c=UTF8_ERROR_VALUE_1; - } else { - c=U_SENTINEL; - } + c=errorValue(0, strict); break; } } else { /* single-byte character precedes trailing bytes */ - if(strict>=0) { - c=UTF8_ERROR_VALUE_1; - } else { - c=U_SENTINEL; - } + c=errorValue(0, strict); break; } } diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index 54e5ae1c541..0912e4be361 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -8038,163 +8038,9 @@ endOfSecLoop: } /* - Slightly modified version of U8_NEXT macro defined in utf8.h. U8_NEXT requires - the length of UTF-8 string. This version assumes that the UTF-8 string is null - terminated and does not require the length as input. - Note: ucol_strcollUTF8 supports null terminated input. Calculating length of null terminated input string takes extra amount of CPU cycles. */ -static const UChar32 -utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; - -#define UTF8_ERROR_VALUE_1 0x15 -#define UTF8_ERROR_VALUE_2 0x9f -#define UTF_ERROR_VALUE 0xffff - -static const UChar32 -utf8_errorValue[6]={ - UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff, - 0x3ffffff, 0x7fffffff -}; - -static -UChar32 utf8_nextCharSafeBodyNullTerm(const uint8_t *s, int32_t *pi, UChar32 c, UBool strict) { - int32_t i=*pi; - uint8_t count=U8_COUNT_TRAIL_BYTES(c); - U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */ - - if (c) { - uint8_t trail, illegal=0; - - U8_MASK_LEAD_BYTE((c), count); - /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ - switch(count) { - /* each branch falls through to the next one */ - case 5: - case 4: - /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ - illegal=1; - break; - case 3: - trail=s[(i)]; - if (trail==0) { - illegal=1; - break; - } - (c)=((c)<<6)|(trail&0x3f); - if(c<0x110) { - illegal|=(trail&0xc0)^0x80; - } else { - /* code point>0x10ffff, outside Unicode */ - illegal=1; - break; - } - ++(i); - case 2: - trail=s[(i)]; - if (trail==0) { - illegal=1; - break; - } - (c)=((c)<<6)|(trail&0x3f); - illegal|=(trail&0xc0)^0x80; - ++(i); - case 1: - trail=s[(i)]; - if (trail==0) { - illegal=1; - break; - } - (c)=((c)<<6)|(trail&0x3f); - illegal|=(trail&0xc0)^0x80; - ++(i); - break; - case 0: - if(strict>=0) { - return UTF8_ERROR_VALUE_1; - } else { - return U_SENTINEL; - } - /* no default branch to optimize switch() - all values are covered */ - } - - /* - * All the error handling should return a value - * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right. - * - * Starting with Unicode 3.0.1, non-shortest forms are illegal. - * Starting with Unicode 3.2, surrogate code points must not be - * encoded in UTF-8, and there are no irregular sequences any more. - * - * U8_ macros (new in ICU 2.4) return negative values for error conditions. - */ - - /* correct sequence - all trail bytes have (b7..b6)==(10)? */ - /* illegal is also set if count>=4 */ - if(illegal || (c)0 && U8_IS_TRAIL(s[i])) { - ++(i); - --count; - } - if(strict>=0) { - c=utf8_errorValue[errorCount-count]; - } else { - c=U_SENTINEL; - } - } else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) { - /* strict: forbid non-characters like U+fffe */ - c=utf8_errorValue[count]; - } - } - *pi=i; - return c; -} - -#define U8_NEXT_NULLTERM(s, i, c) { \ - (c)=(uint8_t)(s)[(i)]; \ - if((c)>=0x80) { \ - uint8_t __t1, __t2; \ - if( /* handle U+1000..U+CFFF inline */ \ - (0xe0<(c) && (c)<=0xec) && \ - (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 && \ - (__t2=(uint8_t)((s)[(i)+2]-0x80))<= 0x3f && __t2 != 0 \ - ) { \ - /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ - (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ - (i)+=3; \ - } else if( /* handle U+0080..U+07FF inline */ \ - ((c)<0xe0 && (c)>=0xc2) && \ - (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 \ - ) { \ - (c)=(UChar)((((c)&0x1f)<<6)|__t1); \ - (i)+=2; \ - } else if(U8_IS_LEAD(c)) { \ - /* function call for "complicated" and error cases */ \ - ++(i); \ - (c)=utf8_nextCharSafeBodyNullTerm((const uint8_t *)s, &(i), c, -1); \ - } else { \ - (c)=U_SENTINEL; \ - ++(i); \ - } \ - } else { \ - if ((c)) { \ - ++(i); \ - } \ - } \ -} - -#define U8_GET_NULLTERM(s, start, i, c) { \ - int32_t _u8_get_index=(int32_t)(i); \ - U8_SET_CP_START(s, start, _u8_get_index); \ - U8_NEXT_NULLTERM(s, _u8_get_index, c); \ -} - - static UCollationResult ucol_strcollRegularUTF8( const UCollator *coll, @@ -8253,19 +8099,12 @@ ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength, UChar32 schar = 0, tchar = 0; for(;;) { - if (len == -1) { - U8_GET_NULLTERM((const uint8_t*)s, 0, *index, schar); - if (schar == 0) { - return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); - } - } else { - if (*index == len) { - return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); - } - U8_GET((const uint8_t*)s, 0, *index, len, schar); + if (*index == len) { + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } - if (schar == -1) { - schar = 0xfffd; + U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar); + if (len < 0 && schar == 0) { + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ @@ -8320,22 +8159,15 @@ ucol_strcollUseLatin1UTF8( for(;;) { while(sOrder==0) { // this loop skips primary ignorables // sOrder=getNextlatinOneCE(source); - if (sLen==-1) { - U8_NEXT_NULLTERM(source, sIndex, sChar); - if (sChar == 0) { - endOfSource = TRUE; - sLen = sIndex; - break; - } - } else { - if (sIndex == sLen) { - endOfSource = TRUE; - break; - } - U8_NEXT(source, sIndex, sLen ,sChar); + if (sIndex == sLen) { + endOfSource = TRUE; + break; } - if (sChar == -1) { - sChar = 0xfffd; // fallback for the bad code + U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar); + if (sLen < 0 && sChar == 0) { + endOfSource = TRUE; + sLen = sIndex; + break; } if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) //fprintf(stderr, "R"); @@ -8360,28 +8192,21 @@ ucol_strcollUseLatin1UTF8( while(tOrder==0) { // this loop skips primary ignorables // tOrder=getNextlatinOneCE(target); - if (tLen == -1) { - U8_NEXT_NULLTERM(target, tIndex, tChar); - if (tChar == 0) { - if(endOfSource) { - tLen = tIndex; - goto endOfPrimLoopU8; - } else { - return UCOL_GREATER; - } + if (tIndex == tLen) { + if(endOfSource) { + goto endOfPrimLoopU8; + } else { + return UCOL_GREATER; } - } else { - if (tIndex == tLen) { - if(endOfSource) { - goto endOfPrimLoopU8; - } else { - return UCOL_GREATER; - } - } - U8_NEXT(target, tIndex, tLen, tChar); } - if (tChar == -1) { - tChar = 0xfffd; + U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); + if (tLen < 0 && tChar == 0) { + if(endOfSource) { + tLen = tIndex; + goto endOfPrimLoopU8; + } else { + return UCOL_GREATER; + } } if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) //fprintf(stderr, "R"); @@ -8448,7 +8273,7 @@ endOfPrimLoopU8: break; } U_ASSERT(sLen >= 0); - U8_NEXT(source, sIndex, sLen, sChar); + U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); U_ASSERT(sChar >= 0 && sChar <= 0xFF); sOrder = elements[sChar]; if(sOrder > UCOL_NOT_FOUND) { @@ -8465,7 +8290,7 @@ endOfPrimLoopU8: } } U_ASSERT(tLen >= 0); - U8_NEXT(target, tIndex, tLen, tChar); + U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); U_ASSERT(tChar >= 0 && tChar <= 0xFF); tOrder = elements[tChar]; if(tOrder > UCOL_NOT_FOUND) { @@ -8505,7 +8330,7 @@ endOfPrimLoopU8: endOfSource = TRUE; break; } - U8_PREV(source, 0, sIndex, sChar); + U8_PREV_OR_FFFD(source, 0, sIndex, sChar); U_ASSERT(sChar >= 0 && sChar <= 0xFF); sOrder = elements[sChar]; // don't even look for contractions @@ -8519,7 +8344,7 @@ endOfPrimLoopU8: return UCOL_GREATER; } } - U8_PREV(target, 0, tIndex, tChar); + U8_PREV_OR_FFFD(target, 0, tIndex, tChar); U_ASSERT(tChar >= 0 && tChar <= 0xFF); tOrder = elements[tChar]; // don't even look for contractions @@ -8560,7 +8385,7 @@ endOfSecLoopU8: break; } U_ASSERT(sLen >= 0); - U8_NEXT(source, sIndex, sLen, sChar); + U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); U_ASSERT(sChar >= 0 && sChar <= 0xFF); sOrder = elements[sChar]; if(sOrder > UCOL_NOT_FOUND) { @@ -8576,7 +8401,7 @@ endOfSecLoopU8: } } U_ASSERT(tLen >= 0); - U8_NEXT(target, tIndex, tLen, tChar); + U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); U_ASSERT(tChar >= 0 && tChar <= 0xFF); tOrder = elements[tChar]; if(tOrder > UCOL_NOT_FOUND) { @@ -8963,36 +8788,18 @@ ucol_strcollUTF8( UChar32 uc32 = -1; if (!bSrcLimit) { - if (sourceLength >= 0) { - U8_GET((uint8_t*)source, 0, equalLength, sourceLength, uc32); - } else { - U8_GET_NULLTERM((uint8_t*)source, 0, equalLength, uc32); - } - if (uc32 == -1) { - uc32 = 0xfffd; - bSawNonLatin1 |= TRUE; - } else { - if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { - bUnsafeCP = TRUE; - } - bSawNonLatin1 |= (uc32 > 0xff); + U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32); + if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { + bUnsafeCP = TRUE; } + bSawNonLatin1 |= (uc32 > 0xff); } if (!bTargLimit) { - if (targetLength >= 0) { - U8_GET((uint8_t*)target, 0, equalLength, targetLength, uc32); - } else { - U8_GET_NULLTERM((uint8_t*)target, 0, equalLength, uc32); - } - if (uc32 == -1) { - uc32 = 0xfffd; - bSawNonLatin1 |= TRUE; - } else { - if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { - bUnsafeCP = TRUE; - } - bSawNonLatin1 |= (uc32 > 0xff); + U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32); + if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { + bUnsafeCP = TRUE; } + bSawNonLatin1 |= (uc32 > 0xff); } if (bUnsafeCP) { @@ -9000,7 +8807,7 @@ ucol_strcollUTF8( // We are stopped in the middle of a contraction. // Scan backwards through the == part of the string looking for the start of the contraction. // It doesn't matter which string we scan, since they are the same in this region. - U8_PREV((uint8_t*)source, 0, equalLength, uc32); + U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32); bSawNonLatin1 |= (uc32 > 0xff); if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) { break; diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c index 5179a732911..88e1a091312 100644 --- a/icu4c/source/test/cintltst/utf8tst.c +++ b/icu4c/source/test/cintltst/utf8tst.c @@ -195,7 +195,7 @@ static void TestGetChar() 0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1 }; uint16_t i=0; - UChar32 c; + UChar32 c, expected; uint32_t offset=0; for(offset=0; offset= 0 : c != result[i+1]){ - log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); + UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE); + expected=result[i+1]; + if(c != expected){ + log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); } - UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE); - if(c != result[i+1]){ - log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); + U8_GET(input, 0, offset, sizeof(input), c); + if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } + if(c != expected){ + log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + } + + U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c); + if(expected<0) { expected=0xfffd; } + if(c != expected){ + log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c); } UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE); @@ -228,7 +236,7 @@ static void TestGetChar() log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); } - i=(uint16_t)(i+3); + i=(uint16_t)(i+3); } } @@ -274,7 +282,7 @@ static void TestNextPrevChar() { }; /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */ - UChar32 c=0x0000; + UChar32 c, expected; uint32_t i=0; uint32_t offset=0; int32_t setOffset=0; @@ -285,9 +293,10 @@ static void TestNextPrevChar() { log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } - if(c != result[i+1]){ - log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); - } + expected=result[i+1]; + if(c != expected){ + log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + } setOffset=offset; U8_NEXT(input, setOffset, sizeof(input), c); @@ -295,9 +304,21 @@ static void TestNextPrevChar() { log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } - if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){ - log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); - } + if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } + if(c != expected){ + log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + } + + setOffset=offset; + U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c); + if(setOffset != movedOffset[i+1]){ + log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[i+1], setOffset); + } + if(expected<0) { expected=0xfffd; } + if(c != expected){ + log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + } setOffset=offset; UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE); @@ -320,9 +341,10 @@ static void TestNextPrevChar() { log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } - if(c != result[i+4]){ - log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); - } + expected=result[i+4]; + if(c != expected){ + log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + } setOffset=offset; U8_PREV(input, 0, setOffset, c); @@ -330,9 +352,21 @@ static void TestNextPrevChar() { log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } - if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){ - log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); - } + if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } + if(c != expected){ + log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + } + + setOffset=offset; + U8_PREV_OR_FFFD(input, 0, setOffset, c); + if(setOffset != movedOffset[i+4]){ + log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[i+4], setOffset); + } + if(expected<0) { expected=0xfffd; } + if(c != expected){ + log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c); + } setOffset=offset; UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); @@ -378,14 +412,24 @@ static void TestNulTerminated() { 0 }; - UChar32 c, c2; + UChar32 c, c2, expected; int32_t i0, i=0, j, k, expectedIndex; int32_t cpIndex=0; do { i0=i; U8_NEXT(input, i, -1, c); - if(c!=result[cpIndex]) { - log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]); + expected=result[cpIndex]; + if(c!=expected) { + log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected); + } + j=i0; + U8_NEXT_OR_FFFD(input, j, -1, c); + if(expected<0) { expected=0xfffd; } + if(c!=expected) { + log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected); + } + if(j!=i) { + log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i); } j=i0; U8_FWD_1(input, j, -1); @@ -414,6 +458,11 @@ static void TestNulTerminated() { if(c2!=c) { log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j); } + U8_GET_OR_FFFD(input, 0, j, -1, c2); + expected= (c>=0) ? c : 0xfffd; + if(c2!=expected) { + log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j); + } /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */ k=j+1; U8_SET_CP_LIMIT(input, 0, k, -1); diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 021314a9cbe..abc40039d6e 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -2626,10 +2626,7 @@ static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, UChar32 c; int32_t start=0, prev; while((prev=start)