From 2b2af0bbc53164adf6c31737f548800635be4710 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 12 Apr 2000 19:36:07 +0000 Subject: [PATCH] ICU-176 utf macros get, prev, back take start parameter X-SVN-Rev: 1116 --- icu4c/source/common/unicode/utf.h | 48 +++++++------- icu4c/source/common/unicode/utf16.h | 26 ++++---- icu4c/source/common/unicode/utf32.h | 21 +++---- icu4c/source/common/unicode/utf8.h | 74 +++++++++++----------- icu4c/source/common/unistr.cpp | 2 +- icu4c/source/common/utf_impl.c | 97 +++++++++++++++++------------ 6 files changed, 140 insertions(+), 128 deletions(-) diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h index 77abb82fe35..afc103e582c 100644 --- a/icu4c/source/common/unicode/utf.h +++ b/icu4c/source/common/unicode/utf.h @@ -149,7 +149,7 @@ typedef int32_t UTextOffset; # define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) # define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) -# define UTF_GET_CHAR_SAFE(s, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, i, length, c, strict) +# define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) # define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) # define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) @@ -164,19 +164,19 @@ typedef int32_t UTextOffset; # define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) # define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) -# define UTF_SET_CHAR_START_SAFE(s, i) UTF16_SET_CHAR_START_SAFE(s, i) +# define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) # define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) -# define UTF_PREV_CHAR_SAFE(s, i, c, strict) UTF16_PREV_CHAR_SAFE(s, i, c, strict) +# define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) # define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) -# define UTF_BACK_1_SAFE(s, i) UTF16_BACK_1_SAFE(s, i) +# define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) # define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) -# define UTF_BACK_N_SAFE(s, i, n) UTF16_BACK_N_SAFE(s, i, n) +# define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) # define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) -# define UTF_SET_CHAR_LIMIT_SAFE(s, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, i, length) +# define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) #elif UTF_SIZE==32 @@ -192,48 +192,48 @@ typedef int32_t UTextOffset; #ifdef UTF_SAFE -# define UTF_GET_CHAR(s, i, length, c) UTF_GET_CHAR_SAFE(s, i, length, c, FALSE) +# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE) # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE) # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c) # define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length) # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n) -# define UTF_SET_CHAR_START(s, i) UTF_SET_CHAR_START_SAFE(s, i) +# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i) -# define UTF_PREV_CHAR(s, i, c) UTF_PREV_CHAR_SAFE(s, i, c, FALSE) -# define UTF_BACK_1(s, i) UTF_BACK_1_SAFE(s, i) -# define UTF_BACK_N(s, i, n) UTF_BACK_N_SAFE(s, i, n) -# define UTF_SET_CHAR_LIMIT(s, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, i, length) +# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE) +# define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i) +# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n) +# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) #elif defined(UTF_STRICT) -# define UTF_GET_CHAR(s, i, length, c) UTF_GET_CHAR_SAFE(s, i, length, c, TRUE) +# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE) # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE) # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c) # define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length) # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n) -# define UTF_SET_CHAR_START(s, i) UTF_SET_CHAR_START_SAFE(s, i) +# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i) -# define UTF_PREV_CHAR(s, i, c) UTF_PREV_CHAR_SAFE(s, i, c, TRUE) -# define UTF_BACK_1(s, i) UTF_BACK_1_SAFE(s, i) -# define UTF_BACK_N(s, i, n) UTF_BACK_N_SAFE(s, i, n) -# define UTF_SET_CHAR_LIMIT(s, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, i, length) +# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE) +# define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i) +# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n) +# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) #else /* UTF_UNSAFE */ -# define UTF_GET_CHAR(s, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c) +# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c) # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_UNSAFE(s, i, c) # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_UNSAFE(s, i, c) # define UTF_FWD_1(s, i, length) UTF_FWD_1_UNSAFE(s, i) # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_UNSAFE(s, i, n) -# define UTF_SET_CHAR_START(s, i) UTF_SET_CHAR_START_UNSAFE(s, i) +# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_UNSAFE(s, i) -# define UTF_PREV_CHAR(s, i, c) UTF_PREV_CHAR_UNSAFE(s, i, c) -# define UTF_BACK_1(s, i) UTF_BACK_1_UNSAFE(s, i) -# define UTF_BACK_N(s, i, n) UTF_BACK_N_UNSAFE(s, i, n) -# define UTF_SET_CHAR_LIMIT(s, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i) +# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_UNSAFE(s, i, c) +# define UTF_BACK_1(s, start, i) UTF_BACK_1_UNSAFE(s, i) +# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_UNSAFE(s, i, n) +# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i) #endif diff --git a/icu4c/source/common/unicode/utf16.h b/icu4c/source/common/unicode/utf16.h index da445a60df1..793b9c633b2 100644 --- a/icu4c/source/common/unicode/utf16.h +++ b/icu4c/source/common/unicode/utf16.h @@ -72,7 +72,7 @@ } \ } -#define UTF16_GET_CHAR_SAFE(s, i, length, c, strict) { \ +#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ (c)=(s)[i]; \ if(UTF_IS_SURROGATE(c)) { \ uint16_t __c2; \ @@ -85,7 +85,7 @@ (c)=UTF_ERROR_VALUE; \ } \ } else { \ - if((i)>0 && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ + if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \ } else if(strict) {\ @@ -207,8 +207,8 @@ } \ } -#define UTF16_SET_CHAR_START_SAFE(s, i) { \ - if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>0 && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ +#define UTF16_SET_CHAR_START_SAFE(s, start, i) { \ + if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ --(i); \ } \ } @@ -267,11 +267,11 @@ /* safe versions with error-checking and optional regularity-checking */ -#define UTF16_PREV_CHAR_SAFE(s, i, c, strict) { \ +#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \ (c)=(s)[--(i)]; \ if(UTF_IS_SECOND_SURROGATE(c)) { \ uint16_t __c2; \ - if((i)>0 && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ + if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ --(i); \ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \ @@ -286,22 +286,22 @@ } \ } -#define UTF16_BACK_1_SAFE(s, i) { \ - if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>0 && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ +#define UTF16_BACK_1_SAFE(s, start, i) { \ + if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ --(i); \ } \ } -#define UTF16_BACK_N_SAFE(s, i, n) { \ +#define UTF16_BACK_N_SAFE(s, start, i, n) { \ UTextOffset __N=(n); \ - while(__N>0 && (i)>0) { \ - UTF16_BACK_1_SAFE(s, i); \ + while(__N>0 && (i)>(start)) { \ + UTF16_BACK_1_SAFE(s, start, i); \ --__N; \ } \ } -#define UTF16_SET_CHAR_LIMIT_SAFE(s, i, length) { \ - if((i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ +#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \ + if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ ++(i); \ } \ } diff --git a/icu4c/source/common/unicode/utf32.h b/icu4c/source/common/unicode/utf32.h index a6de5d75da1..1e1e339b0e1 100644 --- a/icu4c/source/common/unicode/utf32.h +++ b/icu4c/source/common/unicode/utf32.h @@ -54,7 +54,7 @@ (c)=(s)[i]; \ } -#define UTF32_GET_CHAR_SAFE(s, i, length, c, strict) { \ +#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ (c)=(s)[i]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ @@ -107,7 +107,7 @@ } \ } -#define UTF32_SET_CHAR_START_SAFE(s, i) { \ +#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \ } /* definitions with backward iteration -------------------------------------- */ @@ -127,24 +127,21 @@ #define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \ } -#define UTF32_PREV_CHAR_SAFE(s, i, c, strict) { \ +#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \ (c)=(s)[--(i)]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ } \ } -#define UTF32_BACK_1_SAFE(s, i) { \ - if((i)>0) { \ - --(i); \ - } \ +#define UTF32_BACK_1_SAFE(s, start, i) { \ + --(i); \ } -#define UTF32_BACK_N_SAFE(s, i, n) { \ - if((i)>=(n)) { \ - (i)-=(n); \ - } else { \ - (i)=0; \ +#define UTF32_BACK_N_SAFE(s, start, i, n) { \ + (i)-=(n); \ + if((i)<(start)) { \ + (i)=(start); \ } \ } diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index 158c96798e8..58672a19bb4 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -45,10 +45,10 @@ U_CAPI UTextOffset U_EXPORT2 utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c); U_CAPI UChar32 U_EXPORT2 -utf8_prevCharSafeBody(const uint8_t *s, UTextOffset *pi, UChar32 c, bool_t strict); +utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, bool_t strict); U_CAPI UTextOffset U_EXPORT2 -utf8_back1SafeBody(const uint8_t *s, UTextOffset i); +utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i); /* * For the semantics of all of these macros, see utf16.h. @@ -110,9 +110,9 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i); UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \ } -#define UTF8_GET_CHAR_SAFE(s, i, length, c, strict) { \ +#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ UTextOffset __I=(UTextOffset)(i); \ - UTF8_SET_CHAR_START_SAFE(s, __I); \ + UTF8_SET_CHAR_START_SAFE(s, start, __I); \ UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \ } @@ -131,7 +131,7 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i); */ #define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \ (c)=(s)[(i)++]; \ - if((c)&0x80) { \ + if((uint8_t)((c)-0xc0)<0x35) { \ uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \ UTF8_MASK_LEAD_BYTE(c, __count); \ switch(__count) { \ @@ -185,7 +185,7 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i); #define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ (c)=(s)[(i)++]; \ - if((c)&0x80) { \ + if(UTF8_IS_LEAD(c)) { \ (c)=utf8_nextCharSafeBody(s, &(i), (UTextOffset)(length), c, strict); \ } \ } @@ -199,17 +199,15 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i); } #define UTF8_FWD_1_SAFE(s, i, length) { \ - if((i)<(length)) { \ - uint8_t __b=(s)[(i)++]; \ - if(__b&0x80) { \ - uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \ - if((i)+__count>(length)) { \ - __count=(length)-(i); \ - } \ - while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \ - ++(i); \ - --__count; \ - } \ + uint8_t __b=(s)[(i)++]; \ + if(UTF8_IS_LEAD(__b)) { \ + uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \ + if((i)+__count>(length)) { \ + __count=(length)-(i); \ + } \ + while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \ + ++(i); \ + --__count; \ } \ } \ } @@ -222,9 +220,9 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i); } \ } -#define UTF8_SET_CHAR_START_SAFE(s, i) { \ - if((s)[(i)]&0x80) { \ - (i)=utf8_back1SafeBody(s, (UTextOffset)(i)); \ +#define UTF8_SET_CHAR_START_SAFE(s, start, i) { \ + if(UTF8_IS_TRAIL((s)[(i)])) { \ + (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \ } \ } @@ -232,10 +230,10 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i); #define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \ (c)=(s)[--(i)]; \ - if((c)&0x80) { \ + if(UTF8_IS_TRAIL(c)) { \ uint8_t __b, __count=1, __shift=6; \ \ - /* c must be a trail byte */ \ + /* c is a trail byte */ \ (c)&=0x3f; \ for(;;) { \ __b=(s)[--(i)]; \ @@ -269,33 +267,35 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i); UTF8_FWD_1_UNSAFE(s, i); \ } -#define UTF8_PREV_CHAR_SAFE(s, i, c, strict) { \ +#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \ (c)=(s)[--(i)]; \ - if((c)&0x80) { \ - (c)=utf8_prevCharSafeBody(s, &(i), c, strict); \ + if(UTF8_IS_TRAIL((c))) { \ + (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \ } \ } -#define UTF8_BACK_1_SAFE(s, i) { \ - if((i)>0) { \ - --(i); \ - if((s)[(i)]&0x80) { \ - (i)=utf8_back1SafeBody(s, (UTextOffset)(i)); \ - } \ +#define UTF8_BACK_1_SAFE(s, start, i) { \ + if(UTF8_IS_TRAIL((s)[--(i)])) { \ + (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \ } \ } -#define UTF8_BACK_N_SAFE(s, i, n) { \ +#define UTF8_BACK_N_SAFE(s, start, i, n) { \ UTextOffset __N=(n); \ - while(__N>0 && (i)>0) { \ - UTF8_BACK_1_SAFE(s, i); \ + while(__N>0 && (i)>(start)) { \ + UTF8_BACK_1_SAFE(s, start, i); \ --__N; \ } \ } -#define UTF8_SET_CHAR_LIMIT_SAFE(s, i, length) { \ - UTF8_BACK_1_SAFE(s, i); \ - UTF8_FWD_1_SAFE(s, i, length); \ +#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \ + if((start)<(i) && (i)<(length)) { \ + UTF8_BACK_1_SAFE(s, start, i); \ + (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ + if((i)>(length)) { \ + (i)=(length); \ + } \ + } \ } #endif diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp index cb83a82c39b..b18f1f8cc04 100644 --- a/icu4c/source/common/unistr.cpp +++ b/icu4c/source/common/unistr.cpp @@ -1085,7 +1085,7 @@ UnicodeString::trim() if(i <= 0) { break; } - UTF_PREV_CHAR(fArray, i, c); + UTF_PREV_CHAR(fArray, 0, i, c); if(!(c == 0x20 || Unicode::isWhitespace(c))) { break; } diff --git a/icu4c/source/common/utf_impl.c b/icu4c/source/common/utf_impl.c index 8d392063a3b..798635296d9 100644 --- a/icu4c/source/common/utf_impl.c +++ b/icu4c/source/common/utf_impl.c @@ -193,69 +193,84 @@ utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c } U_CAPI UChar32 U_EXPORT2 -utf8_prevCharSafeBody(const uint8_t *s, UTextOffset *pi, UChar32 c, bool_t strict) { +utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, bool_t strict) { UTextOffset i=*pi; - if(UTF8_IS_TRAIL(c)) { - uint8_t b, count=1, shift=6; + uint8_t b, count=1, shift=6; - c&=0x3f; - while(i>0 && count<6) { - b=s[--i]; - if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ - if(b&0x40) { - /* lead byte */ - uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b); + /* extract value bits from the last trail byte */ + c&=0x3f; - if(count==shouldCount) { - *pi=i; - UTF8_MASK_LEAD_BYTE(b, count); - c|=(UChar32)b<0x10ffff || - (strict) && - (UTF_IS_SURROGATE(c) || - count>=4 || (c)0x10ffff || + (strict) && + (UTF_IS_SURROGATE(c) || + count>=4 || (c)5) { + if(I-5>start) { Z=I-5; } else { - Z=0; + Z=start; } /* return I if the sequence starting there is long enough to include i */