ICU-176 utf macros get, prev, back take start parameter

X-SVN-Rev: 1116
2025-04-08 06:53:45 +00:00 · 2000-04-12 19:36:07 +00:00 · 2000-04-12 19:36:07 +00:00 · 2b2af0bbc5
commit 2b2af0bbc5
parent c117cd37bb
6 changed files with 140 additions and 128 deletions
--- a/icu4c/source/common/unicode/utf.h
+++ b/icu4c/source/common/unicode/utf.h
@ -149,7 +149,7 @@ typedef int32_t UTextOffset;
 #   define UTF_ARRAY_SIZE(size)                         UTF16_ARRAY_SIZE(size)

 #   define UTF_GET_CHAR_UNSAFE(s, i, c)                 UTF16_GET_CHAR_UNSAFE(s, i, c)
-#   define UTF_GET_CHAR_SAFE(s, i, length, c, strict)   UTF16_GET_CHAR_SAFE(s, i, length, c, strict)
+#   define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict)

 #   define UTF_NEXT_CHAR_UNSAFE(s, i, c)                UTF16_NEXT_CHAR_UNSAFE(s, i, c)
 #   define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict)  UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict)
@ -164,19 +164,19 @@ typedef int32_t UTextOffset;
 #   define UTF_FWD_N_SAFE(s, i, length, n)              UTF16_FWD_N_SAFE(s, i, length, n)

 #   define UTF_SET_CHAR_START_UNSAFE(s, i)              UTF16_SET_CHAR_START_UNSAFE(s, i)
-#   define UTF_SET_CHAR_START_SAFE(s, i)                UTF16_SET_CHAR_START_SAFE(s, i)
+#   define UTF_SET_CHAR_START_SAFE(s, start, i)         UTF16_SET_CHAR_START_SAFE(s, start, i)

 #   define UTF_PREV_CHAR_UNSAFE(s, i, c)                UTF16_PREV_CHAR_UNSAFE(s, i, c)
-#   define UTF_PREV_CHAR_SAFE(s, i, c, strict)          UTF16_PREV_CHAR_SAFE(s, i, c, strict)
+#   define UTF_PREV_CHAR_SAFE(s, start, i, c, strict)   UTF16_PREV_CHAR_SAFE(s, start, i, c, strict)

 #   define UTF_BACK_1_UNSAFE(s, i)                      UTF16_BACK_1_UNSAFE(s, i)
-#   define UTF_BACK_1_SAFE(s, i)                        UTF16_BACK_1_SAFE(s, i)
+#   define UTF_BACK_1_SAFE(s, start, i)                 UTF16_BACK_1_SAFE(s, start, i)

 #   define UTF_BACK_N_UNSAFE(s, i, n)                   UTF16_BACK_N_UNSAFE(s, i, n)
-#   define UTF_BACK_N_SAFE(s, i, n)                     UTF16_BACK_N_SAFE(s, i, n)
+#   define UTF_BACK_N_SAFE(s, start, i, n)              UTF16_BACK_N_SAFE(s, start, i, n)

 #   define UTF_SET_CHAR_LIMIT_UNSAFE(s, i)              UTF16_SET_CHAR_LIMIT_UNSAFE(s, i)
-#   define UTF_SET_CHAR_LIMIT_SAFE(s, i, length)        UTF16_SET_CHAR_LIMIT_SAFE(s, i, length)
+#   define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length)

 #elif UTF_SIZE==32

@ -192,48 +192,48 @@ typedef int32_t UTextOffset;

 #ifdef UTF_SAFE

-#   define UTF_GET_CHAR(s, i, length, c)        UTF_GET_CHAR_SAFE(s, i, length, c, FALSE)
+#   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE)

 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE)
 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_SAFE(s, i, length, c)
 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_SAFE(s, i, length)
 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_SAFE(s, i, length, n)
-#   define UTF_SET_CHAR_START(s, i)             UTF_SET_CHAR_START_SAFE(s, i)
+#   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_SAFE(s, start, i)

-#   define UTF_PREV_CHAR(s, i, c)               UTF_PREV_CHAR_SAFE(s, i, c, FALSE)
-#   define UTF_BACK_1(s, i)                     UTF_BACK_1_SAFE(s, i)
-#   define UTF_BACK_N(s, i, n)                  UTF_BACK_N_SAFE(s, i, n)
-#   define UTF_SET_CHAR_LIMIT(s, i, length)     UTF_SET_CHAR_LIMIT_SAFE(s, i, length)
+#   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE)
+#   define UTF_BACK_1(s, start, i)              UTF_BACK_1_SAFE(s, start, i)
+#   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_SAFE(s, start, i, n)
+#   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)

 #elif defined(UTF_STRICT)

-#   define UTF_GET_CHAR(s, i, length, c)        UTF_GET_CHAR_SAFE(s, i, length, c, TRUE)
+#   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE)

 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE)
 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_SAFE(s, i, length, c)
 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_SAFE(s, i, length)
 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_SAFE(s, i, length, n)
-#   define UTF_SET_CHAR_START(s, i)             UTF_SET_CHAR_START_SAFE(s, i)
+#   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_SAFE(s, start, i)

-#   define UTF_PREV_CHAR(s, i, c)               UTF_PREV_CHAR_SAFE(s, i, c, TRUE)
-#   define UTF_BACK_1(s, i)                     UTF_BACK_1_SAFE(s, i)
-#   define UTF_BACK_N(s, i, n)                  UTF_BACK_N_SAFE(s, i, n)
-#   define UTF_SET_CHAR_LIMIT(s, i, length)     UTF_SET_CHAR_LIMIT_SAFE(s, i, length)
+#   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE)
+#   define UTF_BACK_1(s, start, i)              UTF_BACK_1_SAFE(s, start, i)
+#   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_SAFE(s, start, i, n)
+#   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)

 #else /* UTF_UNSAFE */

-#   define UTF_GET_CHAR(s, i, length, c)        UTF_GET_CHAR_UNSAFE(s, i, c)
+#   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c)

 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_UNSAFE(s, i, c)
 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_UNSAFE(s, i, c)
 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_UNSAFE(s, i)
 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_UNSAFE(s, i, n)
-#   define UTF_SET_CHAR_START(s, i)             UTF_SET_CHAR_START_UNSAFE(s, i)
+#   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_UNSAFE(s, i)

-#   define UTF_PREV_CHAR(s, i, c)               UTF_PREV_CHAR_UNSAFE(s, i, c)
-#   define UTF_BACK_1(s, i)                     UTF_BACK_1_UNSAFE(s, i)
-#   define UTF_BACK_N(s, i, n)                  UTF_BACK_N_UNSAFE(s, i, n)
-#   define UTF_SET_CHAR_LIMIT(s, i, length)     UTF_SET_CHAR_LIMIT_UNSAFE(s, i)
+#   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_UNSAFE(s, i, c)
+#   define UTF_BACK_1(s, start, i)              UTF_BACK_1_UNSAFE(s, i)
+#   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_UNSAFE(s, i, n)
+#   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i)

 #endif

--- a/icu4c/source/common/unicode/utf16.h
+++ b/icu4c/source/common/unicode/utf16.h
@ -72,7 +72,7 @@
    } \
 }

-#define UTF16_GET_CHAR_SAFE(s, i, length, c, strict) { \
+#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
    (c)=(s)[i]; \
    if(UTF_IS_SURROGATE(c)) { \
        uint16_t __c2; \
@ -85,7 +85,7 @@
                (c)=UTF_ERROR_VALUE; \
            } \
        } else { \
-            if((i)>0 && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
+            if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
                (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
                /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
            } else if(strict) {\
@ -207,8 +207,8 @@
    } \
 }

-#define UTF16_SET_CHAR_START_SAFE(s, i) { \
-    if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>0 && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
+#define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
+    if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
        --(i); \
    } \
 }
@ -267,11 +267,11 @@

 /* safe versions with error-checking and optional regularity-checking */

-#define UTF16_PREV_CHAR_SAFE(s, i, c, strict) { \
+#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
    (c)=(s)[--(i)]; \
    if(UTF_IS_SECOND_SURROGATE(c)) { \
        uint16_t __c2; \
-        if((i)>0 && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
+        if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
            --(i); \
            (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
            /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
@ -286,22 +286,22 @@
    } \
 }

-#define UTF16_BACK_1_SAFE(s, i) { \
-    if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>0 && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
+#define UTF16_BACK_1_SAFE(s, start, i) { \
+    if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
        --(i); \
    } \
 }

-#define UTF16_BACK_N_SAFE(s, i, n) { \
+#define UTF16_BACK_N_SAFE(s, start, i, n) { \
    UTextOffset __N=(n); \
-    while(__N>0 && (i)>0) { \
-        UTF16_BACK_1_SAFE(s, i); \
+    while(__N>0 && (i)>(start)) { \
+        UTF16_BACK_1_SAFE(s, start, i); \
        --__N; \
    } \
 }

-#define UTF16_SET_CHAR_LIMIT_SAFE(s, i, length) { \
-    if((i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
+#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
+    if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
        ++(i); \
    } \
 }
--- a/icu4c/source/common/unicode/utf32.h
+++ b/icu4c/source/common/unicode/utf32.h
@ -54,7 +54,7 @@
    (c)=(s)[i]; \
 }

-#define UTF32_GET_CHAR_SAFE(s, i, length, c, strict) { \
+#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
    (c)=(s)[i]; \
    if(!UTF32_IS_SAFE(c, strict)) { \
        (c)=UTF_ERROR_VALUE; \
@ -107,7 +107,7 @@
    } \
 }

-#define UTF32_SET_CHAR_START_SAFE(s, i) { \
+#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \
 }

 /* definitions with backward iteration -------------------------------------- */
@ -127,24 +127,21 @@
 #define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \
 }

-#define UTF32_PREV_CHAR_SAFE(s, i, c, strict) { \
+#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \
    (c)=(s)[--(i)]; \
    if(!UTF32_IS_SAFE(c, strict)) { \
        (c)=UTF_ERROR_VALUE; \
    } \
 }

-#define UTF32_BACK_1_SAFE(s, i) { \
-    if((i)>0) { \
-        --(i); \
-    } \
+#define UTF32_BACK_1_SAFE(s, start, i) { \
+    --(i); \
 }

-#define UTF32_BACK_N_SAFE(s, i, n) { \
-    if((i)>=(n)) { \
-        (i)-=(n); \
-    } else { \
-        (i)=0; \
+#define UTF32_BACK_N_SAFE(s, start, i, n) { \
+    (i)-=(n); \
+    if((i)<(start)) { \
+        (i)=(start); \
    } \
 }

--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@ -45,10 +45,10 @@ U_CAPI UTextOffset U_EXPORT2
 utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c);

 U_CAPI UChar32 U_EXPORT2
-utf8_prevCharSafeBody(const uint8_t *s, UTextOffset *pi, UChar32 c, bool_t strict);
+utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, bool_t strict);

 U_CAPI UTextOffset U_EXPORT2
-utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
+utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i);

 /*
 * For the semantics of all of these macros, see utf16.h.
@ -110,9 +110,9 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
    UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \
 }

-#define UTF8_GET_CHAR_SAFE(s, i, length, c, strict) { \
+#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
    UTextOffset __I=(UTextOffset)(i); \
-    UTF8_SET_CHAR_START_SAFE(s, __I); \
+    UTF8_SET_CHAR_START_SAFE(s, start, __I); \
    UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \
 }

@ -131,7 +131,7 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
 */
 #define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \
    (c)=(s)[(i)++]; \
-    if((c)&0x80) { \
+    if((uint8_t)((c)-0xc0)<0x35) { \
        uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \
        UTF8_MASK_LEAD_BYTE(c, __count); \
        switch(__count) { \
@ -185,7 +185,7 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);

 #define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
    (c)=(s)[(i)++]; \
-    if((c)&0x80) { \
+    if(UTF8_IS_LEAD(c)) { \
        (c)=utf8_nextCharSafeBody(s, &(i), (UTextOffset)(length), c, strict); \
    } \
 }
@ -199,17 +199,15 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
 }

 #define UTF8_FWD_1_SAFE(s, i, length) { \
-    if((i)<(length)) { \
-        uint8_t __b=(s)[(i)++]; \
-        if(__b&0x80) { \
-            uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \
-            if((i)+__count>(length)) { \
-                __count=(length)-(i); \
-            } \
-            while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \
-                ++(i); \
-                --__count; \
-            } \
+    uint8_t __b=(s)[(i)++]; \
+    if(UTF8_IS_LEAD(__b)) { \
+        uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \
+        if((i)+__count>(length)) { \
+            __count=(length)-(i); \
+        } \
+        while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \
+            ++(i); \
+            --__count; \
        } \
    } \
 }
@ -222,9 +220,9 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
    } \
 }

-#define UTF8_SET_CHAR_START_SAFE(s, i) { \
-    if((s)[(i)]&0x80) { \
-        (i)=utf8_back1SafeBody(s, (UTextOffset)(i)); \
+#define UTF8_SET_CHAR_START_SAFE(s, start, i) { \
+    if(UTF8_IS_TRAIL((s)[(i)])) { \
+        (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \
    } \
 }

@ -232,10 +230,10 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);

 #define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \
    (c)=(s)[--(i)]; \
-    if((c)&0x80) { \
+    if(UTF8_IS_TRAIL(c)) { \
        uint8_t __b, __count=1, __shift=6; \
 \
-        /* c must be a trail byte */ \
+        /* c is a trail byte */ \
        (c)&=0x3f; \
        for(;;) { \
            __b=(s)[--(i)]; \
@ -269,33 +267,35 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
    UTF8_FWD_1_UNSAFE(s, i); \
 }

-#define UTF8_PREV_CHAR_SAFE(s, i, c, strict) { \
+#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \
    (c)=(s)[--(i)]; \
-    if((c)&0x80) { \
-        (c)=utf8_prevCharSafeBody(s, &(i), c, strict); \
+    if(UTF8_IS_TRAIL((c))) { \
+        (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \
    } \
 }

-#define UTF8_BACK_1_SAFE(s, i) { \
-    if((i)>0) { \
-        --(i); \
-        if((s)[(i)]&0x80) { \
-            (i)=utf8_back1SafeBody(s, (UTextOffset)(i)); \
-        } \
+#define UTF8_BACK_1_SAFE(s, start, i) { \
+    if(UTF8_IS_TRAIL((s)[--(i)])) { \
+        (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \
    } \
 }

-#define UTF8_BACK_N_SAFE(s, i, n) { \
+#define UTF8_BACK_N_SAFE(s, start, i, n) { \
    UTextOffset __N=(n); \
-    while(__N>0 && (i)>0) { \
-        UTF8_BACK_1_SAFE(s, i); \
+    while(__N>0 && (i)>(start)) { \
+        UTF8_BACK_1_SAFE(s, start, i); \
        --__N; \
    } \
 }

-#define UTF8_SET_CHAR_LIMIT_SAFE(s, i, length) { \
-    UTF8_BACK_1_SAFE(s, i); \
-    UTF8_FWD_1_SAFE(s, i, length); \
+#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
+    if((start)<(i) && (i)<(length)) { \
+        UTF8_BACK_1_SAFE(s, start, i); \
+        (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \
+        if((i)>(length)) { \
+            (i)=(length); \
+        } \
+    } \
 }

 #endif
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@ -1085,7 +1085,7 @@ UnicodeString::trim()
    if(i <= 0) {
      break;
    }
-    UTF_PREV_CHAR(fArray, i, c);
+    UTF_PREV_CHAR(fArray, 0, i, c);
    if(!(c == 0x20 || Unicode::isWhitespace(c))) {
      break;
    }
--- a/icu4c/source/common/utf_impl.c
+++ b/icu4c/source/common/utf_impl.c
@ -193,69 +193,84 @@ utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c
 }

 U_CAPI UChar32 U_EXPORT2
-utf8_prevCharSafeBody(const uint8_t *s, UTextOffset *pi, UChar32 c, bool_t strict) {
+utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, bool_t strict) {
    UTextOffset i=*pi;
-    if(UTF8_IS_TRAIL(c)) {
-        uint8_t b, count=1, shift=6;
+    uint8_t b, count=1, shift=6;

-        c&=0x3f;
-        while(i>0 && count<6) {
-            b=s[--i];
-            if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
-                if(b&0x40) {
-                    /* lead byte */
-                    uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
+    /* extract value bits from the last trail byte */
+    c&=0x3f;

-                    if(count==shouldCount) {
-                        *pi=i;
-                        UTF8_MASK_LEAD_BYTE(b, count);
-                        c|=(UChar32)b<<shift;
-                        if( c>0x10ffff ||
-                            (strict) &&
-                                (UTF_IS_SURROGATE(c) ||
-                                 count>=4 || (c)<utf8_minRegular[count] || ((c)&0xfffe)==0xfffe)
-                        ) {
-                            /* irregular sequence */
-                        } else {
-                            return c;
-                        }
+    for(;;) {
+        if(i<=start) {
+            /* no lead byte at all */
+            c=UTF8_ERROR_VALUE_1;
+            break;
+        }
+
+        /* read another previous byte */
+        b=s[--i];
+        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
+            if(b&0x40) {
+                /* lead byte, this will always end the loop */
+                uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
+
+                if(count==shouldCount) {
+                    /* set the new position */
+                    *pi=i;
+                    UTF8_MASK_LEAD_BYTE(b, count);
+                    c|=(UChar32)b<<shift;
+                    if( c>0x10ffff ||
+                        (strict) &&
+                            (UTF_IS_SURROGATE(c) ||
+                             count>=4 || (c)<utf8_minRegular[count] || ((c)&0xfffe)==0xfffe)
+                    ) {
+                        /* irregular sequence */
+                        c=utf8_errorValue[count];
                    } else {
-                        /* the lead byte does not match the number of trail bytes */
-                        /* only set the position to the lead byte if it would
-                           include the trail byte that we started with */
-                        if(count<shouldCount) {
-                            *pi=i;
-                        }
+                        /* exit with correct c */
                    }
-                    break;
                } else {
-                    /* trail byte */
-                    c|=(UChar32)(b&0x3f)<<shift;
-                    ++count;
-                    shift+=6;
+                    /* the lead byte does not match the number of trail bytes */
+                    /* only set the position to the lead byte if it would
+                       include the trail byte that we started with */
+                    if(count<shouldCount) {
+                        *pi=i;
+                        c=utf8_errorValue[count];
+                    } else {
+                        c=UTF8_ERROR_VALUE_1;
+                    }
                }
+                break;
+            } else if(count<5) {
+                /* trail byte */
+                c|=(UChar32)(b&0x3f)<<shift;
+                ++count;
+                shift+=6;
            } else {
-                /* single-byte character precedes trailing bytes */
+                /* more than 5 trail bytes is illegal */
+                c=UTF8_ERROR_VALUE_1;
                break;
            }
+        } else {
+            /* single-byte character precedes trailing bytes */
+            c=UTF8_ERROR_VALUE_1;
+            break;
        }
-        /* i==0 or count==6 - no lead byte in legal distance */
-    /* } else { called with single lead byte */
    }
-    return UTF_ERROR_VALUE;
+    return c;
 }

 U_CAPI UTextOffset U_EXPORT2
-utf8_back1SafeBody(const uint8_t *s, UTextOffset i) {
+utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i) {
    /* i had been decremented once before the function call */
    UTextOffset I=i, Z;
    uint8_t b;

    /* read at most the 6 bytes s[Z] to s[i], inclusively */
-    if(I>5) {
+    if(I-5>start) {
        Z=I-5;
    } else {
-        Z=0;
+        Z=start;
    }

    /* return I if the sequence starting there is long enough to include i */