From 2b2af0bbc53164adf6c31737f548800635be4710 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Wed, 12 Apr 2000 19:36:07 +0000
Subject: [PATCH] ICU-176 utf macros get, prev, back take start parameter

X-SVN-Rev: 1116
---
 icu4c/source/common/unicode/utf.h   | 48 +++++++-------
 icu4c/source/common/unicode/utf16.h | 26 ++++----
 icu4c/source/common/unicode/utf32.h | 21 +++----
 icu4c/source/common/unicode/utf8.h  | 74 +++++++++++-----------
 icu4c/source/common/unistr.cpp      |  2 +-
 icu4c/source/common/utf_impl.c      | 97 +++++++++++++++++------------
 6 files changed, 140 insertions(+), 128 deletions(-)

diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h
index 77abb82fe35..afc103e582c 100644
--- a/icu4c/source/common/unicode/utf.h
+++ b/icu4c/source/common/unicode/utf.h
@@ -149,7 +149,7 @@ typedef int32_t UTextOffset;
 #   define UTF_ARRAY_SIZE(size)                         UTF16_ARRAY_SIZE(size)
 
 #   define UTF_GET_CHAR_UNSAFE(s, i, c)                 UTF16_GET_CHAR_UNSAFE(s, i, c)
-#   define UTF_GET_CHAR_SAFE(s, i, length, c, strict)   UTF16_GET_CHAR_SAFE(s, i, length, c, strict)
+#   define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict)
 
 #   define UTF_NEXT_CHAR_UNSAFE(s, i, c)                UTF16_NEXT_CHAR_UNSAFE(s, i, c)
 #   define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict)  UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict)
@@ -164,19 +164,19 @@ typedef int32_t UTextOffset;
 #   define UTF_FWD_N_SAFE(s, i, length, n)              UTF16_FWD_N_SAFE(s, i, length, n)
 
 #   define UTF_SET_CHAR_START_UNSAFE(s, i)              UTF16_SET_CHAR_START_UNSAFE(s, i)
-#   define UTF_SET_CHAR_START_SAFE(s, i)                UTF16_SET_CHAR_START_SAFE(s, i)
+#   define UTF_SET_CHAR_START_SAFE(s, start, i)         UTF16_SET_CHAR_START_SAFE(s, start, i)
 
 #   define UTF_PREV_CHAR_UNSAFE(s, i, c)                UTF16_PREV_CHAR_UNSAFE(s, i, c)
-#   define UTF_PREV_CHAR_SAFE(s, i, c, strict)          UTF16_PREV_CHAR_SAFE(s, i, c, strict)
+#   define UTF_PREV_CHAR_SAFE(s, start, i, c, strict)   UTF16_PREV_CHAR_SAFE(s, start, i, c, strict)
 
 #   define UTF_BACK_1_UNSAFE(s, i)                      UTF16_BACK_1_UNSAFE(s, i)
-#   define UTF_BACK_1_SAFE(s, i)                        UTF16_BACK_1_SAFE(s, i)
+#   define UTF_BACK_1_SAFE(s, start, i)                 UTF16_BACK_1_SAFE(s, start, i)
 
 #   define UTF_BACK_N_UNSAFE(s, i, n)                   UTF16_BACK_N_UNSAFE(s, i, n)
-#   define UTF_BACK_N_SAFE(s, i, n)                     UTF16_BACK_N_SAFE(s, i, n)
+#   define UTF_BACK_N_SAFE(s, start, i, n)              UTF16_BACK_N_SAFE(s, start, i, n)
 
 #   define UTF_SET_CHAR_LIMIT_UNSAFE(s, i)              UTF16_SET_CHAR_LIMIT_UNSAFE(s, i)
-#   define UTF_SET_CHAR_LIMIT_SAFE(s, i, length)        UTF16_SET_CHAR_LIMIT_SAFE(s, i, length)
+#   define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length)
 
 #elif UTF_SIZE==32
 
@@ -192,48 +192,48 @@ typedef int32_t UTextOffset;
 
 #ifdef UTF_SAFE
 
-#   define UTF_GET_CHAR(s, i, length, c)        UTF_GET_CHAR_SAFE(s, i, length, c, FALSE)
+#   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE)
 
 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE)
 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_SAFE(s, i, length, c)
 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_SAFE(s, i, length)
 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_SAFE(s, i, length, n)
-#   define UTF_SET_CHAR_START(s, i)             UTF_SET_CHAR_START_SAFE(s, i)
+#   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_SAFE(s, start, i)
 
-#   define UTF_PREV_CHAR(s, i, c)               UTF_PREV_CHAR_SAFE(s, i, c, FALSE)
-#   define UTF_BACK_1(s, i)                     UTF_BACK_1_SAFE(s, i)
-#   define UTF_BACK_N(s, i, n)                  UTF_BACK_N_SAFE(s, i, n)
-#   define UTF_SET_CHAR_LIMIT(s, i, length)     UTF_SET_CHAR_LIMIT_SAFE(s, i, length)
+#   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE)
+#   define UTF_BACK_1(s, start, i)              UTF_BACK_1_SAFE(s, start, i)
+#   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_SAFE(s, start, i, n)
+#   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
 
 #elif defined(UTF_STRICT)
 
-#   define UTF_GET_CHAR(s, i, length, c)        UTF_GET_CHAR_SAFE(s, i, length, c, TRUE)
+#   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE)
 
 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE)
 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_SAFE(s, i, length, c)
 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_SAFE(s, i, length)
 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_SAFE(s, i, length, n)
-#   define UTF_SET_CHAR_START(s, i)             UTF_SET_CHAR_START_SAFE(s, i)
+#   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_SAFE(s, start, i)
 
-#   define UTF_PREV_CHAR(s, i, c)               UTF_PREV_CHAR_SAFE(s, i, c, TRUE)
-#   define UTF_BACK_1(s, i)                     UTF_BACK_1_SAFE(s, i)
-#   define UTF_BACK_N(s, i, n)                  UTF_BACK_N_SAFE(s, i, n)
-#   define UTF_SET_CHAR_LIMIT(s, i, length)     UTF_SET_CHAR_LIMIT_SAFE(s, i, length)
+#   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE)
+#   define UTF_BACK_1(s, start, i)              UTF_BACK_1_SAFE(s, start, i)
+#   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_SAFE(s, start, i, n)
+#   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
 
 #else /* UTF_UNSAFE */
 
-#   define UTF_GET_CHAR(s, i, length, c)        UTF_GET_CHAR_UNSAFE(s, i, c)
+#   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c)
 
 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_UNSAFE(s, i, c)
 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_UNSAFE(s, i, c)
 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_UNSAFE(s, i)
 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_UNSAFE(s, i, n)
-#   define UTF_SET_CHAR_START(s, i)             UTF_SET_CHAR_START_UNSAFE(s, i)
+#   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_UNSAFE(s, i)
 
-#   define UTF_PREV_CHAR(s, i, c)               UTF_PREV_CHAR_UNSAFE(s, i, c)
-#   define UTF_BACK_1(s, i)                     UTF_BACK_1_UNSAFE(s, i)
-#   define UTF_BACK_N(s, i, n)                  UTF_BACK_N_UNSAFE(s, i, n)
-#   define UTF_SET_CHAR_LIMIT(s, i, length)     UTF_SET_CHAR_LIMIT_UNSAFE(s, i)
+#   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_UNSAFE(s, i, c)
+#   define UTF_BACK_1(s, start, i)              UTF_BACK_1_UNSAFE(s, i)
+#   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_UNSAFE(s, i, n)
+#   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i)
 
 #endif
 
diff --git a/icu4c/source/common/unicode/utf16.h b/icu4c/source/common/unicode/utf16.h
index da445a60df1..793b9c633b2 100644
--- a/icu4c/source/common/unicode/utf16.h
+++ b/icu4c/source/common/unicode/utf16.h
@@ -72,7 +72,7 @@
     } \
 }
 
-#define UTF16_GET_CHAR_SAFE(s, i, length, c, strict) { \
+#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
     (c)=(s)[i]; \
     if(UTF_IS_SURROGATE(c)) { \
         uint16_t __c2; \
@@ -85,7 +85,7 @@
                 (c)=UTF_ERROR_VALUE; \
             } \
         } else { \
-            if((i)>0 && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
+            if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
                 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
             } else if(strict) {\
@@ -207,8 +207,8 @@
     } \
 }
 
-#define UTF16_SET_CHAR_START_SAFE(s, i) { \
-    if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>0 && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
+#define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
+    if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
         --(i); \
     } \
 }
@@ -267,11 +267,11 @@
 
 /* safe versions with error-checking and optional regularity-checking */
 
-#define UTF16_PREV_CHAR_SAFE(s, i, c, strict) { \
+#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
     (c)=(s)[--(i)]; \
     if(UTF_IS_SECOND_SURROGATE(c)) { \
         uint16_t __c2; \
-        if((i)>0 && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
+        if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
             --(i); \
             (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
@@ -286,22 +286,22 @@
     } \
 }
 
-#define UTF16_BACK_1_SAFE(s, i) { \
-    if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>0 && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
+#define UTF16_BACK_1_SAFE(s, start, i) { \
+    if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
         --(i); \
     } \
 }
 
-#define UTF16_BACK_N_SAFE(s, i, n) { \
+#define UTF16_BACK_N_SAFE(s, start, i, n) { \
     UTextOffset __N=(n); \
-    while(__N>0 && (i)>0) { \
-        UTF16_BACK_1_SAFE(s, i); \
+    while(__N>0 && (i)>(start)) { \
+        UTF16_BACK_1_SAFE(s, start, i); \
         --__N; \
     } \
 }
 
-#define UTF16_SET_CHAR_LIMIT_SAFE(s, i, length) { \
-    if((i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
+#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
+    if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
         ++(i); \
     } \
 }
diff --git a/icu4c/source/common/unicode/utf32.h b/icu4c/source/common/unicode/utf32.h
index a6de5d75da1..1e1e339b0e1 100644
--- a/icu4c/source/common/unicode/utf32.h
+++ b/icu4c/source/common/unicode/utf32.h
@@ -54,7 +54,7 @@
     (c)=(s)[i]; \
 }
 
-#define UTF32_GET_CHAR_SAFE(s, i, length, c, strict) { \
+#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
     (c)=(s)[i]; \
     if(!UTF32_IS_SAFE(c, strict)) { \
         (c)=UTF_ERROR_VALUE; \
@@ -107,7 +107,7 @@
     } \
 }
 
-#define UTF32_SET_CHAR_START_SAFE(s, i) { \
+#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \
 }
 
 /* definitions with backward iteration -------------------------------------- */
@@ -127,24 +127,21 @@
 #define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \
 }
 
-#define UTF32_PREV_CHAR_SAFE(s, i, c, strict) { \
+#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \
     (c)=(s)[--(i)]; \
     if(!UTF32_IS_SAFE(c, strict)) { \
         (c)=UTF_ERROR_VALUE; \
     } \
 }
 
-#define UTF32_BACK_1_SAFE(s, i) { \
-    if((i)>0) { \
-        --(i); \
-    } \
+#define UTF32_BACK_1_SAFE(s, start, i) { \
+    --(i); \
 }
 
-#define UTF32_BACK_N_SAFE(s, i, n) { \
-    if((i)>=(n)) { \
-        (i)-=(n); \
-    } else { \
-        (i)=0; \
+#define UTF32_BACK_N_SAFE(s, start, i, n) { \
+    (i)-=(n); \
+    if((i)<(start)) { \
+        (i)=(start); \
     } \
 }
 
diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h
index 158c96798e8..58672a19bb4 100644
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@@ -45,10 +45,10 @@ U_CAPI UTextOffset U_EXPORT2
 utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c);
 
 U_CAPI UChar32 U_EXPORT2
-utf8_prevCharSafeBody(const uint8_t *s, UTextOffset *pi, UChar32 c, bool_t strict);
+utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, bool_t strict);
 
 U_CAPI UTextOffset U_EXPORT2
-utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
+utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i);
 
 /*
  * For the semantics of all of these macros, see utf16.h.
@@ -110,9 +110,9 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
     UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \
 }
 
-#define UTF8_GET_CHAR_SAFE(s, i, length, c, strict) { \
+#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
     UTextOffset __I=(UTextOffset)(i); \
-    UTF8_SET_CHAR_START_SAFE(s, __I); \
+    UTF8_SET_CHAR_START_SAFE(s, start, __I); \
     UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \
 }
 
@@ -131,7 +131,7 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
  */
 #define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \
     (c)=(s)[(i)++]; \
-    if((c)&0x80) { \
+    if((uint8_t)((c)-0xc0)<0x35) { \
         uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \
         UTF8_MASK_LEAD_BYTE(c, __count); \
         switch(__count) { \
@@ -185,7 +185,7 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
 
 #define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
     (c)=(s)[(i)++]; \
-    if((c)&0x80) { \
+    if(UTF8_IS_LEAD(c)) { \
         (c)=utf8_nextCharSafeBody(s, &(i), (UTextOffset)(length), c, strict); \
     } \
 }
@@ -199,17 +199,15 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
 }
 
 #define UTF8_FWD_1_SAFE(s, i, length) { \
-    if((i)<(length)) { \
-        uint8_t __b=(s)[(i)++]; \
-        if(__b&0x80) { \
-            uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \
-            if((i)+__count>(length)) { \
-                __count=(length)-(i); \
-            } \
-            while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \
-                ++(i); \
-                --__count; \
-            } \
+    uint8_t __b=(s)[(i)++]; \
+    if(UTF8_IS_LEAD(__b)) { \
+        uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \
+        if((i)+__count>(length)) { \
+            __count=(length)-(i); \
+        } \
+        while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \
+            ++(i); \
+            --__count; \
         } \
     } \
 }
@@ -222,9 +220,9 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
     } \
 }
 
-#define UTF8_SET_CHAR_START_SAFE(s, i) { \
-    if((s)[(i)]&0x80) { \
-        (i)=utf8_back1SafeBody(s, (UTextOffset)(i)); \
+#define UTF8_SET_CHAR_START_SAFE(s, start, i) { \
+    if(UTF8_IS_TRAIL((s)[(i)])) { \
+        (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \
     } \
 }
 
@@ -232,10 +230,10 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
 
 #define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \
     (c)=(s)[--(i)]; \
-    if((c)&0x80) { \
+    if(UTF8_IS_TRAIL(c)) { \
         uint8_t __b, __count=1, __shift=6; \
 \
-        /* c must be a trail byte */ \
+        /* c is a trail byte */ \
         (c)&=0x3f; \
         for(;;) { \
             __b=(s)[--(i)]; \
@@ -269,33 +267,35 @@ utf8_back1SafeBody(const uint8_t *s, UTextOffset i);
     UTF8_FWD_1_UNSAFE(s, i); \
 }
 
-#define UTF8_PREV_CHAR_SAFE(s, i, c, strict) { \
+#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \
     (c)=(s)[--(i)]; \
-    if((c)&0x80) { \
-        (c)=utf8_prevCharSafeBody(s, &(i), c, strict); \
+    if(UTF8_IS_TRAIL((c))) { \
+        (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \
     } \
 }
 
-#define UTF8_BACK_1_SAFE(s, i) { \
-    if((i)>0) { \
-        --(i); \
-        if((s)[(i)]&0x80) { \
-            (i)=utf8_back1SafeBody(s, (UTextOffset)(i)); \
-        } \
+#define UTF8_BACK_1_SAFE(s, start, i) { \
+    if(UTF8_IS_TRAIL((s)[--(i)])) { \
+        (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \
     } \
 }
 
-#define UTF8_BACK_N_SAFE(s, i, n) { \
+#define UTF8_BACK_N_SAFE(s, start, i, n) { \
     UTextOffset __N=(n); \
-    while(__N>0 && (i)>0) { \
-        UTF8_BACK_1_SAFE(s, i); \
+    while(__N>0 && (i)>(start)) { \
+        UTF8_BACK_1_SAFE(s, start, i); \
         --__N; \
     } \
 }
 
-#define UTF8_SET_CHAR_LIMIT_SAFE(s, i, length) { \
-    UTF8_BACK_1_SAFE(s, i); \
-    UTF8_FWD_1_SAFE(s, i, length); \
+#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
+    if((start)<(i) && (i)<(length)) { \
+        UTF8_BACK_1_SAFE(s, start, i); \
+        (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \
+        if((i)>(length)) { \
+            (i)=(length); \
+        } \
+    } \
 }
 
 #endif
diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp
index cb83a82c39b..b18f1f8cc04 100644
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@@ -1085,7 +1085,7 @@ UnicodeString::trim()
     if(i <= 0) {
       break;
     }
-    UTF_PREV_CHAR(fArray, i, c);
+    UTF_PREV_CHAR(fArray, 0, i, c);
     if(!(c == 0x20 || Unicode::isWhitespace(c))) {
       break;
     }
diff --git a/icu4c/source/common/utf_impl.c b/icu4c/source/common/utf_impl.c
index 8d392063a3b..798635296d9 100644
--- a/icu4c/source/common/utf_impl.c
+++ b/icu4c/source/common/utf_impl.c
@@ -193,69 +193,84 @@ utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c
 }
 
 U_CAPI UChar32 U_EXPORT2
-utf8_prevCharSafeBody(const uint8_t *s, UTextOffset *pi, UChar32 c, bool_t strict) {
+utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, bool_t strict) {
     UTextOffset i=*pi;
-    if(UTF8_IS_TRAIL(c)) {
-        uint8_t b, count=1, shift=6;
+    uint8_t b, count=1, shift=6;
 
-        c&=0x3f;
-        while(i>0 && count<6) {
-            b=s[--i];
-            if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
-                if(b&0x40) {
-                    /* lead byte */
-                    uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
+    /* extract value bits from the last trail byte */
+    c&=0x3f;
 
-                    if(count==shouldCount) {
-                        *pi=i;
-                        UTF8_MASK_LEAD_BYTE(b, count);
-                        c|=(UChar32)b<<shift;
-                        if( c>0x10ffff ||
-                            (strict) &&
-                                (UTF_IS_SURROGATE(c) ||
-                                 count>=4 || (c)<utf8_minRegular[count] || ((c)&0xfffe)==0xfffe)
-                        ) {
-                            /* irregular sequence */
-                        } else {
-                            return c;
-                        }
+    for(;;) {
+        if(i<=start) {
+            /* no lead byte at all */
+            c=UTF8_ERROR_VALUE_1;
+            break;
+        }
+
+        /* read another previous byte */
+        b=s[--i];
+        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
+            if(b&0x40) {
+                /* lead byte, this will always end the loop */
+                uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
+
+                if(count==shouldCount) {
+                    /* set the new position */
+                    *pi=i;
+                    UTF8_MASK_LEAD_BYTE(b, count);
+                    c|=(UChar32)b<<shift;
+                    if( c>0x10ffff ||
+                        (strict) &&
+                            (UTF_IS_SURROGATE(c) ||
+                             count>=4 || (c)<utf8_minRegular[count] || ((c)&0xfffe)==0xfffe)
+                    ) {
+                        /* irregular sequence */
+                        c=utf8_errorValue[count];
                     } else {
-                        /* the lead byte does not match the number of trail bytes */
-                        /* only set the position to the lead byte if it would
-                           include the trail byte that we started with */
-                        if(count<shouldCount) {
-                            *pi=i;
-                        }
+                        /* exit with correct c */
                     }
-                    break;
                 } else {
-                    /* trail byte */
-                    c|=(UChar32)(b&0x3f)<<shift;
-                    ++count;
-                    shift+=6;
+                    /* the lead byte does not match the number of trail bytes */
+                    /* only set the position to the lead byte if it would
+                       include the trail byte that we started with */
+                    if(count<shouldCount) {
+                        *pi=i;
+                        c=utf8_errorValue[count];
+                    } else {
+                        c=UTF8_ERROR_VALUE_1;
+                    }
                 }
+                break;
+            } else if(count<5) {
+                /* trail byte */
+                c|=(UChar32)(b&0x3f)<<shift;
+                ++count;
+                shift+=6;
             } else {
-                /* single-byte character precedes trailing bytes */
+                /* more than 5 trail bytes is illegal */
+                c=UTF8_ERROR_VALUE_1;
                 break;
             }
+        } else {
+            /* single-byte character precedes trailing bytes */
+            c=UTF8_ERROR_VALUE_1;
+            break;
         }
-        /* i==0 or count==6 - no lead byte in legal distance */
-    /* } else { called with single lead byte */
     }
-    return UTF_ERROR_VALUE;
+    return c;
 }
 
 U_CAPI UTextOffset U_EXPORT2
-utf8_back1SafeBody(const uint8_t *s, UTextOffset i) {
+utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i) {
     /* i had been decremented once before the function call */
     UTextOffset I=i, Z;
     uint8_t b;
 
     /* read at most the 6 bytes s[Z] to s[i], inclusively */
-    if(I>5) {
+    if(I-5>start) {
         Z=I-5;
     } else {
-        Z=0;
+        Z=start;
     }
 
     /* return I if the sequence starting there is long enough to include i */