From 5d32123d29ed5ef80a19959138ca919f835c1242 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Sat, 5 Aug 2006 21:27:11 +0000
Subject: [PATCH] ICU-4558 fix and clarify utfxx.h API docs for input string
 indexes for U16_ and U8_ macros

X-SVN-Rev: 19988
---
 icu4c/source/common/unicode/utf.h   | 10 ++++++++++
 icu4c/source/common/unicode/utf16.h | 20 +++++++++----------
 icu4c/source/common/unicode/utf8.h  | 30 ++++++++++++++---------------
 3 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h
index 32c2d74a2e6..2dfef63d66f 100644
--- a/icu4c/source/common/unicode/utf.h
+++ b/icu4c/source/common/unicode/utf.h
@@ -63,6 +63,14 @@
  * malformed sequences can be expressed unambiguously with a distinct subrange
  * of Unicode code points.)
  *
+ * The regular "safe" macros require that the initial, passed-in string index
+ * is within bounds. They only check the index when they read more than one
+ * code unit. This is usually done with code similar to the following loop:
+ * <pre>while(i<length) {
+ *   U16_NEXT(s, i, length, c);
+ *   // use c
+ * }</pre>
+ *
  * When it is safe to assume that text is well-formed UTF-16
  * (does not contain single, unpaired surrogates), then one can use
  * U16_..._UNSAFE macros.
@@ -80,6 +88,8 @@
  * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
  * and are fast, while the safe UTF-8 macros call functions for all but the
  * trivial (ASCII) cases.
+ * (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
+ * characters inline as well.)
  *
  * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
  * code point values (0..U+10ffff). They are indicated with negative values instead.
diff --git a/icu4c/source/common/unicode/utf16.h b/icu4c/source/common/unicode/utf16.h
index 217c27429b7..cd8c5c1ed10 100644
--- a/icu4c/source/common/unicode/utf16.h
+++ b/icu4c/source/common/unicode/utf16.h
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1999-2005, International Business Machines
+*   Copyright (C) 1999-2006, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -179,7 +179,7 @@
  *
  * @param s const UChar * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i<length
+ * @param i string offset, must be start<=i<length
  * @param length string length
  * @param c output UChar32 variable
  * @see U16_GET_UNSAFE
@@ -243,7 +243,7 @@
  * will be returned as the code point.
  *
  * @param s const UChar * string
- * @param i string offset, i<length
+ * @param i string offset, must be i<length
  * @param length string length
  * @param c output UChar32 variable
  * @see U16_NEXT_UNSAFE
@@ -292,7 +292,7 @@
  * then isError is set to TRUE.
  *
  * @param s const UChar * string buffer
- * @param i string offset, i<length
+ * @param i string offset, must be i<capacity
  * @param capacity size of the string buffer
  * @param c code point to append
  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
@@ -332,7 +332,7 @@
  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  *
  * @param s const UChar * string
- * @param i string offset, i<length
+ * @param i string offset, must be i<length
  * @param length string length
  * @see U16_FWD_1_UNSAFE
  * @stable ICU 2.4
@@ -370,7 +370,7 @@
  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  *
  * @param s const UChar * string
- * @param i string offset, i<length
+ * @param i string offset, must be i<length
  * @param length string length
  * @param n number of code points to skip
  * @see U16_FWD_N_UNSAFE
@@ -413,7 +413,7 @@
  *
  * @param s const UChar * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i
+ * @param i string offset, must be start<=i
  * @see U16_SET_CP_START_UNSAFE
  * @stable ICU 2.4
  */
@@ -468,7 +468,7 @@
  *
  * @param s const UChar * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i
+ * @param i string offset, must be start<i
  * @param c output UChar32 variable
  * @see U16_PREV_UNSAFE
  * @stable ICU 2.4
@@ -509,7 +509,7 @@
  *
  * @param s const UChar * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i
+ * @param i string offset, must be start<i
  * @see U16_BACK_1_UNSAFE
  * @stable ICU 2.4
  */
@@ -549,7 +549,7 @@
  *
  * @param s const UChar * string
  * @param start start of string
- * @param i string offset, i<length
+ * @param i string offset, must be start<i
  * @param n number of code points to skip
  * @see U16_BACK_N_UNSAFE
  * @stable ICU 2.4
diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h
index 4abd50cd82d..ff788403048 100644
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@@ -181,7 +181,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  *
  * @param s const uint8_t * string
  * @param start starting string offset
- * @param i string offset, start<=i<length
+ * @param i string offset, must be start<=i<length
  * @param length string length
  * @param c output UChar32 variable, set to <0 in case of an error
  * @see U8_GET_UNSAFE
@@ -243,7 +243,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  * c is set to a negative value.
  *
  * @param s const uint8_t * string
- * @param i string offset, i<length
+ * @param i string offset, must be i<length
  * @param length string length
  * @param c output UChar32 variable, set to <0 in case of an error
  * @see U8_NEXT_UNSAFE
@@ -320,25 +320,25 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  * then isError is set to TRUE.
  *
  * @param s const uint8_t * string buffer
- * @param i string offset, i<length
- * @param length size of the string buffer
+ * @param i string offset, must be i<capacity
+ * @param capacity size of the string buffer
  * @param c code point to append
  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
  * @see U8_APPEND_UNSAFE
  * @stable ICU 2.4
  */
-#define U8_APPEND(s, i, length, c, isError) { \
+#define U8_APPEND(s, i, capacity, c, isError) { \
     if((uint32_t)(c)<=0x7f) { \
         (s)[(i)++]=(uint8_t)(c); \
-    } else if((uint32_t)(c)<=0x7ff && (i)+1<(length)) { \
+    } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \
         (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
         (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
-    } else if((uint32_t)(c)<=0xd7ff && (i)+2<(length)) { \
+    } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \
         (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
         (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
         (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
     } else { \
-        (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, &(isError)); \
+        (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(capacity), c, &(isError)); \
     } \
 }
 
@@ -362,7 +362,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  * "Safe" macro, checks for illegal sequences and for string boundaries.
  *
  * @param s const uint8_t * string
- * @param i string offset, i<length
+ * @param i string offset, must be i<length
  * @param length string length
  * @see U8_FWD_1_UNSAFE
  * @stable ICU 2.4
@@ -408,7 +408,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  * "Safe" macro, checks for illegal sequences and for string boundaries.
  *
  * @param s const uint8_t * string
- * @param i string offset, i<length
+ * @param i string offset, must be i<length
  * @param length string length
  * @param n number of code points to skip
  * @see U8_FWD_N_UNSAFE
@@ -449,7 +449,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  *
  * @param s const uint8_t * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i
+ * @param i string offset, must be start<=i
  * @see U8_SET_CP_START_UNSAFE
  * @stable ICU 2.4
  */
@@ -517,7 +517,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  *
  * @param s const uint8_t * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i
+ * @param i string offset, must be start<i
  * @param c output UChar32 variable, set to <0 in case of an error
  * @see U8_PREV_UNSAFE
  * @stable ICU 2.4
@@ -556,7 +556,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  *
  * @param s const uint8_t * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i
+ * @param i string offset, must be start<i
  * @see U8_BACK_1_UNSAFE
  * @stable ICU 2.4
  */
@@ -596,7 +596,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  *
  * @param s const uint8_t * string
  * @param start index of the start of the string
- * @param i string offset, i<length
+ * @param i string offset, must be start<i
  * @param n number of code points to skip
  * @see U8_BACK_N_UNSAFE
  * @stable ICU 2.4
@@ -637,7 +637,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  *
  * @param s const uint8_t * string
  * @param start starting string offset (usually 0)
- * @param i string offset, start<=i<=length
+ * @param i string offset, must be start<=i<=length
  * @param length string length
  * @see U8_SET_CP_LIMIT_UNSAFE
  * @stable ICU 2.4