From bbf2815b33cf31ed8e46bf9e1cb258c81fd50470 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Wed, 1 Dec 1999 17:51:53 +0000
Subject: [PATCH] ICU-68 construct UnicodeString from invariant char*, alias
 from UChar*

X-SVN-Rev: 268
---
 icu4c/source/common/unistr.cpp | 135 +++++++++++++++++++++++----------
 icu4c/source/common/unistr.h   |  65 +++++++++++++++-
 icu4c/source/common/ustring.h  |   6 +-
 3 files changed, 160 insertions(+), 46 deletions(-)

diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp
index d3b91fbb9fd..3998d9f3a53 100644
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@@ -17,13 +17,15 @@
 *******************************************************************************
 */
 
-#include "unistr.h"
 
+#include "utypes.h"
+#include "putil.h"
 #include "locid.h"
 #include "cstring.h"
 #include "cmemory.h"
 #include "ustring.h"
 #include "mutex.h"
+#include "unistr.h"
 
 #if 0
 //DEBUGGING
@@ -152,6 +154,21 @@ UnicodeString::UnicodeString( const UChar *text,
   doReplace(0, 0, text, 0, textLength);
 }
 
+UnicodeString::UnicodeString(bool_t isTerminated,
+                             UChar *text,
+                             int32_t textLength)
+  : fArray(text),
+    fLength(textLength != -1 || !isTerminated ? textLength : u_strlen(text)),
+    fCapacity(isTerminated ? fLength + 1 : fLength),
+    fRefCounted(FALSE),
+    fHashCode(kInvalidHashCode),
+    fBogus(FALSE)
+{
+  if(fLength < 0) {
+    setToBogus();
+  }
+}
+
 UnicodeString::UnicodeString(const char *codepageData,
                  const char *codepage)
   : fArray(fStackBuffer),
@@ -621,10 +638,10 @@ UnicodeString::doReplace(UTextOffset start,
       // don't delete it until the end of the method.  this can happen
       // in code like UnicodeString s = "foo"; s += s;
       if(srcChars != getArrayStart())
-    delete [] fArray;
+        delete [] fArray;
       else {
-    deleteWhenDone = TRUE;
-    bufferToDelete = fArray;
+        deleteWhenDone = TRUE;
+        bufferToDelete = fArray;
       }
     }
 
@@ -771,15 +788,19 @@ UnicodeString::extract(UTextOffset start,
   int32_t arraySize        = 0x0FFFFFFF;
 
   // create the converter
-  UConverter *converter = 0;
+  UConverter *converter;
 
   // if the codepage is the default, use our cache
-  if(codepage == 0)
+  if(codepage == 0) {
     converter = getDefaultConverter(status);
-  else
+  } else if(*codepage == 0) {
+    converter = 0;
+  } else {
     converter = ucnv_open(codepage, &status);
+  }
 
   // if we failed, set the appropriate flags and return
+  // if it is an empty string, then use the "invariant character" conversion
   if(U_FAILURE(status)) {
     // close the converter
     if(codepage == 0)
@@ -789,14 +810,22 @@ UnicodeString::extract(UTextOffset start,
     return 0;
   }
 
+  // perform the conversion
+  if(converter == 0) {
+    // use the "invariant characters" conversion
+    if(length > fLength - start) {
+      length = fLength - start;
+    }
+    u_UCharsToChars(mySource, myTarget, length);
+    return length;
+  }
+
+  // there is no loop here since we assume the buffer is large enough
   myTargetLimit = myTarget + arraySize;
 
   if(myTargetLimit < myTarget)  /* ptr wrapped around: pin to U_MAX_PTR */
     myTargetLimit = (char*)U_MAX_PTR; 
 
-  // perform the conversion
-  // there is no loop here since we assume the buffer is large enough
-
   ucnv_fromUnicode(converter, &myTarget,  myTargetLimit,
            &mySource, mySourceEnd, NULL, TRUE, &status);
 
@@ -822,7 +851,7 @@ UnicodeString::doCodepageCreate(const char *codepageData,
   int32_t sourceLen        = dataLength;
   const char *mySource     = codepageData;
   const char *mySourceEnd  = mySource + sourceLen;
-  UChar *myTarget          = getArrayStart();
+  UChar *myTarget;
   UErrorCode status        = U_ZERO_ERROR;
   int32_t arraySize        = getCapacity();
 
@@ -830,9 +859,12 @@ UnicodeString::doCodepageCreate(const char *codepageData,
   UConverter *converter = 0;
 
   // if the codepage is the default, use our cache
-  converter = (codepage == 0
-           ? getDefaultConverter(status)
-           : ucnv_open(codepage, &status));
+  // if it is an empty string, then use the "invariant character" conversion
+  converter = (codepage == 0 ?
+                 getDefaultConverter(status) :
+                 *codepage == 0 ?
+                   0 :
+                   ucnv_open(codepage, &status));
 
   // if we failed, set the appropriate flags and return
   if(U_FAILURE(status)) {
@@ -845,8 +877,37 @@ UnicodeString::doCodepageCreate(const char *codepageData,
     return;
   }
 
+  fHashCode = kInvalidHashCode;
+
   // perform the conversion
-  do {
+  if(converter == 0) {
+    // use the "invariant characters" conversion
+    if(arraySize < dataLength) {
+      int32_t tempCapacity;
+      // allocate enough space for the dataLength, the refCount, and a NUL
+      UChar *temp = allocate(dataLength + 2, tempCapacity);
+
+      if(temp == 0) {
+        // set flags and return
+        setToBogus();
+        return;
+      }
+
+      fArray      = temp;
+      fCapacity   = tempCapacity;
+
+      setRefCount(1);
+
+      u_charsToUChars(codepageData, fArray + 1, dataLength);
+      fArray[dataLength + 1] = 0;
+    } else {
+      u_charsToUChars(codepageData, getArrayStart(), dataLength);
+    }
+    return;
+  }
+
+  myTarget = getArrayStart();
+  for(;;) {
     // reset the error code
     status = U_ZERO_ERROR;
 
@@ -859,30 +920,24 @@ UnicodeString::doCodepageCreate(const char *codepageData,
     arraySize    = getCapacity() - fLength;
 
     // allocate more space and copy data, if needed
-    if(fLength < dataLength) {
+    if(status == U_INDEX_OUTOFBOUNDS_ERROR) {
       int32_t tempCapacity;
       UChar *temp = allocate(fCapacity, tempCapacity);
 
       if(! temp) {
-    // close the converter
-    if(codepage == 0)
-      releaseDefaultConverter(converter);
-    else
-      ucnv_close(converter);
-    // set flags and return
-    setToBogus();
-    return;
+        // set flags and return
+        setToBogus();
+        break;
       }
 
-      // if we're not currently ref counted, shift the array right by one
-      if(fRefCounted == FALSE)
-    us_arrayCopy(fArray, 0, temp, 1, fLength);
-      // otherwise, copy the old array into temp, including the ref count
-      else
-          us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
-
-      if(fRefCounted && removeRef() == 0)
-    delete [] fArray;
+      if(fRefCounted) {
+        // copy the old array into temp, including the ref count
+        us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
+        delete [] fArray;
+      } else {
+        // if we're not currently ref counted, shift the array right by one
+        us_arrayCopy(fArray, 0, temp, 1, fLength);
+      }
 
       fArray      = temp;
       fCapacity   = tempCapacity;
@@ -891,11 +946,10 @@ UnicodeString::doCodepageCreate(const char *codepageData,
 
       myTarget    = getArrayStart() + fLength;
       arraySize   = getCapacity() - fLength;
+    } else {
+      break;
     }
   }
-  while(status == U_INDEX_OUTOFBOUNDS_ERROR);
-
-  fHashCode = kInvalidHashCode;
 
   // close the converter
   if(codepage == 0)
@@ -925,9 +979,6 @@ UnicodeString::getUChars() const
   if(fBogus)
     return 0;
 
-  // clone our array, if necessary
-  ((UnicodeString*)this)->cloneArrayIfNeeded();
-
   // no room for null, resize
   if(getCapacity() <= fLength) {
     // allocate at minimum the current capacity + needed space
@@ -955,8 +1006,10 @@ UnicodeString::getUChars() const
     ((UnicodeString*)this)->setRefCount(1);
   }
 
-  // tack on a trailing null
-  fArray[(fRefCounted ? 1 : 0) + fLength] = 0;
+  if(getArrayStart()[fLength] != 0) {
+    // tack on a trailing null
+    ((UChar *)getArrayStart())[fLength] = 0;
+  }
 
   return getArrayStart();
 }
diff --git a/icu4c/source/common/unistr.h b/icu4c/source/common/unistr.h
index 567a8bc8f5d..b55b6f5d9e9 100644
--- a/icu4c/source/common/unistr.h
+++ b/icu4c/source/common/unistr.h
@@ -40,6 +40,30 @@
 class Locale;
 class UCharReference;
 
+/**
+ * Unicode String literals in C++.
+ * Dependent on the platform properties, different UnicodeString
+ * constructors should be used to create a UnicodeString object from
+ * a string literal.
+ * The macros are defined for maximum performance.
+ * They work only for strings that contain "invariant characters", i.e.,
+ * only latin letters, digits, and some punctuation.
+ * See utypes.h for details.
+ *
+ * The string parameter must be a C string literal.
+ * The length of the string, not including the terminating
+ * <code>NUL</code>, must be specified as a constant.
+ * The U_STRING_DECL macro should be invoked exactly once for one
+ * such string variable before it is used.
+ */
+#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
+#   define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)L ## cs, length)
+#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
+#   define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)cs, length)
+#else
+#   define UNICODE_STRING(cs, length) UnicodeString(cs, length, "")
+#endif
+
 /**
  * UnicodeString is a concrete implementation of the abstract class 
  * UnicodeText.  UnicodeString performs codeset conversion from char*
@@ -678,6 +702,9 @@ public:
    * @param target the target buffer for extraction
    * @param codepage the desired codepage for the characters.  0 has 
    * the special meaning of the default codepage
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
    * @return the number of characters written to <TT>dst</TT>
    */
   int32_t extract(UTextOffset start,
@@ -1302,12 +1329,34 @@ public:
   UnicodeString(const UChar *text,
         int32_t textLength);
 
+  /**
+   * Aliasing UChar* constructor.
+   * The text will be used for the new UnicodeString object, but
+   * it will not be released when the UnicodeString is destroyed.
+   * Be careful not to attempt to modify the contents of the UnicodeString
+   * if the text is read-only. Operations that allocate an entirely
+   * new buffer are harmless.
+   *
+   * @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated.
+   *                     This must be true if <code>textLength==-1</code>.
+   * @param text The characters to alias for the UnicodeString.
+   * @param textLength The number of Unicode characters in <code>text</code> to alias.
+   *                   If -1, then this constructor will determine the length
+   *                   by calling <code>u_strlen()</code>.
+   */
+  UnicodeString(bool_t isTerminated,
+                UChar *text,
+                int32_t textLength);
+
   /**
    * char* constructor.
    * @param codepageData an array of bytes, null-terminated
    * @param codepage the encoding of <TT>codepageData</TT>.  The special
    * value 0 for <TT>codepage</TT> indicates that the text is in the 
    * platform's default codepage.
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
    */
   UnicodeString(const char *codepageData,
         const char *codepage = 0);
@@ -1319,6 +1368,9 @@ public:
    * @param codepage the encoding of <TT>codepageData</TT>.  The special
    * value 0 for <TT>codepage</TT> indicates that the text is in the 
    * platform's default codepage.
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
    */
   UnicodeString(const char *codepageData,
         int32_t dataLength,
@@ -1454,7 +1506,16 @@ private:
   void pinIndices(UTextOffset& start,
           int32_t& length) const;
 
-  // Real ctor for converting from codepage data
+  /*
+   * Real constructor for converting from codepage data.
+   * It assumes that it is called with !fRefCounted.
+   *
+   * If <code>codepage==0</code>, then the default converter
+   * is used for the platform encoding.
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
+   */
   void doCodepageCreate(const char *codepageData,
             int32_t dataLength,
             const char *codepage);
@@ -1472,8 +1533,8 @@ private:
   UChar     *fArray;        // the Unicode data
   int32_t   fLength;        // number characters in fArray
   int32_t   fCapacity;      // sizeof fArray
-  bool_t    fRefCounted;    // indicates if we own storage
   int32_t   fHashCode;      // the hash code
+  bool_t    fRefCounted;    // indicates if we own storage
   bool_t    fBogus;         // indicates if an operation failed
 
   // constants
diff --git a/icu4c/source/common/ustring.h b/icu4c/source/common/ustring.h
index 48938586120..76ffb045053 100644
--- a/icu4c/source/common/ustring.h
+++ b/icu4c/source/common/ustring.h
@@ -174,7 +174,7 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1,
  * parameters.
  * The string parameter must be a C string literal.
  * The length of the string, not including the terminating
- * <code>NUL</code> must be specified as a constant.
+ * <code>NUL</code>, must be specified as a constant.
  * The U_STRING_DECL macro should be invoked exactly once for one
  * such string variable before it is used.
  *
@@ -195,10 +195,10 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1,
  * </pre>
  */
 #if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
-#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)L ## cs }
+#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)L ## cs }
 #   define U_STRING_INIT(var, cs, length)
 #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
-#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)cs }
+#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)cs }
 #   define U_STRING_INIT(var, cs, length)
 #else
 #   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]