ICU-68 construct UnicodeString from invariant char*, alias from UChar*

X-SVN-Rev: 268
2025-04-07 14:31:31 +00:00 · 1999-12-01 17:51:53 +00:00 · 1999-12-01 17:51:53 +00:00 · bbf2815b33
commit bbf2815b33
parent 6f38a88ec1
3 changed files with 160 additions and 46 deletions
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@ -17,13 +17,15 @@
 *******************************************************************************
 */

-#include "unistr.h"

+#include "utypes.h"
+#include "putil.h"
 #include "locid.h"
 #include "cstring.h"
 #include "cmemory.h"
 #include "ustring.h"
 #include "mutex.h"
+#include "unistr.h"

 #if 0
 //DEBUGGING
@ -152,6 +154,21 @@ UnicodeString::UnicodeString( const UChar *text,
  doReplace(0, 0, text, 0, textLength);
 }

+UnicodeString::UnicodeString(bool_t isTerminated,
+                             UChar *text,
+                             int32_t textLength)
+  : fArray(text),
+    fLength(textLength != -1 || !isTerminated ? textLength : u_strlen(text)),
+    fCapacity(isTerminated ? fLength + 1 : fLength),
+    fRefCounted(FALSE),
+    fHashCode(kInvalidHashCode),
+    fBogus(FALSE)
+{
+  if(fLength < 0) {
+    setToBogus();
+  }
+}
+
 UnicodeString::UnicodeString(const char *codepageData,
                 const char *codepage)
  : fArray(fStackBuffer),
@ -621,10 +638,10 @@ UnicodeString::doReplace(UTextOffset start,
      // don't delete it until the end of the method.  this can happen
      // in code like UnicodeString s = "foo"; s += s;
      if(srcChars != getArrayStart())
-    delete [] fArray;
+        delete [] fArray;
      else {
-    deleteWhenDone = TRUE;
-    bufferToDelete = fArray;
+        deleteWhenDone = TRUE;
+        bufferToDelete = fArray;
      }
    }

@ -771,15 +788,19 @@ UnicodeString::extract(UTextOffset start,
  int32_t arraySize        = 0x0FFFFFFF;

  // create the converter
-  UConverter *converter = 0;
+  UConverter *converter;

  // if the codepage is the default, use our cache
-  if(codepage == 0)
+  if(codepage == 0) {
    converter = getDefaultConverter(status);
-  else
+  } else if(*codepage == 0) {
+    converter = 0;
+  } else {
    converter = ucnv_open(codepage, &status);
+  }

  // if we failed, set the appropriate flags and return
+  // if it is an empty string, then use the "invariant character" conversion
  if(U_FAILURE(status)) {
    // close the converter
    if(codepage == 0)
@ -789,14 +810,22 @@ UnicodeString::extract(UTextOffset start,
    return 0;
  }

+  // perform the conversion
+  if(converter == 0) {
+    // use the "invariant characters" conversion
+    if(length > fLength - start) {
+      length = fLength - start;
+    }
+    u_UCharsToChars(mySource, myTarget, length);
+    return length;
+  }
+
+  // there is no loop here since we assume the buffer is large enough
  myTargetLimit = myTarget + arraySize;

  if(myTargetLimit < myTarget)  /* ptr wrapped around: pin to U_MAX_PTR */
    myTargetLimit = (char*)U_MAX_PTR; 

-  // perform the conversion
-  // there is no loop here since we assume the buffer is large enough
-
  ucnv_fromUnicode(converter, &myTarget,  myTargetLimit,
           &mySource, mySourceEnd, NULL, TRUE, &status);

@ -822,7 +851,7 @@ UnicodeString::doCodepageCreate(const char *codepageData,
  int32_t sourceLen        = dataLength;
  const char *mySource     = codepageData;
  const char *mySourceEnd  = mySource + sourceLen;
-  UChar *myTarget          = getArrayStart();
+  UChar *myTarget;
  UErrorCode status        = U_ZERO_ERROR;
  int32_t arraySize        = getCapacity();

@ -830,9 +859,12 @@ UnicodeString::doCodepageCreate(const char *codepageData,
  UConverter *converter = 0;

  // if the codepage is the default, use our cache
-  converter = (codepage == 0
-           ? getDefaultConverter(status)
-           : ucnv_open(codepage, &status));
+  // if it is an empty string, then use the "invariant character" conversion
+  converter = (codepage == 0 ?
+                 getDefaultConverter(status) :
+                 *codepage == 0 ?
+                   0 :
+                   ucnv_open(codepage, &status));

  // if we failed, set the appropriate flags and return
  if(U_FAILURE(status)) {
@ -845,8 +877,37 @@ UnicodeString::doCodepageCreate(const char *codepageData,
    return;
  }

+  fHashCode = kInvalidHashCode;
+
  // perform the conversion
-  do {
+  if(converter == 0) {
+    // use the "invariant characters" conversion
+    if(arraySize < dataLength) {
+      int32_t tempCapacity;
+      // allocate enough space for the dataLength, the refCount, and a NUL
+      UChar *temp = allocate(dataLength + 2, tempCapacity);
+
+      if(temp == 0) {
+        // set flags and return
+        setToBogus();
+        return;
+      }
+
+      fArray      = temp;
+      fCapacity   = tempCapacity;
+
+      setRefCount(1);
+
+      u_charsToUChars(codepageData, fArray + 1, dataLength);
+      fArray[dataLength + 1] = 0;
+    } else {
+      u_charsToUChars(codepageData, getArrayStart(), dataLength);
+    }
+    return;
+  }
+
+  myTarget = getArrayStart();
+  for(;;) {
    // reset the error code
    status = U_ZERO_ERROR;

@ -859,30 +920,24 @@ UnicodeString::doCodepageCreate(const char *codepageData,
    arraySize    = getCapacity() - fLength;

    // allocate more space and copy data, if needed
-    if(fLength < dataLength) {
+    if(status == U_INDEX_OUTOFBOUNDS_ERROR) {
      int32_t tempCapacity;
      UChar *temp = allocate(fCapacity, tempCapacity);

      if(! temp) {
-    // close the converter
-    if(codepage == 0)
-      releaseDefaultConverter(converter);
-    else
-      ucnv_close(converter);
-    // set flags and return
-    setToBogus();
-    return;
+        // set flags and return
+        setToBogus();
+        break;
      }

-      // if we're not currently ref counted, shift the array right by one
-      if(fRefCounted == FALSE)
-    us_arrayCopy(fArray, 0, temp, 1, fLength);
-      // otherwise, copy the old array into temp, including the ref count
-      else
-          us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
-
-      if(fRefCounted && removeRef() == 0)
-    delete [] fArray;
+      if(fRefCounted) {
+        // copy the old array into temp, including the ref count
+        us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
+        delete [] fArray;
+      } else {
+        // if we're not currently ref counted, shift the array right by one
+        us_arrayCopy(fArray, 0, temp, 1, fLength);
+      }

      fArray      = temp;
      fCapacity   = tempCapacity;
@ -891,11 +946,10 @@ UnicodeString::doCodepageCreate(const char *codepageData,

      myTarget    = getArrayStart() + fLength;
      arraySize   = getCapacity() - fLength;
+    } else {
+      break;
    }
  }
-  while(status == U_INDEX_OUTOFBOUNDS_ERROR);
-
-  fHashCode = kInvalidHashCode;

  // close the converter
  if(codepage == 0)
@ -925,9 +979,6 @@ UnicodeString::getUChars() const
  if(fBogus)
    return 0;

-  // clone our array, if necessary
-  ((UnicodeString*)this)->cloneArrayIfNeeded();
-
  // no room for null, resize
  if(getCapacity() <= fLength) {
    // allocate at minimum the current capacity + needed space
@ -955,8 +1006,10 @@ UnicodeString::getUChars() const
    ((UnicodeString*)this)->setRefCount(1);
  }

-  // tack on a trailing null
-  fArray[(fRefCounted ? 1 : 0) + fLength] = 0;
+  if(getArrayStart()[fLength] != 0) {
+    // tack on a trailing null
+    ((UChar *)getArrayStart())[fLength] = 0;
+  }

  return getArrayStart();
 }
--- a/icu4c/source/common/unistr.h
+++ b/icu4c/source/common/unistr.h
@ -40,6 +40,30 @@
 class Locale;
 class UCharReference;

+/**
+ * Unicode String literals in C++.
+ * Dependent on the platform properties, different UnicodeString
+ * constructors should be used to create a UnicodeString object from
+ * a string literal.
+ * The macros are defined for maximum performance.
+ * They work only for strings that contain "invariant characters", i.e.,
+ * only latin letters, digits, and some punctuation.
+ * See utypes.h for details.
+ *
+ * The string parameter must be a C string literal.
+ * The length of the string, not including the terminating
+ * <code>NUL</code>, must be specified as a constant.
+ * The U_STRING_DECL macro should be invoked exactly once for one
+ * such string variable before it is used.
+ */
+#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
+#   define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)L ## cs, length)
+#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
+#   define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)cs, length)
+#else
+#   define UNICODE_STRING(cs, length) UnicodeString(cs, length, "")
+#endif
+
 /**
 * UnicodeString is a concrete implementation of the abstract class 
 * UnicodeText.  UnicodeString performs codeset conversion from char*
@ -678,6 +702,9 @@ public:
   * @param target the target buffer for extraction
   * @param codepage the desired codepage for the characters.  0 has 
   * the special meaning of the default codepage
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
   * @return the number of characters written to <TT>dst</TT>
   */
  int32_t extract(UTextOffset start,
@ -1302,12 +1329,34 @@ public:
  UnicodeString(const UChar *text,
        int32_t textLength);

+  /**
+   * Aliasing UChar* constructor.
+   * The text will be used for the new UnicodeString object, but
+   * it will not be released when the UnicodeString is destroyed.
+   * Be careful not to attempt to modify the contents of the UnicodeString
+   * if the text is read-only. Operations that allocate an entirely
+   * new buffer are harmless.
+   *
+   * @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated.
+   *                     This must be true if <code>textLength==-1</code>.
+   * @param text The characters to alias for the UnicodeString.
+   * @param textLength The number of Unicode characters in <code>text</code> to alias.
+   *                   If -1, then this constructor will determine the length
+   *                   by calling <code>u_strlen()</code>.
+   */
+  UnicodeString(bool_t isTerminated,
+                UChar *text,
+                int32_t textLength);
+
  /**
   * char* constructor.
   * @param codepageData an array of bytes, null-terminated
   * @param codepage the encoding of <TT>codepageData</TT>.  The special
   * value 0 for <TT>codepage</TT> indicates that the text is in the 
   * platform's default codepage.
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
   */
  UnicodeString(const char *codepageData,
        const char *codepage = 0);
@ -1319,6 +1368,9 @@ public:
   * @param codepage the encoding of <TT>codepageData</TT>.  The special
   * value 0 for <TT>codepage</TT> indicates that the text is in the 
   * platform's default codepage.
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
   */
  UnicodeString(const char *codepageData,
        int32_t dataLength,
@ -1454,7 +1506,16 @@ private:
  void pinIndices(UTextOffset& start,
          int32_t& length) const;

-  // Real ctor for converting from codepage data
+  /*
+   * Real constructor for converting from codepage data.
+   * It assumes that it is called with !fRefCounted.
+   *
+   * If <code>codepage==0</code>, then the default converter
+   * is used for the platform encoding.
+   * If <code>codepage</code> is an empty string (<code>""</code>),
+   * then a simple conversion is performed on the codepage-invariant
+   * subset ("invariant characters") of the platform encoding. See utypes.h.
+   */
  void doCodepageCreate(const char *codepageData,
            int32_t dataLength,
            const char *codepage);
@ -1472,8 +1533,8 @@ private:
  UChar     *fArray;        // the Unicode data
  int32_t   fLength;        // number characters in fArray
  int32_t   fCapacity;      // sizeof fArray
-  bool_t    fRefCounted;    // indicates if we own storage
  int32_t   fHashCode;      // the hash code
+  bool_t    fRefCounted;    // indicates if we own storage
  bool_t    fBogus;         // indicates if an operation failed

  // constants
--- a/icu4c/source/common/ustring.h
+++ b/icu4c/source/common/ustring.h
@ -174,7 +174,7 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1,
 * parameters.
 * The string parameter must be a C string literal.
 * The length of the string, not including the terminating
- * <code>NUL</code> must be specified as a constant.
+ * <code>NUL</code>, must be specified as a constant.
 * The U_STRING_DECL macro should be invoked exactly once for one
 * such string variable before it is used.
 *
@ -195,10 +195,10 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1,
 * </pre>
 */
 #if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
-#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)L ## cs }
+#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)L ## cs }
 #   define U_STRING_INIT(var, cs, length)
 #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
-#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)cs }
+#   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)cs }
 #   define U_STRING_INIT(var, cs, length)
 #else
 #   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]