diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp
index d3b91fbb9fd..3998d9f3a53 100644
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@@ -17,13 +17,15 @@
*******************************************************************************
*/
-#include "unistr.h"
+#include "utypes.h"
+#include "putil.h"
#include "locid.h"
#include "cstring.h"
#include "cmemory.h"
#include "ustring.h"
#include "mutex.h"
+#include "unistr.h"
#if 0
//DEBUGGING
@@ -152,6 +154,21 @@ UnicodeString::UnicodeString( const UChar *text,
doReplace(0, 0, text, 0, textLength);
}
+UnicodeString::UnicodeString(bool_t isTerminated,
+ UChar *text,
+ int32_t textLength)
+ : fArray(text),
+ fLength(textLength != -1 || !isTerminated ? textLength : u_strlen(text)),
+ fCapacity(isTerminated ? fLength + 1 : fLength),
+ fRefCounted(FALSE),
+ fHashCode(kInvalidHashCode),
+ fBogus(FALSE)
+{
+ if(fLength < 0) {
+ setToBogus();
+ }
+}
+
UnicodeString::UnicodeString(const char *codepageData,
const char *codepage)
: fArray(fStackBuffer),
@@ -621,10 +638,10 @@ UnicodeString::doReplace(UTextOffset start,
// don't delete it until the end of the method. this can happen
// in code like UnicodeString s = "foo"; s += s;
if(srcChars != getArrayStart())
- delete [] fArray;
+ delete [] fArray;
else {
- deleteWhenDone = TRUE;
- bufferToDelete = fArray;
+ deleteWhenDone = TRUE;
+ bufferToDelete = fArray;
}
}
@@ -771,15 +788,19 @@ UnicodeString::extract(UTextOffset start,
int32_t arraySize = 0x0FFFFFFF;
// create the converter
- UConverter *converter = 0;
+ UConverter *converter;
// if the codepage is the default, use our cache
- if(codepage == 0)
+ if(codepage == 0) {
converter = getDefaultConverter(status);
- else
+ } else if(*codepage == 0) {
+ converter = 0;
+ } else {
converter = ucnv_open(codepage, &status);
+ }
// if we failed, set the appropriate flags and return
+ // if it is an empty string, then use the "invariant character" conversion
if(U_FAILURE(status)) {
// close the converter
if(codepage == 0)
@@ -789,14 +810,22 @@ UnicodeString::extract(UTextOffset start,
return 0;
}
+ // perform the conversion
+ if(converter == 0) {
+ // use the "invariant characters" conversion
+ if(length > fLength - start) {
+ length = fLength - start;
+ }
+ u_UCharsToChars(mySource, myTarget, length);
+ return length;
+ }
+
+ // there is no loop here since we assume the buffer is large enough
myTargetLimit = myTarget + arraySize;
if(myTargetLimit < myTarget) /* ptr wrapped around: pin to U_MAX_PTR */
myTargetLimit = (char*)U_MAX_PTR;
- // perform the conversion
- // there is no loop here since we assume the buffer is large enough
-
ucnv_fromUnicode(converter, &myTarget, myTargetLimit,
&mySource, mySourceEnd, NULL, TRUE, &status);
@@ -822,7 +851,7 @@ UnicodeString::doCodepageCreate(const char *codepageData,
int32_t sourceLen = dataLength;
const char *mySource = codepageData;
const char *mySourceEnd = mySource + sourceLen;
- UChar *myTarget = getArrayStart();
+ UChar *myTarget;
UErrorCode status = U_ZERO_ERROR;
int32_t arraySize = getCapacity();
@@ -830,9 +859,12 @@ UnicodeString::doCodepageCreate(const char *codepageData,
UConverter *converter = 0;
// if the codepage is the default, use our cache
- converter = (codepage == 0
- ? getDefaultConverter(status)
- : ucnv_open(codepage, &status));
+ // if it is an empty string, then use the "invariant character" conversion
+ converter = (codepage == 0 ?
+ getDefaultConverter(status) :
+ *codepage == 0 ?
+ 0 :
+ ucnv_open(codepage, &status));
// if we failed, set the appropriate flags and return
if(U_FAILURE(status)) {
@@ -845,8 +877,37 @@ UnicodeString::doCodepageCreate(const char *codepageData,
return;
}
+ fHashCode = kInvalidHashCode;
+
// perform the conversion
- do {
+ if(converter == 0) {
+ // use the "invariant characters" conversion
+ if(arraySize < dataLength) {
+ int32_t tempCapacity;
+ // allocate enough space for the dataLength, the refCount, and a NUL
+ UChar *temp = allocate(dataLength + 2, tempCapacity);
+
+ if(temp == 0) {
+ // set flags and return
+ setToBogus();
+ return;
+ }
+
+ fArray = temp;
+ fCapacity = tempCapacity;
+
+ setRefCount(1);
+
+ u_charsToUChars(codepageData, fArray + 1, dataLength);
+ fArray[dataLength + 1] = 0;
+ } else {
+ u_charsToUChars(codepageData, getArrayStart(), dataLength);
+ }
+ return;
+ }
+
+ myTarget = getArrayStart();
+ for(;;) {
// reset the error code
status = U_ZERO_ERROR;
@@ -859,30 +920,24 @@ UnicodeString::doCodepageCreate(const char *codepageData,
arraySize = getCapacity() - fLength;
// allocate more space and copy data, if needed
- if(fLength < dataLength) {
+ if(status == U_INDEX_OUTOFBOUNDS_ERROR) {
int32_t tempCapacity;
UChar *temp = allocate(fCapacity, tempCapacity);
if(! temp) {
- // close the converter
- if(codepage == 0)
- releaseDefaultConverter(converter);
- else
- ucnv_close(converter);
- // set flags and return
- setToBogus();
- return;
+ // set flags and return
+ setToBogus();
+ break;
}
- // if we're not currently ref counted, shift the array right by one
- if(fRefCounted == FALSE)
- us_arrayCopy(fArray, 0, temp, 1, fLength);
- // otherwise, copy the old array into temp, including the ref count
- else
- us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
-
- if(fRefCounted && removeRef() == 0)
- delete [] fArray;
+ if(fRefCounted) {
+ // copy the old array into temp, including the ref count
+ us_arrayCopy(fArray, 0, temp, 0, fLength + 1);
+ delete [] fArray;
+ } else {
+ // if we're not currently ref counted, shift the array right by one
+ us_arrayCopy(fArray, 0, temp, 1, fLength);
+ }
fArray = temp;
fCapacity = tempCapacity;
@@ -891,11 +946,10 @@ UnicodeString::doCodepageCreate(const char *codepageData,
myTarget = getArrayStart() + fLength;
arraySize = getCapacity() - fLength;
+ } else {
+ break;
}
}
- while(status == U_INDEX_OUTOFBOUNDS_ERROR);
-
- fHashCode = kInvalidHashCode;
// close the converter
if(codepage == 0)
@@ -925,9 +979,6 @@ UnicodeString::getUChars() const
if(fBogus)
return 0;
- // clone our array, if necessary
- ((UnicodeString*)this)->cloneArrayIfNeeded();
-
// no room for null, resize
if(getCapacity() <= fLength) {
// allocate at minimum the current capacity + needed space
@@ -955,8 +1006,10 @@ UnicodeString::getUChars() const
((UnicodeString*)this)->setRefCount(1);
}
- // tack on a trailing null
- fArray[(fRefCounted ? 1 : 0) + fLength] = 0;
+ if(getArrayStart()[fLength] != 0) {
+ // tack on a trailing null
+ ((UChar *)getArrayStart())[fLength] = 0;
+ }
return getArrayStart();
}
diff --git a/icu4c/source/common/unistr.h b/icu4c/source/common/unistr.h
index 567a8bc8f5d..b55b6f5d9e9 100644
--- a/icu4c/source/common/unistr.h
+++ b/icu4c/source/common/unistr.h
@@ -40,6 +40,30 @@
class Locale;
class UCharReference;
+/**
+ * Unicode String literals in C++.
+ * Dependent on the platform properties, different UnicodeString
+ * constructors should be used to create a UnicodeString object from
+ * a string literal.
+ * The macros are defined for maximum performance.
+ * They work only for strings that contain "invariant characters", i.e.,
+ * only latin letters, digits, and some punctuation.
+ * See utypes.h for details.
+ *
+ * The string parameter must be a C string literal.
+ * The length of the string, not including the terminating
+ * NUL
, must be specified as a constant.
+ * The U_STRING_DECL macro should be invoked exactly once for one
+ * such string variable before it is used.
+ */
+#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
+# define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)L ## cs, length)
+#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
+# define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)cs, length)
+#else
+# define UNICODE_STRING(cs, length) UnicodeString(cs, length, "")
+#endif
+
/**
* UnicodeString is a concrete implementation of the abstract class
* UnicodeText. UnicodeString performs codeset conversion from char*
@@ -678,6 +702,9 @@ public:
* @param target the target buffer for extraction
* @param codepage the desired codepage for the characters. 0 has
* the special meaning of the default codepage
+ * If codepage
is an empty string (""
),
+ * then a simple conversion is performed on the codepage-invariant
+ * subset ("invariant characters") of the platform encoding. See utypes.h.
* @return the number of characters written to dst
*/
int32_t extract(UTextOffset start,
@@ -1302,12 +1329,34 @@ public:
UnicodeString(const UChar *text,
int32_t textLength);
+ /**
+ * Aliasing UChar* constructor.
+ * The text will be used for the new UnicodeString object, but
+ * it will not be released when the UnicodeString is destroyed.
+ * Be careful not to attempt to modify the contents of the UnicodeString
+ * if the text is read-only. Operations that allocate an entirely
+ * new buffer are harmless.
+ *
+ * @param isTerminated specifies if text
is NUL
-terminated.
+ * This must be true if textLength==-1
.
+ * @param text The characters to alias for the UnicodeString.
+ * @param textLength The number of Unicode characters in text
to alias.
+ * If -1, then this constructor will determine the length
+ * by calling u_strlen()
.
+ */
+ UnicodeString(bool_t isTerminated,
+ UChar *text,
+ int32_t textLength);
+
/**
* char* constructor.
* @param codepageData an array of bytes, null-terminated
* @param codepage the encoding of codepageData. The special
* value 0 for codepage indicates that the text is in the
* platform's default codepage.
+ * If codepage
is an empty string (""
),
+ * then a simple conversion is performed on the codepage-invariant
+ * subset ("invariant characters") of the platform encoding. See utypes.h.
*/
UnicodeString(const char *codepageData,
const char *codepage = 0);
@@ -1319,6 +1368,9 @@ public:
* @param codepage the encoding of codepageData. The special
* value 0 for codepage indicates that the text is in the
* platform's default codepage.
+ * If codepage
is an empty string (""
),
+ * then a simple conversion is performed on the codepage-invariant
+ * subset ("invariant characters") of the platform encoding. See utypes.h.
*/
UnicodeString(const char *codepageData,
int32_t dataLength,
@@ -1454,7 +1506,16 @@ private:
void pinIndices(UTextOffset& start,
int32_t& length) const;
- // Real ctor for converting from codepage data
+ /*
+ * Real constructor for converting from codepage data.
+ * It assumes that it is called with !fRefCounted.
+ *
+ * If codepage==0
, then the default converter
+ * is used for the platform encoding.
+ * If codepage
is an empty string (""
),
+ * then a simple conversion is performed on the codepage-invariant
+ * subset ("invariant characters") of the platform encoding. See utypes.h.
+ */
void doCodepageCreate(const char *codepageData,
int32_t dataLength,
const char *codepage);
@@ -1472,8 +1533,8 @@ private:
UChar *fArray; // the Unicode data
int32_t fLength; // number characters in fArray
int32_t fCapacity; // sizeof fArray
- bool_t fRefCounted; // indicates if we own storage
int32_t fHashCode; // the hash code
+ bool_t fRefCounted; // indicates if we own storage
bool_t fBogus; // indicates if an operation failed
// constants
diff --git a/icu4c/source/common/ustring.h b/icu4c/source/common/ustring.h
index 48938586120..76ffb045053 100644
--- a/icu4c/source/common/ustring.h
+++ b/icu4c/source/common/ustring.h
@@ -174,7 +174,7 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1,
* parameters.
* The string parameter must be a C string literal.
* The length of the string, not including the terminating
- * NUL
must be specified as a constant.
+ * NUL
, must be specified as a constant.
* The U_STRING_DECL macro should be invoked exactly once for one
* such string variable before it is used.
*
@@ -195,10 +195,10 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1,
*
*/
#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
-# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)L ## cs }
+# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)L ## cs }
# define U_STRING_INIT(var, cs, length)
#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
-# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)cs }
+# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)cs }
# define U_STRING_INIT(var, cs, length)
#else
# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]