From bbf2815b33cf31ed8e46bf9e1cb258c81fd50470 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 1 Dec 1999 17:51:53 +0000 Subject: [PATCH] ICU-68 construct UnicodeString from invariant char*, alias from UChar* X-SVN-Rev: 268 --- icu4c/source/common/unistr.cpp | 135 +++++++++++++++++++++++---------- icu4c/source/common/unistr.h | 65 +++++++++++++++- icu4c/source/common/ustring.h | 6 +- 3 files changed, 160 insertions(+), 46 deletions(-) diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp index d3b91fbb9fd..3998d9f3a53 100644 --- a/icu4c/source/common/unistr.cpp +++ b/icu4c/source/common/unistr.cpp @@ -17,13 +17,15 @@ ******************************************************************************* */ -#include "unistr.h" +#include "utypes.h" +#include "putil.h" #include "locid.h" #include "cstring.h" #include "cmemory.h" #include "ustring.h" #include "mutex.h" +#include "unistr.h" #if 0 //DEBUGGING @@ -152,6 +154,21 @@ UnicodeString::UnicodeString( const UChar *text, doReplace(0, 0, text, 0, textLength); } +UnicodeString::UnicodeString(bool_t isTerminated, + UChar *text, + int32_t textLength) + : fArray(text), + fLength(textLength != -1 || !isTerminated ? textLength : u_strlen(text)), + fCapacity(isTerminated ? fLength + 1 : fLength), + fRefCounted(FALSE), + fHashCode(kInvalidHashCode), + fBogus(FALSE) +{ + if(fLength < 0) { + setToBogus(); + } +} + UnicodeString::UnicodeString(const char *codepageData, const char *codepage) : fArray(fStackBuffer), @@ -621,10 +638,10 @@ UnicodeString::doReplace(UTextOffset start, // don't delete it until the end of the method. this can happen // in code like UnicodeString s = "foo"; s += s; if(srcChars != getArrayStart()) - delete [] fArray; + delete [] fArray; else { - deleteWhenDone = TRUE; - bufferToDelete = fArray; + deleteWhenDone = TRUE; + bufferToDelete = fArray; } } @@ -771,15 +788,19 @@ UnicodeString::extract(UTextOffset start, int32_t arraySize = 0x0FFFFFFF; // create the converter - UConverter *converter = 0; + UConverter *converter; // if the codepage is the default, use our cache - if(codepage == 0) + if(codepage == 0) { converter = getDefaultConverter(status); - else + } else if(*codepage == 0) { + converter = 0; + } else { converter = ucnv_open(codepage, &status); + } // if we failed, set the appropriate flags and return + // if it is an empty string, then use the "invariant character" conversion if(U_FAILURE(status)) { // close the converter if(codepage == 0) @@ -789,14 +810,22 @@ UnicodeString::extract(UTextOffset start, return 0; } + // perform the conversion + if(converter == 0) { + // use the "invariant characters" conversion + if(length > fLength - start) { + length = fLength - start; + } + u_UCharsToChars(mySource, myTarget, length); + return length; + } + + // there is no loop here since we assume the buffer is large enough myTargetLimit = myTarget + arraySize; if(myTargetLimit < myTarget) /* ptr wrapped around: pin to U_MAX_PTR */ myTargetLimit = (char*)U_MAX_PTR; - // perform the conversion - // there is no loop here since we assume the buffer is large enough - ucnv_fromUnicode(converter, &myTarget, myTargetLimit, &mySource, mySourceEnd, NULL, TRUE, &status); @@ -822,7 +851,7 @@ UnicodeString::doCodepageCreate(const char *codepageData, int32_t sourceLen = dataLength; const char *mySource = codepageData; const char *mySourceEnd = mySource + sourceLen; - UChar *myTarget = getArrayStart(); + UChar *myTarget; UErrorCode status = U_ZERO_ERROR; int32_t arraySize = getCapacity(); @@ -830,9 +859,12 @@ UnicodeString::doCodepageCreate(const char *codepageData, UConverter *converter = 0; // if the codepage is the default, use our cache - converter = (codepage == 0 - ? getDefaultConverter(status) - : ucnv_open(codepage, &status)); + // if it is an empty string, then use the "invariant character" conversion + converter = (codepage == 0 ? + getDefaultConverter(status) : + *codepage == 0 ? + 0 : + ucnv_open(codepage, &status)); // if we failed, set the appropriate flags and return if(U_FAILURE(status)) { @@ -845,8 +877,37 @@ UnicodeString::doCodepageCreate(const char *codepageData, return; } + fHashCode = kInvalidHashCode; + // perform the conversion - do { + if(converter == 0) { + // use the "invariant characters" conversion + if(arraySize < dataLength) { + int32_t tempCapacity; + // allocate enough space for the dataLength, the refCount, and a NUL + UChar *temp = allocate(dataLength + 2, tempCapacity); + + if(temp == 0) { + // set flags and return + setToBogus(); + return; + } + + fArray = temp; + fCapacity = tempCapacity; + + setRefCount(1); + + u_charsToUChars(codepageData, fArray + 1, dataLength); + fArray[dataLength + 1] = 0; + } else { + u_charsToUChars(codepageData, getArrayStart(), dataLength); + } + return; + } + + myTarget = getArrayStart(); + for(;;) { // reset the error code status = U_ZERO_ERROR; @@ -859,30 +920,24 @@ UnicodeString::doCodepageCreate(const char *codepageData, arraySize = getCapacity() - fLength; // allocate more space and copy data, if needed - if(fLength < dataLength) { + if(status == U_INDEX_OUTOFBOUNDS_ERROR) { int32_t tempCapacity; UChar *temp = allocate(fCapacity, tempCapacity); if(! temp) { - // close the converter - if(codepage == 0) - releaseDefaultConverter(converter); - else - ucnv_close(converter); - // set flags and return - setToBogus(); - return; + // set flags and return + setToBogus(); + break; } - // if we're not currently ref counted, shift the array right by one - if(fRefCounted == FALSE) - us_arrayCopy(fArray, 0, temp, 1, fLength); - // otherwise, copy the old array into temp, including the ref count - else - us_arrayCopy(fArray, 0, temp, 0, fLength + 1); - - if(fRefCounted && removeRef() == 0) - delete [] fArray; + if(fRefCounted) { + // copy the old array into temp, including the ref count + us_arrayCopy(fArray, 0, temp, 0, fLength + 1); + delete [] fArray; + } else { + // if we're not currently ref counted, shift the array right by one + us_arrayCopy(fArray, 0, temp, 1, fLength); + } fArray = temp; fCapacity = tempCapacity; @@ -891,11 +946,10 @@ UnicodeString::doCodepageCreate(const char *codepageData, myTarget = getArrayStart() + fLength; arraySize = getCapacity() - fLength; + } else { + break; } } - while(status == U_INDEX_OUTOFBOUNDS_ERROR); - - fHashCode = kInvalidHashCode; // close the converter if(codepage == 0) @@ -925,9 +979,6 @@ UnicodeString::getUChars() const if(fBogus) return 0; - // clone our array, if necessary - ((UnicodeString*)this)->cloneArrayIfNeeded(); - // no room for null, resize if(getCapacity() <= fLength) { // allocate at minimum the current capacity + needed space @@ -955,8 +1006,10 @@ UnicodeString::getUChars() const ((UnicodeString*)this)->setRefCount(1); } - // tack on a trailing null - fArray[(fRefCounted ? 1 : 0) + fLength] = 0; + if(getArrayStart()[fLength] != 0) { + // tack on a trailing null + ((UChar *)getArrayStart())[fLength] = 0; + } return getArrayStart(); } diff --git a/icu4c/source/common/unistr.h b/icu4c/source/common/unistr.h index 567a8bc8f5d..b55b6f5d9e9 100644 --- a/icu4c/source/common/unistr.h +++ b/icu4c/source/common/unistr.h @@ -40,6 +40,30 @@ class Locale; class UCharReference; +/** + * Unicode String literals in C++. + * Dependent on the platform properties, different UnicodeString + * constructors should be used to create a UnicodeString object from + * a string literal. + * The macros are defined for maximum performance. + * They work only for strings that contain "invariant characters", i.e., + * only latin letters, digits, and some punctuation. + * See utypes.h for details. + * + * The string parameter must be a C string literal. + * The length of the string, not including the terminating + * NUL, must be specified as a constant. + * The U_STRING_DECL macro should be invoked exactly once for one + * such string variable before it is used. + */ +#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY +# define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)L ## cs, length) +#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY +# define UNICODE_STRING(cs, length) UnicodeString(TRUE, (UChar *)cs, length) +#else +# define UNICODE_STRING(cs, length) UnicodeString(cs, length, "") +#endif + /** * UnicodeString is a concrete implementation of the abstract class * UnicodeText. UnicodeString performs codeset conversion from char* @@ -678,6 +702,9 @@ public: * @param target the target buffer for extraction * @param codepage the desired codepage for the characters. 0 has * the special meaning of the default codepage + * If codepage is an empty string (""), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. * @return the number of characters written to dst */ int32_t extract(UTextOffset start, @@ -1302,12 +1329,34 @@ public: UnicodeString(const UChar *text, int32_t textLength); + /** + * Aliasing UChar* constructor. + * The text will be used for the new UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * Be careful not to attempt to modify the contents of the UnicodeString + * if the text is read-only. Operations that allocate an entirely + * new buffer are harmless. + * + * @param isTerminated specifies if text is NUL-terminated. + * This must be true if textLength==-1. + * @param text The characters to alias for the UnicodeString. + * @param textLength The number of Unicode characters in text to alias. + * If -1, then this constructor will determine the length + * by calling u_strlen(). + */ + UnicodeString(bool_t isTerminated, + UChar *text, + int32_t textLength); + /** * char* constructor. * @param codepageData an array of bytes, null-terminated * @param codepage the encoding of codepageData. The special * value 0 for codepage indicates that the text is in the * platform's default codepage. + * If codepage is an empty string (""), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. */ UnicodeString(const char *codepageData, const char *codepage = 0); @@ -1319,6 +1368,9 @@ public: * @param codepage the encoding of codepageData. The special * value 0 for codepage indicates that the text is in the * platform's default codepage. + * If codepage is an empty string (""), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. */ UnicodeString(const char *codepageData, int32_t dataLength, @@ -1454,7 +1506,16 @@ private: void pinIndices(UTextOffset& start, int32_t& length) const; - // Real ctor for converting from codepage data + /* + * Real constructor for converting from codepage data. + * It assumes that it is called with !fRefCounted. + * + * If codepage==0, then the default converter + * is used for the platform encoding. + * If codepage is an empty string (""), + * then a simple conversion is performed on the codepage-invariant + * subset ("invariant characters") of the platform encoding. See utypes.h. + */ void doCodepageCreate(const char *codepageData, int32_t dataLength, const char *codepage); @@ -1472,8 +1533,8 @@ private: UChar *fArray; // the Unicode data int32_t fLength; // number characters in fArray int32_t fCapacity; // sizeof fArray - bool_t fRefCounted; // indicates if we own storage int32_t fHashCode; // the hash code + bool_t fRefCounted; // indicates if we own storage bool_t fBogus; // indicates if an operation failed // constants diff --git a/icu4c/source/common/ustring.h b/icu4c/source/common/ustring.h index 48938586120..76ffb045053 100644 --- a/icu4c/source/common/ustring.h +++ b/icu4c/source/common/ustring.h @@ -174,7 +174,7 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1, * parameters. * The string parameter must be a C string literal. * The length of the string, not including the terminating - * NUL must be specified as a constant. + * NUL, must be specified as a constant. * The U_STRING_DECL macro should be invoked exactly once for one * such string variable before it is used. * @@ -195,10 +195,10 @@ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1, * */ #if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY -# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)L ## cs } +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)L ## cs } # define U_STRING_INIT(var, cs, length) #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY -# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)cs } +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)cs } # define U_STRING_INIT(var, cs, length) #else # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]