diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h index 48fe2b3b1d6..f841594abbe 100644 --- a/icu4c/source/common/unicode/unistr.h +++ b/icu4c/source/common/unicode/unistr.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1998-2014, International Business Machines +* Copyright (C) 1998-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -173,15 +173,52 @@ class UnicodeStringAppendable; // unicode/appendable.h # endif #endif +/** + * \def UNISTR_OBJECT_SIZE + * Desired sizeof(UnicodeString) in bytes. + * It should be a multiple of sizeof(pointer) to avoid unusable space for padding. + * The object size may want to be a multiple of 16 bytes, + * which is a common granularity for heap allocation. + * + * Any space inside the object beyond sizeof(vtable pointer) + 2 + * is available for storing short strings inside the object. + * The bigger the object, the longer a string that can be stored inside the object, + * without additional heap allocation. + * + * Depending on a platform's pointer size, pointer alignment requirements, + * and struct padding, the compiler will usually round up sizeof(UnicodeString) + * to 4 * sizeof(pointer) (or 3 * sizeof(pointer) for P128 data models), + * to hold the fields for heap-allocated strings. + * Such a minimum size also ensures that the object is easily large enough + * to hold at least 2 UChars, for one supplementary code point (U16_MAX_LENGTH). + * + * sizeof(UnicodeString) >= 48 should work for all known platforms. + * + * For example, on a 64-bit machine where sizeof(vtable pointer) is 8, + * sizeof(UnicodeString) = 64 would leave space for + * (64 - sizeof(vtable pointer) - 2) / U_SIZEOF_UCHAR = (64 - 8 - 2) / 2 = 27 + * UChars stored inside the object. + * + * The minimum object size on a 64-bit machine would be + * 4 * sizeof(pointer) = 4 * 8 = 32 bytes, + * and the internal buffer would hold up to 11 UChars in that case. + * + * @see U16_MAX_LENGTH + * @draft ICU 56 + */ +#ifndef UNISTR_OBJECT_SIZE +# define UNISTR_OBJECT_SIZE 64 +#endif + /** * UnicodeString is a string class that stores Unicode characters directly and provides - * similar functionality as the Java String and StringBuffer classes. + * similar functionality as the Java String and StringBuffer/StringBuilder classes. * It is a concrete implementation of the abstract class Replaceable (for transliteration). * * The UnicodeString class is not suitable for subclassing. * *
For an overview of Unicode strings in C and C++ see the - * User Guide Strings chapter.
+ * User Guide Strings chapter. * *In ICU, a Unicode string consists of 16-bit Unicode code units.
* A Unicode character may be stored with either one code unit
@@ -3474,9 +3511,12 @@ private:
// constants
enum {
- // Set the stack buffer size so that sizeof(UnicodeString) is,
- // naturally (without padding), a multiple of sizeof(pointer).
- US_STACKBUF_SIZE= sizeof(void *)==4 ? 13 : 15, // Size of stack buffer for short strings
+ /**
+ * Size of stack buffer for short strings.
+ * Must be at least U16_MAX_LENGTH for the single-code point constructor to work.
+ * @see UNISTR_OBJECT_SIZE
+ */
+ US_STACKBUF_SIZE=(int32_t)(UNISTR_OBJECT_SIZE-sizeof(void *)-2)/U_SIZEOF_UCHAR,
kInvalidUChar=0xffff, // U+FFFF returned by charAt(invalid index)
kGrowSize=128, // grow size for this buffer
kInvalidHashCode=0, // invalid hash code
@@ -3544,9 +3584,10 @@ private:
* (Padding at the end of fFields is ok:
* As long as it is no larger than fStackFields, it is not wasted space.)
*
- * For some of the history of the UnicodeString class fields layout,
- * see ICU ticket #11336 "UnicodeString: recombine stack buffer arrays"
- * and ticket #8322 "why is sizeof(UnicodeString)==48?".
+ * For some of the history of the UnicodeString class fields layout, see
+ * - ICU ticket #11551 "longer UnicodeString contents in stack buffer"
+ * - ICU ticket #11336 "UnicodeString: recombine stack buffer arrays"
+ * - ICU ticket #8322 "why is sizeof(UnicodeString)==48?"
*/
// (implicit) *vtable;
union StackBufferOrFields {
@@ -3558,9 +3599,11 @@ private:
} fStackFields;
struct {
int16_t fLengthAndFlags; // bit fields: see constants above
- UChar *fArray; // the Unicode data
- int32_t fCapacity; // capacity of fArray (in UChars)
int32_t fLength; // number of characters in fArray if >127; else undefined
+ int32_t fCapacity; // capacity of fArray (in UChars)
+ // array pointer last to minimize padding for machines with P128 data model
+ // or pointer sizes that are not a power of 2
+ UChar *fArray; // the Unicode data
} fFields;
} fUnion;
};
diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp
index 9997aa7b57b..a995d76dacc 100644
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@@ -1,6 +1,6 @@
/*
******************************************************************************
-* Copyright (C) 1999-2014, International Business Machines Corporation and
+* Copyright (C) 1999-2015, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*
@@ -367,8 +367,40 @@ UnicodeString::allocate(int32_t capacity) {
//========================================
// Destructor
//========================================
+
+#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
+static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
+static u_atomic_int32_t beyondCount(0);
+
+U_CAPI void unistr_printLengths() {
+ int32_t i;
+ for(i = 0; i <= 59; ++i) {
+ printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
+ }
+ int32_t beyond = beyondCount;
+ for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
+ beyond += finalLengthCounts[i];
+ }
+ printf(">59, %9d\n", beyond);
+}
+#endif
+
UnicodeString::~UnicodeString()
{
+#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
+ // Count lengths of strings at the end of their lifetime.
+ // Useful for discussion of a desirable stack buffer size.
+ // Count the contents length, not the optional NUL terminator nor further capacity.
+ // Ignore open-buffer strings and strings which alias external storage.
+ if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
+ if(hasShortLength()) {
+ umtx_atomic_inc(finalLengthCounts + getShortLength());
+ } else {
+ umtx_atomic_inc(&beyondCount);
+ }
+ }
+#endif
+
releaseArray();
}
diff --git a/icu4c/source/test/cintltst/cintltst.c b/icu4c/source/test/cintltst/cintltst.c
index 1448e0633ef..5b883e2a09b 100644
--- a/icu4c/source/test/cintltst/cintltst.c
+++ b/icu4c/source/test/cintltst/cintltst.c
@@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2013, International Business Machines Corporation and
+ * Copyright (c) 1997-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@@ -73,6 +73,10 @@ void ctest_setICU_DATA(void);
static int gOrigArgc;
static const char* const * gOrigArgv;
+#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
+U_CAPI void unistr_printLengths();
+#endif
+
int main(int argc, const char* const argv[])
{
int nerrors = 0;
@@ -233,6 +237,10 @@ int main(int argc, const char* const argv[])
} /* End of loop that repeats the entire test, if requested. (Normally doesn't loop) */
+#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
+ unistr_printLengths();
+#endif
+
endTime = uprv_getRawUTCtime();
diffTime = (int32_t)(endTime - startTime);
printf("Elapsed Time: %02d:%02d:%02d.%03d\n",
diff --git a/icu4c/source/test/intltest/intltest.cpp b/icu4c/source/test/intltest/intltest.cpp
index cfd944a1849..63ad6dc912b 100644
--- a/icu4c/source/test/intltest/intltest.cpp
+++ b/icu4c/source/test/intltest/intltest.cpp
@@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2014, International Business Machines Corporation and
+ * Copyright (c) 1997-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@@ -1166,6 +1166,10 @@ IntlTest::run_phase2( char* name, char* par ) // supports reporting memory leaks
# define TRY_CNV_2 "sjis"
#endif
+#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
+U_CAPI void unistr_printLengths();
+#endif
+
int
main(int argc, char* argv[])
{
@@ -1524,6 +1528,10 @@ main(int argc, char* argv[])
u_cleanup();
}
+#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
+ unistr_printLengths();
+#endif
+
fprintf(stdout, "--------------------------------------\n");
if (execCount <= 0) {
diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp
index 97151d956a2..a730be86f50 100644
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2014, International Business Machines Corporation and
+ * Copyright (c) 1997-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@@ -2098,20 +2098,35 @@ UnicodeStringTest::TestUnicodeStringImplementsAppendable() {
void
UnicodeStringTest::TestSizeofUnicodeString() {
// See the comments in unistr.h near the declaration of UnicodeString's fields.
+ // See the API comments for UNISTR_OBJECT_SIZE.
size_t sizeofUniStr=sizeof(UnicodeString);
- size_t expected;
- switch(sizeof(void *)) {
- case 4:
- expected=32;
- break;
- case 8:
- expected=40;
- break;
- default:
- logln("This platform has neither 32-bit nor 64-bit pointers.");
- return;
- }
+ size_t expected=UNISTR_OBJECT_SIZE;
if(expected!=sizeofUniStr) {
- errln("sizeof(UnicodeString)=%d, expected %d", (int)sizeofUniStr, (int)expected);
+ // Possible cause: UNISTR_OBJECT_SIZE may not be a multiple of sizeof(pointer),
+ // of the compiler might add more internal padding than expected.
+ errln("sizeof(UnicodeString)=%d, expected UNISTR_OBJECT_SIZE=%d",
+ (int)sizeofUniStr, (int)expected);
+ }
+ if(sizeofUniStr<32) {
+ errln("sizeof(UnicodeString)=%d < 32, probably too small", (int)sizeofUniStr);
+ }
+ // We assume that the entire UnicodeString object,
+ // minus the vtable pointer and 2 bytes for flags and short length,
+ // is available for internal storage of UChars.
+ int32_t expectedStackBufferLength=((int32_t)UNISTR_OBJECT_SIZE-sizeof(void *)-2)/U_SIZEOF_UCHAR;
+ UnicodeString s;
+ const UChar *emptyBuffer=s.getBuffer();
+ for(int32_t i=0; i