From 1d4b6a6ec7323c37aafabee0db192035ccf42d76 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 15 Apr 2015 18:49:55 +0000 Subject: [PATCH] ICU-11551 new UNISTR_OBJECT_SIZE=64 for 27 UChars stored internally on 64-bit machine X-SVN-Rev: 37339 --- icu4c/source/common/unicode/unistr.h | 65 ++++++++++++++++++++----- icu4c/source/common/unistr.cpp | 34 ++++++++++++- icu4c/source/test/cintltst/cintltst.c | 10 +++- icu4c/source/test/intltest/intltest.cpp | 10 +++- icu4c/source/test/intltest/ustrtest.cpp | 43 ++++++++++------ 5 files changed, 134 insertions(+), 28 deletions(-) diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h index 48fe2b3b1d6..f841594abbe 100644 --- a/icu4c/source/common/unicode/unistr.h +++ b/icu4c/source/common/unicode/unistr.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1998-2014, International Business Machines +* Copyright (C) 1998-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -173,15 +173,52 @@ class UnicodeStringAppendable; // unicode/appendable.h # endif #endif +/** + * \def UNISTR_OBJECT_SIZE + * Desired sizeof(UnicodeString) in bytes. + * It should be a multiple of sizeof(pointer) to avoid unusable space for padding. + * The object size may want to be a multiple of 16 bytes, + * which is a common granularity for heap allocation. + * + * Any space inside the object beyond sizeof(vtable pointer) + 2 + * is available for storing short strings inside the object. + * The bigger the object, the longer a string that can be stored inside the object, + * without additional heap allocation. + * + * Depending on a platform's pointer size, pointer alignment requirements, + * and struct padding, the compiler will usually round up sizeof(UnicodeString) + * to 4 * sizeof(pointer) (or 3 * sizeof(pointer) for P128 data models), + * to hold the fields for heap-allocated strings. + * Such a minimum size also ensures that the object is easily large enough + * to hold at least 2 UChars, for one supplementary code point (U16_MAX_LENGTH). + * + * sizeof(UnicodeString) >= 48 should work for all known platforms. + * + * For example, on a 64-bit machine where sizeof(vtable pointer) is 8, + * sizeof(UnicodeString) = 64 would leave space for + * (64 - sizeof(vtable pointer) - 2) / U_SIZEOF_UCHAR = (64 - 8 - 2) / 2 = 27 + * UChars stored inside the object. + * + * The minimum object size on a 64-bit machine would be + * 4 * sizeof(pointer) = 4 * 8 = 32 bytes, + * and the internal buffer would hold up to 11 UChars in that case. + * + * @see U16_MAX_LENGTH + * @draft ICU 56 + */ +#ifndef UNISTR_OBJECT_SIZE +# define UNISTR_OBJECT_SIZE 64 +#endif + /** * UnicodeString is a string class that stores Unicode characters directly and provides - * similar functionality as the Java String and StringBuffer classes. + * similar functionality as the Java String and StringBuffer/StringBuilder classes. * It is a concrete implementation of the abstract class Replaceable (for transliteration). * * The UnicodeString class is not suitable for subclassing. * *

For an overview of Unicode strings in C and C++ see the - * User Guide Strings chapter.

+ * User Guide Strings chapter.

* *

In ICU, a Unicode string consists of 16-bit Unicode code units. * A Unicode character may be stored with either one code unit @@ -3474,9 +3511,12 @@ private: // constants enum { - // Set the stack buffer size so that sizeof(UnicodeString) is, - // naturally (without padding), a multiple of sizeof(pointer). - US_STACKBUF_SIZE= sizeof(void *)==4 ? 13 : 15, // Size of stack buffer for short strings + /** + * Size of stack buffer for short strings. + * Must be at least U16_MAX_LENGTH for the single-code point constructor to work. + * @see UNISTR_OBJECT_SIZE + */ + US_STACKBUF_SIZE=(int32_t)(UNISTR_OBJECT_SIZE-sizeof(void *)-2)/U_SIZEOF_UCHAR, kInvalidUChar=0xffff, // U+FFFF returned by charAt(invalid index) kGrowSize=128, // grow size for this buffer kInvalidHashCode=0, // invalid hash code @@ -3544,9 +3584,10 @@ private: * (Padding at the end of fFields is ok: * As long as it is no larger than fStackFields, it is not wasted space.) * - * For some of the history of the UnicodeString class fields layout, - * see ICU ticket #11336 "UnicodeString: recombine stack buffer arrays" - * and ticket #8322 "why is sizeof(UnicodeString)==48?". + * For some of the history of the UnicodeString class fields layout, see + * - ICU ticket #11551 "longer UnicodeString contents in stack buffer" + * - ICU ticket #11336 "UnicodeString: recombine stack buffer arrays" + * - ICU ticket #8322 "why is sizeof(UnicodeString)==48?" */ // (implicit) *vtable; union StackBufferOrFields { @@ -3558,9 +3599,11 @@ private: } fStackFields; struct { int16_t fLengthAndFlags; // bit fields: see constants above - UChar *fArray; // the Unicode data - int32_t fCapacity; // capacity of fArray (in UChars) int32_t fLength; // number of characters in fArray if >127; else undefined + int32_t fCapacity; // capacity of fArray (in UChars) + // array pointer last to minimize padding for machines with P128 data model + // or pointer sizes that are not a power of 2 + UChar *fArray; // the Unicode data } fFields; } fUnion; }; diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp index 9997aa7b57b..a995d76dacc 100644 --- a/icu4c/source/common/unistr.cpp +++ b/icu4c/source/common/unistr.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1999-2014, International Business Machines Corporation and +* Copyright (C) 1999-2015, International Business Machines Corporation and * others. All Rights Reserved. ****************************************************************************** * @@ -367,8 +367,40 @@ UnicodeString::allocate(int32_t capacity) { //======================================== // Destructor //======================================== + +#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS +static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1 +static u_atomic_int32_t beyondCount(0); + +U_CAPI void unistr_printLengths() { + int32_t i; + for(i = 0; i <= 59; ++i) { + printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]); + } + int32_t beyond = beyondCount; + for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) { + beyond += finalLengthCounts[i]; + } + printf(">59, %9d\n", beyond); +} +#endif + UnicodeString::~UnicodeString() { +#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS + // Count lengths of strings at the end of their lifetime. + // Useful for discussion of a desirable stack buffer size. + // Count the contents length, not the optional NUL terminator nor further capacity. + // Ignore open-buffer strings and strings which alias external storage. + if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) { + if(hasShortLength()) { + umtx_atomic_inc(finalLengthCounts + getShortLength()); + } else { + umtx_atomic_inc(&beyondCount); + } + } +#endif + releaseArray(); } diff --git a/icu4c/source/test/cintltst/cintltst.c b/icu4c/source/test/cintltst/cintltst.c index 1448e0633ef..5b883e2a09b 100644 --- a/icu4c/source/test/cintltst/cintltst.c +++ b/icu4c/source/test/cintltst/cintltst.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2013, International Business Machines Corporation and + * Copyright (c) 1997-2015, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -73,6 +73,10 @@ void ctest_setICU_DATA(void); static int gOrigArgc; static const char* const * gOrigArgv; +#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS +U_CAPI void unistr_printLengths(); +#endif + int main(int argc, const char* const argv[]) { int nerrors = 0; @@ -233,6 +237,10 @@ int main(int argc, const char* const argv[]) } /* End of loop that repeats the entire test, if requested. (Normally doesn't loop) */ +#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS + unistr_printLengths(); +#endif + endTime = uprv_getRawUTCtime(); diffTime = (int32_t)(endTime - startTime); printf("Elapsed Time: %02d:%02d:%02d.%03d\n", diff --git a/icu4c/source/test/intltest/intltest.cpp b/icu4c/source/test/intltest/intltest.cpp index cfd944a1849..63ad6dc912b 100644 --- a/icu4c/source/test/intltest/intltest.cpp +++ b/icu4c/source/test/intltest/intltest.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2014, International Business Machines Corporation and + * Copyright (c) 1997-2015, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -1166,6 +1166,10 @@ IntlTest::run_phase2( char* name, char* par ) // supports reporting memory leaks # define TRY_CNV_2 "sjis" #endif +#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS +U_CAPI void unistr_printLengths(); +#endif + int main(int argc, char* argv[]) { @@ -1524,6 +1528,10 @@ main(int argc, char* argv[]) u_cleanup(); } +#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS + unistr_printLengths(); +#endif + fprintf(stdout, "--------------------------------------\n"); if (execCount <= 0) { diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp index 97151d956a2..a730be86f50 100644 --- a/icu4c/source/test/intltest/ustrtest.cpp +++ b/icu4c/source/test/intltest/ustrtest.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2014, International Business Machines Corporation and + * Copyright (c) 1997-2015, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -2098,20 +2098,35 @@ UnicodeStringTest::TestUnicodeStringImplementsAppendable() { void UnicodeStringTest::TestSizeofUnicodeString() { // See the comments in unistr.h near the declaration of UnicodeString's fields. + // See the API comments for UNISTR_OBJECT_SIZE. size_t sizeofUniStr=sizeof(UnicodeString); - size_t expected; - switch(sizeof(void *)) { - case 4: - expected=32; - break; - case 8: - expected=40; - break; - default: - logln("This platform has neither 32-bit nor 64-bit pointers."); - return; - } + size_t expected=UNISTR_OBJECT_SIZE; if(expected!=sizeofUniStr) { - errln("sizeof(UnicodeString)=%d, expected %d", (int)sizeofUniStr, (int)expected); + // Possible cause: UNISTR_OBJECT_SIZE may not be a multiple of sizeof(pointer), + // of the compiler might add more internal padding than expected. + errln("sizeof(UnicodeString)=%d, expected UNISTR_OBJECT_SIZE=%d", + (int)sizeofUniStr, (int)expected); + } + if(sizeofUniStr<32) { + errln("sizeof(UnicodeString)=%d < 32, probably too small", (int)sizeofUniStr); + } + // We assume that the entire UnicodeString object, + // minus the vtable pointer and 2 bytes for flags and short length, + // is available for internal storage of UChars. + int32_t expectedStackBufferLength=((int32_t)UNISTR_OBJECT_SIZE-sizeof(void *)-2)/U_SIZEOF_UCHAR; + UnicodeString s; + const UChar *emptyBuffer=s.getBuffer(); + for(int32_t i=0; i