From e9151df7a2ec4c9d250e58155d989c7948a9354a Mon Sep 17 00:00:00 2001 From: Vladimir Weinstein Date: Thu, 7 Dec 2000 07:22:55 +0000 Subject: [PATCH] ICU-96 Better memory management in sort key, safe clone implementation X-SVN-Rev: 3162 --- icu4c/source/i18n/tblcoll.cpp | 22 ++- icu4c/source/i18n/ucol.cpp | 204 +++++++++++++++++++++------- icu4c/source/i18n/unicode/tblcoll.h | 5 + 3 files changed, 180 insertions(+), 51 deletions(-) diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp index ae6756ac55e..b028f56f46a 100644 --- a/icu4c/source/i18n/tblcoll.cpp +++ b/icu4c/source/i18n/tblcoll.cpp @@ -66,6 +66,7 @@ #include "unicode/resbund.h" #include "filestrm.h" #include "umemstrm.h" +#include "umutex.h" #ifdef _DEBUG #include "unistrm.h" @@ -142,6 +143,9 @@ const int16_t RuleBasedCollator::FILEID = 0x5443; // unique f const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id +UMTX RuleBasedCollator::collMutex = NULL; +UBool RuleBasedCollator::isMutexInited = RuleBasedCollator::initMutex(); + //////////////////////////////////////////////////////////////////////// // NormalizerIterator // @@ -494,6 +498,17 @@ RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor, //=============================================================================== +UBool RuleBasedCollator::initMutex() { + if(isMutexInited == FALSE) { + umtx_lock(NULL); + if(isMutexInited == FALSE) { + umtx_init(&collMutex); + isMutexInited = TRUE; + } + umtx_unlock(NULL); + } + return isMutexInited; +} RuleBasedCollator::RuleBasedCollator() : Collator(), @@ -854,7 +869,7 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale, if (U_FAILURE(status)) { return; } - + // Try to load, in order: // 1. The desired locale's collation. // 2. A fallback of the desired locale. @@ -2980,7 +2995,10 @@ UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCod } Collator* RuleBasedCollator::safeClone(void) { - return 0; + umtx_lock(&collMutex); + Collator *result = new RuleBasedCollator(*this); + umtx_unlock(&collMutex); + return result; } diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index 01ffd8023dd..f93f768c8f3 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -14,6 +14,7 @@ #include "unicode/ustring.h" #include "unicode/normlzr.h" #include "cpputils.h" +#include "cstring.h" static uint8_t utf16fixup[32] = { @@ -29,8 +30,15 @@ static uint8_t utf16fixup[32] = { #include "ucmp32.h" #include "tcoldata.h" #include "tables.h" -#define UCOL_MAX_BUFFER 1000 + +#define UCOL_DEBUG + +#define UCOL_MAX_BUFFER 512 +#define UCOL_MEM_FACTOR_PRIM 12 +#define UCOL_MEM_FACTOR_SEC_PLUS 8 #define UCOL_WRITABLE_BUFFER_SIZE 256 +#define UCOL_BYTES_PER_PRIM 2 +#define UCOL_NORMALIZATION_GROWTH 2 struct collIterate { UChar *string; // Original string @@ -340,6 +348,47 @@ int32_t ucol_getNextCE(const UCollator *coll, collIterate *source, UErrorCode *s return getComplicatedCE(coll, source, status); } */ +UCollationResult ucol_compareUsingSortKeys(const UCollator *coll, + const UChar *source, + int32_t sourceLength, + const UChar *target, + int32_t targetLength) +{ + uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; + uint8_t *sourceKeyP = sourceKey; + uint8_t *targetKeyP = targetKey; + int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; + + sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); + if(sourceKeyLen > UCOL_MAX_BUFFER) { + sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); + sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); + } + + targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); + if(targetKeyLen > UCOL_MAX_BUFFER) { + targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); + targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); + } + + int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); + + if(sourceKeyP != sourceKey) { + uprv_free(sourceKeyP); + } + + if(targetKeyP != targetKey) { + uprv_free(targetKeyP); + } + + if(result<0) { + return UCOL_LESS; + } else if(result>0) { + return UCOL_GREATER; + } else { + return UCOL_EQUAL; + } +} #define UCOL_GETNEXTCE(order, coll, collationSource, status) { \ if (U_FAILURE((status)) || ((collationSource).pos>=(collationSource).len \ @@ -841,6 +890,9 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll, { sOrder = ucol_getIncrementalCE(coll, &sColl, &status); *(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder); + if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */ + return alternateIncrementalProcessing(coll, &sColl, &tColl); + } } gets = TRUE; @@ -849,6 +901,9 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll, { tOrder = ucol_getIncrementalCE(coll, &tColl, &status); *(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder); + if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */ + return alternateIncrementalProcessing(coll, &sColl, &tColl); + } } gett = TRUE; @@ -985,8 +1040,11 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll, bufferFrenchSec = TRUE; } } - sOrder = ucol_getIncrementalCE(coll, &sColl, &status); - *(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder); + sOrder = ucol_getIncrementalCE(coll, &sColl, &status); + *(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder); + if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */ + return alternateIncrementalProcessing(coll, &sColl, &tColl); + } } //while ((sOrder = ucol_getIncrementalCE(coll, &sColl, &status)) != CollationElementIterator::NULLORDER); while (sOrder != UCOL_NULLORDER); @@ -1013,6 +1071,9 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll, } tOrder = ucol_getIncrementalCE(coll, &tColl, &status); *(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder); + if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */ + return alternateIncrementalProcessing(coll, &sColl, &tColl); + } } while ( tOrder != UCOL_NULLORDER); } @@ -1090,6 +1151,8 @@ ucol_strcoll( const UCollator *coll, UErrorCode status = U_ZERO_ERROR; UChar normSource[UCOL_MAX_BUFFER], normTarget[UCOL_MAX_BUFFER]; + UChar *normSourceP = normSource; + UChar *normTargetP = normTarget; uint32_t normSourceLength = UCOL_MAX_BUFFER, normTargetLength = UCOL_MAX_BUFFER; collIterate sColl, tColl; @@ -1097,10 +1160,42 @@ ucol_strcoll( const UCollator *coll, if(cppColl->getDecomposition() == Normalizer::NO_OP) { init_collIterate(source, sourceLength, &sColl, FALSE); init_collIterate(target, targetLength, &tColl, FALSE); - } else { + } else { /* TODO: This is bad behaved if we're working with small buffers */ + /* We really need the normalization quick check here*/ UNormalizationMode normMode = ucol_getNormalization(coll); normSourceLength = u_normalize(source, sourceLength, normMode, 0, normSource, normSourceLength, &status); + if(U_FAILURE(status)) { /* This would be buffer overflow */ + normSourceP = (UChar *)uprv_malloc((normSourceLength+1)*sizeof(UChar)); + status = U_ZERO_ERROR; + normSourceLength = u_normalize(source, sourceLength, normMode, 0, normSourceP, normSourceLength+1, &status); + normTargetLength = u_normalize(target, targetLength, normMode, 0, normTargetP, normTargetLength, &status); + if(U_FAILURE(status)) { /* This would be buffer overflow */ + normTargetP = (UChar *)uprv_malloc((normTargetLength+1)*sizeof(UChar)); + status = U_ZERO_ERROR; + normTargetLength = u_normalize(target, targetLength, normMode, 0, normTargetP, normTargetLength+1, &status); + } + Normalizer::EMode mode = cppColl->getDecomposition(); + cppColl->setDecomposition(Normalizer::NO_OP); + UCollationResult result = ucol_strcoll(coll, normSourceP, normSourceLength, normTargetP, normTargetLength); + cppColl->setDecomposition(mode); + uprv_free(normSourceP); + if(normTargetP != normTarget) { + uprv_free(normTargetP); + } + return result; + } normTargetLength = u_normalize(target, targetLength, normMode, 0, normTarget, normTargetLength, &status); + if(U_FAILURE(status)) { /* This would be buffer overflow */ + normTargetP = (UChar *)uprv_malloc((normTargetLength+1)*sizeof(UChar)); + status = U_ZERO_ERROR; + normTargetLength = u_normalize(target, targetLength, normMode, 0, normTargetP, normTargetLength+1, &status); + Normalizer::EMode mode = cppColl->getDecomposition(); + cppColl->setDecomposition(Normalizer::NO_OP); + UCollationResult result = ucol_strcoll(coll, normSourceP, normSourceLength, normTargetP, normTargetLength); + cppColl->setDecomposition(mode); + uprv_free(normTargetP); + return result; + } init_collIterate(normSource, normSourceLength, &sColl, TRUE); init_collIterate(normTarget, normTargetLength, &tColl, TRUE); } @@ -1321,6 +1416,9 @@ ucol_strcoll( const UCollator *coll, { UCOL_GETNEXTCE(sOrder, coll, sColl, status); *(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder); + if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */ + return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength); + } } gets = TRUE; @@ -1329,6 +1427,9 @@ ucol_strcoll( const UCollator *coll, { UCOL_GETNEXTCE(tOrder, coll, tColl, status); *(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder); + if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */ + return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength); + } } gett = TRUE; @@ -1465,8 +1566,11 @@ ucol_strcoll( const UCollator *coll, bufferFrenchSec = TRUE; } } - UCOL_GETNEXTCE(sOrder, coll, sColl, status); - *(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder); + UCOL_GETNEXTCE(sOrder, coll, sColl, status); + *(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder); + if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */ + return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength); + } } //while ((sOrder = ucol_getNextCE(coll, &sColl, &status)) != CollationElementIterator::NULLORDER); while (sOrder != UCOL_NULLORDER); @@ -1493,6 +1597,9 @@ ucol_strcoll( const UCollator *coll, } UCOL_GETNEXTCE(tOrder, coll, tColl, status); *(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder); + if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */ + return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength); + } } while ( tOrder != UCOL_NULLORDER); } @@ -1590,36 +1697,54 @@ ucol_getSortKey(const UCollator *coll, UErrorCode status = U_ZERO_ERROR; - uint8_t prim[2*UCOL_MAX_BUFFER], second[UCOL_MAX_BUFFER], tert[UCOL_MAX_BUFFER]; + uint8_t prim[UCOL_BYTES_PER_PRIM*UCOL_MAX_BUFFER], second[UCOL_MAX_BUFFER], tert[UCOL_MAX_BUFFER]; uint8_t *primaries = prim, *secondaries = second, *tertiaries = tert; - UChar normBuffer[2*UCOL_MAX_BUFFER]; + UChar normBuffer[UCOL_NORMALIZATION_GROWTH*UCOL_MAX_BUFFER]; UChar *normSource = normBuffer; - int32_t normSourceLen = 2048; - - for(i = 0; igetStrength() >= Collator::SECONDARY); - UBool compareTer = (((RuleBasedCollator *)coll)->getStrength() >= Collator::TERTIARY); - UBool compareIdent = (((RuleBasedCollator *)coll)->getStrength() == Collator::IDENTICAL); + UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status); - if(len > UCOL_MAX_BUFFER) { - primaries = (uint8_t *)uprv_malloc(6*len*sizeof(uint8_t)); + UBool compareSec = (strength >= UCOL_SECONDARY); + UBool compareTer = (strength >= UCOL_TERTIARY); + UBool compareQuad = (strength >= UCOL_QUATERNARY); + UBool compareIdent = (strength == UCOL_IDENTICAL); + + collIterate s; + init_collIterate((UChar *)source, len, &s, FALSE); + + // If we need to normalize, we'll do it all at once at the beggining! + UNormalizationMode normMode = ucol_getNormalization(coll); + if(normMode != UNORM_NONE) { + normSourceLen = u_normalize(source, sourceLength, normMode, 0, normSource, normSourceLen, &status); + if(U_FAILURE(status)) { + status=U_ZERO_ERROR; + normSource = (UChar *) uprv_malloc((normSourceLen+1)*sizeof(UChar)); + normSourceLen = u_normalize(source, sourceLength, normMode, 0, normSource, (normSourceLen+1), &status); + } + normSource[normSourceLen] = 0; + s.string = normSource; + s.pos = normSource; + s.len = normSource+normSourceLen; + } + + len = s.len-s.pos; + + /* TODO: logic for deciding whether we need to allocate memory should be better */ + /* Generally, (and see below for the reason) we will never get more primaries than */ + /* secondaries or tertiaries. Also, a primary strenght collation with a small stack buffer */ + /* will suffer from additional allocations */ + if(len*(UCOL_BYTES_PER_PRIM*UCOL_MEM_FACTOR_PRIM + 4*UCOL_MEM_FACTOR_SEC_PLUS)>= UCOL_BYTES_PER_PRIM*UCOL_MAX_BUFFER) { + primaries = (uint8_t *)uprv_malloc((UCOL_BYTES_PER_PRIM*UCOL_MEM_FACTOR_PRIM+4*UCOL_MEM_FACTOR_SEC_PLUS)*(len+1)*sizeof(uint8_t)); if(compareSec) { - secondaries = (uint8_t *)uprv_malloc(2*len*sizeof(uint8_t)); + secondaries = (uint8_t *)uprv_malloc(UCOL_MEM_FACTOR_SEC_PLUS*(len+1)*sizeof(uint8_t)); } if(compareTer) { - tertiaries = (uint8_t *)uprv_malloc(2*len*sizeof(uint8_t)); + tertiaries = (uint8_t *)uprv_malloc(UCOL_MEM_FACTOR_SEC_PLUS*(len+1)*sizeof(uint8_t)); } } @@ -1627,26 +1752,6 @@ ucol_getSortKey(const UCollator *coll, uint8_t *secstart = secondaries; uint8_t *terstart = tertiaries; - collIterate s; - init_collIterate((UChar *)source, len, &s, FALSE); - - // If we need to normalize, we'll do it all at once at the beggining! - if(((RuleBasedCollator *)coll)->getDecomposition() != Normalizer::NO_OP) { - UnicodeString normalized; - Normalizer::normalize(UnicodeString(source, sourceLength), ((RuleBasedCollator *)coll)->getDecomposition(), - 0, normalized, status); - normSourceLen = normalized.length(); - - if(normSourceLen > UCOL_MAX_BUFFER) { - normSource = (UChar *) uprv_malloc(normSourceLen*sizeof(UChar)); - } - normalized.extract(0, normSourceLen, normSource); - normSource[normSourceLen] = 0; - s.string = normSource; - s.pos = normSource; - s.len = normSource+normSourceLen; - } - uint32_t order = 0; uint16_t primary = 0; @@ -1679,8 +1784,10 @@ ucol_getSortKey(const UCollator *coll, UCOL_GETNEXTCE(order, coll, s, status); } - - + /* TODO: we shouldn't overuse primary buffer - it makes it hard to do proper allocation */ + /* and guessing how much memory we need. Instead, it would be good to start stuffing data */ + /* in the resulting buffer as soon as we get all the values. After an overflow, we return */ + /* the memory needed */ if(compareSec) { *(primaries++) = UCOL_LEVELTERMINATOR; uint32_t secsize = secondaries-secstart; @@ -1849,8 +1956,7 @@ U_CAPI UColAttributeValue ucol_getAttribute(const UCollator *coll, UColAttribute } U_CAPI UCollator *ucol_safeClone(const UCollator *coll, void *stackBuffer, uint32_t bufferSize, UErrorCode *status) { - *status = U_UNSUPPORTED_ERROR; - return NULL; + return (UCollator *)(((RuleBasedCollator *)coll)->safeClone()); } U_CAPI int32_t ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) { diff --git a/icu4c/source/i18n/unicode/tblcoll.h b/icu4c/source/i18n/unicode/tblcoll.h index 8bac31ddb98..0b7cc36556b 100644 --- a/icu4c/source/i18n/unicode/tblcoll.h +++ b/icu4c/source/i18n/unicode/tblcoll.h @@ -898,6 +898,8 @@ private: const UnicodeString& name, const UnicodeString& suffix); + static UBool initMutex(void); + int32_t getStrengthOrder(NormalizerIterator* cursor, UErrorCode status) const; VectorOfInt* makeReorderedBuffer(NormalizerIterator* cursor, @@ -945,6 +947,9 @@ private: static const char* kFilenameSuffix; + static UBool isMutexInited; + static UMTX collMutex; + //-------------------------------------------------------------------------- // Data Members