mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-96 Better memory management in sort key, safe clone implementation
X-SVN-Rev: 3162
This commit is contained in:
parent
c346349b12
commit
e9151df7a2
3 changed files with 180 additions and 51 deletions
|
@ -66,6 +66,7 @@
|
|||
#include "unicode/resbund.h"
|
||||
#include "filestrm.h"
|
||||
#include "umemstrm.h"
|
||||
#include "umutex.h"
|
||||
|
||||
#ifdef _DEBUG
|
||||
#include "unistrm.h"
|
||||
|
@ -142,6 +143,9 @@ const int16_t RuleBasedCollator::FILEID = 0x5443; // unique f
|
|||
const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension
|
||||
char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id
|
||||
|
||||
UMTX RuleBasedCollator::collMutex = NULL;
|
||||
UBool RuleBasedCollator::isMutexInited = RuleBasedCollator::initMutex();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// NormalizerIterator
|
||||
//
|
||||
|
@ -494,6 +498,17 @@ RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor,
|
|||
|
||||
|
||||
//===============================================================================
|
||||
UBool RuleBasedCollator::initMutex() {
|
||||
if(isMutexInited == FALSE) {
|
||||
umtx_lock(NULL);
|
||||
if(isMutexInited == FALSE) {
|
||||
umtx_init(&collMutex);
|
||||
isMutexInited = TRUE;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
}
|
||||
return isMutexInited;
|
||||
}
|
||||
|
||||
RuleBasedCollator::RuleBasedCollator()
|
||||
: Collator(),
|
||||
|
@ -854,7 +869,7 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale,
|
|||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Try to load, in order:
|
||||
// 1. The desired locale's collation.
|
||||
// 2. A fallback of the desired locale.
|
||||
|
@ -2980,7 +2995,10 @@ UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCod
|
|||
}
|
||||
|
||||
Collator* RuleBasedCollator::safeClone(void) {
|
||||
return 0;
|
||||
umtx_lock(&collMutex);
|
||||
Collator *result = new RuleBasedCollator(*this);
|
||||
umtx_unlock(&collMutex);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "unicode/ustring.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "cpputils.h"
|
||||
#include "cstring.h"
|
||||
|
||||
|
||||
static uint8_t utf16fixup[32] = {
|
||||
|
@ -29,8 +30,15 @@ static uint8_t utf16fixup[32] = {
|
|||
#include "ucmp32.h"
|
||||
#include "tcoldata.h"
|
||||
#include "tables.h"
|
||||
#define UCOL_MAX_BUFFER 1000
|
||||
|
||||
#define UCOL_DEBUG
|
||||
|
||||
#define UCOL_MAX_BUFFER 512
|
||||
#define UCOL_MEM_FACTOR_PRIM 12
|
||||
#define UCOL_MEM_FACTOR_SEC_PLUS 8
|
||||
#define UCOL_WRITABLE_BUFFER_SIZE 256
|
||||
#define UCOL_BYTES_PER_PRIM 2
|
||||
#define UCOL_NORMALIZATION_GROWTH 2
|
||||
|
||||
struct collIterate {
|
||||
UChar *string; // Original string
|
||||
|
@ -340,6 +348,47 @@ int32_t ucol_getNextCE(const UCollator *coll, collIterate *source, UErrorCode *s
|
|||
return getComplicatedCE(coll, source, status);
|
||||
}
|
||||
*/
|
||||
UCollationResult ucol_compareUsingSortKeys(const UCollator *coll,
|
||||
const UChar *source,
|
||||
int32_t sourceLength,
|
||||
const UChar *target,
|
||||
int32_t targetLength)
|
||||
{
|
||||
uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
|
||||
uint8_t *sourceKeyP = sourceKey;
|
||||
uint8_t *targetKeyP = targetKey;
|
||||
int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
|
||||
|
||||
sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
|
||||
if(sourceKeyLen > UCOL_MAX_BUFFER) {
|
||||
sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
|
||||
sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
|
||||
}
|
||||
|
||||
targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
|
||||
if(targetKeyLen > UCOL_MAX_BUFFER) {
|
||||
targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
|
||||
targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
|
||||
}
|
||||
|
||||
int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
|
||||
|
||||
if(sourceKeyP != sourceKey) {
|
||||
uprv_free(sourceKeyP);
|
||||
}
|
||||
|
||||
if(targetKeyP != targetKey) {
|
||||
uprv_free(targetKeyP);
|
||||
}
|
||||
|
||||
if(result<0) {
|
||||
return UCOL_LESS;
|
||||
} else if(result>0) {
|
||||
return UCOL_GREATER;
|
||||
} else {
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
}
|
||||
|
||||
#define UCOL_GETNEXTCE(order, coll, collationSource, status) { \
|
||||
if (U_FAILURE((status)) || ((collationSource).pos>=(collationSource).len \
|
||||
|
@ -841,6 +890,9 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
|||
{
|
||||
sOrder = ucol_getIncrementalCE(coll, &sColl, &status);
|
||||
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
|
||||
if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return alternateIncrementalProcessing(coll, &sColl, &tColl);
|
||||
}
|
||||
}
|
||||
|
||||
gets = TRUE;
|
||||
|
@ -849,6 +901,9 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
|||
{
|
||||
tOrder = ucol_getIncrementalCE(coll, &tColl, &status);
|
||||
*(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder);
|
||||
if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return alternateIncrementalProcessing(coll, &sColl, &tColl);
|
||||
}
|
||||
}
|
||||
|
||||
gett = TRUE;
|
||||
|
@ -985,8 +1040,11 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
|||
bufferFrenchSec = TRUE;
|
||||
}
|
||||
}
|
||||
sOrder = ucol_getIncrementalCE(coll, &sColl, &status);
|
||||
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
|
||||
sOrder = ucol_getIncrementalCE(coll, &sColl, &status);
|
||||
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
|
||||
if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return alternateIncrementalProcessing(coll, &sColl, &tColl);
|
||||
}
|
||||
}
|
||||
//while ((sOrder = ucol_getIncrementalCE(coll, &sColl, &status)) != CollationElementIterator::NULLORDER);
|
||||
while (sOrder != UCOL_NULLORDER);
|
||||
|
@ -1013,6 +1071,9 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
|
|||
}
|
||||
tOrder = ucol_getIncrementalCE(coll, &tColl, &status);
|
||||
*(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder);
|
||||
if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return alternateIncrementalProcessing(coll, &sColl, &tColl);
|
||||
}
|
||||
}
|
||||
while ( tOrder != UCOL_NULLORDER);
|
||||
}
|
||||
|
@ -1090,6 +1151,8 @@ ucol_strcoll( const UCollator *coll,
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
UChar normSource[UCOL_MAX_BUFFER], normTarget[UCOL_MAX_BUFFER];
|
||||
UChar *normSourceP = normSource;
|
||||
UChar *normTargetP = normTarget;
|
||||
uint32_t normSourceLength = UCOL_MAX_BUFFER, normTargetLength = UCOL_MAX_BUFFER;
|
||||
|
||||
collIterate sColl, tColl;
|
||||
|
@ -1097,10 +1160,42 @@ ucol_strcoll( const UCollator *coll,
|
|||
if(cppColl->getDecomposition() == Normalizer::NO_OP) {
|
||||
init_collIterate(source, sourceLength, &sColl, FALSE);
|
||||
init_collIterate(target, targetLength, &tColl, FALSE);
|
||||
} else {
|
||||
} else { /* TODO: This is bad behaved if we're working with small buffers */
|
||||
/* We really need the normalization quick check here*/
|
||||
UNormalizationMode normMode = ucol_getNormalization(coll);
|
||||
normSourceLength = u_normalize(source, sourceLength, normMode, 0, normSource, normSourceLength, &status);
|
||||
if(U_FAILURE(status)) { /* This would be buffer overflow */
|
||||
normSourceP = (UChar *)uprv_malloc((normSourceLength+1)*sizeof(UChar));
|
||||
status = U_ZERO_ERROR;
|
||||
normSourceLength = u_normalize(source, sourceLength, normMode, 0, normSourceP, normSourceLength+1, &status);
|
||||
normTargetLength = u_normalize(target, targetLength, normMode, 0, normTargetP, normTargetLength, &status);
|
||||
if(U_FAILURE(status)) { /* This would be buffer overflow */
|
||||
normTargetP = (UChar *)uprv_malloc((normTargetLength+1)*sizeof(UChar));
|
||||
status = U_ZERO_ERROR;
|
||||
normTargetLength = u_normalize(target, targetLength, normMode, 0, normTargetP, normTargetLength+1, &status);
|
||||
}
|
||||
Normalizer::EMode mode = cppColl->getDecomposition();
|
||||
cppColl->setDecomposition(Normalizer::NO_OP);
|
||||
UCollationResult result = ucol_strcoll(coll, normSourceP, normSourceLength, normTargetP, normTargetLength);
|
||||
cppColl->setDecomposition(mode);
|
||||
uprv_free(normSourceP);
|
||||
if(normTargetP != normTarget) {
|
||||
uprv_free(normTargetP);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
normTargetLength = u_normalize(target, targetLength, normMode, 0, normTarget, normTargetLength, &status);
|
||||
if(U_FAILURE(status)) { /* This would be buffer overflow */
|
||||
normTargetP = (UChar *)uprv_malloc((normTargetLength+1)*sizeof(UChar));
|
||||
status = U_ZERO_ERROR;
|
||||
normTargetLength = u_normalize(target, targetLength, normMode, 0, normTargetP, normTargetLength+1, &status);
|
||||
Normalizer::EMode mode = cppColl->getDecomposition();
|
||||
cppColl->setDecomposition(Normalizer::NO_OP);
|
||||
UCollationResult result = ucol_strcoll(coll, normSourceP, normSourceLength, normTargetP, normTargetLength);
|
||||
cppColl->setDecomposition(mode);
|
||||
uprv_free(normTargetP);
|
||||
return result;
|
||||
}
|
||||
init_collIterate(normSource, normSourceLength, &sColl, TRUE);
|
||||
init_collIterate(normTarget, normTargetLength, &tColl, TRUE);
|
||||
}
|
||||
|
@ -1321,6 +1416,9 @@ ucol_strcoll( const UCollator *coll,
|
|||
{
|
||||
UCOL_GETNEXTCE(sOrder, coll, sColl, status);
|
||||
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
|
||||
if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength);
|
||||
}
|
||||
}
|
||||
|
||||
gets = TRUE;
|
||||
|
@ -1329,6 +1427,9 @@ ucol_strcoll( const UCollator *coll,
|
|||
{
|
||||
UCOL_GETNEXTCE(tOrder, coll, tColl, status);
|
||||
*(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder);
|
||||
if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength);
|
||||
}
|
||||
}
|
||||
|
||||
gett = TRUE;
|
||||
|
@ -1465,8 +1566,11 @@ ucol_strcoll( const UCollator *coll,
|
|||
bufferFrenchSec = TRUE;
|
||||
}
|
||||
}
|
||||
UCOL_GETNEXTCE(sOrder, coll, sColl, status);
|
||||
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
|
||||
UCOL_GETNEXTCE(sOrder, coll, sColl, status);
|
||||
*(--sFSBEnd) = UCOL_SECONDARYORDER(sOrder);
|
||||
if(sFSBEnd == sourceFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength);
|
||||
}
|
||||
}
|
||||
//while ((sOrder = ucol_getNextCE(coll, &sColl, &status)) != CollationElementIterator::NULLORDER);
|
||||
while (sOrder != UCOL_NULLORDER);
|
||||
|
@ -1493,6 +1597,9 @@ ucol_strcoll( const UCollator *coll,
|
|||
}
|
||||
UCOL_GETNEXTCE(tOrder, coll, tColl, status);
|
||||
*(--tFSBEnd) = UCOL_SECONDARYORDER(tOrder);
|
||||
if(tFSBEnd == targetFrenchSec) { /* overflowing the buffer, bail out */
|
||||
return ucol_compareUsingSortKeys(coll, source, sourceLength, target, targetLength);
|
||||
}
|
||||
}
|
||||
while ( tOrder != UCOL_NULLORDER);
|
||||
}
|
||||
|
@ -1590,36 +1697,54 @@ ucol_getSortKey(const UCollator *coll,
|
|||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
uint8_t prim[2*UCOL_MAX_BUFFER], second[UCOL_MAX_BUFFER], tert[UCOL_MAX_BUFFER];
|
||||
uint8_t prim[UCOL_BYTES_PER_PRIM*UCOL_MAX_BUFFER], second[UCOL_MAX_BUFFER], tert[UCOL_MAX_BUFFER];
|
||||
|
||||
uint8_t *primaries = prim, *secondaries = second, *tertiaries = tert;
|
||||
|
||||
UChar normBuffer[2*UCOL_MAX_BUFFER];
|
||||
UChar normBuffer[UCOL_NORMALIZATION_GROWTH*UCOL_MAX_BUFFER];
|
||||
UChar *normSource = normBuffer;
|
||||
int32_t normSourceLen = 2048;
|
||||
|
||||
for(i = 0; i<UCOL_MAX_BUFFER; i++) {
|
||||
prim[i]=second[i]=tert[i]='\0';
|
||||
}
|
||||
|
||||
for(i = UCOL_MAX_BUFFER; i<2*UCOL_MAX_BUFFER; i++) {
|
||||
prim[i]=normBuffer[i]='\0';
|
||||
}
|
||||
|
||||
int32_t normSourceLen = UCOL_NORMALIZATION_GROWTH*UCOL_MAX_BUFFER;
|
||||
|
||||
int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
|
||||
|
||||
UBool compareSec = (((RuleBasedCollator *)coll)->getStrength() >= Collator::SECONDARY);
|
||||
UBool compareTer = (((RuleBasedCollator *)coll)->getStrength() >= Collator::TERTIARY);
|
||||
UBool compareIdent = (((RuleBasedCollator *)coll)->getStrength() == Collator::IDENTICAL);
|
||||
UColAttributeValue strength = ucol_getAttribute(coll, UCOL_STRENGTH, &status);
|
||||
|
||||
if(len > UCOL_MAX_BUFFER) {
|
||||
primaries = (uint8_t *)uprv_malloc(6*len*sizeof(uint8_t));
|
||||
UBool compareSec = (strength >= UCOL_SECONDARY);
|
||||
UBool compareTer = (strength >= UCOL_TERTIARY);
|
||||
UBool compareQuad = (strength >= UCOL_QUATERNARY);
|
||||
UBool compareIdent = (strength == UCOL_IDENTICAL);
|
||||
|
||||
collIterate s;
|
||||
init_collIterate((UChar *)source, len, &s, FALSE);
|
||||
|
||||
// If we need to normalize, we'll do it all at once at the beggining!
|
||||
UNormalizationMode normMode = ucol_getNormalization(coll);
|
||||
if(normMode != UNORM_NONE) {
|
||||
normSourceLen = u_normalize(source, sourceLength, normMode, 0, normSource, normSourceLen, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
status=U_ZERO_ERROR;
|
||||
normSource = (UChar *) uprv_malloc((normSourceLen+1)*sizeof(UChar));
|
||||
normSourceLen = u_normalize(source, sourceLength, normMode, 0, normSource, (normSourceLen+1), &status);
|
||||
}
|
||||
normSource[normSourceLen] = 0;
|
||||
s.string = normSource;
|
||||
s.pos = normSource;
|
||||
s.len = normSource+normSourceLen;
|
||||
}
|
||||
|
||||
len = s.len-s.pos;
|
||||
|
||||
/* TODO: logic for deciding whether we need to allocate memory should be better */
|
||||
/* Generally, (and see below for the reason) we will never get more primaries than */
|
||||
/* secondaries or tertiaries. Also, a primary strenght collation with a small stack buffer */
|
||||
/* will suffer from additional allocations */
|
||||
if(len*(UCOL_BYTES_PER_PRIM*UCOL_MEM_FACTOR_PRIM + 4*UCOL_MEM_FACTOR_SEC_PLUS)>= UCOL_BYTES_PER_PRIM*UCOL_MAX_BUFFER) {
|
||||
primaries = (uint8_t *)uprv_malloc((UCOL_BYTES_PER_PRIM*UCOL_MEM_FACTOR_PRIM+4*UCOL_MEM_FACTOR_SEC_PLUS)*(len+1)*sizeof(uint8_t));
|
||||
if(compareSec) {
|
||||
secondaries = (uint8_t *)uprv_malloc(2*len*sizeof(uint8_t));
|
||||
secondaries = (uint8_t *)uprv_malloc(UCOL_MEM_FACTOR_SEC_PLUS*(len+1)*sizeof(uint8_t));
|
||||
}
|
||||
if(compareTer) {
|
||||
tertiaries = (uint8_t *)uprv_malloc(2*len*sizeof(uint8_t));
|
||||
tertiaries = (uint8_t *)uprv_malloc(UCOL_MEM_FACTOR_SEC_PLUS*(len+1)*sizeof(uint8_t));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1627,26 +1752,6 @@ ucol_getSortKey(const UCollator *coll,
|
|||
uint8_t *secstart = secondaries;
|
||||
uint8_t *terstart = tertiaries;
|
||||
|
||||
collIterate s;
|
||||
init_collIterate((UChar *)source, len, &s, FALSE);
|
||||
|
||||
// If we need to normalize, we'll do it all at once at the beggining!
|
||||
if(((RuleBasedCollator *)coll)->getDecomposition() != Normalizer::NO_OP) {
|
||||
UnicodeString normalized;
|
||||
Normalizer::normalize(UnicodeString(source, sourceLength), ((RuleBasedCollator *)coll)->getDecomposition(),
|
||||
0, normalized, status);
|
||||
normSourceLen = normalized.length();
|
||||
|
||||
if(normSourceLen > UCOL_MAX_BUFFER) {
|
||||
normSource = (UChar *) uprv_malloc(normSourceLen*sizeof(UChar));
|
||||
}
|
||||
normalized.extract(0, normSourceLen, normSource);
|
||||
normSource[normSourceLen] = 0;
|
||||
s.string = normSource;
|
||||
s.pos = normSource;
|
||||
s.len = normSource+normSourceLen;
|
||||
}
|
||||
|
||||
uint32_t order = 0;
|
||||
|
||||
uint16_t primary = 0;
|
||||
|
@ -1679,8 +1784,10 @@ ucol_getSortKey(const UCollator *coll,
|
|||
UCOL_GETNEXTCE(order, coll, s, status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* TODO: we shouldn't overuse primary buffer - it makes it hard to do proper allocation */
|
||||
/* and guessing how much memory we need. Instead, it would be good to start stuffing data */
|
||||
/* in the resulting buffer as soon as we get all the values. After an overflow, we return */
|
||||
/* the memory needed */
|
||||
if(compareSec) {
|
||||
*(primaries++) = UCOL_LEVELTERMINATOR;
|
||||
uint32_t secsize = secondaries-secstart;
|
||||
|
@ -1849,8 +1956,7 @@ U_CAPI UColAttributeValue ucol_getAttribute(const UCollator *coll, UColAttribute
|
|||
}
|
||||
|
||||
U_CAPI UCollator *ucol_safeClone(const UCollator *coll, void *stackBuffer, uint32_t bufferSize, UErrorCode *status) {
|
||||
*status = U_UNSUPPORTED_ERROR;
|
||||
return NULL;
|
||||
return (UCollator *)(((RuleBasedCollator *)coll)->safeClone());
|
||||
}
|
||||
|
||||
U_CAPI int32_t ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
|
||||
|
|
|
@ -898,6 +898,8 @@ private:
|
|||
const UnicodeString& name,
|
||||
const UnicodeString& suffix);
|
||||
|
||||
static UBool initMutex(void);
|
||||
|
||||
int32_t getStrengthOrder(NormalizerIterator* cursor,
|
||||
UErrorCode status) const;
|
||||
VectorOfInt* makeReorderedBuffer(NormalizerIterator* cursor,
|
||||
|
@ -945,6 +947,9 @@ private:
|
|||
|
||||
static const char* kFilenameSuffix;
|
||||
|
||||
static UBool isMutexInited;
|
||||
static UMTX collMutex;
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// Data Members
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue