ICU-5420 merge changes from branches/eric/string-search r.23303 - r.23976

X-SVN-Rev: 23977
This commit is contained in:
Eric Mader 2008-05-23 04:22:28 +00:00
parent 905f90890e
commit 9011fe483f
26 changed files with 4690 additions and 211 deletions

View file

@ -106,9 +106,21 @@ void SearchIterator::setBreakIterator(BreakIterator *breakiter,
UErrorCode &status)
{
if (U_SUCCESS(status)) {
#if 0
m_search_->breakIter = NULL;
// the c++ breakiterator may not make use of ubreakiterator.
// so we'll have to keep track of it ourselves.
#else
// Well, gee... the Constructors that take a BreakIterator
// all cast the BreakIterator to a UBreakIterator and
// pass it to the corresponding usearch_openFromXXX
// routine, so there's no reason not to do this.
//
// Besides, a UBreakIterator is a BreakIterator, so
// any subclass of BreakIterator should work fine here...
m_search_->breakIter = (UBreakIterator *) breakiter;
#endif
m_breakiterator_ = breakiter;
}
}
@ -283,10 +295,16 @@ int32_t SearchIterator::previous(UErrorCode &status)
}
if (matchindex != USEARCH_DONE) {
if (m_search_->isOverlap) {
matchindex += m_search_->matchedLength - 2;
}
return handlePrev(matchindex, status);
}
return handlePrev(offset, status);
}
return USEARCH_DONE;
}

View file

@ -350,11 +350,13 @@ int32_t StringSearch::handleNext(int32_t position, UErrorCode &status)
// looking at usearch.cpp, this part is shifted out to
// StringSearch instead of SearchIterator because m_strsrch_ is
// not accessible in SearchIterator
#if 0
if (position + m_strsrch_->pattern.defaultShiftSize
> m_search_->textLength) {
setMatchNotFound();
return USEARCH_DONE;
}
#endif
if (m_search_->matchedLength <= 0) {
// the flipping direction issue has already been handled
// in next()
@ -366,6 +368,8 @@ int32_t StringSearch::handleNext(int32_t position, UErrorCode &status)
}
ucol_setOffset(m_strsrch_->textIter, position, &status);
#if 0
for (;;) {
if (m_search_->isCanonicalMatch) {
// can't use exact here since extra accents are allowed.
@ -397,6 +401,29 @@ int32_t StringSearch::handleNext(int32_t position, UErrorCode &status)
return m_search_->matchedIndex;
}
}
#else
// if m_strsrch_->breakIter is always the same as m_breakiterator_
// then we don't need to check the match boundaries here because
// usearch_handleNextXXX will already have done it.
if (m_search_->isCanonicalMatch) {
// *could* actually use exact here 'cause no extra accents allowed...
usearch_handleNextCanonical(m_strsrch_, &status);
} else {
usearch_handleNextExact(m_strsrch_, &status);
}
if (U_FAILURE(status)) {
return USEARCH_DONE;
}
if (m_search_->matchedIndex == USEARCH_DONE) {
ucol_setOffset(m_strsrch_->textIter, m_search_->textLength, &status);
} else {
ucol_setOffset(m_strsrch_->textIter, m_search_->matchedIndex, &status);
}
return m_search_->matchedIndex;
#endif
}
}
return USEARCH_DONE;
@ -424,11 +451,13 @@ int32_t StringSearch::handlePrev(int32_t position, UErrorCode &status)
// looking at usearch.cpp, this part is shifted out to
// StringSearch instead of SearchIterator because m_strsrch_ is
// not accessible in SearchIterator
#if 0
if (!m_search_->isOverlap &&
position - m_strsrch_->pattern.defaultShiftSize < 0) {
setMatchNotFound();
return USEARCH_DONE;
}
for (;;) {
if (m_search_->isCanonicalMatch) {
// can't use exact here since extra accents are allowed.
@ -452,6 +481,22 @@ int32_t StringSearch::handlePrev(int32_t position, UErrorCode &status)
return m_search_->matchedIndex;
}
}
#else
ucol_setOffset(m_strsrch_->textIter, position, &status);
if (m_search_->isCanonicalMatch) {
// *could* use exact match here since extra accents *not* allowed!
usearch_handlePreviousCanonical(m_strsrch_, &status);
} else {
usearch_handlePreviousExact(m_strsrch_, &status);
}
if (U_FAILURE(status)) {
return USEARCH_DONE;
}
return m_search_->matchedIndex;
#endif
}
return m_search_->matchedIndex;

View file

@ -101,6 +101,10 @@ inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri
(s)->extendCEs = NULL;
(s)->extendCEsSize = 0;
(s)->CEpos = (s)->toReturn = (s)->CEs;
(s)->offsetBuffer = NULL;
(s)->offsetBufferSize = 0;
(s)->offsetReturn = (s)->offsetStore = NULL;
(s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
(s)->writableBuffer = (s)->stackWritableBuffer;
(s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
(s)->coll = (collator);
@ -175,6 +179,7 @@ inline void loadState(collIterate *data, const collIterateState *backup,
}
}
data->pos = backup->pos;
if ((data->flags & UCOL_ITER_INNORMBUF) &&
data->writableBuffer != backup->bufferaddress) {
/*
@ -1377,6 +1382,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
}
UChar ch = 0;
collationSource->offsetReturn = NULL;
for (;;) /* Loop handles case when incremental normalize switches */
{ /* to or from the side buffer / original string, and we */
@ -1586,6 +1592,83 @@ void collPrevIterNormalize(collIterate *data)
unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
normLen, &status);
if (data->offsetBuffer == NULL) {
int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
data->offsetBufferSize = len;
data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
data->offsetStore = data->offsetBuffer;
} else if(data->offsetBufferSize < (int32_t) normLen) {
int32_t storeIX = data->offsetStore - data->offsetBuffer;
int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
if (tob != NULL) {
data->offsetBuffer = tob;
data->offsetStore = &data->offsetBuffer[storeIX];
data->offsetBufferSize = normLen + 1;
}
}
/*
* The usual case at this point is that we've got a base
* character followed by marks that were normalized. If
* fcdPosition is NULL, that means that we backed up to
* the beginning of the string and there's no base character.
*
* Forward processing will usually normalize when it sees
* the first mark, so that mark will get it's natural offset
* and the rest will get the offset of the character following
* the marks. The base character will also get its natural offset.
*
* We write the offset of the base character, if there is one,
* followed by the offset of the first mark and then the offsets
* of the rest of the marks.
*/
int32_t firstMarkOffset = 0;
int32_t trailOffset = data->pos - data->string + 1;
int32_t trailCount = normLen - 1;
if (data->fcdPosition != NULL) {
int32_t baseOffset = data->fcdPosition - data->string;
UChar baseChar = *data->fcdPosition;
firstMarkOffset = baseOffset + 1;
/*
* If the base character is the start of a contraction, forward processing
* will normalize the marks while checking for the contraction, which means
* that the offset of the first mark will the same as the other marks.
*
* **** THIS IS PROBABLY NOT A COMPLETE TEST ****
*/
if (baseChar >= 0x100) {
int32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
}
if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
firstMarkOffset = trailOffset;
}
}
*(data->offsetStore++) = baseOffset;
}
*(data->offsetStore++) = firstMarkOffset;
for (int32_t i = 0; i < trailCount; i += 1) {
*(data->offsetStore++) = trailOffset;
}
data->offsetRepeatValue = trailOffset;
data->offsetReturn = data->offsetStore - 1;
if (data->offsetReturn == data->offsetBuffer) {
data->offsetStore = data->offsetBuffer;
}
data->pos = data->writableBuffer + data->writableBufSize;
data->origFlags = data->flags;
data->flags |= UCOL_ITER_INNORMBUF;
@ -1756,10 +1839,24 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
UErrorCode *status)
{
uint32_t result = (uint32_t)UCOL_NULLORDER;
if (data->offsetReturn != NULL) {
if (data->offsetRepeatCount > 0) {
data->offsetRepeatCount -= 1;
} else {
if (data->offsetReturn == data->offsetBuffer) {
data->offsetReturn = NULL;
data->offsetStore = data->offsetBuffer;
} else {
data->offsetReturn -= 1;
}
}
}
if ((data->extendCEs && data->toReturn > data->extendCEs) ||
(!data->extendCEs && data->toReturn > data->CEs))
{
data->toReturn --;
data->toReturn -= 1;
result = *(data->toReturn);
if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
data->CEpos = data->toReturn;
@ -1767,6 +1864,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
}
else {
UChar ch = 0;
/*
Loop handles case when incremental normalize switches to or from the
side buffer / original string, and we need to start again to get the
@ -1813,6 +1911,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
data->pos = data->fcdPosition + 1;
}
data->flags = data->origFlags;
data->offsetRepeatValue = 0;
continue;
}
}
@ -1903,10 +2002,12 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
}
}
}
if(result == UCOL_NOT_FOUND) {
result = getPrevImplicit(ch, data);
}
}
return result;
}
@ -2399,6 +2500,7 @@ inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
}
uint32_t r = uprv_uca_getImplicitPrimary(cp);
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
collationSource->offsetRepeatCount += 1;
return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
}
@ -2871,6 +2973,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
{
*(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
source->offsetRepeatCount += 1;
return CE;
}
case EXPANSION_TAG:
@ -2880,18 +2983,24 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
/* I have to decide where continuations are going to be dealt with */
uint32_t size;
uint32_t i; /* general counter */
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
size = getExpansionCount(CE);
CE = *CEOffset++;
//source->offsetRepeatCount = -1;
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
for(i = 1; i<size; i++) {
*(source->CEpos++) = *CEOffset++;
source->offsetRepeatCount += 1;
}
} else { /* else, we do */
while(*CEOffset != 0) {
*(source->CEpos++) = *CEOffset++;
source->offsetRepeatCount += 1;
}
}
return CE;
}
case DIGIT_TAG:
@ -3263,6 +3372,29 @@ inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
collationSource->toReturn = collationSource->CEpos;
if (collationSource->offsetBuffer == NULL) {
collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
collationSource->offsetStore = collationSource->offsetBuffer;
}
// **** doesn't work if using iterator ****
if (collationSource->flags & UCOL_ITER_INNORMBUF) {
collationSource->offsetRepeatCount = 1;
} else {
int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
*(collationSource->offsetStore++) = firstOffset;
*(collationSource->offsetStore++) = firstOffset + 1;
collationSource->offsetReturn = collationSource->offsetStore - 1;
*(collationSource->offsetBuffer) = firstOffset;
if (collationSource->offsetReturn == collationSource->offsetBuffer) {
collationSource->offsetStore = collationSource->offsetBuffer;
}
}
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
}
@ -3293,6 +3425,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
{
case NOT_FOUND_TAG: /* this tag always returns */
return CE;
case SPEC_PROC_TAG:
{
// Special processing is getting a CE that is preceded by a certain prefix
@ -3450,15 +3583,54 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
*(UCharOffset) = schar;
noChars++;
int32_t offsetBias;
#if 0
if (source->offsetReturn != NULL) {
source->offsetStore = source->offsetReturn - noChars;
}
// **** doesn't work if using iterator ****
if (source->flags & UCOL_ITER_INNORMBUF) {
if (source->fcdPosition == NULL) {
offsetBias = 0;
} else {
offsetBias = (int32_t)(source->fcdPosition - source->string);
}
} else {
offsetBias = (int32_t)(source->pos - source->string);
}
#else
// **** doesn't work if using iterator ****
if (source->flags & UCOL_ITER_INNORMBUF) {
#if 1
offsetBias = -1;
#else
if (source->fcdPosition == NULL) {
offsetBias = 0;
} else {
offsetBias = (int32_t)(source->fcdPosition - source->string);
}
#endif
} else {
offsetBias = (int32_t)(source->pos - source->string);
}
#endif
/* a new collIterate is used to simplify things, since using the current
collIterate will mean that the forward and backwards iteration will
share and change the same buffers. we don't want to get into that. */
collIterate temp;
int32_t rawOffset;
//IInit_collIterate(coll, UCharOffset, -1, &temp);
IInit_collIterate(coll, UCharOffset, noChars, &temp);
temp.flags &= ~UCOL_ITER_NORM;
rawOffset = temp.pos - temp.string; // should always be zero?
CE = ucol_IGetNextCE(coll, &temp, status);
if (source->extendCEs) {
endCEBuffer = source->extendCEs + source->extendCEsSize;
CECount = (source->CEpos - source->extendCEs)/sizeof(uint32_t);
@ -3466,8 +3638,20 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
CECount = (source->CEpos - source->CEs)/sizeof(uint32_t);
}
if (source->offsetBuffer == NULL) {
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
source->offsetStore = source->offsetBuffer;
}
while (CE != UCOL_NO_MORE_CES) {
*(source->CEpos ++) = CE;
if (offsetBias >= 0) {
*(source->offsetStore ++) = rawOffset + offsetBias;
}
CECount++;
if (source->CEpos == endCEBuffer) {
/* ran out of CE space, reallocate to new buffer.
@ -3494,43 +3678,135 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
source->extendCEs = tempBufCE;
}
}
if (CECount == -1) {
*status = U_MEMORY_ALLOCATION_ERROR;
source->extendCEsSize = 0;
source->CEpos = source->CEs;
freeHeapWritableBuffer(&temp);
if (strbuffer != buffer) {
uprv_free(strbuffer);
}
return (uint32_t)UCOL_NULLORDER;
}
source->CEpos = source->extendCEs + CECount;
endCEBuffer = source->extendCEs + source->extendCEsSize;
}
if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
int32_t storeIX = source->offsetStore - source->offsetBuffer;
int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
if (tob != NULL) {
source->offsetBuffer = tob;
source->offsetStore = &source->offsetBuffer[storeIX];
source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
} else {
// memory error...
*status = U_MEMORY_ALLOCATION_ERROR;
source->CEpos = source->CEs;
freeHeapWritableBuffer(&temp);
if (strbuffer != buffer) {
uprv_free(strbuffer);
}
return (uint32_t) UCOL_NULLORDER;
}
}
rawOffset = temp.pos - temp.string;
CE = ucol_IGetNextCE(coll, &temp, status);
}
if (source->offsetRepeatValue != 0) {
if (CECount > noChars) {
source->offsetRepeatCount += temp.offsetRepeatCount;
} else {
// **** does this really skip the right offsets? ****
source->offsetReturn -= (noChars - CECount);
}
}
freeHeapWritableBuffer(&temp);
if (strbuffer != buffer) {
uprv_free(strbuffer);
}
if (offsetBias >= 0) {
source->offsetReturn = source->offsetStore - 1;
if (source->offsetReturn == source->offsetBuffer) {
source->offsetStore = source->offsetBuffer;
}
}
source->toReturn = source->CEpos - 1;
if (source->toReturn == source->CEs) {
source->CEpos = source->CEs;
}
return *(source->toReturn);
case LONG_PRIMARY_TAG:
{
*(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
*(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
source->toReturn = source->CEpos - 1;
if (source->offsetBuffer == NULL) {
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
source->offsetStore = source->offsetBuffer;
}
if (source->flags & UCOL_ITER_INNORMBUF) {
source->offsetRepeatCount = 1;
} else {
int32_t firstOffset = (int32_t)(source->pos - source->string);
*(source->offsetStore++) = firstOffset;
*(source->offsetStore++) = firstOffset + 1;
source->offsetReturn = source->offsetStore - 1;
*(source->offsetBuffer) = firstOffset;
if (source->offsetReturn == source->offsetBuffer) {
source->offsetStore = source->offsetBuffer;
}
}
return *(source->toReturn);
}
case EXPANSION_TAG: /* this tag always returns */
{
/*
This should handle expansion.
NOTE: we can encounter both continuations and expansions in an expansion!
I have to decide where continuations are going to be dealt with
*/
int32_t firstOffset = (int32_t)(source->pos - source->string);
// **** doesn't work if using iterator ****
if (source->offsetReturn != NULL) {
if (source->offsetReturn == source->offsetBuffer) {
source->offsetStore = source->offsetBuffer;
}else {
firstOffset = -1;
}
}
if (source->offsetBuffer == NULL) {
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
source->offsetStore = source->offsetBuffer;
}
/* find the offset to expansion table */
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
size = getExpansionCount(CE);
@ -3539,23 +3815,45 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
if there are less than 16 elements in expansion, we don't terminate
*/
uint32_t count;
for (count = 0; count < size; count++) {
*(source->CEpos ++) = *CEOffset++;
if (firstOffset >= 0) {
*(source->offsetStore ++) = firstOffset + 1;
}
}
}
else {
} else {
/* else, we do */
while (*CEOffset != 0) {
*(source->CEpos ++) = *CEOffset ++;
if (firstOffset >= 0) {
*(source->offsetStore ++) = firstOffset + 1;
}
}
}
if (firstOffset >= 0) {
source->offsetReturn = source->offsetStore - 1;
*(source->offsetBuffer) = firstOffset;
if (source->offsetReturn == source->offsetBuffer) {
source->offsetStore = source->offsetBuffer;
}
} else {
source->offsetRepeatCount += size - 1;
}
source->toReturn = source->CEpos - 1;
// in case of one element expansion, we
// want to immediately return CEpos
if(source->toReturn == source->CEs) {
source->CEpos = source->CEs;
}
return *(source->toReturn);
}
case DIGIT_TAG:
{
/*
@ -3592,7 +3890,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
handle surrogates...
*/
if (U16_IS_TRAIL (ch)){
if (U16_IS_TRAIL (ch)) {
if (!collIter_bos(source)){
UChar lead = getPrevNormalizedChar(source, status);
if(U16_IS_LEAD(lead)) {
@ -3609,12 +3907,11 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
}
digVal = u_charDigitValue(char32);
for(;;){
for(;;) {
// Make sure we have enough space.
if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
{
if (digIndx >= ((numTempBufSize - 2) * 2) + 1) {
numTempBufSize *= 2;
if (numTempBuf == stackNumTempBuf){
if (numTempBuf == stackNumTempBuf) {
numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
// Null pointer check
if (numTempBuf == NULL) {
@ -3622,7 +3919,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
return 0;
}
uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
}else {
} else {
uint8_t *temp = (uint8_t *)uprv_realloc(numTempBuf, numTempBufSize);
if (temp == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
@ -3637,7 +3934,8 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
// Skip over trailing zeroes, and keep a count of them.
if (digVal != 0)
nonZeroValReached = TRUE;
if (nonZeroValReached){
if (nonZeroValReached) {
/*
We parse the digit string into base 100 numbers (this fits into a byte).
We only add to the buffer in twos, thus if we are parsing an odd character,
@ -3651,7 +3949,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
ones place and the second digit encountered into the tens place.
*/
if ((digIndx + trailingZeroCount) % 2 == 1){
if ((digIndx + trailingZeroCount) % 2 == 1) {
// High-order digit case (tens place)
collateVal += (uint8_t)(digVal * 10);
@ -3665,37 +3963,33 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
collateVal = 0;
}
else{
} else {
// Low-order digit case (ones place)
collateVal = (uint8_t)digVal;
// Check for leading zeroes.
if (collateVal == 0)
{
if (collateVal == 0) {
if (!leadingZeroIndex)
leadingZeroIndex = (digIndx/2) + 2;
}
else
} else
leadingZeroIndex = 0;
// No need to write to buffer; the case of a last odd digit
// is handled below.
}
++digIndx;
}
else
} else
++trailingZeroCount;
if (!collIter_bos(source)){
if (!collIter_bos(source)) {
ch = getPrevNormalizedChar(source, status);
//goBackOne(source);
if (U16_IS_TRAIL(ch)){
if (U16_IS_TRAIL(ch)) {
backupState(source, &state);
if (!collIter_bos(source))
{
if (!collIter_bos(source)) {
goBackOne(source);
UChar lead = getPrevNormalizedChar(source, status);
if(U16_IS_LEAD(lead)) {
char32 = U16_GET_SUPPLEMENTARY(lead,ch);
} else {
@ -3703,11 +3997,10 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
char32 = ch;
}
}
}
else
} else
char32 = ch;
if ((digVal = u_charDigitValue(char32)) == -1){
if ((digVal = u_charDigitValue(char32)) == -1) {
if (char32 > 0xFFFF) {// For surrogates.
loadState(source, &state, FALSE);
}
@ -3717,22 +4010,23 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
//getNextNormalizedChar(source);
break;
}
goBackOne(source);
}else
break;
}
if (nonZeroValReached == FALSE){
if (! nonZeroValReached) {
digIndx = 2;
trailingZeroCount = 0;
numTempBuf[2] = 6;
}
if ((digIndx + trailingZeroCount) % 2 != 0){
if ((digIndx + trailingZeroCount) % 2 != 0) {
numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
digIndx += 1; // The implicit leading zero
}
if (trailingZeroCount % 2 != 0){
if (trailingZeroCount % 2 != 0) {
// We had to consume one trailing zero for the low digit
// of the least significant byte
digIndx += 1; // The trailing zero not in the exponent
@ -3764,8 +4058,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
(UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
UCOL_BYTE_COMMON; // Tertiary weight.
i = endIndex - 1; // Reset the index into the buffer.
while(i >= 2)
{
while(i >= 2) {
primWeight = numTempBuf[i--] << 8;
if ( i >= 2)
primWeight |= numTempBuf[i--];
@ -3776,13 +4069,13 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
source->toReturn = source->CEpos -1;
return *(source->toReturn);
}
else {
} else {
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
CE = *(CEOffset++);
break;
}
}
case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
{
static const uint32_t
@ -3809,18 +4102,37 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
V += VBase;
T += TBase;
if (source->offsetBuffer == NULL) {
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
source->offsetStore = source->offsetBuffer;
}
int32_t firstOffset = (int32_t)(source->pos - source->string);
*(source->offsetStore++) = firstOffset;
/*
return the first CE, but first put the rest into the expansion buffer
*/
if (!source->coll->image->jamoSpecial)
{
* return the first CE, but first put the rest into the expansion buffer
*/
if (!source->coll->image->jamoSpecial) {
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
if (T != TBase)
*(source->offsetStore++) = firstOffset + 1;
if (T != TBase) {
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
*(source->offsetStore++) = firstOffset + 1;
}
source->toReturn = source->CEpos - 1;
return *(source->toReturn);
source->offsetReturn = source->offsetStore - 1;
if (source->offsetReturn == source->offsetBuffer) {
source->offsetStore = source->offsetBuffer;
}
return *(source->toReturn);
} else {
// Since Hanguls pass the FCD check, it is
// guaranteed that we won't be in
@ -3862,18 +4174,46 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
return(UCOL_IGNORABLE);
}
}
case IMPLICIT_TAG: /* everything that is not defined otherwise */
#if 0
if (source->offsetBuffer == NULL) {
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
source->offsetStore = source->offsetBuffer;
}
// **** doesn't work if using iterator ****
if (source->flags & UCOL_ITER_INNORMBUF) {
source->offsetRepeatCount = 1;
} else {
int32_t firstOffset = (int32_t)(source->pos - source->string);
*(source->offsetStore++) = firstOffset;
*(source->offsetStore++) = firstOffset + 1;
source->offsetReturn = source->offsetStore - 1;
if (source->offsetReturn == source->offsetBuffer) {
source->offsetStore = source->offsetBuffer;
}
}
#endif
return getPrevImplicit(ch, source);
// TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
return getPrevImplicit(ch, source);
case SURROGATE_TAG: /* This is a surrogate pair */
/* essentialy an engaged lead surrogate. */
/* if you have encountered it here, it means that a */
/* broken sequence was encountered and this is an error */
return 0;
case LEAD_SURROGATE_TAG: /* D800-DBFF*/
return 0; /* broken surrogate sequence */
case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
{
UChar32 cp = 0;
@ -3897,22 +4237,27 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
} else {
return 0; /* completely ignorable */
}
return getPrevImplicit(cp, source);
}
/* UCA is filled with these. Tailorings are NOT_FOUND */
/* not yet implemented */
case CHARSET_TAG: /* this tag always returns */
/* probably after 1.8 */
return UCOL_NOT_FOUND;
default: /* this tag always returns */
*status = U_INTERNAL_PROGRAM_ERROR;
CE=0;
break;
}
if (CE <= UCOL_NOT_FOUND) {
break;
}
}
return CE;
}

View file

@ -270,6 +270,12 @@ typedef struct collIterate {
uint32_t *toReturn; /* This is the CE from CEs buffer that should be returned */
uint32_t *CEpos; /* This is the position to which we have stored processed CEs */
int32_t *offsetReturn; /* This is the offset to return, if non-NULL */
int32_t *offsetStore; /* This is the pointer for storing offsets */
int32_t offsetRepeatCount; /* Repeat stored offset if non-zero */
int32_t offsetRepeatValue; /* offset value to repeat */
UChar *writableBuffer;
uint32_t writableBufSize;
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
@ -280,6 +286,10 @@ typedef struct collIterate {
int32_t extendCEsSize; /* Holds the size of the dynamic CEs buffer */
uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */
UChar stackWritableBuffer[UCOL_WRITABLE_BUFFER_SIZE]; /* A writable buffer. */
int32_t *offsetBuffer; /* A dynamic buffer to hold offsets */
int32_t offsetBufferSize; /* The size of the offset buffer */
UCharIterator *iterator;
/*int32_t iteratorIndex;*/
} collIterate;
@ -293,6 +303,7 @@ data similar to collIterate.
*/
struct collIterateState {
UChar *pos; /* This is position in the string. Can be to original or writable buf */
UChar *returnPos;
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
UChar *bufferaddress; /* address of the normalization buffer */
uint32_t buffersize;
@ -305,6 +316,8 @@ struct collIterateState {
U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, int32_t sourceLen, collIterate *s);
struct UCollationPCE;
typedef struct UCollationPCE UCollationPCE;
struct UCollationElements
{
@ -320,9 +333,17 @@ struct UCollationElements
* Indicates if the data should be deleted.
*/
UBool isWritable;
/**
* Data for getNextProcessed, getPreviousProcessed.
*/
UCollationPCE *pce;
};
U_CAPI void U_EXPORT2
uprv_init_pce(const struct UCollationElements *elems);
#define UCOL_LEVELTERMINATOR 1
/* mask off anything but primary order */

View file

@ -20,6 +20,7 @@
#include "unicode/ucoleitr.h"
#include "unicode/ustring.h"
#include "unicode/sortkey.h"
#include "unicode/uobject.h"
#include "ucol_imp.h"
#include "cmemory.h"
@ -27,8 +28,269 @@ U_NAMESPACE_USE
#define BUFFER_LENGTH 100
#define DEFAULT_BUFFER_SIZE 16
#define BUFFER_GROW 8
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
#define DELETE_ARRAY(array) uprv_free((void *) (array))
typedef struct collIterate collIterator;
struct RCEI
{
uint32_t ce;
int32_t low;
int32_t high;
};
struct RCEBuffer
{
RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
RCEI *buffer;
int32_t bufferIndex;
int32_t bufferSize;
RCEBuffer();
~RCEBuffer();
UBool empty() const;
void put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
const RCEI *get();
};
RCEBuffer::RCEBuffer()
{
buffer = defaultBuffer;
bufferIndex = 0;
bufferSize = DEFAULT_BUFFER_SIZE;
}
RCEBuffer::~RCEBuffer()
{
if (buffer != defaultBuffer) {
DELETE_ARRAY(buffer);
}
}
UBool RCEBuffer::empty() const
{
return bufferIndex <= 0;
}
void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
{
if (bufferIndex >= bufferSize) {
RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
ARRAY_COPY(newBuffer, buffer, bufferSize);
if (buffer != defaultBuffer) {
DELETE_ARRAY(buffer);
}
buffer = newBuffer;
bufferSize += BUFFER_GROW;
}
buffer[bufferIndex].ce = ce;
buffer[bufferIndex].low = ixLow;
buffer[bufferIndex].high = ixHigh;
bufferIndex += 1;
}
const RCEI *RCEBuffer::get()
{
if (bufferIndex > 0) {
return &buffer[--bufferIndex];
}
return NULL;
}
struct PCEI
{
uint64_t ce;
int32_t low;
int32_t high;
};
struct PCEBuffer
{
PCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
PCEI *buffer;
int32_t bufferIndex;
int32_t bufferSize;
PCEBuffer();
~PCEBuffer();
void reset();
UBool empty() const;
void put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
const PCEI *get();
};
PCEBuffer::PCEBuffer()
{
buffer = defaultBuffer;
bufferIndex = 0;
bufferSize = DEFAULT_BUFFER_SIZE;
}
PCEBuffer::~PCEBuffer()
{
if (buffer != defaultBuffer) {
DELETE_ARRAY(buffer);
}
}
void PCEBuffer::reset()
{
bufferIndex = 0;
}
UBool PCEBuffer::empty() const
{
return bufferIndex <= 0;
}
void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
{
if (bufferIndex >= bufferSize) {
PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
ARRAY_COPY(newBuffer, buffer, bufferSize);
if (buffer != defaultBuffer) {
DELETE_ARRAY(buffer);
}
buffer = newBuffer;
bufferSize += BUFFER_GROW;
}
buffer[bufferIndex].ce = ce;
buffer[bufferIndex].low = ixLow;
buffer[bufferIndex].high = ixHigh;
bufferIndex += 1;
}
const PCEI *PCEBuffer::get()
{
if (bufferIndex > 0) {
return &buffer[--bufferIndex];
}
return NULL;
}
/*
* This inherits from UObject so that
* it can be allocated by new and the
* constructor for PCEBuffer is called.
*/
struct UCollationPCE : public UObject
{
PCEBuffer pceBuffer;
UCollationStrength strength;
UBool toShift;
UBool isShifted;
uint32_t variableTop;
UCollationPCE(UCollationElements *elems);
~UCollationPCE();
void init(const UCollator *coll);
virtual UClassID getDynamicClassID() const;
static UClassID getStaticClassID();
};
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE)
UCollationPCE::UCollationPCE(UCollationElements *elems)
{
init(elems->iteratordata_.coll);
}
void UCollationPCE::init(const UCollator *coll)
{
UErrorCode status = U_ZERO_ERROR;
strength = ucol_getStrength(coll);
toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
isShifted = FALSE;
variableTop = coll->variableTopValue << 16;
}
UCollationPCE::~UCollationPCE()
{
// nothing to do
}
inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
{
uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
// This is clean, but somewhat slow...
// We could apply the mask to ce and then
// just get all three orders...
switch(elems->pce->strength) {
default:
tertiary = ucol_tertiaryOrder(ce);
/* note fall-through */
case UCOL_SECONDARY:
secondary = ucol_secondaryOrder(ce);
/* note fall-through */
case UCOL_PRIMARY:
primary = ucol_primaryOrder(ce);
}
// Continuation?
if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0)
|| (elems->pce->isShifted && primary == 0)) {
if (primary == 0) {
return UCOL_IGNORABLE;
}
if (elems->pce->strength >= UCOL_QUATERNARY) {
quaternary = primary;
}
primary = secondary = tertiary = 0;
elems->pce->isShifted = TRUE;
} else {
if (elems->pce->strength >= UCOL_QUATERNARY) {
quaternary = 0xFFFF;
}
elems->pce->isShifted = FALSE;
}
return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
}
U_CAPI void U_EXPORT2
uprv_init_pce(const UCollationElements *elems)
{
if (elems->pce != NULL) {
elems->pce->init(elems->iteratordata_.coll);
}
}
/* public methods ---------------------------------------------------- */
U_CAPI UCollationElements* U_EXPORT2
@ -50,8 +312,9 @@ ucol_openElements(const UCollator *coll,
return NULL;
}
result->reset_ = TRUE;
result->isWritable = FALSE;
result->reset_ = TRUE;
result->isWritable = FALSE;
result->pce = NULL;
if (text == NULL) {
textLength = 0;
@ -64,22 +327,34 @@ ucol_openElements(const UCollator *coll,
U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements *elems)
{
if (elems != NULL) {
collIterate *ci = &elems->iteratordata_;
if (ci != NULL) {
if (ci->writableBuffer != ci->stackWritableBuffer) {
uprv_free(ci->writableBuffer);
}
if (ci->extendCEs) {
uprv_free(ci->extendCEs);
}
}
if (elems->isWritable && elems->iteratordata_.string != NULL)
{
uprv_free(elems->iteratordata_.string);
}
uprv_free(elems);
}
if (elems != NULL) {
collIterate *ci = &elems->iteratordata_;
if (ci != NULL) {
if (ci->writableBuffer != ci->stackWritableBuffer) {
uprv_free(ci->writableBuffer);
}
if (ci->extendCEs) {
uprv_free(ci->extendCEs);
}
if (ci->offsetBuffer) {
uprv_free(ci->offsetBuffer);
}
}
if (elems->isWritable && elems->iteratordata_.string != NULL)
{
uprv_free(elems->iteratordata_.string);
}
if (elems->pce != NULL) {
delete elems->pce;
}
uprv_free(elems);
}
}
U_CAPI void U_EXPORT2
@ -103,6 +378,9 @@ ucol_reset(UCollationElements *elems)
ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
}
ci->fcdPosition = NULL;
//ci->offsetReturn = ci->offsetStore = NULL;
ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
}
U_CAPI int32_t U_EXPORT2
@ -126,6 +404,52 @@ ucol_next(UCollationElements *elems,
return result;
}
U_CAPI int64_t U_EXPORT2
ucol_nextProcessed(UCollationElements *elems,
int32_t *ixLow,
int32_t *ixHigh,
UErrorCode *status)
{
const UCollator *coll = elems->iteratordata_.coll;
int64_t result = UCOL_IGNORABLE;
uint32_t low = 0, high = 0;
if (U_FAILURE(*status)) {
return UCOL_PROCESSED_NULLORDER;
}
if (elems->pce == NULL) {
elems->pce = new UCollationPCE(elems);
} else {
elems->pce->pceBuffer.reset();
}
elems->reset_ = FALSE;
do {
low = ucol_getOffset(elems);
uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status);
high = ucol_getOffset(elems);
if (ce == UCOL_NO_MORE_CES) {
result = UCOL_PROCESSED_NULLORDER;
break;
}
result = processCE(elems, ce);
} while (result == UCOL_IGNORABLE);
if (ixLow != NULL) {
*ixLow = low;
}
if (ixHigh != NULL) {
*ixHigh = high;
}
return result;
}
U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements *elems,
UErrorCode *status)
@ -161,12 +485,162 @@ ucol_previous(UCollationElements *elems,
}
}
U_CAPI int64_t U_EXPORT2
ucol_previousProcessed(UCollationElements *elems,
int32_t *ixLow,
int32_t *ixHigh,
UErrorCode *status)
{
const UCollator *coll = elems->iteratordata_.coll;
int64_t result = UCOL_IGNORABLE;
// int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
// UCollationStrength strength = ucol_getStrength(coll);
// UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
// uint32_t variableTop = coll->variableTopValue;
uint32_t low = 0, high = 0;
if (U_FAILURE(*status)) {
return UCOL_PROCESSED_NULLORDER;
}
if (elems->reset_ &&
(elems->iteratordata_.pos == elems->iteratordata_.string)) {
if (elems->iteratordata_.endp == NULL) {
elems->iteratordata_.endp = elems->iteratordata_.string +
u_strlen(elems->iteratordata_.string);
elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
}
elems->iteratordata_.pos = elems->iteratordata_.endp;
elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
}
if (elems->pce == NULL) {
elems->pce = new UCollationPCE(elems);
} else {
//elems->pce->pceBuffer.reset();
}
elems->reset_ = FALSE;
while (elems->pce->pceBuffer.empty()) {
// buffer raw CEs up to non-ignorable primary
RCEBuffer rceb;
uint32_t ce;
// **** do we need to reset rceb, or will it always be empty at this point ****
do {
high = ucol_getOffset(elems);
ce = ucol_getPrevCE(coll, &elems->iteratordata_, status);
low = ucol_getOffset(elems);
if (ce == UCOL_NO_MORE_CES) {
if (! rceb.empty()) {
break;
}
goto finish;
}
rceb.put(ce, low, high);
} while ((ce & UCOL_PRIMARYMASK) == 0);
// process the raw CEs
while (! rceb.empty()) {
const RCEI *rcei = rceb.get();
result = processCE(elems, rcei->ce);
if (result != UCOL_IGNORABLE) {
elems->pce->pceBuffer.put(result, rcei->low, rcei->high);
}
}
}
finish:
if (elems->pce->pceBuffer.empty()) {
// **** Is -1 the right value for ixLow, ixHigh? ****
if (ixLow != NULL) {
*ixLow = -1;
}
if (ixHigh != NULL) {
*ixHigh = -1
;
}
return UCOL_PROCESSED_NULLORDER;
}
const PCEI *pcei = elems->pce->pceBuffer.get();
if (ixLow != NULL) {
*ixLow = pcei->low;
}
if (ixHigh != NULL) {
*ixHigh = pcei->high;
}
return pcei->ce;
}
U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements *elems,
int32_t order)
{
uint8_t result;
#if 0
UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
#else
const UCollator *coll = elems->iteratordata_.coll;
const uint32_t *start;
const uint32_t *limit;
const uint32_t *mid;
uint32_t strengthMask = 0;
uint32_t mOrder = (uint32_t) order;
switch (coll->strength)
{
default:
strengthMask |= UCOL_TERTIARYORDERMASK;
/* fall through */
case UCOL_SECONDARY:
strengthMask |= UCOL_SECONDARYORDERMASK;
/* fall through */
case UCOL_PRIMARY:
strengthMask |= UCOL_PRIMARYORDERMASK;
}
mOrder &= strengthMask;
start = (coll)->endExpansionCE;
limit = (coll)->lastEndExpansionCE;
while (start < limit - 1) {
mid = start + ((limit - start) >> 1);
if (mOrder <= (*mid & strengthMask)) {
limit = mid;
} else {
start = mid;
}
}
// FIXME: with a masked search, there might be more than one hit,
// so we need to look forward and backward from the match to find all
// of the hits...
if ((*start & strengthMask) == mOrder) {
result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));
} else if ((*limit & strengthMask) == mOrder) {
result = *(coll->expansionCESize + (limit - coll->endExpansionCE));
} else if ((mOrder & 0xFFFF) == 0x00C0) {
result = 2;
} else {
result = 1;
}
#endif
return result;
}
@ -199,21 +673,30 @@ ucol_setText( UCollationElements *elems,
U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements *elems)
{
const collIterate *ci = &(elems->iteratordata_);
// while processing characters in normalization buffer getOffset will
// return the next non-normalized character.
// should be inline with the old implementation since the old codes uses
// nextDecomp in normalizer which also decomposes the string till the
// first base character is found.
if (ci->flags & UCOL_ITER_INNORMBUF) {
if (ci->fcdPosition == NULL) {
return 0;
}
return (int32_t)(ci->fcdPosition - ci->string);
}
else {
return (int32_t)(ci->pos - ci->string);
}
const collIterate *ci = &(elems->iteratordata_);
if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) {
return ci->offsetRepeatValue;
}
if (ci->offsetReturn != NULL) {
return *ci->offsetReturn;
}
// while processing characters in normalization buffer getOffset will
// return the next non-normalized character.
// should be inline with the old implementation since the old codes uses
// nextDecomp in normalizer which also decomposes the string till the
// first base character is found.
if (ci->flags & UCOL_ITER_INNORMBUF) {
if (ci->fcdPosition == NULL) {
return 0;
}
return (int32_t)(ci->fcdPosition - ci->string);
}
else {
return (int32_t)(ci->pos - ci->string);
}
}
U_CAPI void U_EXPORT2
@ -239,6 +722,10 @@ ucol_setOffset(UCollationElements *elems,
}
ci->fcdPosition = NULL;
elems->reset_ = FALSE;
ci->offsetReturn = NULL;
ci->offsetStore = ci->offsetBuffer;
ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
}
U_CAPI int32_t U_EXPORT2

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2001-2004, International Business Machines
* Copyright (C) 2001-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*
@ -27,6 +27,14 @@
*/
#define UCOL_NULLORDER ((int32_t)0xFFFFFFFF)
/**
* This indicates an error has occured during processing or there are no more CEs
* to be returned.
*
* @internal
*/
#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
#include "unicode/ucol.h"
/**
@ -175,6 +183,45 @@ ucol_next(UCollationElements *elems, UErrorCode *status);
U_STABLE int32_t U_EXPORT2
ucol_previous(UCollationElements *elems, UErrorCode *status);
/**
* Get the processed ordering priority of the next collation element in the text.
* A single character may contain more than one collation element.
*
* @param elems The UCollationElements containing the text.
* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
* @param status A pointer to an UErrorCode to receive any errors.
* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
* if an error has occured or if the end of string has been reached
*
* @internal
*/
U_INTERNAL int64_t U_EXPORT2
ucol_nextProcessed(UCollationElements *elems, int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
/**
* Get the processed ordering priority of the previous collation element in the text.
* A single character may contain more than one collation element.
* Note that internally a stack is used to store buffered collation elements.
* It is very rare that the stack will overflow, however if such a case is
* encountered, the problem can be solved by increasing the size
* UCOL_EXPAND_CE_BUFFER_SIZE in ucol_imp.h.
*
* @param elems The UCollationElements containing the text.
* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
* @param status A pointer to an UErrorCode to receive any errors. Noteably
* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
* buffer has been exhausted.
* @return The previous collation elements ordering, otherwise returns
* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
* string has been reached.
*
* @internal
*/
U_INTERNAL int64_t U_EXPORT2
ucol_previousProcessed(UCollationElements *elems, int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
/**
* Get the maximum length of any expansion sequences that end with the
* specified comparison order.

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2001-2007 IBM and others. All rights reserved.
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 06/28/2001 synwee Creation.
@ -641,6 +641,126 @@ U_STABLE int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
*/
U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch);
/**
* Simple forward search for the pattern, starting at a specified index,
* and using using a default set search options.
*
* This is an experimental function, and is not an official part of the
* ICU API.
*
* The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
*
* The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
* any Break Iterator are ignored.
*
* Matches obey the following constraints:
*
* Characters at the start or end positions of a match that are ignorable
* for collation are not included as part of the match, unless they
* are part of a combining sequence, as described below.
*
* A match will not include a partial combining sequence. Combining
* character sequences are considered to be inseperable units,
* and either match the pattern completely, or are considered to not match
* at all. Thus, for example, an A followed a combining accent mark will
* not be found when searching for a plain (unaccented) A. (unless
* the collation strength has been set to ignore all accents).
*
* When beginning a search, the initial starting position, startIdx,
* is assumed to be an acceptable match boundary with respect to
* combining characters. A combining sequence that spans across the
* starting point will not supress a match beginning at startIdx.
*
* Characters that expand to multiple collation elements
* (German sharp-S becoming 'ss', or the composed forms of accented
* characters, for example) also must match completely.
* Searching for a single 's' in a string containing only a sharp-s will
* find no match.
*
*
* @param strsrch the UStringSearch struct, which references both
* the text to be searched and the pattern being sought.
* @param startIdx The index into the text to begin the search.
* @param matchStart An out parameter, the starting index of the matched text.
* This parameter may be NULL.
* A value of -1 will be returned if no match was found.
* @param matchLimit Out parameter, the index of the first position following the matched text.
* The matchLimit will be at a suitable position for beginning a subsequent search
* in the input text.
* This parameter may be NULL.
* A value of -1 will be returned if no match was found.
*
* @param status Report any errors. Note that no match found is not an error.
* @return TRUE if a match was found, FALSE otherwise.
*
* @internal
*/
U_INTERNAL UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
int32_t startIdx,
int32_t *matchStart,
int32_t *matchLimit,
UErrorCode *status);
/**
* Simple backwards search for the pattern, starting at a specified index,
* and using using a default set search options.
*
* This is an experimental function, and is not an official part of the
* ICU API.
*
* The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
*
* The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
* any Break Iterator are ignored.
*
* Matches obey the following constraints:
*
* Characters at the start or end positions of a match that are ignorable
* for collation are not included as part of the match, unless they
* are part of a combining sequence, as described below.
*
* A match will not include a partial combining sequence. Combining
* character sequences are considered to be inseperable units,
* and either match the pattern completely, or are considered to not match
* at all. Thus, for example, an A followed a combining accent mark will
* not be found when searching for a plain (unaccented) A. (unless
* the collation strength has been set to ignore all accents).
*
* When beginning a search, the initial starting position, startIdx,
* is assumed to be an acceptable match boundary with respect to
* combining characters. A combining sequence that spans across the
* starting point will not supress a match beginning at startIdx.
*
* Characters that expand to multiple collation elements
* (German sharp-S becoming 'ss', or the composed forms of accented
* characters, for example) also must match completely.
* Searching for a single 's' in a string containing only a sharp-s will
* find no match.
*
*
* @param strsrch the UStringSearch struct, which references both
* the text to be searched and the pattern being sought.
* @param startIdx The index into the text to begin the search.
* @param matchStart An out parameter, the starting index of the matched text.
* This parameter may be NULL.
* A value of -1 will be returned if no match was found.
* @param matchLimit Out parameter, the index of the first position following the matched text.
* The matchLimit will be at a suitable position for beginning a subsequent search
* in the input text.
* This parameter may be NULL.
* A value of -1 will be returned if no match was found.
*
* @param status Report any errors. Note that no match found is not an error.
* @return TRUE if a match was found, FALSE otherwise.
*
* @internal
*/
U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
int32_t startIdx,
int32_t *matchStart,
int32_t *matchLimit,
UErrorCode *status);
#endif /* #if !UCONFIG_NO_COLLATION */
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2001-2007 IBM and others. All rights reserved.
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 08/13/2001 synwee Creation.
@ -31,8 +31,8 @@ struct USearch {
// value USEARCH_DONE is the default value
// if we are not at the start of the text or the end of the text,
// depending on the iteration direction and matchedIndex is USEARCH_DONE
// it means that we can find any more matches in that particular direction
int32_t matchedIndex;
// it means that we can't find any more matches in that particular direction
int32_t matchedIndex;
int32_t matchedLength;
UBool isForwardSearching;
UBool reset;
@ -45,6 +45,9 @@ struct UPattern {
int32_t CELength;
int32_t *CE;
int32_t CEBuffer[INITIAL_ARRAY_SIZE_];
int32_t PCELength;
int64_t *PCE;
int64_t PCEBuffer[INITIAL_ARRAY_SIZE_];
UBool hasPrefixAccents;
UBool hasSuffixAccents;
int16_t defaultShiftSize;

View file

@ -54,6 +54,9 @@
#include "cmemory.h"
#include "ucol_imp.h"
/* set to 1 to test offsets in backAndForth() */
#define TEST_OFFSETS 0
/* perform test with strength PRIMARY */
static void TestPrimary(void);
@ -436,14 +439,15 @@ void doTest(UCollator* myCollation, const UChar source[], const UChar target[],
* Return an integer array containing all of the collation orders
* returned by calls to next on the specified iterator
*/
int32_t* getOrders(UCollationElements *iter, int32_t *orderLength)
OrderAndOffset* getOrders(UCollationElements *iter, int32_t *orderLength)
{
UErrorCode status;
int32_t order;
int32_t maxSize = 100;
int32_t size = 0;
int32_t *temp;
int32_t *orders =(int32_t*)malloc(sizeof(int32_t) * maxSize);
int32_t offset = ucol_getOffset(iter);
OrderAndOffset *temp;
OrderAndOffset *orders =(OrderAndOffset *)malloc(sizeof(OrderAndOffset) * maxSize);
status= U_ZERO_ERROR;
@ -452,22 +456,26 @@ int32_t* getOrders(UCollationElements *iter, int32_t *orderLength)
if (size == maxSize)
{
maxSize *= 2;
temp = (int32_t*)malloc(sizeof(int32_t) * maxSize);
temp = (OrderAndOffset *)malloc(sizeof(OrderAndOffset) * maxSize);
memcpy(temp, orders, size * sizeof(int32_t));
memcpy(temp, orders, size * sizeof(OrderAndOffset));
free(orders);
orders = temp;
}
orders[size++] = order;
orders[size].order = order;
orders[size].offset = offset;
offset = ucol_getOffset(iter);
size += 1;
}
if (maxSize > size && size > 0)
{
temp = (int32_t*)malloc(sizeof(int32_t) * size);
temp = (OrderAndOffset *)malloc(sizeof(OrderAndOffset) * size);
memcpy(temp, orders, size * sizeof(int32_t));
memcpy(temp, orders, size * sizeof(OrderAndOffset));
free(orders);
orders = temp;
@ -486,8 +494,7 @@ backAndForth(UCollationElements *iter)
int32_t index, o;
UErrorCode status = U_ZERO_ERROR;
int32_t orderLength = 0;
int32_t *orders;
orders= getOrders(iter, &orderLength);
OrderAndOffset *orders = getOrders(iter, &orderLength);
/* Now go through it backwards and make sure we get the same values */
@ -495,49 +502,60 @@ backAndForth(UCollationElements *iter)
ucol_reset(iter);
/* synwee : changed */
while ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
{
if (o != orders[-- index])
{
while ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER) {
int32_t offset = ucol_getOffset(iter);
index -= 1;
if (o != orders[index].order) {
if (o == 0)
index ++;
else
{
while (index > 0 && orders[-- index] == 0)
{
else {
while (index > 0 && orders[-- index].order == 0) {
/* nothing... */
}
if (o != orders[index])
{
log_err("Mismatch at index : 0x%x\n", index);
return;
}
if (o != orders[index].order) {
log_err("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X\n", index,
orders[index].order, o);
goto bail;
}
}
}
#if TEST_OFFSETS
if (offset != orders[index].offset) {
log_err("Mismatched offset at index %d: %d vs. %d\n", index,
orders[index].offset, offset);
goto bail;
}
#endif
}
while (index != 0 && orders[index - 1] == 0) {
index --;
while (index != 0 && orders[index - 1].order == 0) {
index -= 1;
}
if (index != 0)
{
if (index != 0) {
log_err("Didn't get back to beginning - index is %d\n", index);
ucol_reset(iter);
log_err("\nnext: ");
if ((o = ucol_next(iter, &status)) != UCOL_NULLORDER)
{
if ((o = ucol_next(iter, &status)) != UCOL_NULLORDER) {
log_err("Error at %x\n", o);
}
log_err("\nprev: ");
if ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
{
if ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER) {
log_err("Error at %x\n", o);
}
log_verbose("\n");
}
bail:
free(orders);
}

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2006, International Business Machines Corporation and
* Copyright (c) 1997-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -29,13 +29,20 @@
#define RULE_BUFFER_LEN 8192
struct OrderAndOffset
{
int32_t order;
int32_t offset;
};
typedef struct OrderAndOffset OrderAndOffset;
/* tests comparison of custom collation with different strengths */
void doTest(UCollator*, const UChar* source, const UChar* target, UCollationResult result);
/* verify that iterating forward and backwards over the string yields same CEs */
void backAndForth(UCollationElements *iter);
/* gets an array of CEs for a string in UCollationElements iterator. */
int32_t* getOrders(UCollationElements *iter, int32_t *orderLength);
OrderAndOffset* getOrders(UCollationElements *iter, int32_t *orderLength);
void genericOrderingTestWithResult(UCollator *coll, const char * const s[], uint32_t size, UCollationResult result);
void genericOrderingTest(UCollator *coll, const char * const s[], uint32_t size);

View file

@ -562,7 +562,7 @@ static void TestOffset()
UCollator *en_us=NULL;
UCollationElements *iter, *pristine;
int32_t offset;
int32_t *orders;
OrderAndOffset *orders;
int32_t orderLength=0;
int count = 0;
UChar test1[50];
@ -649,7 +649,7 @@ static void TestOffset()
switch (count) {
case 0:
if (ucol_getOffset(iter) != 1) {
log_err("ERROR: Offset of iteration should be 0\n");
log_err("ERROR: Offset of iteration should be 1\n");
}
break;
case 3:
@ -671,10 +671,16 @@ static void TestOffset()
U_SUCCESS(status)) {
switch (count) {
case 0:
case 1:
if (ucol_getOffset(iter) != 3) {
log_err("ERROR: Offset of iteration should be 3\n");
}
break;
case 2:
if (ucol_getOffset(iter) != 1) {
log_err("ERROR: Offset of iteration should be 1\n");
}
break;
default:
if (ucol_getOffset(iter) != 0) {
log_err("ERROR: Offset of iteration should be 0\n");
@ -937,7 +943,7 @@ static void TestSmallBuffer()
UCollationElements *testiter,
*iter;
int32_t count = 0;
int32_t *testorders,
OrderAndOffset *testorders,
*orders;
UChar teststr[500];
@ -977,8 +983,8 @@ static void TestSmallBuffer()
while (count != 0) {
/* UCA collation element for 0x0F76 */
if ((count > 250 && testorders[-- count] != orders[1]) ||
(count <= 250 && testorders[-- count] != orders[0])) {
if ((count > 250 && testorders[-- count].order != orders[1].order) ||
(count <= 250 && testorders[-- count].order != orders[0].order)) {
log_err("Error decomposition does not give the right collation element at %d count\n", count);
break;
}

View file

@ -1,5 +1,5 @@
/********************************************************************
* Copyright (c) 2001-2007 International Business Machines
* Copyright (c) 2001-2008 International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************
* File USRCHDAT.H
@ -19,6 +19,9 @@ Note: This file is included by other C and C++ files. This file should not be di
#if !UCONFIG_NO_COLLATION
/* Set to 1 if matches must be on grapheme boundaries */
#define GRAPHEME_BOUNDARIES 1
U_CDECL_BEGIN
struct SearchData {
const char *text;
@ -51,9 +54,15 @@ static const SearchData BASIC[] = {
{"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}},
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1},
{2}},
#if GRAPHEME_BOUNDARIES
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
{"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
#endif
{"\\u00c9", "e", NULL, UCOL_PRIMARY, NULL, {0, -1}, {1}},
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -74,6 +83,10 @@ static const SearchData BREAKITERATOREXACT[] = {
{"testing that string ab\\u00e9cd does not match e", "e", NULL,
UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
{"\\u00c9", "e", "fr", UCOL_PRIMARY, "characterbreaker", {0, -1}, {1}},
#if 0
/* Problem reported by Dave Bertoni, same as ticket 4279? */
{"\\u0043\\u004F\\u0302\\u0054\\u00C9", "\\u004F", NULL, UCOL_TERTIARY, "characterbreaker", {1, -1}, {2}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -92,6 +105,12 @@ static const SearchData STRENGTH[] = {
{7, 7, 7, 7}},
{"\\u00c0 should match but not A", "A\\u0300", "en", UCOL_IDENTICAL,
NULL, {0, -1}, {1, 0}},
#if 0
/* Ticket 5382 */
{"12\\u0171", "\\u0170", NULL, UCOL_SECONDARY, NULL, {2, -1}, {2}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -120,14 +139,19 @@ static const SearchData VARIABLE[] = {
};
static const SearchData NORMEXACT[] = {
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1},
{2}},
{"a\\u0300\\u0325", "a\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
#if GRAPHEME_BOUNDARIES
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
static const SearchData NONNORMEXACT[] = {
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1},
{0}},
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -167,6 +191,15 @@ static const SearchData TEXT[] = {
};
static const SearchData COMPOSITEBOUNDARIES[] = {
#if GRAPHEME_BOUNDARIES
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
{"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#else
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
@ -175,16 +208,25 @@ static const SearchData COMPOSITEBOUNDARIES[] = {
{"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, 1, -1},
{1, 1}},
#endif
{"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
/* A + 030A + 0301 */
{"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#if GRAPHEME_BOUNDARIES
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#endif
{"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
@ -193,6 +235,15 @@ static const SearchData COMPOSITEBOUNDARIES[] = {
{"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
/* Ticket 5024 */
{"a\\u00e1", "a\\u00e1", NULL, UCOL_SECONDARY, NULL, {0, -1}, {2}},
/* Ticket 5420 */
{"fu\\u00dfball", "fu\\u00df", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
{"fu\\u00dfball", "fuss", NULL, UCOL_PRIMARY, NULL, {0, -1}, {3}},
{"fu\\u00dfball", "uss", NULL, UCOL_PRIMARY, NULL, {1, -1}, {2}},
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -229,12 +280,24 @@ static const char *CONTRACTIONRULE =
static const SearchData CONTRACTION[] = {
/* common discontiguous */
{"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1},
{2}},
#if GRAPHEME_BOUNDARIES
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
#endif
/* contraction prefix */
{"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#if GRAPHEME_BOUNDARIES
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}},
#endif
/* discontiguous problem here for backwards iteration.
accents not found because discontiguous stores all information */
{"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {-1},
@ -249,15 +312,37 @@ static const SearchData CONTRACTION[] = {
/* blocked discontiguous */
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL,
{-1}, {0}},
#if GRAPHEME_BOUNDARIES
/*
* "ab" generates a contraction that's an expansion. The "z" matches the
* first CE of the expansion but the match fails because it ends in the
* middle of an expansion...
*/
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
static const char *IGNORABLERULE = "&a = \\u0300";
static const SearchData IGNORABLE[] = {
#if GRAPHEME_BOUNDARIES
/*
* This isn't much of a test when matches have to be on
* grapheme boundiaries. The match at 0 only works because
* it's at the start of the text.
*/
{"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL,
{0, -1}, {2}},
#else
{"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL,
{0, 3, -1}, {2, 2}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -273,6 +358,20 @@ static const SearchData BASICCANONICAL[] = {
{6, 6}},
{"Scott Ganyo", "c", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
{"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}},
#if GRAPHEME_BOUNDARIES
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325b", "\\u0300b", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL, UCOL_TERTIARY,
NULL, {-1}, {0}},
{"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL, UCOL_TERTIARY,
NULL, {-1}, {0}},
{"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
"\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1},
@ -285,12 +384,28 @@ static const SearchData BASICCANONICAL[] = {
NULL, {0, -1}, {5}},
{"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
"\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {1, 12, -1}, {5, 3}},
#endif
{"\\u00c4\\u0323", "A\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"\\u0308\\u0323", "\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
static const SearchData NORMCANONICAL[] = {
#if GRAPHEME_BOUNDARIES
/*
* These tests don't really mean anything. With matches restricted to grapheme
* boundaries, isCanonicalMatch doesn't mean anything unless normalization is
* also turned on...
*/
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1},
@ -299,6 +414,8 @@ static const SearchData NORMCANONICAL[] = {
{2}},
{"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -397,6 +514,20 @@ static const SearchData TEXTCANONICAL[] = {
};
static const SearchData COMPOSITEBOUNDARIESCANONICAL[] = {
#if GRAPHEME_BOUNDARIES
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
{"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
/* first one matches only because it's at the start of the text */
{"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
/* \\u0300 blocked by \\u0300 */
{"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
@ -407,26 +538,66 @@ static const SearchData COMPOSITEBOUNDARIESCANONICAL[] = {
{1, 1}},
/* \\u0300 blocked by \\u0300 */
{"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
#endif
/* A + 030A + 0301 */
{"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#if GRAPHEME_BOUNDARIES
{"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#endif
{"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#if GRAPHEME_BOUNDARIES
{"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#endif
/* blocked accent */
{"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#if GRAPHEME_BOUNDARIES
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
{"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
#endif
{"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#if GRAPHEME_BOUNDARIES
{"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
{"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#endif
{"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
#if GRAPHEME_BOUNDARIES
{"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
NULL, UCOL_TERTIARY, NULL, {10, -1}, {2}},
#else
{"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
NULL, UCOL_TERTIARY, NULL, {0, 6, 10, 13, -1}, {1, 3, 2, 1}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
@ -459,33 +630,59 @@ static const SearchData SUPPLEMENTARYCANONICAL[] = {
static const SearchData CONTRACTIONCANONICAL[] = {
/* common discontiguous */
#if GRAPHEME_BOUNDARIES
{"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1},
{2}},
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
#endif
/* contraction prefix */
{"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#if GRAPHEME_BOUNDARIES
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
#else
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}},
#endif
/* discontiguous problem here for backwards iteration.
forwards gives 0, 4 but backwards give 1, 3 */
/* {"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {0, -1},
{4}}, */
/* ends not with a contraction character */
{"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1},
{0}},
{"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL,
{0, -1}, {3}},
{"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL,
{0, -1}, {4}},
{"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
{"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
#if GRAPHEME_BOUNDARIES
{"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
/* blocked discontiguous */
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL,
{1, -1}, {4}},
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
/*
* "ab" generates a contraction that's an expansion. The "z" matches the
* first CE of the expansion but the match fails because it ends in the
* middle of an expansion...
*/
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {2}},
#else
{"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {4}},
/* blocked discontiguous */
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {1, -1}, {4}},
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
#endif
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
};
static const SearchData DIACTRICMATCH[] = {
static const SearchData DIACRITICMATCH[] = {
{"\\u03BA\\u03B1\\u03B9\\u0300\\u0020\\u03BA\\u03B1\\u1F76", "\\u03BA\\u03B1\\u03B9", NULL, UCOL_PRIMARY, NULL, {0, 5,-1}, {4, 3}},
{"\\u0061\\u0061\\u00E1", "\\u0061\\u00E1", NULL, UCOL_SECONDARY, NULL, {1, -1}, {2}},
{"\\u0020\\u00C2\\u0303\\u0020\\u0041\\u0061\\u1EAA\\u0041\\u0302\\u0303\\u00C2\\u0303\\u1EAB\\u0061\\u0302\\u0303\\u00E2\\u0303\\uD806\\uDC01\\u0300\\u0020",

View file

@ -453,28 +453,35 @@ static UBool assertCanonicalEqual(const SearchData search)
UCollator *collator = getCollator(search.collator);
UBreakIterator *breaker = getBreakIterator(search.breaker);
UStringSearch *strsrch;
UBool result = TRUE;
CHECK_BREAK_BOOL(search.breaker);
u_unescape(search.text, text, 128);
u_unescape(search.pattern, pattern, 32);
ucol_setStrength(collator, search.strength);
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
strsrch = usearch_openFromCollator(pattern, -1, text, -1, collator,
breaker, &status);
usearch_setAttribute(strsrch, USEARCH_CANONICAL_MATCH, USEARCH_ON,
&status);
if (U_FAILURE(status)) {
log_err("Error opening string search %s\n", u_errorName(status));
return FALSE;
result = FALSE;
goto bail;
}
if (!assertEqualWithUStringSearch(strsrch, search)) {
ucol_setStrength(collator, UCOL_TERTIARY);
usearch_close(strsrch);
return FALSE;
result = FALSE;
goto bail;
}
bail:
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &status);
ucol_setStrength(collator, UCOL_TERTIARY);
usearch_close(strsrch);
return TRUE;
return result;
}
static UBool assertEqualWithAttribute(const SearchData search,
@ -1537,7 +1544,7 @@ static void TestIgnorable(void)
ucol_close(collator);
}
static void TestDiactricMatch(void)
static void TestDiacriticMatch(void)
{
UChar pattern[128];
UChar text[128];
@ -1556,7 +1563,7 @@ static void TestDiactricMatch(void)
return;
}
search = DIACTRICMATCH[count];
search = DIACRITICMATCH[count];
while (search.text != NULL) {
if (search.collator != NULL) {
coll = ucol_openFromShortString(search.collator, FALSE, NULL, &status);
@ -1584,7 +1591,7 @@ static void TestDiactricMatch(void)
}
ucol_close(coll);
search = DIACTRICMATCH[++count];
search = DIACRITICMATCH[++count];
}
usearch_close(strsrch);
}
@ -2024,6 +2031,7 @@ static void TestGetSetOffsetCanonical(void)
UChar text[128];
UErrorCode status = U_ZERO_ERROR;
UStringSearch *strsrch;
UCollator *collator;
memset(pattern, 0, 32*sizeof(UChar));
memset(text, 0, 128*sizeof(UChar));
@ -2031,8 +2039,13 @@ static void TestGetSetOffsetCanonical(void)
open();
strsrch = usearch_openFromCollator(pattern, 16, text, 32, EN_US_, NULL,
&status);
collator = usearch_getCollator(strsrch);
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
usearch_setAttribute(strsrch, USEARCH_CANONICAL_MATCH, USEARCH_ON,
&status);
/* testing out of bounds error */
usearch_setOffset(strsrch, -1, &status);
if (U_SUCCESS(status)) {
@ -2071,7 +2084,7 @@ static void TestGetSetOffsetCanonical(void)
log_err("Error match found at %d %d\n",
usearch_getMatchedStart(strsrch),
usearch_getMatchedLength(strsrch));
return;
goto bail;
}
matchindex = search.offset[count + 1] == -1 ? -1 :
search.offset[count + 2];
@ -2080,7 +2093,7 @@ static void TestGetSetOffsetCanonical(void)
&status);
if (usearch_getOffset(strsrch) != search.offset[count + 1] + 1) {
log_err("Error setting offset\n");
return;
goto bail;
}
}
@ -2095,9 +2108,12 @@ static void TestGetSetOffsetCanonical(void)
log_err("Error match found at %d %d\n",
usearch_getMatchedStart(strsrch),
usearch_getMatchedLength(strsrch));
return;
goto bail;
}
}
bail:
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &status);
usearch_close(strsrch);
close();
}
@ -2242,7 +2258,7 @@ void addSearchTest(TestNode** root)
"tscoll/usrchtst/TestContractionCanonical");
addTest(root, &TestEnd, "tscoll/usrchtst/TestEnd");
addTest(root, &TestNumeric, "tscoll/usrchtst/TestNumeric");
addTest(root, &TestDiactricMatch, "tscoll/usrchtst/TestDiactricMatch");
addTest(root, &TestDiacriticMatch, "tscoll/usrchtst/TestDiacriticMatch");
}
#endif /* #if !UCONFIG_NO_COLLATION */

View file

@ -56,8 +56,7 @@ jamotest.o srchtest.o reptest.o regextst.o \
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
uobjtest.o idnaref.o idnaconf.o nptrans.o punyref.o testidn.o testidna.o incaltst.o \
calcasts.o v32test.o uvectest.o textfile.o tokiter.o utxttest.o \
windttst.o winnmtst.o winutil.o csdetest.o tzrulets.o tzoffloc.o tzfmttst.o
windttst.o winnmtst.o winutil.o csdetest.o tzrulets.o tzoffloc.o tzfmttst.o ssearch.o
DEPS = $(OBJECTS:.o=.d)

View file

@ -365,6 +365,14 @@
RelativePath=".\srchtest.h"
>
</File>
<File
RelativePath=".\ssearch.cpp"
>
</File>
<File
RelativePath=".\ssearch.h"
>
</File>
<File
RelativePath="svccoll.cpp"
>

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2003, International Business Machines Corporation and
* Copyright (c) 1997-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -188,7 +188,7 @@ void CollationIteratorTest::TestOffset(/* char* par */)
// Run all the way through the iterator, then get the offset
int32_t orderLength = 0;
int32_t *orders = getOrders(*iter, orderLength);
Order *orders = getOrders(*iter, orderLength);
int32_t offset = iter->getOffset();

View file

@ -1,6 +1,6 @@
/*
*****************************************************************************
* Copyright (C) 2001-2006, International Business Machines orporation
* Copyright (C) 2001-2008, International Business Machines orporation
* and others. All Rights Reserved.
****************************************************************************/
@ -154,7 +154,7 @@ void StringSearchTest::runIndexedTest(int32_t index, UBool exec,
CASE(33, TestUClassID)
CASE(34, TestSubclass)
CASE(35, TestCoverage)
CASE(36, TestDiactricMatch)
CASE(36, TestDiacriticMatch)
default: name = ""; break;
}
}
@ -256,8 +256,8 @@ UBool StringSearchTest::assertEqualWithStringSearch(StringSearch *strsrch,
char *str = toCharString(strsrch->getText());
errln("Text: %s", str);
str = toCharString(strsrch->getPattern());
errln("Pattern: %s", str);
errln("Error following match found at %d %d",
infoln("Pattern: %s", str);
infoln("Error following match found at %d %d",
strsrch->getMatchedStart(), strsrch->getMatchedLength());
return FALSE;
}
@ -375,7 +375,7 @@ UBool StringSearchTest::assertEqual(const SearchData *search)
if( strsrch2 == strsrch || *strsrch2 != *strsrch ||
!assertEqualWithStringSearch(strsrch2, search)
) {
errln("failure with StringSearch.clone()");
infoln("failure with StringSearch.clone()");
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
delete strsrch;
delete strsrch2;
@ -395,6 +395,7 @@ UBool StringSearchTest::assertCanonicalEqual(const SearchData *search)
BreakIterator *breaker = getBreakIterator(search->breaker);
StringSearch *strsrch;
UChar temp[128];
UBool result = TRUE;
#if UCONFIG_NO_BREAK_ITERATION
if(search->breaker) {
@ -415,22 +416,27 @@ UBool StringSearchTest::assertCanonicalEqual(const SearchData *search)
}
#endif
collator->setStrength(getECollationStrength(search->strength));
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
strsrch = new StringSearch(pattern, text, (RuleBasedCollator *)collator,
breaker, status);
strsrch->setAttribute(USEARCH_CANONICAL_MATCH, USEARCH_ON, status);
if (U_FAILURE(status)) {
errln("Error opening string search %s", u_errorName(status));
return FALSE;
result = FALSE;
goto bail;
}
if (!assertEqualWithStringSearch(strsrch, search)) {
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
delete strsrch;
return FALSE;
result = FALSE;
goto bail;
}
bail:
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, status);
delete strsrch;
return TRUE;
return result;
}
UBool StringSearchTest::assertEqualWithAttribute(const SearchData *search,
@ -681,7 +687,7 @@ void StringSearchTest::TestBasic()
while (BASIC[count].text != NULL) {
//printf("count %d", count);
if (!assertEqual(&BASIC[count])) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
count ++;
}
@ -698,14 +704,14 @@ void StringSearchTest::TestNormExact()
}
while (BASIC[count].text != NULL) {
if (!assertEqual(&BASIC[count])) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
count ++;
}
count = 0;
while (NORMEXACT[count].text != NULL) {
if (!assertEqual(&NORMEXACT[count])) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
count ++;
}
@ -713,7 +719,7 @@ void StringSearchTest::TestNormExact()
count = 0;
while (NONNORMEXACT[count].text != NULL) {
if (!assertEqual(&NONNORMEXACT[count])) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
count ++;
}
@ -724,7 +730,7 @@ void StringSearchTest::TestStrength()
int count = 0;
while (STRENGTH[count].text != NULL) {
if (!assertEqual(&STRENGTH[count])) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
count ++;
}
@ -810,7 +816,7 @@ void StringSearchTest::TestBreakIterator()
}
strsrch->reset();
if (!assertEqualWithStringSearch(strsrch, search)) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
delete strsrch;
count += 2;
@ -818,7 +824,7 @@ void StringSearchTest::TestBreakIterator()
count = 0;
while (BREAKITERATOREXACT[count].text != NULL) {
if (!assertEqual(&BREAKITERATOREXACT[count])) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
count ++;
}
@ -838,7 +844,7 @@ void StringSearchTest::TestVariable()
while (VARIABLE[count].text != NULL) {
logln("variable %d", count);
if (!assertEqual(&VARIABLE[count])) {
errln("Error at test number %d", count);
infoln("Error at test number %d", count);
}
count ++;
}
@ -1546,7 +1552,7 @@ void StringSearchTest::TestIgnorable()
delete collator;
}
void StringSearchTest::TestDiactricMatch()
void StringSearchTest::TestDiacriticMatch()
{
UChar temp[128];
UErrorCode status = U_ZERO_ERROR;
@ -1559,7 +1565,7 @@ void StringSearchTest::TestDiactricMatch()
const SearchData *search;
search = &(DIACTRICMATCH[count]);
search = &(DIACRITICMATCH[count]);
while (search->text != NULL) {
coll = getCollator(search->collator);
coll->setStrength(getECollationStrength(search->strength));
@ -1577,7 +1583,7 @@ void StringSearchTest::TestDiactricMatch()
if (!assertEqualWithStringSearch(strsrch, search)) {
errln("Error at test number %d", count);
}
search = &(DIACTRICMATCH[++count]);
search = &(DIACRITICMATCH[++count]);
delete strsrch;
}
@ -1818,6 +1824,8 @@ void StringSearchTest::TestCollatorCanonical()
if (tailored != NULL) {
delete tailored;
}
return;
}
strsrch->setCollator(m_en_us_, status);
@ -1980,6 +1988,10 @@ void StringSearchTest::TestGetSetOffsetCanonical()
UnicodeString pattern("pattern");
StringSearch *strsrch = new StringSearch(pattern, text, m_en_us_, NULL,
status);
Collator *collator = strsrch->getCollator();
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
strsrch->setAttribute(USEARCH_CANONICAL_MATCH, USEARCH_ON, status);
/* testing out of bounds error */
strsrch->setOffset(-1, status);
@ -2023,7 +2035,7 @@ void StringSearchTest::TestGetSetOffsetCanonical()
errln("Error match found at %d %d",
strsrch->getMatchedStart(),
strsrch->getMatchedLength());
return;
goto bail;
}
matchindex = search.offset[count + 1] == -1 ? -1 :
search.offset[count + 2];
@ -2031,7 +2043,7 @@ void StringSearchTest::TestGetSetOffsetCanonical()
strsrch->setOffset(search.offset[count + 1] + 1, status);
if (strsrch->getOffset() != search.offset[count + 1] + 1) {
errln("Error setting offset");
return;
goto bail;
}
}
@ -2045,9 +2057,12 @@ void StringSearchTest::TestGetSetOffsetCanonical()
errln("Pattern: %s", str);
errln("Error match found at %d %d", strsrch->getMatchedStart(),
strsrch->getMatchedLength());
return;
goto bail;
}
}
bail:
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, status);
delete strsrch;
}

View file

@ -1,6 +1,6 @@
/****************************************************************************
* COPYRIGHT:
* Copyright (c) 2001-2005, International Business Machines Corporation and others
* Copyright (c) 2001-2008, International Business Machines Corporation and others
* All Rights Reserved.
***************************************************************************/
@ -88,7 +88,7 @@ private:
void TestUClassID();
void TestSubclass();
void TestCoverage();
void TestDiactricMatch();
void TestDiacriticMatch();
};
#endif /* #if !UCONFIG_NO_COLLATION */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,40 @@
/*
**********************************************************************
* Copyright (C) 2005-2008, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef __SSEARCH_H
#define __SSEARCH_H
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/ucol.h"
#include "intltest.h"
//
// Test of the function usearch_search()
//
// See srchtest.h for the tests for the rest of the string search functions.
//
class SSearchTest: public IntlTest {
public:
SSearchTest();
virtual ~SSearchTest();
virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* params = NULL );
virtual void searchTest();
virtual void offsetTest();
virtual void monkeyTest(char *params);
private:
virtual const char *getPath(char buffer[2048], const char *filename);
virtual int32_t monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
const char *name, const char *strength, uint32_t seed);
};
#endif

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2007, International Business Machines Corporation and
* Copyright (c) 1997-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -42,6 +42,7 @@
#include "normconf.h"
#include "thcoll.h"
#include "srchtest.h"
#include "ssearch.h"
#include "cntabcol.h"
#include "lcukocol.h"
#include "ucaconf.h"
@ -49,6 +50,9 @@
#include "cmemory.h"
//#include "rndmcoll.h"
// Set to 1 to test offsets in backAndForth()
#define TEST_OFFSETS 0
#define TESTCLASS(n,classname) \
case n: \
name = #classname; \
@ -89,6 +93,7 @@ void IntlTestCollator::runIndexedTest( int32_t index, UBool exec, const char* &n
TESTCLASS(19, CollationServiceTest);
TESTCLASS(20, CollationFinnishTest); // removed by weiv - we have changed Finnish collation
//TESTCLASS(21, RandomCollatorTest); // See ticket 5747 about reenabling this test.
TESTCLASS(21, SSearchTest);
default: name = ""; break;
}
@ -392,7 +397,7 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
{
// Run through the iterator forwards and stick it into an array
int32_t orderLength = 0;
int32_t *orders = getOrders(iter, orderLength);
Order *orders = getOrders(iter, orderLength);
UErrorCode status = U_ZERO_ERROR;
// Now go through it backwards and make sure we get the same values
@ -404,6 +409,8 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
while ((o = iter.previous(status)) != CollationElementIterator::NULLORDER)
{
int32_t offset = iter.getOffset();
if (index == 0) {
if(o == 0) {
continue;
@ -411,28 +418,39 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
// going backwards
errln("Backward iteration returned a non ignorable after orders are exhausted");
break;
}
}
}
if (o != orders[--index])
{
index -= 1;
if (o != orders[index].order) {
if (o == 0)
index ++;
else
{
while (index > 0 && orders[--index] == 0)
{
index += 1;
else {
while (index > 0 && orders[--index].order == 0) {
// nothing...
}
if (o != orders[index])
{
errln("Mismatch at index %d: 0x%X vs 0x%X", index,
orders[index], o);
break;
if (o != orders[index].order) {
errln("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X", index,
orders[index].order, o);
//break;
goto bail;
}
}
}
#if TEST_OFFSETS
if (offset != orders[index].offset) {
errln("Mismatched offset at index %d: %d vs. %d", index,
orders[index].offset, offset);
//break;
goto bail;
}
#endif
}
while (index != 0 && orders[index - 1] == 0)
while (index != 0 && orders[index - 1].order == 0)
{
index --;
}
@ -466,6 +484,7 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
errln("");
}
bail:
delete[] orders;
}
@ -474,12 +493,13 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
* Return an integer array containing all of the collation orders
* returned by calls to next on the specified iterator
*/
int32_t *IntlTestCollator::getOrders(CollationElementIterator &iter, int32_t &orderLength)
IntlTestCollator::Order *IntlTestCollator::getOrders(CollationElementIterator &iter, int32_t &orderLength)
{
int32_t maxSize = 100;
int32_t size = 0;
int32_t *orders = new int32_t[maxSize];
Order *orders = new Order[maxSize];
UErrorCode status = U_ZERO_ERROR;
int32_t offset = iter.getOffset();
int32_t order;
while ((order = iter.next(status)) != CollationElementIterator::NULLORDER)
@ -487,21 +507,25 @@ int32_t *IntlTestCollator::getOrders(CollationElementIterator &iter, int32_t &or
if (size == maxSize)
{
maxSize *= 2;
int32_t *temp = new int32_t[maxSize];
Order *temp = new Order[maxSize];
uprv_memcpy(temp, orders, size * sizeof(int32_t));
uprv_memcpy(temp, orders, size * sizeof(Order));
delete[] orders;
orders = temp;
}
orders[size++] = order;
orders[size].order = order;
orders[size].offset = offset;
offset = iter.getOffset();
size += 1;
}
if (maxSize > size)
{
int32_t *temp = new int32_t[size];
Order *temp = new Order[size];
uprv_memcpy(temp, orders, size * sizeof(int32_t));
uprv_memcpy(temp, orders, size * sizeof(Order));
delete[] orders;
orders = temp;
}

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2003, International Business Machines Corporation and
* Copyright (c) 1997-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -23,6 +23,12 @@
class IntlTestCollator: public IntlTest {
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL );
protected:
struct Order
{
int32_t order;
int32_t offset;
};
// These two should probably go down in IntlTest
void doTest(Collator* col, const UChar *source, const UChar *target, Collator::EComparisonResult result);
@ -42,7 +48,7 @@ protected:
* Return an integer array containing all of the collation orders
* returned by calls to next on the specified iterator
*/
int32_t *getOrders(CollationElementIterator &iter, int32_t &orderLength);
Order *getOrders(CollationElementIterator &iter, int32_t &orderLength);
UCollationResult compareUsingPartials(UCollator *coll, const UChar source[], int32_t sLen, const UChar target[], int32_t tLen, int32_t pieceSize, UErrorCode &status);
};

View file

@ -1,4 +1,4 @@
// Copyright (c) 2001-2007 International Business Machines
// Copyright (c) 2001-2008 International Business Machines
// Corporation and others. All Rights Reserved.
DataDrivenCollationTest:table(nofallback) {
Info {
@ -513,6 +513,25 @@ DataDrivenCollationTest:table(nofallback) {
"xj<xSx<xș=xş<xȘ=xŞ<Xș=Xş<XȘ=XŞ<xșx=xşx<xȘx=xŞx<xT<xTx<xț=xţ<xȚ=xŢ<Xț=Xţ<XȚ"
"=XŢ<xțx=xţx<xȚx=xŢx<xU"
}
}
}
testOffsets {
Info {
Description { "This tests cases where forwards and backwards iteration get different offsets" }
}
Settings {
{
TestLocale { "en" }
Arguments { "[strength 3]" }
}
}
Cases {
"a\uD800\uDC00\uDC00<b\uD800\uDC00\uDC00",
"\u0301A\u0301\u0301<\u0301B\u0301\u0301",
"abcd\r\u0301<abce\r\u0301"
}
}
}
}

413
icu4c/source/test/testdata/ssearch.xml vendored Normal file
View file

@ -0,0 +1,413 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Copyright (c) 2007-2008 IBM Corporation and others. All rights reserved -->
<!-- Test data file for string search -->
<!DOCTYPE stringsearch-tests [
<!ELEMENT stringsearch-tests (test-case+)>
<!ATTLIST stringsearch-tests debug IDREF #IMPLIED >
<!ELEMENT test-case (pattern, pre?, m?, post?)>
<!ATTLIST test-case
id ID #REQUIRED
locale CDATA "en"
strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY"
norm (ON | OFF) "OFF"
>
<!ELEMENT pattern (#PCDATA)>
<!ELEMENT pre (#PCDATA)>
<!ELEMENT m (#PCDATA)>
<!ELEMENT post (#PCDATA)>
]>
<stringsearch-tests debug="test32">
<!-- debug="test11" (for copying into the above element) -->
<!-- Very simple match -->
<test-case id="test01" >
<pattern>abc</pattern>
<pre>xxx</pre><m>abc</m><post>yyy</post>
</test-case>
<!-- Very simple no-match -->
<test-case id="test02" >
<pattern>abc</pattern>
<pre>xxx</pre><post>yyy</post>
</test-case>
<!-- Match after several near-misses. -->
<test-case id="test03" >
<pattern>string</pattern>
<pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post>
</test-case>
<test-case id="test04" strength="PRIMARY" >
<pattern>FUSS</pattern>
<pre>abc</pre><m>fuss</m><post>sss</post>
</test-case>
<test-case id="test05" strength="PRIMARY" >
<pattern>FUSS</pattern>
<pre>abc</pre><m>fuß</m><post>sss</post>
</test-case>
<test-case id="test05.5" strength="PRIMARY" >
<pattern>fuss</pattern>
<pre>a </pre>
<m>fuß</m>
<post>ball table</post>
</test-case>
<test-case id="test06" strength="PRIMARY" >
<pattern>fuß</pattern>
<pre>abc</pre><m>fuss</m><post>xyz</post>
</test-case>
<test-case id="test07" strength="SECONDARY" >
<pattern>fuß</pattern>
<pre>abcfussxyz</pre>
</test-case>
<test-case id="test08" strength="PRIMARY" >
<pattern>fus</pattern>
<pre>abcfuß</pre><post>xyz</post>
</test-case>
<!-- A good match following an initial match that failed because
of not ending on a character boundary -->
<test-case id="test09" strength="PRIMARY">
<pattern>fus</pattern>
<pre>fuß </pre><m>fus</m><post>sss</post>
</test-case>
<!-- Test cases from usrchdat.c BREAKITERATOREXACT -->
<test-case id="test10" strength="TERTIARY">
<pattern>fox</pattern>
<m>fox</m><post>y fox</post>
</test-case>
<test-case id="test11" strength="PRIMARY" locale="de_DE@collation=phonebook">
<pattern>toe</pattern>
<pre>This is a </pre><m></m><post>ne</post>
</test-case>
<test-case id="test11a" strength="SECONDARY" locale="de_DE@collation=phonebook">
<pattern>toe</pattern>
<pre>This is a </pre><post>Töne</post>
</test-case>
<test-case id="test12" strength="TERTIARY">
<pattern>e</pattern>
<pre>tésting that é doés not match </pre><m>e</m><post></post>
</test-case>
<test-case id="test13" strength="PRIMARY" locale="fr">
<pattern>e</pattern>
<pre></pre><m>É</m><post>É</post>
</test-case>
<test-case id="test14" strength="PRIMARY" locale="fr">
<pattern>O</pattern>
<pre>C</pre><m>O\u0302</m><post></post>
</test-case>
<!-- Test cases from usrchdat.c STRENGTH -->
<test-case id="test15" strength="PRIMARY" locale="en">
<pattern>fox</pattern>
<pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post>
</test-case>
<test-case id="test16" strength="PRIMARY" locale="fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post>
</test-case>
<test-case id="test17" strength="PRIMARY" locale="fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post>
</test-case>
<test-case id="test18" strength="PRIMARY" locale="fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post>
</test-case>
<test-case id="test19" strength="PRIMARY" locale="fr">
<pattern>peche</pattern>
<pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post>
</test-case>
<test-case id="test20" strength="PRIMARY" locale="es">
<pattern>channel</pattern>
<pre>A </pre><m>channel</m><post>, </post>
</test-case>
<test-case id="test21" strength="PRIMARY" locale="es">
<pattern>channel</pattern>
<pre>A </pre><m>CHANNEL</m><post>, </post>
</test-case>
<test-case id="test22" strength="PRIMARY" locale="es">
<pattern>channel</pattern>
<pre>A </pre><m>Channel</m><post>s, </post>
</test-case>
<test-case id="test23" strength="PRIMARY" locale="es">
<pattern>channel</pattern>
<pre>A </pre><m>channel</m><post>... </post>
</test-case>
<test-case id="test24" strength="TERTIARY" locale="en">
<pattern>A\u0300</pattern>
<pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A"</post>
</test-case>
<!-- TODO: In the original test data, this test matched at IDENTICAL strength.
Doesn't seem right. The characters are different.
-->
<test-case id="test24a" strength="IDENTICAL" locale="en">
<pattern>A\u0300</pattern>
<pre>At IDENTICAL, shoud this match? </pre><m>\u00c0</m><post></post>
</test-case>
<test-case id="test25" strength="SECONDARY" locale="en">
<pattern>Ű</pattern>
<pre>12</pre><m>ű</m><post> Ű</post>
</test-case>
<test-case id="test26" strength="SECONDARY" locale="en">
<pattern>A</pattern>
<pre>12</pre><m>a</m><post>...</post>
</test-case>
<!-- Test Cases from usrchdat.c, VARIABLE -->
<test-case id="test27" strength="TERTIARY" locale="en">
<pattern>blackbird</pattern>
<pre>black-bird </pre><m>blackbird</m><post>...</post>
</test-case>
<test-case id="test28" strength="TERTIARY" locale="en">
<pattern>go</pattern>
<pre> on</pre>
</test-case>
<!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
the UStringSearch. How did the orignal test run? -->
<!--
<test-case id="test29" strength="PRIMARY" locale="en">
<pattern> </pattern>
<pre></pre><m></m><post>abc</post>
</test-case>
-->
<test-case id="test30" strength="SECONDARY" locale="en">
<pattern>abc</pattern>
<pre> a bc ab c a bc ab c"</pre>
</test-case>
<test-case id="test31" strength="SECONDARY" locale="en">
<pattern>abc</pattern>
<pre> ---------------</pre>
</test-case>
<!-- Normalization test cases from usrchdat.c -->
<test-case id="test32" strength="TERTIARY" norm="ON">
<pattern>a\u0325\u0300</pattern>
<pre></pre><m>a\u0300\u0325</m>
</test-case>
<test-case id="test32a" strength="TERTIARY" norm="OFF">
<pattern>a\u0325\u0300</pattern>
<pre>a\u0300\u0325</pre>
</test-case>
<!-- COMPOSITEBOUNDARIES from usrchdat.c
Boundaries are not identical to orignal test data because
of matching only full combining sequences
-->
<test-case id="test40" strength="TERTIARY">
<pattern>A</pattern>
<pre>À</pre> <!-- \u00C0 -->
</test-case>
<test-case id="test41" strength="TERTIARY">
<pattern>A</pattern>
<pre>À</pre><m>A</m><post>C</post>
</test-case>
<test-case id="test42" strength="TERTIARY">
<pattern>A\u030A</pattern>
<pre>À\u01FA</pre>
</test-case>
<!-- SUPPLEMENTARYCANONICAL from usrchdat.c -->
<test-case id="test50" strength="TERTIARY">
<pattern>\uD800\uDC00</pattern>
<pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m>
<post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
</test-case>
<test-case id="test51" strength="TERTIARY">
<pattern>\\uD834\\uDDB9</pattern>
<pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post>
</test-case>
<test-case id="test52" strength="TERTIARY">
<pattern> \\uD834\\uDDB9 </pattern>
<pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post>
</test-case>
<test-case id="test53" strength="TERTIARY">
<pattern>-\\uD834\\uDDB9-</pattern>
<pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post>
</test-case>
<test-case id="test54" strength="TERTIARY">
<pattern>,\\uD834\\uDDB9,</pattern>
<pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post>
</test-case>
<test-case id="test55" strength="TERTIARY">
<pattern>?\\uD834\\uDDB9?</pattern>
<pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post>
</test-case>
<!-- Long combining sequences -->
<test-case id="test60" strength="PRIMARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<m>A\u0301\u0301\u0301\u0301\u0301</m>
</test-case>
<test-case id="test61" strength="TERTIARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<pre>A\u0301\u0301\u0301\u0301\u0301</pre>
</test-case>
<test-case id="test62" strength="TERTIARY">
<pattern>A\u0301\u0301\u0301\u0301</pattern>
<m>A\u0301\u0301\u0301\u0301</m>
</test-case>
<!-- stand-alone combining marks don't match attached marks -->
<test-case id="test63" strength="TERTIARY">
<pattern>\u0301</pattern>
<pre>A\u0301\u0301\u0301\u0301</pre>
</test-case>
<test-case id="test64" strength="TERTIARY">
<pattern>\u0301</pattern>
<post>\u0301\u0301\u0301\u0301</post>
</test-case>
<!-- stand-alone combining mark does match an un-attached combining mark -->
<test-case id="test65" strength="TERTIARY">
<pattern>\u0301</pattern>
<m>\u0301</m><post>A\u0301\u0301</post>
</test-case>
<test-case id="test66" strength="TERTIARY">
<pattern>\u0301</pattern>
<m>\u0301</m>
</test-case>
<!-- stand-alone combining marks at end of the target text -->
<test-case id="test67" strength="TERTIARY">
<pattern>\u0301</pattern>
<pre>abcd\r</pre><m>\u0301</m>
</test-case>
<!-- attached combining marks at end of the target text, no match -->
<test-case id="test68" strength="TERTIARY">
<pattern>\u0301</pattern>
<pre>abcd\u0301</pre>
</test-case>
<!-- no match within expansions at the start -->
<test-case id="test70" strength="PRIMARY">
<pattern>Eligature</pattern>
<pre>Æligature</pre>
</test-case>
<test-case id="test71" strength="PRIMARY">
<pattern>AEligature</pattern>
<m>Æligature</m>
</test-case>
<test-case id="test72" strength="PRIMARY">
<pattern>AEligature</pattern>
<m>Æligature</m>
</test-case>
<!-- unattached combining Tilde will not match a Tilde that is
part of a composed Ñ (\u00D1) -->
<test-case id="test73" strength="SECONDARY">
<pattern>\u0303</pattern> <!-- combining tilde -->
<pre>Ñ&#x0d;</pre><m>\u0303</m>
</test-case>
<test-case id="test74" strength="SECONDARY">
<pattern>\u0303</pattern> <!-- combining tilde -->
<pre>Ñ &#x0d;</pre><m>\u0303</m><post>a</post>
</test-case>
<test-case id="test75" strength="TERTIARY" locale="fr">
<pattern>\u00EA</pattern>
<pre>p</pre><m>\u00EA</m><post>che</post>
</test-case>
<test-case id="test76" strength="TERTIARY" locale="fr">
<pattern>\u00EA</pattern>
<pre>p</pre><m>e\u0302</m><post>che</post>
</test-case>
<test-case id="test77" strength="TERTIARY" locale="fr">
<pattern>e\u0302</pattern>
<pre>p</pre><m>\u00EA</m><post>che</post>
</test-case>
<!-- Test cases from ticket:5382 -->
<test-case id="test78" strength="SECONDARY" locale="hu_HU">
<pattern>\u0170</pattern>
<m>\u0171</m>
<post>12</post>
</test-case>
<test-case id="test79" strength="SECONDARY" locale="hu_HU">
<pattern>\u0170</pattern>
<pre>1</pre>
<m>\u0171</m>
<post>2</post>
</test-case>
<test-case id="test80" strength="SECONDARY" locale="hu_HU">
<pattern>\u0170</pattern>
<pre>12</pre>
<m>\u0171</m>
</test-case>
<!-- Test cases from ticket:5959 -->
<test-case id="test81" strength="SECONDARY">
<pattern>\u2166</pattern>
<m>VII</m>
</test-case>
<test-case id="test82" strength="SECONDARY">
<pattern>VII</pattern>
<m>\u2166</m>
</test-case>
</stringsearch-tests>

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2006, International Business Machines
* Copyright (C) 2004-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -74,10 +74,15 @@ UXMLParser::UXMLParser(UErrorCode &status) :
// XML Doctype decl production #28
// example "<!DOCTYPE foo SYSTEM "somewhere" >
// or "<!DOCTYPE foo [internal dtd]>
// TODO: we don't actually parse the DOCTYPE or internal subsets.
// Some internal dtd subsets could confuse this simple-minded
// attempt at skipping over them.
mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
// attempt at skipping over them, specifically, occcurences
// of closeing square brackets. These could appear in comments,
// or in parameter entity declarations, for example.
mXMLDoctype(UnicodeString(
"(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)"
), 0, status),
// XML PI production #16
// example "<?target stuff?>