mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 14:31:31 +00:00
ICU-5420 merge changes from branches/eric/string-search r.23303 - r.23976
X-SVN-Rev: 23977
This commit is contained in:
parent
905f90890e
commit
9011fe483f
26 changed files with 4690 additions and 211 deletions
|
@ -106,9 +106,21 @@ void SearchIterator::setBreakIterator(BreakIterator *breakiter,
|
|||
UErrorCode &status)
|
||||
{
|
||||
if (U_SUCCESS(status)) {
|
||||
#if 0
|
||||
m_search_->breakIter = NULL;
|
||||
// the c++ breakiterator may not make use of ubreakiterator.
|
||||
// so we'll have to keep track of it ourselves.
|
||||
#else
|
||||
// Well, gee... the Constructors that take a BreakIterator
|
||||
// all cast the BreakIterator to a UBreakIterator and
|
||||
// pass it to the corresponding usearch_openFromXXX
|
||||
// routine, so there's no reason not to do this.
|
||||
//
|
||||
// Besides, a UBreakIterator is a BreakIterator, so
|
||||
// any subclass of BreakIterator should work fine here...
|
||||
m_search_->breakIter = (UBreakIterator *) breakiter;
|
||||
#endif
|
||||
|
||||
m_breakiterator_ = breakiter;
|
||||
}
|
||||
}
|
||||
|
@ -283,10 +295,16 @@ int32_t SearchIterator::previous(UErrorCode &status)
|
|||
}
|
||||
|
||||
if (matchindex != USEARCH_DONE) {
|
||||
if (m_search_->isOverlap) {
|
||||
matchindex += m_search_->matchedLength - 2;
|
||||
}
|
||||
|
||||
return handlePrev(matchindex, status);
|
||||
}
|
||||
|
||||
return handlePrev(offset, status);
|
||||
}
|
||||
|
||||
return USEARCH_DONE;
|
||||
}
|
||||
|
||||
|
|
|
@ -350,11 +350,13 @@ int32_t StringSearch::handleNext(int32_t position, UErrorCode &status)
|
|||
// looking at usearch.cpp, this part is shifted out to
|
||||
// StringSearch instead of SearchIterator because m_strsrch_ is
|
||||
// not accessible in SearchIterator
|
||||
#if 0
|
||||
if (position + m_strsrch_->pattern.defaultShiftSize
|
||||
> m_search_->textLength) {
|
||||
setMatchNotFound();
|
||||
return USEARCH_DONE;
|
||||
}
|
||||
#endif
|
||||
if (m_search_->matchedLength <= 0) {
|
||||
// the flipping direction issue has already been handled
|
||||
// in next()
|
||||
|
@ -366,6 +368,8 @@ int32_t StringSearch::handleNext(int32_t position, UErrorCode &status)
|
|||
}
|
||||
|
||||
ucol_setOffset(m_strsrch_->textIter, position, &status);
|
||||
|
||||
#if 0
|
||||
for (;;) {
|
||||
if (m_search_->isCanonicalMatch) {
|
||||
// can't use exact here since extra accents are allowed.
|
||||
|
@ -397,6 +401,29 @@ int32_t StringSearch::handleNext(int32_t position, UErrorCode &status)
|
|||
return m_search_->matchedIndex;
|
||||
}
|
||||
}
|
||||
#else
|
||||
// if m_strsrch_->breakIter is always the same as m_breakiterator_
|
||||
// then we don't need to check the match boundaries here because
|
||||
// usearch_handleNextXXX will already have done it.
|
||||
if (m_search_->isCanonicalMatch) {
|
||||
// *could* actually use exact here 'cause no extra accents allowed...
|
||||
usearch_handleNextCanonical(m_strsrch_, &status);
|
||||
} else {
|
||||
usearch_handleNextExact(m_strsrch_, &status);
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return USEARCH_DONE;
|
||||
}
|
||||
|
||||
if (m_search_->matchedIndex == USEARCH_DONE) {
|
||||
ucol_setOffset(m_strsrch_->textIter, m_search_->textLength, &status);
|
||||
} else {
|
||||
ucol_setOffset(m_strsrch_->textIter, m_search_->matchedIndex, &status);
|
||||
}
|
||||
|
||||
return m_search_->matchedIndex;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return USEARCH_DONE;
|
||||
|
@ -424,11 +451,13 @@ int32_t StringSearch::handlePrev(int32_t position, UErrorCode &status)
|
|||
// looking at usearch.cpp, this part is shifted out to
|
||||
// StringSearch instead of SearchIterator because m_strsrch_ is
|
||||
// not accessible in SearchIterator
|
||||
#if 0
|
||||
if (!m_search_->isOverlap &&
|
||||
position - m_strsrch_->pattern.defaultShiftSize < 0) {
|
||||
setMatchNotFound();
|
||||
return USEARCH_DONE;
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
if (m_search_->isCanonicalMatch) {
|
||||
// can't use exact here since extra accents are allowed.
|
||||
|
@ -452,6 +481,22 @@ int32_t StringSearch::handlePrev(int32_t position, UErrorCode &status)
|
|||
return m_search_->matchedIndex;
|
||||
}
|
||||
}
|
||||
#else
|
||||
ucol_setOffset(m_strsrch_->textIter, position, &status);
|
||||
|
||||
if (m_search_->isCanonicalMatch) {
|
||||
// *could* use exact match here since extra accents *not* allowed!
|
||||
usearch_handlePreviousCanonical(m_strsrch_, &status);
|
||||
} else {
|
||||
usearch_handlePreviousExact(m_strsrch_, &status);
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return USEARCH_DONE;
|
||||
}
|
||||
|
||||
return m_search_->matchedIndex;
|
||||
#endif
|
||||
}
|
||||
|
||||
return m_search_->matchedIndex;
|
||||
|
|
|
@ -101,6 +101,10 @@ inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri
|
|||
(s)->extendCEs = NULL;
|
||||
(s)->extendCEsSize = 0;
|
||||
(s)->CEpos = (s)->toReturn = (s)->CEs;
|
||||
(s)->offsetBuffer = NULL;
|
||||
(s)->offsetBufferSize = 0;
|
||||
(s)->offsetReturn = (s)->offsetStore = NULL;
|
||||
(s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
|
||||
(s)->writableBuffer = (s)->stackWritableBuffer;
|
||||
(s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
|
||||
(s)->coll = (collator);
|
||||
|
@ -175,6 +179,7 @@ inline void loadState(collIterate *data, const collIterateState *backup,
|
|||
}
|
||||
}
|
||||
data->pos = backup->pos;
|
||||
|
||||
if ((data->flags & UCOL_ITER_INNORMBUF) &&
|
||||
data->writableBuffer != backup->bufferaddress) {
|
||||
/*
|
||||
|
@ -1377,6 +1382,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
|
|||
}
|
||||
|
||||
UChar ch = 0;
|
||||
collationSource->offsetReturn = NULL;
|
||||
|
||||
for (;;) /* Loop handles case when incremental normalize switches */
|
||||
{ /* to or from the side buffer / original string, and we */
|
||||
|
@ -1586,6 +1592,83 @@ void collPrevIterNormalize(collIterate *data)
|
|||
unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
|
||||
normLen, &status);
|
||||
|
||||
if (data->offsetBuffer == NULL) {
|
||||
int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
|
||||
data->offsetBufferSize = len;
|
||||
data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
|
||||
data->offsetStore = data->offsetBuffer;
|
||||
} else if(data->offsetBufferSize < (int32_t) normLen) {
|
||||
int32_t storeIX = data->offsetStore - data->offsetBuffer;
|
||||
int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
|
||||
|
||||
if (tob != NULL) {
|
||||
data->offsetBuffer = tob;
|
||||
data->offsetStore = &data->offsetBuffer[storeIX];
|
||||
data->offsetBufferSize = normLen + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The usual case at this point is that we've got a base
|
||||
* character followed by marks that were normalized. If
|
||||
* fcdPosition is NULL, that means that we backed up to
|
||||
* the beginning of the string and there's no base character.
|
||||
*
|
||||
* Forward processing will usually normalize when it sees
|
||||
* the first mark, so that mark will get it's natural offset
|
||||
* and the rest will get the offset of the character following
|
||||
* the marks. The base character will also get its natural offset.
|
||||
*
|
||||
* We write the offset of the base character, if there is one,
|
||||
* followed by the offset of the first mark and then the offsets
|
||||
* of the rest of the marks.
|
||||
*/
|
||||
int32_t firstMarkOffset = 0;
|
||||
int32_t trailOffset = data->pos - data->string + 1;
|
||||
int32_t trailCount = normLen - 1;
|
||||
|
||||
if (data->fcdPosition != NULL) {
|
||||
int32_t baseOffset = data->fcdPosition - data->string;
|
||||
UChar baseChar = *data->fcdPosition;
|
||||
|
||||
firstMarkOffset = baseOffset + 1;
|
||||
|
||||
/*
|
||||
* If the base character is the start of a contraction, forward processing
|
||||
* will normalize the marks while checking for the contraction, which means
|
||||
* that the offset of the first mark will the same as the other marks.
|
||||
*
|
||||
* **** THIS IS PROBABLY NOT A COMPLETE TEST ****
|
||||
*/
|
||||
if (baseChar >= 0x100) {
|
||||
int32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
|
||||
|
||||
if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
|
||||
baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
|
||||
}
|
||||
|
||||
if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
|
||||
firstMarkOffset = trailOffset;
|
||||
}
|
||||
}
|
||||
|
||||
*(data->offsetStore++) = baseOffset;
|
||||
}
|
||||
|
||||
*(data->offsetStore++) = firstMarkOffset;
|
||||
|
||||
for (int32_t i = 0; i < trailCount; i += 1) {
|
||||
*(data->offsetStore++) = trailOffset;
|
||||
}
|
||||
|
||||
data->offsetRepeatValue = trailOffset;
|
||||
|
||||
data->offsetReturn = data->offsetStore - 1;
|
||||
if (data->offsetReturn == data->offsetBuffer) {
|
||||
data->offsetStore = data->offsetBuffer;
|
||||
}
|
||||
|
||||
data->pos = data->writableBuffer + data->writableBufSize;
|
||||
data->origFlags = data->flags;
|
||||
data->flags |= UCOL_ITER_INNORMBUF;
|
||||
|
@ -1756,10 +1839,24 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
|||
UErrorCode *status)
|
||||
{
|
||||
uint32_t result = (uint32_t)UCOL_NULLORDER;
|
||||
|
||||
if (data->offsetReturn != NULL) {
|
||||
if (data->offsetRepeatCount > 0) {
|
||||
data->offsetRepeatCount -= 1;
|
||||
} else {
|
||||
if (data->offsetReturn == data->offsetBuffer) {
|
||||
data->offsetReturn = NULL;
|
||||
data->offsetStore = data->offsetBuffer;
|
||||
} else {
|
||||
data->offsetReturn -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((data->extendCEs && data->toReturn > data->extendCEs) ||
|
||||
(!data->extendCEs && data->toReturn > data->CEs))
|
||||
{
|
||||
data->toReturn --;
|
||||
data->toReturn -= 1;
|
||||
result = *(data->toReturn);
|
||||
if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
|
||||
data->CEpos = data->toReturn;
|
||||
|
@ -1767,6 +1864,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
|||
}
|
||||
else {
|
||||
UChar ch = 0;
|
||||
|
||||
/*
|
||||
Loop handles case when incremental normalize switches to or from the
|
||||
side buffer / original string, and we need to start again to get the
|
||||
|
@ -1813,6 +1911,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
|||
data->pos = data->fcdPosition + 1;
|
||||
}
|
||||
data->flags = data->origFlags;
|
||||
data->offsetRepeatValue = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -1903,10 +2002,12 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(result == UCOL_NOT_FOUND) {
|
||||
result = getPrevImplicit(ch, data);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -2399,6 +2500,7 @@ inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
|
|||
}
|
||||
uint32_t r = uprv_uca_getImplicitPrimary(cp);
|
||||
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
collationSource->offsetRepeatCount += 1;
|
||||
return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
|
||||
}
|
||||
|
||||
|
@ -2871,6 +2973,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
{
|
||||
*(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
|
||||
CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
|
||||
source->offsetRepeatCount += 1;
|
||||
return CE;
|
||||
}
|
||||
case EXPANSION_TAG:
|
||||
|
@ -2880,18 +2983,24 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
/* I have to decide where continuations are going to be dealt with */
|
||||
uint32_t size;
|
||||
uint32_t i; /* general counter */
|
||||
|
||||
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
|
||||
size = getExpansionCount(CE);
|
||||
CE = *CEOffset++;
|
||||
//source->offsetRepeatCount = -1;
|
||||
|
||||
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
|
||||
for(i = 1; i<size; i++) {
|
||||
*(source->CEpos++) = *CEOffset++;
|
||||
source->offsetRepeatCount += 1;
|
||||
}
|
||||
} else { /* else, we do */
|
||||
while(*CEOffset != 0) {
|
||||
*(source->CEpos++) = *CEOffset++;
|
||||
source->offsetRepeatCount += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return CE;
|
||||
}
|
||||
case DIGIT_TAG:
|
||||
|
@ -3263,6 +3372,29 @@ inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
|
|||
|
||||
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
|
||||
if (collationSource->offsetBuffer == NULL) {
|
||||
collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
collationSource->offsetStore = collationSource->offsetBuffer;
|
||||
}
|
||||
|
||||
// **** doesn't work if using iterator ****
|
||||
if (collationSource->flags & UCOL_ITER_INNORMBUF) {
|
||||
collationSource->offsetRepeatCount = 1;
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
|
||||
|
||||
*(collationSource->offsetStore++) = firstOffset;
|
||||
*(collationSource->offsetStore++) = firstOffset + 1;
|
||||
|
||||
collationSource->offsetReturn = collationSource->offsetStore - 1;
|
||||
*(collationSource->offsetBuffer) = firstOffset;
|
||||
if (collationSource->offsetReturn == collationSource->offsetBuffer) {
|
||||
collationSource->offsetStore = collationSource->offsetBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
}
|
||||
|
||||
|
@ -3293,6 +3425,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
{
|
||||
case NOT_FOUND_TAG: /* this tag always returns */
|
||||
return CE;
|
||||
|
||||
case SPEC_PROC_TAG:
|
||||
{
|
||||
// Special processing is getting a CE that is preceded by a certain prefix
|
||||
|
@ -3450,15 +3583,54 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
*(UCharOffset) = schar;
|
||||
noChars++;
|
||||
|
||||
int32_t offsetBias;
|
||||
|
||||
#if 0
|
||||
if (source->offsetReturn != NULL) {
|
||||
source->offsetStore = source->offsetReturn - noChars;
|
||||
}
|
||||
|
||||
// **** doesn't work if using iterator ****
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
if (source->fcdPosition == NULL) {
|
||||
offsetBias = 0;
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->fcdPosition - source->string);
|
||||
}
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->pos - source->string);
|
||||
}
|
||||
|
||||
#else
|
||||
// **** doesn't work if using iterator ****
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
#if 1
|
||||
offsetBias = -1;
|
||||
#else
|
||||
if (source->fcdPosition == NULL) {
|
||||
offsetBias = 0;
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->fcdPosition - source->string);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
offsetBias = (int32_t)(source->pos - source->string);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* a new collIterate is used to simplify things, since using the current
|
||||
collIterate will mean that the forward and backwards iteration will
|
||||
share and change the same buffers. we don't want to get into that. */
|
||||
collIterate temp;
|
||||
int32_t rawOffset;
|
||||
|
||||
//IInit_collIterate(coll, UCharOffset, -1, &temp);
|
||||
IInit_collIterate(coll, UCharOffset, noChars, &temp);
|
||||
temp.flags &= ~UCOL_ITER_NORM;
|
||||
|
||||
rawOffset = temp.pos - temp.string; // should always be zero?
|
||||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
|
||||
if (source->extendCEs) {
|
||||
endCEBuffer = source->extendCEs + source->extendCEsSize;
|
||||
CECount = (source->CEpos - source->extendCEs)/sizeof(uint32_t);
|
||||
|
@ -3466,8 +3638,20 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
CECount = (source->CEpos - source->CEs)/sizeof(uint32_t);
|
||||
}
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
while (CE != UCOL_NO_MORE_CES) {
|
||||
*(source->CEpos ++) = CE;
|
||||
|
||||
if (offsetBias >= 0) {
|
||||
*(source->offsetStore ++) = rawOffset + offsetBias;
|
||||
}
|
||||
|
||||
CECount++;
|
||||
if (source->CEpos == endCEBuffer) {
|
||||
/* ran out of CE space, reallocate to new buffer.
|
||||
|
@ -3494,43 +3678,135 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
source->extendCEs = tempBufCE;
|
||||
}
|
||||
}
|
||||
|
||||
if (CECount == -1) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
source->extendCEsSize = 0;
|
||||
source->CEpos = source->CEs;
|
||||
freeHeapWritableBuffer(&temp);
|
||||
|
||||
if (strbuffer != buffer) {
|
||||
uprv_free(strbuffer);
|
||||
}
|
||||
|
||||
return (uint32_t)UCOL_NULLORDER;
|
||||
}
|
||||
|
||||
source->CEpos = source->extendCEs + CECount;
|
||||
endCEBuffer = source->extendCEs + source->extendCEsSize;
|
||||
}
|
||||
|
||||
if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
|
||||
int32_t storeIX = source->offsetStore - source->offsetBuffer;
|
||||
int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
|
||||
sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
|
||||
|
||||
if (tob != NULL) {
|
||||
source->offsetBuffer = tob;
|
||||
source->offsetStore = &source->offsetBuffer[storeIX];
|
||||
source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
|
||||
} else {
|
||||
// memory error...
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
source->CEpos = source->CEs;
|
||||
freeHeapWritableBuffer(&temp);
|
||||
|
||||
if (strbuffer != buffer) {
|
||||
uprv_free(strbuffer);
|
||||
}
|
||||
|
||||
return (uint32_t) UCOL_NULLORDER;
|
||||
}
|
||||
}
|
||||
|
||||
rawOffset = temp.pos - temp.string;
|
||||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
}
|
||||
|
||||
if (source->offsetRepeatValue != 0) {
|
||||
if (CECount > noChars) {
|
||||
source->offsetRepeatCount += temp.offsetRepeatCount;
|
||||
} else {
|
||||
// **** does this really skip the right offsets? ****
|
||||
source->offsetReturn -= (noChars - CECount);
|
||||
}
|
||||
}
|
||||
|
||||
freeHeapWritableBuffer(&temp);
|
||||
|
||||
if (strbuffer != buffer) {
|
||||
uprv_free(strbuffer);
|
||||
}
|
||||
|
||||
if (offsetBias >= 0) {
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
source->toReturn = source->CEpos - 1;
|
||||
if (source->toReturn == source->CEs) {
|
||||
source->CEpos = source->CEs;
|
||||
}
|
||||
|
||||
return *(source->toReturn);
|
||||
|
||||
case LONG_PRIMARY_TAG:
|
||||
{
|
||||
*(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
|
||||
*(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
|
||||
source->toReturn = source->CEpos - 1;
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
source->offsetRepeatCount = 1;
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
|
||||
*(source->offsetStore++) = firstOffset;
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
*(source->offsetBuffer) = firstOffset;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return *(source->toReturn);
|
||||
}
|
||||
|
||||
case EXPANSION_TAG: /* this tag always returns */
|
||||
{
|
||||
/*
|
||||
This should handle expansion.
|
||||
NOTE: we can encounter both continuations and expansions in an expansion!
|
||||
I have to decide where continuations are going to be dealt with
|
||||
*/
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
|
||||
// **** doesn't work if using iterator ****
|
||||
if (source->offsetReturn != NULL) {
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}else {
|
||||
firstOffset = -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
/* find the offset to expansion table */
|
||||
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
|
||||
size = getExpansionCount(CE);
|
||||
|
@ -3539,23 +3815,45 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
if there are less than 16 elements in expansion, we don't terminate
|
||||
*/
|
||||
uint32_t count;
|
||||
|
||||
for (count = 0; count < size; count++) {
|
||||
*(source->CEpos ++) = *CEOffset++;
|
||||
|
||||
if (firstOffset >= 0) {
|
||||
*(source->offsetStore ++) = firstOffset + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
/* else, we do */
|
||||
while (*CEOffset != 0) {
|
||||
*(source->CEpos ++) = *CEOffset ++;
|
||||
|
||||
if (firstOffset >= 0) {
|
||||
*(source->offsetStore ++) = firstOffset + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (firstOffset >= 0) {
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
*(source->offsetBuffer) = firstOffset;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
} else {
|
||||
source->offsetRepeatCount += size - 1;
|
||||
}
|
||||
|
||||
source->toReturn = source->CEpos - 1;
|
||||
// in case of one element expansion, we
|
||||
// want to immediately return CEpos
|
||||
if(source->toReturn == source->CEs) {
|
||||
source->CEpos = source->CEs;
|
||||
}
|
||||
|
||||
return *(source->toReturn);
|
||||
}
|
||||
|
||||
case DIGIT_TAG:
|
||||
{
|
||||
/*
|
||||
|
@ -3592,7 +3890,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
handle surrogates...
|
||||
*/
|
||||
|
||||
if (U16_IS_TRAIL (ch)){
|
||||
if (U16_IS_TRAIL (ch)) {
|
||||
if (!collIter_bos(source)){
|
||||
UChar lead = getPrevNormalizedChar(source, status);
|
||||
if(U16_IS_LEAD(lead)) {
|
||||
|
@ -3609,12 +3907,11 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
}
|
||||
digVal = u_charDigitValue(char32);
|
||||
|
||||
for(;;){
|
||||
for(;;) {
|
||||
// Make sure we have enough space.
|
||||
if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
|
||||
{
|
||||
if (digIndx >= ((numTempBufSize - 2) * 2) + 1) {
|
||||
numTempBufSize *= 2;
|
||||
if (numTempBuf == stackNumTempBuf){
|
||||
if (numTempBuf == stackNumTempBuf) {
|
||||
numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
|
||||
// Null pointer check
|
||||
if (numTempBuf == NULL) {
|
||||
|
@ -3622,7 +3919,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
return 0;
|
||||
}
|
||||
uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
|
||||
}else {
|
||||
} else {
|
||||
uint8_t *temp = (uint8_t *)uprv_realloc(numTempBuf, numTempBufSize);
|
||||
if (temp == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
|
@ -3637,7 +3934,8 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
// Skip over trailing zeroes, and keep a count of them.
|
||||
if (digVal != 0)
|
||||
nonZeroValReached = TRUE;
|
||||
if (nonZeroValReached){
|
||||
|
||||
if (nonZeroValReached) {
|
||||
/*
|
||||
We parse the digit string into base 100 numbers (this fits into a byte).
|
||||
We only add to the buffer in twos, thus if we are parsing an odd character,
|
||||
|
@ -3651,7 +3949,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
ones place and the second digit encountered into the tens place.
|
||||
*/
|
||||
|
||||
if ((digIndx + trailingZeroCount) % 2 == 1){
|
||||
if ((digIndx + trailingZeroCount) % 2 == 1) {
|
||||
// High-order digit case (tens place)
|
||||
collateVal += (uint8_t)(digVal * 10);
|
||||
|
||||
|
@ -3665,37 +3963,33 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
|
||||
numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
|
||||
collateVal = 0;
|
||||
}
|
||||
else{
|
||||
} else {
|
||||
// Low-order digit case (ones place)
|
||||
collateVal = (uint8_t)digVal;
|
||||
|
||||
// Check for leading zeroes.
|
||||
if (collateVal == 0)
|
||||
{
|
||||
if (collateVal == 0) {
|
||||
if (!leadingZeroIndex)
|
||||
leadingZeroIndex = (digIndx/2) + 2;
|
||||
}
|
||||
else
|
||||
} else
|
||||
leadingZeroIndex = 0;
|
||||
|
||||
// No need to write to buffer; the case of a last odd digit
|
||||
// is handled below.
|
||||
}
|
||||
++digIndx;
|
||||
}
|
||||
else
|
||||
} else
|
||||
++trailingZeroCount;
|
||||
|
||||
if (!collIter_bos(source)){
|
||||
if (!collIter_bos(source)) {
|
||||
ch = getPrevNormalizedChar(source, status);
|
||||
//goBackOne(source);
|
||||
if (U16_IS_TRAIL(ch)){
|
||||
if (U16_IS_TRAIL(ch)) {
|
||||
backupState(source, &state);
|
||||
if (!collIter_bos(source))
|
||||
{
|
||||
if (!collIter_bos(source)) {
|
||||
goBackOne(source);
|
||||
UChar lead = getPrevNormalizedChar(source, status);
|
||||
|
||||
if(U16_IS_LEAD(lead)) {
|
||||
char32 = U16_GET_SUPPLEMENTARY(lead,ch);
|
||||
} else {
|
||||
|
@ -3703,11 +3997,10 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
char32 = ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
} else
|
||||
char32 = ch;
|
||||
|
||||
if ((digVal = u_charDigitValue(char32)) == -1){
|
||||
if ((digVal = u_charDigitValue(char32)) == -1) {
|
||||
if (char32 > 0xFFFF) {// For surrogates.
|
||||
loadState(source, &state, FALSE);
|
||||
}
|
||||
|
@ -3717,22 +4010,23 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
//getNextNormalizedChar(source);
|
||||
break;
|
||||
}
|
||||
|
||||
goBackOne(source);
|
||||
}else
|
||||
break;
|
||||
}
|
||||
|
||||
if (nonZeroValReached == FALSE){
|
||||
if (! nonZeroValReached) {
|
||||
digIndx = 2;
|
||||
trailingZeroCount = 0;
|
||||
numTempBuf[2] = 6;
|
||||
}
|
||||
|
||||
if ((digIndx + trailingZeroCount) % 2 != 0){
|
||||
if ((digIndx + trailingZeroCount) % 2 != 0) {
|
||||
numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
|
||||
digIndx += 1; // The implicit leading zero
|
||||
}
|
||||
if (trailingZeroCount % 2 != 0){
|
||||
if (trailingZeroCount % 2 != 0) {
|
||||
// We had to consume one trailing zero for the low digit
|
||||
// of the least significant byte
|
||||
digIndx += 1; // The trailing zero not in the exponent
|
||||
|
@ -3764,8 +4058,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
(UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
|
||||
UCOL_BYTE_COMMON; // Tertiary weight.
|
||||
i = endIndex - 1; // Reset the index into the buffer.
|
||||
while(i >= 2)
|
||||
{
|
||||
while(i >= 2) {
|
||||
primWeight = numTempBuf[i--] << 8;
|
||||
if ( i >= 2)
|
||||
primWeight |= numTempBuf[i--];
|
||||
|
@ -3776,13 +4069,13 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
|
||||
source->toReturn = source->CEpos -1;
|
||||
return *(source->toReturn);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
|
||||
CE = *(CEOffset++);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
|
||||
{
|
||||
static const uint32_t
|
||||
|
@ -3809,18 +4102,37 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
V += VBase;
|
||||
T += TBase;
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
|
||||
*(source->offsetStore++) = firstOffset;
|
||||
|
||||
/*
|
||||
return the first CE, but first put the rest into the expansion buffer
|
||||
*/
|
||||
if (!source->coll->image->jamoSpecial)
|
||||
{
|
||||
* return the first CE, but first put the rest into the expansion buffer
|
||||
*/
|
||||
if (!source->coll->image->jamoSpecial) {
|
||||
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
|
||||
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
|
||||
if (T != TBase)
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
|
||||
if (T != TBase) {
|
||||
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
}
|
||||
|
||||
source->toReturn = source->CEpos - 1;
|
||||
return *(source->toReturn);
|
||||
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
return *(source->toReturn);
|
||||
} else {
|
||||
// Since Hanguls pass the FCD check, it is
|
||||
// guaranteed that we won't be in
|
||||
|
@ -3862,18 +4174,46 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
return(UCOL_IGNORABLE);
|
||||
}
|
||||
}
|
||||
|
||||
case IMPLICIT_TAG: /* everything that is not defined otherwise */
|
||||
#if 0
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
// **** doesn't work if using iterator ****
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
source->offsetRepeatCount = 1;
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
|
||||
*(source->offsetStore++) = firstOffset;
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return getPrevImplicit(ch, source);
|
||||
|
||||
// TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
|
||||
case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
|
||||
return getPrevImplicit(ch, source);
|
||||
|
||||
case SURROGATE_TAG: /* This is a surrogate pair */
|
||||
/* essentialy an engaged lead surrogate. */
|
||||
/* if you have encountered it here, it means that a */
|
||||
/* broken sequence was encountered and this is an error */
|
||||
return 0;
|
||||
|
||||
case LEAD_SURROGATE_TAG: /* D800-DBFF*/
|
||||
return 0; /* broken surrogate sequence */
|
||||
|
||||
case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
|
||||
{
|
||||
UChar32 cp = 0;
|
||||
|
@ -3897,22 +4237,27 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
} else {
|
||||
return 0; /* completely ignorable */
|
||||
}
|
||||
|
||||
return getPrevImplicit(cp, source);
|
||||
}
|
||||
|
||||
/* UCA is filled with these. Tailorings are NOT_FOUND */
|
||||
/* not yet implemented */
|
||||
case CHARSET_TAG: /* this tag always returns */
|
||||
/* probably after 1.8 */
|
||||
return UCOL_NOT_FOUND;
|
||||
|
||||
default: /* this tag always returns */
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
CE=0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (CE <= UCOL_NOT_FOUND) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return CE;
|
||||
}
|
||||
|
||||
|
|
|
@ -270,6 +270,12 @@ typedef struct collIterate {
|
|||
|
||||
uint32_t *toReturn; /* This is the CE from CEs buffer that should be returned */
|
||||
uint32_t *CEpos; /* This is the position to which we have stored processed CEs */
|
||||
|
||||
int32_t *offsetReturn; /* This is the offset to return, if non-NULL */
|
||||
int32_t *offsetStore; /* This is the pointer for storing offsets */
|
||||
int32_t offsetRepeatCount; /* Repeat stored offset if non-zero */
|
||||
int32_t offsetRepeatValue; /* offset value to repeat */
|
||||
|
||||
UChar *writableBuffer;
|
||||
uint32_t writableBufSize;
|
||||
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
|
||||
|
@ -280,6 +286,10 @@ typedef struct collIterate {
|
|||
int32_t extendCEsSize; /* Holds the size of the dynamic CEs buffer */
|
||||
uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */
|
||||
UChar stackWritableBuffer[UCOL_WRITABLE_BUFFER_SIZE]; /* A writable buffer. */
|
||||
|
||||
int32_t *offsetBuffer; /* A dynamic buffer to hold offsets */
|
||||
int32_t offsetBufferSize; /* The size of the offset buffer */
|
||||
|
||||
UCharIterator *iterator;
|
||||
/*int32_t iteratorIndex;*/
|
||||
} collIterate;
|
||||
|
@ -293,6 +303,7 @@ data similar to collIterate.
|
|||
*/
|
||||
struct collIterateState {
|
||||
UChar *pos; /* This is position in the string. Can be to original or writable buf */
|
||||
UChar *returnPos;
|
||||
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
|
||||
UChar *bufferaddress; /* address of the normalization buffer */
|
||||
uint32_t buffersize;
|
||||
|
@ -305,6 +316,8 @@ struct collIterateState {
|
|||
U_CAPI void U_EXPORT2
|
||||
uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, int32_t sourceLen, collIterate *s);
|
||||
|
||||
struct UCollationPCE;
|
||||
typedef struct UCollationPCE UCollationPCE;
|
||||
|
||||
struct UCollationElements
|
||||
{
|
||||
|
@ -320,9 +333,17 @@ struct UCollationElements
|
|||
* Indicates if the data should be deleted.
|
||||
*/
|
||||
UBool isWritable;
|
||||
|
||||
/**
|
||||
* Data for getNextProcessed, getPreviousProcessed.
|
||||
*/
|
||||
UCollationPCE *pce;
|
||||
};
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_init_pce(const struct UCollationElements *elems);
|
||||
|
||||
#define UCOL_LEVELTERMINATOR 1
|
||||
|
||||
/* mask off anything but primary order */
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "unicode/ucoleitr.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/sortkey.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "ucol_imp.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
|
@ -27,8 +28,269 @@ U_NAMESPACE_USE
|
|||
|
||||
#define BUFFER_LENGTH 100
|
||||
|
||||
#define DEFAULT_BUFFER_SIZE 16
|
||||
#define BUFFER_GROW 8
|
||||
|
||||
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
||||
|
||||
#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
|
||||
|
||||
#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
|
||||
|
||||
#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
|
||||
|
||||
#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
||||
|
||||
typedef struct collIterate collIterator;
|
||||
|
||||
struct RCEI
|
||||
{
|
||||
uint32_t ce;
|
||||
int32_t low;
|
||||
int32_t high;
|
||||
};
|
||||
|
||||
struct RCEBuffer
|
||||
{
|
||||
RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
|
||||
RCEI *buffer;
|
||||
int32_t bufferIndex;
|
||||
int32_t bufferSize;
|
||||
|
||||
RCEBuffer();
|
||||
~RCEBuffer();
|
||||
|
||||
UBool empty() const;
|
||||
void put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
|
||||
const RCEI *get();
|
||||
};
|
||||
|
||||
RCEBuffer::RCEBuffer()
|
||||
{
|
||||
buffer = defaultBuffer;
|
||||
bufferIndex = 0;
|
||||
bufferSize = DEFAULT_BUFFER_SIZE;
|
||||
}
|
||||
|
||||
RCEBuffer::~RCEBuffer()
|
||||
{
|
||||
if (buffer != defaultBuffer) {
|
||||
DELETE_ARRAY(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
UBool RCEBuffer::empty() const
|
||||
{
|
||||
return bufferIndex <= 0;
|
||||
}
|
||||
|
||||
void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
|
||||
{
|
||||
if (bufferIndex >= bufferSize) {
|
||||
RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
|
||||
|
||||
ARRAY_COPY(newBuffer, buffer, bufferSize);
|
||||
|
||||
if (buffer != defaultBuffer) {
|
||||
DELETE_ARRAY(buffer);
|
||||
}
|
||||
|
||||
buffer = newBuffer;
|
||||
bufferSize += BUFFER_GROW;
|
||||
}
|
||||
|
||||
buffer[bufferIndex].ce = ce;
|
||||
buffer[bufferIndex].low = ixLow;
|
||||
buffer[bufferIndex].high = ixHigh;
|
||||
|
||||
bufferIndex += 1;
|
||||
}
|
||||
|
||||
const RCEI *RCEBuffer::get()
|
||||
{
|
||||
if (bufferIndex > 0) {
|
||||
return &buffer[--bufferIndex];
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct PCEI
|
||||
{
|
||||
uint64_t ce;
|
||||
int32_t low;
|
||||
int32_t high;
|
||||
};
|
||||
|
||||
struct PCEBuffer
|
||||
{
|
||||
PCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
|
||||
PCEI *buffer;
|
||||
int32_t bufferIndex;
|
||||
int32_t bufferSize;
|
||||
|
||||
PCEBuffer();
|
||||
~PCEBuffer();
|
||||
|
||||
void reset();
|
||||
UBool empty() const;
|
||||
void put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
|
||||
const PCEI *get();
|
||||
};
|
||||
|
||||
PCEBuffer::PCEBuffer()
|
||||
{
|
||||
buffer = defaultBuffer;
|
||||
bufferIndex = 0;
|
||||
bufferSize = DEFAULT_BUFFER_SIZE;
|
||||
}
|
||||
|
||||
PCEBuffer::~PCEBuffer()
|
||||
{
|
||||
if (buffer != defaultBuffer) {
|
||||
DELETE_ARRAY(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
void PCEBuffer::reset()
|
||||
{
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
UBool PCEBuffer::empty() const
|
||||
{
|
||||
return bufferIndex <= 0;
|
||||
}
|
||||
|
||||
void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
|
||||
{
|
||||
if (bufferIndex >= bufferSize) {
|
||||
PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
|
||||
|
||||
ARRAY_COPY(newBuffer, buffer, bufferSize);
|
||||
|
||||
if (buffer != defaultBuffer) {
|
||||
DELETE_ARRAY(buffer);
|
||||
}
|
||||
|
||||
buffer = newBuffer;
|
||||
bufferSize += BUFFER_GROW;
|
||||
}
|
||||
|
||||
buffer[bufferIndex].ce = ce;
|
||||
buffer[bufferIndex].low = ixLow;
|
||||
buffer[bufferIndex].high = ixHigh;
|
||||
|
||||
bufferIndex += 1;
|
||||
}
|
||||
|
||||
const PCEI *PCEBuffer::get()
|
||||
{
|
||||
if (bufferIndex > 0) {
|
||||
return &buffer[--bufferIndex];
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* This inherits from UObject so that
|
||||
* it can be allocated by new and the
|
||||
* constructor for PCEBuffer is called.
|
||||
*/
|
||||
struct UCollationPCE : public UObject
|
||||
{
|
||||
PCEBuffer pceBuffer;
|
||||
UCollationStrength strength;
|
||||
UBool toShift;
|
||||
UBool isShifted;
|
||||
uint32_t variableTop;
|
||||
|
||||
UCollationPCE(UCollationElements *elems);
|
||||
~UCollationPCE();
|
||||
|
||||
void init(const UCollator *coll);
|
||||
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
static UClassID getStaticClassID();
|
||||
};
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE)
|
||||
|
||||
UCollationPCE::UCollationPCE(UCollationElements *elems)
|
||||
{
|
||||
init(elems->iteratordata_.coll);
|
||||
}
|
||||
|
||||
void UCollationPCE::init(const UCollator *coll)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
strength = ucol_getStrength(coll);
|
||||
toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
|
||||
isShifted = FALSE;
|
||||
variableTop = coll->variableTopValue << 16;
|
||||
}
|
||||
|
||||
UCollationPCE::~UCollationPCE()
|
||||
{
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
|
||||
{
|
||||
uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
|
||||
|
||||
// This is clean, but somewhat slow...
|
||||
// We could apply the mask to ce and then
|
||||
// just get all three orders...
|
||||
switch(elems->pce->strength) {
|
||||
default:
|
||||
tertiary = ucol_tertiaryOrder(ce);
|
||||
/* note fall-through */
|
||||
|
||||
case UCOL_SECONDARY:
|
||||
secondary = ucol_secondaryOrder(ce);
|
||||
/* note fall-through */
|
||||
|
||||
case UCOL_PRIMARY:
|
||||
primary = ucol_primaryOrder(ce);
|
||||
}
|
||||
|
||||
// Continuation?
|
||||
if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0)
|
||||
|| (elems->pce->isShifted && primary == 0)) {
|
||||
|
||||
if (primary == 0) {
|
||||
return UCOL_IGNORABLE;
|
||||
}
|
||||
|
||||
if (elems->pce->strength >= UCOL_QUATERNARY) {
|
||||
quaternary = primary;
|
||||
}
|
||||
|
||||
primary = secondary = tertiary = 0;
|
||||
elems->pce->isShifted = TRUE;
|
||||
} else {
|
||||
if (elems->pce->strength >= UCOL_QUATERNARY) {
|
||||
quaternary = 0xFFFF;
|
||||
}
|
||||
|
||||
elems->pce->isShifted = FALSE;
|
||||
}
|
||||
|
||||
|
||||
return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_init_pce(const UCollationElements *elems)
|
||||
{
|
||||
if (elems->pce != NULL) {
|
||||
elems->pce->init(elems->iteratordata_.coll);
|
||||
}
|
||||
}
|
||||
|
||||
/* public methods ---------------------------------------------------- */
|
||||
|
||||
U_CAPI UCollationElements* U_EXPORT2
|
||||
|
@ -50,8 +312,9 @@ ucol_openElements(const UCollator *coll,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
result->reset_ = TRUE;
|
||||
result->isWritable = FALSE;
|
||||
result->reset_ = TRUE;
|
||||
result->isWritable = FALSE;
|
||||
result->pce = NULL;
|
||||
|
||||
if (text == NULL) {
|
||||
textLength = 0;
|
||||
|
@ -64,22 +327,34 @@ ucol_openElements(const UCollator *coll,
|
|||
U_CAPI void U_EXPORT2
|
||||
ucol_closeElements(UCollationElements *elems)
|
||||
{
|
||||
if (elems != NULL) {
|
||||
collIterate *ci = &elems->iteratordata_;
|
||||
if (ci != NULL) {
|
||||
if (ci->writableBuffer != ci->stackWritableBuffer) {
|
||||
uprv_free(ci->writableBuffer);
|
||||
}
|
||||
if (ci->extendCEs) {
|
||||
uprv_free(ci->extendCEs);
|
||||
}
|
||||
}
|
||||
if (elems->isWritable && elems->iteratordata_.string != NULL)
|
||||
{
|
||||
uprv_free(elems->iteratordata_.string);
|
||||
}
|
||||
uprv_free(elems);
|
||||
}
|
||||
if (elems != NULL) {
|
||||
collIterate *ci = &elems->iteratordata_;
|
||||
|
||||
if (ci != NULL) {
|
||||
if (ci->writableBuffer != ci->stackWritableBuffer) {
|
||||
uprv_free(ci->writableBuffer);
|
||||
}
|
||||
|
||||
if (ci->extendCEs) {
|
||||
uprv_free(ci->extendCEs);
|
||||
}
|
||||
|
||||
if (ci->offsetBuffer) {
|
||||
uprv_free(ci->offsetBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
if (elems->isWritable && elems->iteratordata_.string != NULL)
|
||||
{
|
||||
uprv_free(elems->iteratordata_.string);
|
||||
}
|
||||
|
||||
if (elems->pce != NULL) {
|
||||
delete elems->pce;
|
||||
}
|
||||
|
||||
uprv_free(elems);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
|
@ -103,6 +378,9 @@ ucol_reset(UCollationElements *elems)
|
|||
ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
|
||||
}
|
||||
ci->fcdPosition = NULL;
|
||||
|
||||
//ci->offsetReturn = ci->offsetStore = NULL;
|
||||
ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
@ -126,6 +404,52 @@ ucol_next(UCollationElements *elems,
|
|||
return result;
|
||||
}
|
||||
|
||||
U_CAPI int64_t U_EXPORT2
|
||||
ucol_nextProcessed(UCollationElements *elems,
|
||||
int32_t *ixLow,
|
||||
int32_t *ixHigh,
|
||||
UErrorCode *status)
|
||||
{
|
||||
const UCollator *coll = elems->iteratordata_.coll;
|
||||
int64_t result = UCOL_IGNORABLE;
|
||||
uint32_t low = 0, high = 0;
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return UCOL_PROCESSED_NULLORDER;
|
||||
}
|
||||
|
||||
if (elems->pce == NULL) {
|
||||
elems->pce = new UCollationPCE(elems);
|
||||
} else {
|
||||
elems->pce->pceBuffer.reset();
|
||||
}
|
||||
|
||||
elems->reset_ = FALSE;
|
||||
|
||||
do {
|
||||
low = ucol_getOffset(elems);
|
||||
uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status);
|
||||
high = ucol_getOffset(elems);
|
||||
|
||||
if (ce == UCOL_NO_MORE_CES) {
|
||||
result = UCOL_PROCESSED_NULLORDER;
|
||||
break;
|
||||
}
|
||||
|
||||
result = processCE(elems, ce);
|
||||
} while (result == UCOL_IGNORABLE);
|
||||
|
||||
if (ixLow != NULL) {
|
||||
*ixLow = low;
|
||||
}
|
||||
|
||||
if (ixHigh != NULL) {
|
||||
*ixHigh = high;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucol_previous(UCollationElements *elems,
|
||||
UErrorCode *status)
|
||||
|
@ -161,12 +485,162 @@ ucol_previous(UCollationElements *elems,
|
|||
}
|
||||
}
|
||||
|
||||
U_CAPI int64_t U_EXPORT2
|
||||
ucol_previousProcessed(UCollationElements *elems,
|
||||
int32_t *ixLow,
|
||||
int32_t *ixHigh,
|
||||
UErrorCode *status)
|
||||
{
|
||||
const UCollator *coll = elems->iteratordata_.coll;
|
||||
int64_t result = UCOL_IGNORABLE;
|
||||
// int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
|
||||
// UCollationStrength strength = ucol_getStrength(coll);
|
||||
// UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
|
||||
// uint32_t variableTop = coll->variableTopValue;
|
||||
uint32_t low = 0, high = 0;
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return UCOL_PROCESSED_NULLORDER;
|
||||
}
|
||||
|
||||
if (elems->reset_ &&
|
||||
(elems->iteratordata_.pos == elems->iteratordata_.string)) {
|
||||
if (elems->iteratordata_.endp == NULL) {
|
||||
elems->iteratordata_.endp = elems->iteratordata_.string +
|
||||
u_strlen(elems->iteratordata_.string);
|
||||
elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
|
||||
}
|
||||
|
||||
elems->iteratordata_.pos = elems->iteratordata_.endp;
|
||||
elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
|
||||
}
|
||||
|
||||
if (elems->pce == NULL) {
|
||||
elems->pce = new UCollationPCE(elems);
|
||||
} else {
|
||||
//elems->pce->pceBuffer.reset();
|
||||
}
|
||||
|
||||
elems->reset_ = FALSE;
|
||||
|
||||
while (elems->pce->pceBuffer.empty()) {
|
||||
// buffer raw CEs up to non-ignorable primary
|
||||
RCEBuffer rceb;
|
||||
uint32_t ce;
|
||||
|
||||
// **** do we need to reset rceb, or will it always be empty at this point ****
|
||||
do {
|
||||
high = ucol_getOffset(elems);
|
||||
ce = ucol_getPrevCE(coll, &elems->iteratordata_, status);
|
||||
low = ucol_getOffset(elems);
|
||||
|
||||
if (ce == UCOL_NO_MORE_CES) {
|
||||
if (! rceb.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
goto finish;
|
||||
}
|
||||
|
||||
rceb.put(ce, low, high);
|
||||
} while ((ce & UCOL_PRIMARYMASK) == 0);
|
||||
|
||||
// process the raw CEs
|
||||
while (! rceb.empty()) {
|
||||
const RCEI *rcei = rceb.get();
|
||||
|
||||
result = processCE(elems, rcei->ce);
|
||||
|
||||
if (result != UCOL_IGNORABLE) {
|
||||
elems->pce->pceBuffer.put(result, rcei->low, rcei->high);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
finish:
|
||||
if (elems->pce->pceBuffer.empty()) {
|
||||
// **** Is -1 the right value for ixLow, ixHigh? ****
|
||||
if (ixLow != NULL) {
|
||||
*ixLow = -1;
|
||||
}
|
||||
|
||||
if (ixHigh != NULL) {
|
||||
*ixHigh = -1
|
||||
;
|
||||
}
|
||||
return UCOL_PROCESSED_NULLORDER;
|
||||
}
|
||||
|
||||
const PCEI *pcei = elems->pce->pceBuffer.get();
|
||||
|
||||
if (ixLow != NULL) {
|
||||
*ixLow = pcei->low;
|
||||
}
|
||||
|
||||
if (ixHigh != NULL) {
|
||||
*ixHigh = pcei->high;
|
||||
}
|
||||
|
||||
return pcei->ce;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucol_getMaxExpansion(const UCollationElements *elems,
|
||||
int32_t order)
|
||||
{
|
||||
uint8_t result;
|
||||
|
||||
#if 0
|
||||
UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
|
||||
#else
|
||||
const UCollator *coll = elems->iteratordata_.coll;
|
||||
const uint32_t *start;
|
||||
const uint32_t *limit;
|
||||
const uint32_t *mid;
|
||||
uint32_t strengthMask = 0;
|
||||
uint32_t mOrder = (uint32_t) order;
|
||||
|
||||
switch (coll->strength)
|
||||
{
|
||||
default:
|
||||
strengthMask |= UCOL_TERTIARYORDERMASK;
|
||||
/* fall through */
|
||||
|
||||
case UCOL_SECONDARY:
|
||||
strengthMask |= UCOL_SECONDARYORDERMASK;
|
||||
/* fall through */
|
||||
|
||||
case UCOL_PRIMARY:
|
||||
strengthMask |= UCOL_PRIMARYORDERMASK;
|
||||
}
|
||||
|
||||
mOrder &= strengthMask;
|
||||
start = (coll)->endExpansionCE;
|
||||
limit = (coll)->lastEndExpansionCE;
|
||||
|
||||
while (start < limit - 1) {
|
||||
mid = start + ((limit - start) >> 1);
|
||||
if (mOrder <= (*mid & strengthMask)) {
|
||||
limit = mid;
|
||||
} else {
|
||||
start = mid;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: with a masked search, there might be more than one hit,
|
||||
// so we need to look forward and backward from the match to find all
|
||||
// of the hits...
|
||||
if ((*start & strengthMask) == mOrder) {
|
||||
result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));
|
||||
} else if ((*limit & strengthMask) == mOrder) {
|
||||
result = *(coll->expansionCESize + (limit - coll->endExpansionCE));
|
||||
} else if ((mOrder & 0xFFFF) == 0x00C0) {
|
||||
result = 2;
|
||||
} else {
|
||||
result = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -199,21 +673,30 @@ ucol_setText( UCollationElements *elems,
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
ucol_getOffset(const UCollationElements *elems)
|
||||
{
|
||||
const collIterate *ci = &(elems->iteratordata_);
|
||||
// while processing characters in normalization buffer getOffset will
|
||||
// return the next non-normalized character.
|
||||
// should be inline with the old implementation since the old codes uses
|
||||
// nextDecomp in normalizer which also decomposes the string till the
|
||||
// first base character is found.
|
||||
if (ci->flags & UCOL_ITER_INNORMBUF) {
|
||||
if (ci->fcdPosition == NULL) {
|
||||
return 0;
|
||||
}
|
||||
return (int32_t)(ci->fcdPosition - ci->string);
|
||||
}
|
||||
else {
|
||||
return (int32_t)(ci->pos - ci->string);
|
||||
}
|
||||
const collIterate *ci = &(elems->iteratordata_);
|
||||
|
||||
if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) {
|
||||
return ci->offsetRepeatValue;
|
||||
}
|
||||
|
||||
if (ci->offsetReturn != NULL) {
|
||||
return *ci->offsetReturn;
|
||||
}
|
||||
|
||||
// while processing characters in normalization buffer getOffset will
|
||||
// return the next non-normalized character.
|
||||
// should be inline with the old implementation since the old codes uses
|
||||
// nextDecomp in normalizer which also decomposes the string till the
|
||||
// first base character is found.
|
||||
if (ci->flags & UCOL_ITER_INNORMBUF) {
|
||||
if (ci->fcdPosition == NULL) {
|
||||
return 0;
|
||||
}
|
||||
return (int32_t)(ci->fcdPosition - ci->string);
|
||||
}
|
||||
else {
|
||||
return (int32_t)(ci->pos - ci->string);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
|
@ -239,6 +722,10 @@ ucol_setOffset(UCollationElements *elems,
|
|||
}
|
||||
ci->fcdPosition = NULL;
|
||||
elems->reset_ = FALSE;
|
||||
|
||||
ci->offsetReturn = NULL;
|
||||
ci->offsetStore = ci->offsetBuffer;
|
||||
ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2001-2004, International Business Machines
|
||||
* Copyright (C) 2001-2008, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*
|
||||
|
@ -27,6 +27,14 @@
|
|||
*/
|
||||
#define UCOL_NULLORDER ((int32_t)0xFFFFFFFF)
|
||||
|
||||
/**
|
||||
* This indicates an error has occured during processing or there are no more CEs
|
||||
* to be returned.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
|
||||
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
/**
|
||||
|
@ -175,6 +183,45 @@ ucol_next(UCollationElements *elems, UErrorCode *status);
|
|||
U_STABLE int32_t U_EXPORT2
|
||||
ucol_previous(UCollationElements *elems, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the processed ordering priority of the next collation element in the text.
|
||||
* A single character may contain more than one collation element.
|
||||
*
|
||||
* @param elems The UCollationElements containing the text.
|
||||
* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
|
||||
* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
|
||||
* @param status A pointer to an UErrorCode to receive any errors.
|
||||
* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
|
||||
* if an error has occured or if the end of string has been reached
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
U_INTERNAL int64_t U_EXPORT2
|
||||
ucol_nextProcessed(UCollationElements *elems, int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the processed ordering priority of the previous collation element in the text.
|
||||
* A single character may contain more than one collation element.
|
||||
* Note that internally a stack is used to store buffered collation elements.
|
||||
* It is very rare that the stack will overflow, however if such a case is
|
||||
* encountered, the problem can be solved by increasing the size
|
||||
* UCOL_EXPAND_CE_BUFFER_SIZE in ucol_imp.h.
|
||||
*
|
||||
* @param elems The UCollationElements containing the text.
|
||||
* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
|
||||
* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
|
||||
* @param status A pointer to an UErrorCode to receive any errors. Noteably
|
||||
* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
|
||||
* buffer has been exhausted.
|
||||
* @return The previous collation elements ordering, otherwise returns
|
||||
* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
|
||||
* string has been reached.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
U_INTERNAL int64_t U_EXPORT2
|
||||
ucol_previousProcessed(UCollationElements *elems, int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the maximum length of any expansion sequences that end with the
|
||||
* specified comparison order.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2007 IBM and others. All rights reserved.
|
||||
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 06/28/2001 synwee Creation.
|
||||
|
@ -641,6 +641,126 @@ U_STABLE int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
|
|||
*/
|
||||
U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch);
|
||||
|
||||
/**
|
||||
* Simple forward search for the pattern, starting at a specified index,
|
||||
* and using using a default set search options.
|
||||
*
|
||||
* This is an experimental function, and is not an official part of the
|
||||
* ICU API.
|
||||
*
|
||||
* The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
|
||||
*
|
||||
* The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
|
||||
* any Break Iterator are ignored.
|
||||
*
|
||||
* Matches obey the following constraints:
|
||||
*
|
||||
* Characters at the start or end positions of a match that are ignorable
|
||||
* for collation are not included as part of the match, unless they
|
||||
* are part of a combining sequence, as described below.
|
||||
*
|
||||
* A match will not include a partial combining sequence. Combining
|
||||
* character sequences are considered to be inseperable units,
|
||||
* and either match the pattern completely, or are considered to not match
|
||||
* at all. Thus, for example, an A followed a combining accent mark will
|
||||
* not be found when searching for a plain (unaccented) A. (unless
|
||||
* the collation strength has been set to ignore all accents).
|
||||
*
|
||||
* When beginning a search, the initial starting position, startIdx,
|
||||
* is assumed to be an acceptable match boundary with respect to
|
||||
* combining characters. A combining sequence that spans across the
|
||||
* starting point will not supress a match beginning at startIdx.
|
||||
*
|
||||
* Characters that expand to multiple collation elements
|
||||
* (German sharp-S becoming 'ss', or the composed forms of accented
|
||||
* characters, for example) also must match completely.
|
||||
* Searching for a single 's' in a string containing only a sharp-s will
|
||||
* find no match.
|
||||
*
|
||||
*
|
||||
* @param strsrch the UStringSearch struct, which references both
|
||||
* the text to be searched and the pattern being sought.
|
||||
* @param startIdx The index into the text to begin the search.
|
||||
* @param matchStart An out parameter, the starting index of the matched text.
|
||||
* This parameter may be NULL.
|
||||
* A value of -1 will be returned if no match was found.
|
||||
* @param matchLimit Out parameter, the index of the first position following the matched text.
|
||||
* The matchLimit will be at a suitable position for beginning a subsequent search
|
||||
* in the input text.
|
||||
* This parameter may be NULL.
|
||||
* A value of -1 will be returned if no match was found.
|
||||
*
|
||||
* @param status Report any errors. Note that no match found is not an error.
|
||||
* @return TRUE if a match was found, FALSE otherwise.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
U_INTERNAL UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
|
||||
int32_t startIdx,
|
||||
int32_t *matchStart,
|
||||
int32_t *matchLimit,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Simple backwards search for the pattern, starting at a specified index,
|
||||
* and using using a default set search options.
|
||||
*
|
||||
* This is an experimental function, and is not an official part of the
|
||||
* ICU API.
|
||||
*
|
||||
* The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
|
||||
*
|
||||
* The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
|
||||
* any Break Iterator are ignored.
|
||||
*
|
||||
* Matches obey the following constraints:
|
||||
*
|
||||
* Characters at the start or end positions of a match that are ignorable
|
||||
* for collation are not included as part of the match, unless they
|
||||
* are part of a combining sequence, as described below.
|
||||
*
|
||||
* A match will not include a partial combining sequence. Combining
|
||||
* character sequences are considered to be inseperable units,
|
||||
* and either match the pattern completely, or are considered to not match
|
||||
* at all. Thus, for example, an A followed a combining accent mark will
|
||||
* not be found when searching for a plain (unaccented) A. (unless
|
||||
* the collation strength has been set to ignore all accents).
|
||||
*
|
||||
* When beginning a search, the initial starting position, startIdx,
|
||||
* is assumed to be an acceptable match boundary with respect to
|
||||
* combining characters. A combining sequence that spans across the
|
||||
* starting point will not supress a match beginning at startIdx.
|
||||
*
|
||||
* Characters that expand to multiple collation elements
|
||||
* (German sharp-S becoming 'ss', or the composed forms of accented
|
||||
* characters, for example) also must match completely.
|
||||
* Searching for a single 's' in a string containing only a sharp-s will
|
||||
* find no match.
|
||||
*
|
||||
*
|
||||
* @param strsrch the UStringSearch struct, which references both
|
||||
* the text to be searched and the pattern being sought.
|
||||
* @param startIdx The index into the text to begin the search.
|
||||
* @param matchStart An out parameter, the starting index of the matched text.
|
||||
* This parameter may be NULL.
|
||||
* A value of -1 will be returned if no match was found.
|
||||
* @param matchLimit Out parameter, the index of the first position following the matched text.
|
||||
* The matchLimit will be at a suitable position for beginning a subsequent search
|
||||
* in the input text.
|
||||
* This parameter may be NULL.
|
||||
* A value of -1 will be returned if no match was found.
|
||||
*
|
||||
* @param status Report any errors. Note that no match found is not an error.
|
||||
* @return TRUE if a match was found, FALSE otherwise.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
|
||||
int32_t startIdx,
|
||||
int32_t *matchStart,
|
||||
int32_t *matchLimit,
|
||||
UErrorCode *status);
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
||||
#endif
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2007 IBM and others. All rights reserved.
|
||||
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 08/13/2001 synwee Creation.
|
||||
|
@ -31,8 +31,8 @@ struct USearch {
|
|||
// value USEARCH_DONE is the default value
|
||||
// if we are not at the start of the text or the end of the text,
|
||||
// depending on the iteration direction and matchedIndex is USEARCH_DONE
|
||||
// it means that we can find any more matches in that particular direction
|
||||
int32_t matchedIndex;
|
||||
// it means that we can't find any more matches in that particular direction
|
||||
int32_t matchedIndex;
|
||||
int32_t matchedLength;
|
||||
UBool isForwardSearching;
|
||||
UBool reset;
|
||||
|
@ -45,6 +45,9 @@ struct UPattern {
|
|||
int32_t CELength;
|
||||
int32_t *CE;
|
||||
int32_t CEBuffer[INITIAL_ARRAY_SIZE_];
|
||||
int32_t PCELength;
|
||||
int64_t *PCE;
|
||||
int64_t PCEBuffer[INITIAL_ARRAY_SIZE_];
|
||||
UBool hasPrefixAccents;
|
||||
UBool hasSuffixAccents;
|
||||
int16_t defaultShiftSize;
|
||||
|
|
|
@ -54,6 +54,9 @@
|
|||
#include "cmemory.h"
|
||||
#include "ucol_imp.h"
|
||||
|
||||
/* set to 1 to test offsets in backAndForth() */
|
||||
#define TEST_OFFSETS 0
|
||||
|
||||
/* perform test with strength PRIMARY */
|
||||
static void TestPrimary(void);
|
||||
|
||||
|
@ -436,14 +439,15 @@ void doTest(UCollator* myCollation, const UChar source[], const UChar target[],
|
|||
* Return an integer array containing all of the collation orders
|
||||
* returned by calls to next on the specified iterator
|
||||
*/
|
||||
int32_t* getOrders(UCollationElements *iter, int32_t *orderLength)
|
||||
OrderAndOffset* getOrders(UCollationElements *iter, int32_t *orderLength)
|
||||
{
|
||||
UErrorCode status;
|
||||
int32_t order;
|
||||
int32_t maxSize = 100;
|
||||
int32_t size = 0;
|
||||
int32_t *temp;
|
||||
int32_t *orders =(int32_t*)malloc(sizeof(int32_t) * maxSize);
|
||||
int32_t offset = ucol_getOffset(iter);
|
||||
OrderAndOffset *temp;
|
||||
OrderAndOffset *orders =(OrderAndOffset *)malloc(sizeof(OrderAndOffset) * maxSize);
|
||||
status= U_ZERO_ERROR;
|
||||
|
||||
|
||||
|
@ -452,22 +456,26 @@ int32_t* getOrders(UCollationElements *iter, int32_t *orderLength)
|
|||
if (size == maxSize)
|
||||
{
|
||||
maxSize *= 2;
|
||||
temp = (int32_t*)malloc(sizeof(int32_t) * maxSize);
|
||||
temp = (OrderAndOffset *)malloc(sizeof(OrderAndOffset) * maxSize);
|
||||
|
||||
memcpy(temp, orders, size * sizeof(int32_t));
|
||||
memcpy(temp, orders, size * sizeof(OrderAndOffset));
|
||||
free(orders);
|
||||
orders = temp;
|
||||
|
||||
}
|
||||
|
||||
orders[size++] = order;
|
||||
orders[size].order = order;
|
||||
orders[size].offset = offset;
|
||||
|
||||
offset = ucol_getOffset(iter);
|
||||
size += 1;
|
||||
}
|
||||
|
||||
if (maxSize > size && size > 0)
|
||||
{
|
||||
temp = (int32_t*)malloc(sizeof(int32_t) * size);
|
||||
temp = (OrderAndOffset *)malloc(sizeof(OrderAndOffset) * size);
|
||||
|
||||
memcpy(temp, orders, size * sizeof(int32_t));
|
||||
memcpy(temp, orders, size * sizeof(OrderAndOffset));
|
||||
free(orders);
|
||||
orders = temp;
|
||||
|
||||
|
@ -486,8 +494,7 @@ backAndForth(UCollationElements *iter)
|
|||
int32_t index, o;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t orderLength = 0;
|
||||
int32_t *orders;
|
||||
orders= getOrders(iter, &orderLength);
|
||||
OrderAndOffset *orders = getOrders(iter, &orderLength);
|
||||
|
||||
|
||||
/* Now go through it backwards and make sure we get the same values */
|
||||
|
@ -495,49 +502,60 @@ backAndForth(UCollationElements *iter)
|
|||
ucol_reset(iter);
|
||||
|
||||
/* synwee : changed */
|
||||
while ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
|
||||
{
|
||||
if (o != orders[-- index])
|
||||
{
|
||||
while ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER) {
|
||||
int32_t offset = ucol_getOffset(iter);
|
||||
|
||||
index -= 1;
|
||||
if (o != orders[index].order) {
|
||||
if (o == 0)
|
||||
index ++;
|
||||
else
|
||||
{
|
||||
while (index > 0 && orders[-- index] == 0)
|
||||
{
|
||||
else {
|
||||
while (index > 0 && orders[-- index].order == 0) {
|
||||
/* nothing... */
|
||||
}
|
||||
if (o != orders[index])
|
||||
{
|
||||
log_err("Mismatch at index : 0x%x\n", index);
|
||||
return;
|
||||
}
|
||||
|
||||
if (o != orders[index].order) {
|
||||
log_err("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X\n", index,
|
||||
orders[index].order, o);
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if TEST_OFFSETS
|
||||
if (offset != orders[index].offset) {
|
||||
log_err("Mismatched offset at index %d: %d vs. %d\n", index,
|
||||
orders[index].offset, offset);
|
||||
goto bail;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
while (index != 0 && orders[index - 1] == 0) {
|
||||
index --;
|
||||
while (index != 0 && orders[index - 1].order == 0) {
|
||||
index -= 1;
|
||||
}
|
||||
|
||||
if (index != 0)
|
||||
{
|
||||
if (index != 0) {
|
||||
log_err("Didn't get back to beginning - index is %d\n", index);
|
||||
|
||||
ucol_reset(iter);
|
||||
log_err("\nnext: ");
|
||||
if ((o = ucol_next(iter, &status)) != UCOL_NULLORDER)
|
||||
{
|
||||
|
||||
if ((o = ucol_next(iter, &status)) != UCOL_NULLORDER) {
|
||||
log_err("Error at %x\n", o);
|
||||
}
|
||||
|
||||
log_err("\nprev: ");
|
||||
if ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
|
||||
{
|
||||
|
||||
if ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER) {
|
||||
log_err("Error at %x\n", o);
|
||||
}
|
||||
|
||||
log_verbose("\n");
|
||||
}
|
||||
|
||||
bail:
|
||||
free(orders);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2006, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2008, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
|
@ -29,13 +29,20 @@
|
|||
|
||||
#define RULE_BUFFER_LEN 8192
|
||||
|
||||
struct OrderAndOffset
|
||||
{
|
||||
int32_t order;
|
||||
int32_t offset;
|
||||
};
|
||||
|
||||
typedef struct OrderAndOffset OrderAndOffset;
|
||||
|
||||
/* tests comparison of custom collation with different strengths */
|
||||
void doTest(UCollator*, const UChar* source, const UChar* target, UCollationResult result);
|
||||
/* verify that iterating forward and backwards over the string yields same CEs */
|
||||
void backAndForth(UCollationElements *iter);
|
||||
/* gets an array of CEs for a string in UCollationElements iterator. */
|
||||
int32_t* getOrders(UCollationElements *iter, int32_t *orderLength);
|
||||
OrderAndOffset* getOrders(UCollationElements *iter, int32_t *orderLength);
|
||||
|
||||
void genericOrderingTestWithResult(UCollator *coll, const char * const s[], uint32_t size, UCollationResult result);
|
||||
void genericOrderingTest(UCollator *coll, const char * const s[], uint32_t size);
|
||||
|
|
|
@ -562,7 +562,7 @@ static void TestOffset()
|
|||
UCollator *en_us=NULL;
|
||||
UCollationElements *iter, *pristine;
|
||||
int32_t offset;
|
||||
int32_t *orders;
|
||||
OrderAndOffset *orders;
|
||||
int32_t orderLength=0;
|
||||
int count = 0;
|
||||
UChar test1[50];
|
||||
|
@ -649,7 +649,7 @@ static void TestOffset()
|
|||
switch (count) {
|
||||
case 0:
|
||||
if (ucol_getOffset(iter) != 1) {
|
||||
log_err("ERROR: Offset of iteration should be 0\n");
|
||||
log_err("ERROR: Offset of iteration should be 1\n");
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
|
@ -671,10 +671,16 @@ static void TestOffset()
|
|||
U_SUCCESS(status)) {
|
||||
switch (count) {
|
||||
case 0:
|
||||
case 1:
|
||||
if (ucol_getOffset(iter) != 3) {
|
||||
log_err("ERROR: Offset of iteration should be 3\n");
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
if (ucol_getOffset(iter) != 1) {
|
||||
log_err("ERROR: Offset of iteration should be 1\n");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (ucol_getOffset(iter) != 0) {
|
||||
log_err("ERROR: Offset of iteration should be 0\n");
|
||||
|
@ -937,7 +943,7 @@ static void TestSmallBuffer()
|
|||
UCollationElements *testiter,
|
||||
*iter;
|
||||
int32_t count = 0;
|
||||
int32_t *testorders,
|
||||
OrderAndOffset *testorders,
|
||||
*orders;
|
||||
|
||||
UChar teststr[500];
|
||||
|
@ -977,8 +983,8 @@ static void TestSmallBuffer()
|
|||
|
||||
while (count != 0) {
|
||||
/* UCA collation element for 0x0F76 */
|
||||
if ((count > 250 && testorders[-- count] != orders[1]) ||
|
||||
(count <= 250 && testorders[-- count] != orders[0])) {
|
||||
if ((count > 250 && testorders[-- count].order != orders[1].order) ||
|
||||
(count <= 250 && testorders[-- count].order != orders[0].order)) {
|
||||
log_err("Error decomposition does not give the right collation element at %d count\n", count);
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/********************************************************************
|
||||
* Copyright (c) 2001-2007 International Business Machines
|
||||
* Copyright (c) 2001-2008 International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************
|
||||
* File USRCHDAT.H
|
||||
|
@ -19,6 +19,9 @@ Note: This file is included by other C and C++ files. This file should not be di
|
|||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
/* Set to 1 if matches must be on grapheme boundaries */
|
||||
#define GRAPHEME_BOUNDARIES 1
|
||||
|
||||
U_CDECL_BEGIN
|
||||
struct SearchData {
|
||||
const char *text;
|
||||
|
@ -51,9 +54,15 @@ static const SearchData BASIC[] = {
|
|||
{"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}},
|
||||
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1},
|
||||
{2}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
{"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
|
||||
#endif
|
||||
|
||||
{"\\u00c9", "e", NULL, UCOL_PRIMARY, NULL, {0, -1}, {1}},
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
@ -74,6 +83,10 @@ static const SearchData BREAKITERATOREXACT[] = {
|
|||
{"testing that string ab\\u00e9cd does not match e", "e", NULL,
|
||||
UCOL_TERTIARY, "characterbreaker", {1, 28, 41, -1}, {1, 1, 1}},
|
||||
{"\\u00c9", "e", "fr", UCOL_PRIMARY, "characterbreaker", {0, -1}, {1}},
|
||||
#if 0
|
||||
/* Problem reported by Dave Bertoni, same as ticket 4279? */
|
||||
{"\\u0043\\u004F\\u0302\\u0054\\u00C9", "\\u004F", NULL, UCOL_TERTIARY, "characterbreaker", {1, -1}, {2}},
|
||||
#endif
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -92,6 +105,12 @@ static const SearchData STRENGTH[] = {
|
|||
{7, 7, 7, 7}},
|
||||
{"\\u00c0 should match but not A", "A\\u0300", "en", UCOL_IDENTICAL,
|
||||
NULL, {0, -1}, {1, 0}},
|
||||
|
||||
#if 0
|
||||
/* Ticket 5382 */
|
||||
{"12\\u0171", "\\u0170", NULL, UCOL_SECONDARY, NULL, {2, -1}, {2}},
|
||||
#endif
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -120,14 +139,19 @@ static const SearchData VARIABLE[] = {
|
|||
};
|
||||
|
||||
static const SearchData NORMEXACT[] = {
|
||||
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1},
|
||||
{2}},
|
||||
{"a\\u0300\\u0325", "a\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
#endif
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
static const SearchData NONNORMEXACT[] = {
|
||||
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1},
|
||||
{0}},
|
||||
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -167,6 +191,15 @@ static const SearchData TEXT[] = {
|
|||
};
|
||||
|
||||
static const SearchData COMPOSITEBOUNDARIES[] = {
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
|
||||
{"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
#else
|
||||
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
|
||||
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
|
||||
|
@ -175,16 +208,25 @@ static const SearchData COMPOSITEBOUNDARIES[] = {
|
|||
{"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, 1, -1},
|
||||
{1, 1}},
|
||||
#endif
|
||||
|
||||
{"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
/* A + 030A + 0301 */
|
||||
{"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
#endif
|
||||
|
||||
{"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
|
@ -193,6 +235,15 @@ static const SearchData COMPOSITEBOUNDARIES[] = {
|
|||
{"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
/* Ticket 5024 */
|
||||
{"a\\u00e1", "a\\u00e1", NULL, UCOL_SECONDARY, NULL, {0, -1}, {2}},
|
||||
|
||||
/* Ticket 5420 */
|
||||
{"fu\\u00dfball", "fu\\u00df", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
|
||||
{"fu\\u00dfball", "fuss", NULL, UCOL_PRIMARY, NULL, {0, -1}, {3}},
|
||||
{"fu\\u00dfball", "uss", NULL, UCOL_PRIMARY, NULL, {1, -1}, {2}},
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -229,12 +280,24 @@ static const char *CONTRACTIONRULE =
|
|||
static const SearchData CONTRACTION[] = {
|
||||
/* common discontiguous */
|
||||
{"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1},
|
||||
{2}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
#endif
|
||||
|
||||
/* contraction prefix */
|
||||
{"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}},
|
||||
#endif
|
||||
|
||||
/* discontiguous problem here for backwards iteration.
|
||||
accents not found because discontiguous stores all information */
|
||||
{"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {-1},
|
||||
|
@ -249,15 +312,37 @@ static const SearchData CONTRACTION[] = {
|
|||
/* blocked discontiguous */
|
||||
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL,
|
||||
{-1}, {0}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
/*
|
||||
* "ab" generates a contraction that's an expansion. The "z" matches the
|
||||
* first CE of the expansion but the match fails because it ends in the
|
||||
* middle of an expansion...
|
||||
*/
|
||||
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
#endif
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
static const char *IGNORABLERULE = "&a = \\u0300";
|
||||
|
||||
static const SearchData IGNORABLE[] = {
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
/*
|
||||
* This isn't much of a test when matches have to be on
|
||||
* grapheme boundiaries. The match at 0 only works because
|
||||
* it's at the start of the text.
|
||||
*/
|
||||
{"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL,
|
||||
{0, -1}, {2}},
|
||||
#else
|
||||
{"\\u0300\\u0315 \\u0300\\u0315 ", "\\u0300", NULL, UCOL_PRIMARY, NULL,
|
||||
{0, 3, -1}, {2, 2}},
|
||||
#endif
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -273,6 +358,20 @@ static const SearchData BASICCANONICAL[] = {
|
|||
{6, 6}},
|
||||
{"Scott Ganyo", "c", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
|
||||
{"Scott Ganyo", " ", NULL, UCOL_TERTIARY, NULL, {5, -1}, {1}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300b", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325b", "\\u0300b", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u0325\\u0300A\\u0325\\u0300", "\\u0300A\\u0300", NULL, UCOL_TERTIARY,
|
||||
NULL, {-1}, {0}},
|
||||
{"\\u0325\\u0300A\\u0325\\u0300", "\\u0325A\\u0325", NULL, UCOL_TERTIARY,
|
||||
NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
|
||||
"\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1},
|
||||
|
@ -285,12 +384,28 @@ static const SearchData BASICCANONICAL[] = {
|
|||
NULL, {0, -1}, {5}},
|
||||
{"a\\u0300\\u0325b\\u0300\\u0325c \\u0325b\\u0300 \\u0300b\\u0325",
|
||||
"\\u0300b\\u0325", NULL, UCOL_TERTIARY, NULL, {1, 12, -1}, {5, 3}},
|
||||
#endif
|
||||
|
||||
{"\\u00c4\\u0323", "A\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"\\u0308\\u0323", "\\u0323\\u0308", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
||||
static const SearchData NORMCANONICAL[] = {
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
/*
|
||||
* These tests don't really mean anything. With matches restricted to grapheme
|
||||
* boundaries, isCanonicalMatch doesn't mean anything unless normalization is
|
||||
* also turned on...
|
||||
*/
|
||||
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0300\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"a\\u0300\\u0325", "\\u0325\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1},
|
||||
|
@ -299,6 +414,8 @@ static const SearchData NORMCANONICAL[] = {
|
|||
{2}},
|
||||
{"a\\u0300\\u0325", "\\u0325", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
{"a\\u0300\\u0325", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
#endif
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -397,6 +514,20 @@ static const SearchData TEXTCANONICAL[] = {
|
|||
};
|
||||
|
||||
static const SearchData COMPOSITEBOUNDARIESCANONICAL[] = {
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
|
||||
{"B\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u00C0B", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
/* first one matches only because it's at the start of the text */
|
||||
{"\\u0300\\u00C0", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
|
||||
/* \\u0300 blocked by \\u0300 */
|
||||
{"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u00C0", "A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"A\\u00C0C", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
|
||||
{"\\u00C0A", "A", NULL, UCOL_TERTIARY, NULL, {0, 1, -1}, {1, 1}},
|
||||
|
@ -407,26 +538,66 @@ static const SearchData COMPOSITEBOUNDARIESCANONICAL[] = {
|
|||
{1, 1}},
|
||||
/* \\u0300 blocked by \\u0300 */
|
||||
{"\\u00C0\\u0300", "\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
#endif
|
||||
|
||||
/* A + 030A + 0301 */
|
||||
{"\\u01FA", "\\u01FA", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u01FA", "A\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u01FA", "\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
#endif
|
||||
|
||||
{"\\u01FA", "\\u030AA", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u01FA", "\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
#endif
|
||||
|
||||
/* blocked accent */
|
||||
{"\\u01FA", "A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u01FA", "\\u030A\\u0301", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"A\\u01FA", "A\\u030A", NULL, UCOL_TERTIARY, NULL, {1, -1}, {1}},
|
||||
{"\\u01FAA", "\\u0301A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
#endif
|
||||
|
||||
{"\\u0F73", "\\u0F73", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"\\u0F73", "\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
{"\\u0F73", "\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
#endif
|
||||
|
||||
{"\\u0F73", "\\u0F71\\u0F72", NULL, UCOL_TERTIARY, NULL, {0, -1}, {1}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
|
||||
NULL, UCOL_TERTIARY, NULL, {10, -1}, {2}},
|
||||
#else
|
||||
{"A\\u0F73", "A\\u0F71", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"\\u0F73A", "\\u0F72A", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"\\u01FA A\\u0301\\u030A A\\u030A\\u0301 A\\u030A \\u01FA", "A\\u030A",
|
||||
NULL, UCOL_TERTIARY, NULL, {0, 6, 10, 13, -1}, {1, 3, 2, 1}},
|
||||
#endif
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
|
@ -459,33 +630,59 @@ static const SearchData SUPPLEMENTARYCANONICAL[] = {
|
|||
|
||||
static const SearchData CONTRACTIONCANONICAL[] = {
|
||||
/* common discontiguous */
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"A\\u0300\\u0315", "\\u0300", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1},
|
||||
{2}},
|
||||
{"A\\u0300\\u0315", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {1, -1}, {2}},
|
||||
#endif
|
||||
|
||||
/* contraction prefix */
|
||||
{"AB\\u0315C", "A", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
#else
|
||||
{"AB\\u0315C", "AB", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
{"AB\\u0315C", "\\u0315", NULL, UCOL_TERTIARY, NULL, {2, -1}, {1}},
|
||||
#endif
|
||||
|
||||
/* discontiguous problem here for backwards iteration.
|
||||
forwards gives 0, 4 but backwards give 1, 3 */
|
||||
/* {"X\\u0300\\u0319\\u0315", "\\u0319", NULL, UCOL_TERTIARY, NULL, {0, -1},
|
||||
{4}}, */
|
||||
|
||||
/* ends not with a contraction character */
|
||||
{"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1},
|
||||
{0}},
|
||||
{"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL,
|
||||
{0, -1}, {3}},
|
||||
{"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL,
|
||||
{0, -1}, {4}},
|
||||
{"X\\u0315\\u0300D", "\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
{"X\\u0315\\u0300D", "X\\u0300\\u0315", NULL, UCOL_TERTIARY, NULL, {0, -1}, {3}},
|
||||
|
||||
#if GRAPHEME_BOUNDARIES
|
||||
{"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
/* blocked discontiguous */
|
||||
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL,
|
||||
{1, -1}, {4}},
|
||||
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {-1}, {0}},
|
||||
|
||||
/*
|
||||
* "ab" generates a contraction that's an expansion. The "z" matches the
|
||||
* first CE of the expansion but the match fails because it ends in the
|
||||
* middle of an expansion...
|
||||
*/
|
||||
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {-1}, {2}},
|
||||
#else
|
||||
{"X\\u0300\\u031A\\u0315D", "X\\u0300", NULL, UCOL_TERTIARY, NULL, {0, -1}, {4}},
|
||||
|
||||
/* blocked discontiguous */
|
||||
{"X\\u0300\\u031A\\u0315D", "\\u031A\\u0315D", NULL, UCOL_TERTIARY, NULL, {1, -1}, {4}},
|
||||
|
||||
{"ab", "z", NULL, UCOL_TERTIARY, NULL, {0, -1}, {2}},
|
||||
#endif
|
||||
|
||||
{NULL, NULL, NULL, UCOL_TERTIARY, NULL, {-1}, {0}}
|
||||
};
|
||||
|
||||
static const SearchData DIACTRICMATCH[] = {
|
||||
static const SearchData DIACRITICMATCH[] = {
|
||||
{"\\u03BA\\u03B1\\u03B9\\u0300\\u0020\\u03BA\\u03B1\\u1F76", "\\u03BA\\u03B1\\u03B9", NULL, UCOL_PRIMARY, NULL, {0, 5,-1}, {4, 3}},
|
||||
{"\\u0061\\u0061\\u00E1", "\\u0061\\u00E1", NULL, UCOL_SECONDARY, NULL, {1, -1}, {2}},
|
||||
{"\\u0020\\u00C2\\u0303\\u0020\\u0041\\u0061\\u1EAA\\u0041\\u0302\\u0303\\u00C2\\u0303\\u1EAB\\u0061\\u0302\\u0303\\u00E2\\u0303\\uD806\\uDC01\\u0300\\u0020",
|
||||
|
|
|
@ -453,28 +453,35 @@ static UBool assertCanonicalEqual(const SearchData search)
|
|||
UCollator *collator = getCollator(search.collator);
|
||||
UBreakIterator *breaker = getBreakIterator(search.breaker);
|
||||
UStringSearch *strsrch;
|
||||
UBool result = TRUE;
|
||||
|
||||
CHECK_BREAK_BOOL(search.breaker);
|
||||
u_unescape(search.text, text, 128);
|
||||
u_unescape(search.pattern, pattern, 32);
|
||||
ucol_setStrength(collator, search.strength);
|
||||
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
|
||||
strsrch = usearch_openFromCollator(pattern, -1, text, -1, collator,
|
||||
breaker, &status);
|
||||
usearch_setAttribute(strsrch, USEARCH_CANONICAL_MATCH, USEARCH_ON,
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("Error opening string search %s\n", u_errorName(status));
|
||||
return FALSE;
|
||||
result = FALSE;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (!assertEqualWithUStringSearch(strsrch, search)) {
|
||||
ucol_setStrength(collator, UCOL_TERTIARY);
|
||||
usearch_close(strsrch);
|
||||
return FALSE;
|
||||
result = FALSE;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
bail:
|
||||
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &status);
|
||||
ucol_setStrength(collator, UCOL_TERTIARY);
|
||||
usearch_close(strsrch);
|
||||
return TRUE;
|
||||
return result;
|
||||
}
|
||||
|
||||
static UBool assertEqualWithAttribute(const SearchData search,
|
||||
|
@ -1537,7 +1544,7 @@ static void TestIgnorable(void)
|
|||
ucol_close(collator);
|
||||
}
|
||||
|
||||
static void TestDiactricMatch(void)
|
||||
static void TestDiacriticMatch(void)
|
||||
{
|
||||
UChar pattern[128];
|
||||
UChar text[128];
|
||||
|
@ -1556,7 +1563,7 @@ static void TestDiactricMatch(void)
|
|||
return;
|
||||
}
|
||||
|
||||
search = DIACTRICMATCH[count];
|
||||
search = DIACRITICMATCH[count];
|
||||
while (search.text != NULL) {
|
||||
if (search.collator != NULL) {
|
||||
coll = ucol_openFromShortString(search.collator, FALSE, NULL, &status);
|
||||
|
@ -1584,7 +1591,7 @@ static void TestDiactricMatch(void)
|
|||
}
|
||||
ucol_close(coll);
|
||||
|
||||
search = DIACTRICMATCH[++count];
|
||||
search = DIACRITICMATCH[++count];
|
||||
}
|
||||
usearch_close(strsrch);
|
||||
}
|
||||
|
@ -2024,6 +2031,7 @@ static void TestGetSetOffsetCanonical(void)
|
|||
UChar text[128];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UStringSearch *strsrch;
|
||||
UCollator *collator;
|
||||
|
||||
memset(pattern, 0, 32*sizeof(UChar));
|
||||
memset(text, 0, 128*sizeof(UChar));
|
||||
|
@ -2031,8 +2039,13 @@ static void TestGetSetOffsetCanonical(void)
|
|||
open();
|
||||
strsrch = usearch_openFromCollator(pattern, 16, text, 32, EN_US_, NULL,
|
||||
&status);
|
||||
|
||||
collator = usearch_getCollator(strsrch);
|
||||
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
|
||||
|
||||
usearch_setAttribute(strsrch, USEARCH_CANONICAL_MATCH, USEARCH_ON,
|
||||
&status);
|
||||
|
||||
/* testing out of bounds error */
|
||||
usearch_setOffset(strsrch, -1, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
|
@ -2071,7 +2084,7 @@ static void TestGetSetOffsetCanonical(void)
|
|||
log_err("Error match found at %d %d\n",
|
||||
usearch_getMatchedStart(strsrch),
|
||||
usearch_getMatchedLength(strsrch));
|
||||
return;
|
||||
goto bail;
|
||||
}
|
||||
matchindex = search.offset[count + 1] == -1 ? -1 :
|
||||
search.offset[count + 2];
|
||||
|
@ -2080,7 +2093,7 @@ static void TestGetSetOffsetCanonical(void)
|
|||
&status);
|
||||
if (usearch_getOffset(strsrch) != search.offset[count + 1] + 1) {
|
||||
log_err("Error setting offset\n");
|
||||
return;
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2095,9 +2108,12 @@ static void TestGetSetOffsetCanonical(void)
|
|||
log_err("Error match found at %d %d\n",
|
||||
usearch_getMatchedStart(strsrch),
|
||||
usearch_getMatchedLength(strsrch));
|
||||
return;
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
bail:
|
||||
ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &status);
|
||||
usearch_close(strsrch);
|
||||
close();
|
||||
}
|
||||
|
@ -2242,7 +2258,7 @@ void addSearchTest(TestNode** root)
|
|||
"tscoll/usrchtst/TestContractionCanonical");
|
||||
addTest(root, &TestEnd, "tscoll/usrchtst/TestEnd");
|
||||
addTest(root, &TestNumeric, "tscoll/usrchtst/TestNumeric");
|
||||
addTest(root, &TestDiactricMatch, "tscoll/usrchtst/TestDiactricMatch");
|
||||
addTest(root, &TestDiacriticMatch, "tscoll/usrchtst/TestDiacriticMatch");
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
|
|
@ -56,8 +56,7 @@ jamotest.o srchtest.o reptest.o regextst.o \
|
|||
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
|
||||
uobjtest.o idnaref.o idnaconf.o nptrans.o punyref.o testidn.o testidna.o incaltst.o \
|
||||
calcasts.o v32test.o uvectest.o textfile.o tokiter.o utxttest.o \
|
||||
windttst.o winnmtst.o winutil.o csdetest.o tzrulets.o tzoffloc.o tzfmttst.o
|
||||
|
||||
windttst.o winnmtst.o winutil.o csdetest.o tzrulets.o tzoffloc.o tzfmttst.o ssearch.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
|
|
|
@ -365,6 +365,14 @@
|
|||
RelativePath=".\srchtest.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ssearch.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ssearch.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="svccoll.cpp"
|
||||
>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2003, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2008, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -188,7 +188,7 @@ void CollationIteratorTest::TestOffset(/* char* par */)
|
|||
|
||||
// Run all the way through the iterator, then get the offset
|
||||
int32_t orderLength = 0;
|
||||
int32_t *orders = getOrders(*iter, orderLength);
|
||||
Order *orders = getOrders(*iter, orderLength);
|
||||
|
||||
int32_t offset = iter->getOffset();
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2001-2006, International Business Machines orporation
|
||||
* Copyright (C) 2001-2008, International Business Machines orporation
|
||||
* and others. All Rights Reserved.
|
||||
****************************************************************************/
|
||||
|
||||
|
@ -154,7 +154,7 @@ void StringSearchTest::runIndexedTest(int32_t index, UBool exec,
|
|||
CASE(33, TestUClassID)
|
||||
CASE(34, TestSubclass)
|
||||
CASE(35, TestCoverage)
|
||||
CASE(36, TestDiactricMatch)
|
||||
CASE(36, TestDiacriticMatch)
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -256,8 +256,8 @@ UBool StringSearchTest::assertEqualWithStringSearch(StringSearch *strsrch,
|
|||
char *str = toCharString(strsrch->getText());
|
||||
errln("Text: %s", str);
|
||||
str = toCharString(strsrch->getPattern());
|
||||
errln("Pattern: %s", str);
|
||||
errln("Error following match found at %d %d",
|
||||
infoln("Pattern: %s", str);
|
||||
infoln("Error following match found at %d %d",
|
||||
strsrch->getMatchedStart(), strsrch->getMatchedLength());
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -375,7 +375,7 @@ UBool StringSearchTest::assertEqual(const SearchData *search)
|
|||
if( strsrch2 == strsrch || *strsrch2 != *strsrch ||
|
||||
!assertEqualWithStringSearch(strsrch2, search)
|
||||
) {
|
||||
errln("failure with StringSearch.clone()");
|
||||
infoln("failure with StringSearch.clone()");
|
||||
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
|
||||
delete strsrch;
|
||||
delete strsrch2;
|
||||
|
@ -395,6 +395,7 @@ UBool StringSearchTest::assertCanonicalEqual(const SearchData *search)
|
|||
BreakIterator *breaker = getBreakIterator(search->breaker);
|
||||
StringSearch *strsrch;
|
||||
UChar temp[128];
|
||||
UBool result = TRUE;
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION
|
||||
if(search->breaker) {
|
||||
|
@ -415,22 +416,27 @@ UBool StringSearchTest::assertCanonicalEqual(const SearchData *search)
|
|||
}
|
||||
#endif
|
||||
collator->setStrength(getECollationStrength(search->strength));
|
||||
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
|
||||
strsrch = new StringSearch(pattern, text, (RuleBasedCollator *)collator,
|
||||
breaker, status);
|
||||
strsrch->setAttribute(USEARCH_CANONICAL_MATCH, USEARCH_ON, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Error opening string search %s", u_errorName(status));
|
||||
return FALSE;
|
||||
result = FALSE;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (!assertEqualWithStringSearch(strsrch, search)) {
|
||||
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
|
||||
delete strsrch;
|
||||
return FALSE;
|
||||
result = FALSE;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
bail:
|
||||
collator->setStrength(getECollationStrength(UCOL_TERTIARY));
|
||||
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, status);
|
||||
delete strsrch;
|
||||
return TRUE;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
UBool StringSearchTest::assertEqualWithAttribute(const SearchData *search,
|
||||
|
@ -681,7 +687,7 @@ void StringSearchTest::TestBasic()
|
|||
while (BASIC[count].text != NULL) {
|
||||
//printf("count %d", count);
|
||||
if (!assertEqual(&BASIC[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
|
@ -698,14 +704,14 @@ void StringSearchTest::TestNormExact()
|
|||
}
|
||||
while (BASIC[count].text != NULL) {
|
||||
if (!assertEqual(&BASIC[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
count = 0;
|
||||
while (NORMEXACT[count].text != NULL) {
|
||||
if (!assertEqual(&NORMEXACT[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
|
@ -713,7 +719,7 @@ void StringSearchTest::TestNormExact()
|
|||
count = 0;
|
||||
while (NONNORMEXACT[count].text != NULL) {
|
||||
if (!assertEqual(&NONNORMEXACT[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
|
@ -724,7 +730,7 @@ void StringSearchTest::TestStrength()
|
|||
int count = 0;
|
||||
while (STRENGTH[count].text != NULL) {
|
||||
if (!assertEqual(&STRENGTH[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
|
@ -810,7 +816,7 @@ void StringSearchTest::TestBreakIterator()
|
|||
}
|
||||
strsrch->reset();
|
||||
if (!assertEqualWithStringSearch(strsrch, search)) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
delete strsrch;
|
||||
count += 2;
|
||||
|
@ -818,7 +824,7 @@ void StringSearchTest::TestBreakIterator()
|
|||
count = 0;
|
||||
while (BREAKITERATOREXACT[count].text != NULL) {
|
||||
if (!assertEqual(&BREAKITERATOREXACT[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
|
@ -838,7 +844,7 @@ void StringSearchTest::TestVariable()
|
|||
while (VARIABLE[count].text != NULL) {
|
||||
logln("variable %d", count);
|
||||
if (!assertEqual(&VARIABLE[count])) {
|
||||
errln("Error at test number %d", count);
|
||||
infoln("Error at test number %d", count);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
|
@ -1546,7 +1552,7 @@ void StringSearchTest::TestIgnorable()
|
|||
delete collator;
|
||||
}
|
||||
|
||||
void StringSearchTest::TestDiactricMatch()
|
||||
void StringSearchTest::TestDiacriticMatch()
|
||||
{
|
||||
UChar temp[128];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
@ -1559,7 +1565,7 @@ void StringSearchTest::TestDiactricMatch()
|
|||
|
||||
const SearchData *search;
|
||||
|
||||
search = &(DIACTRICMATCH[count]);
|
||||
search = &(DIACRITICMATCH[count]);
|
||||
while (search->text != NULL) {
|
||||
coll = getCollator(search->collator);
|
||||
coll->setStrength(getECollationStrength(search->strength));
|
||||
|
@ -1577,7 +1583,7 @@ void StringSearchTest::TestDiactricMatch()
|
|||
if (!assertEqualWithStringSearch(strsrch, search)) {
|
||||
errln("Error at test number %d", count);
|
||||
}
|
||||
search = &(DIACTRICMATCH[++count]);
|
||||
search = &(DIACRITICMATCH[++count]);
|
||||
delete strsrch;
|
||||
}
|
||||
|
||||
|
@ -1818,6 +1824,8 @@ void StringSearchTest::TestCollatorCanonical()
|
|||
if (tailored != NULL) {
|
||||
delete tailored;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
strsrch->setCollator(m_en_us_, status);
|
||||
|
@ -1980,6 +1988,10 @@ void StringSearchTest::TestGetSetOffsetCanonical()
|
|||
UnicodeString pattern("pattern");
|
||||
StringSearch *strsrch = new StringSearch(pattern, text, m_en_us_, NULL,
|
||||
status);
|
||||
Collator *collator = strsrch->getCollator();
|
||||
|
||||
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
|
||||
|
||||
strsrch->setAttribute(USEARCH_CANONICAL_MATCH, USEARCH_ON, status);
|
||||
/* testing out of bounds error */
|
||||
strsrch->setOffset(-1, status);
|
||||
|
@ -2023,7 +2035,7 @@ void StringSearchTest::TestGetSetOffsetCanonical()
|
|||
errln("Error match found at %d %d",
|
||||
strsrch->getMatchedStart(),
|
||||
strsrch->getMatchedLength());
|
||||
return;
|
||||
goto bail;
|
||||
}
|
||||
matchindex = search.offset[count + 1] == -1 ? -1 :
|
||||
search.offset[count + 2];
|
||||
|
@ -2031,7 +2043,7 @@ void StringSearchTest::TestGetSetOffsetCanonical()
|
|||
strsrch->setOffset(search.offset[count + 1] + 1, status);
|
||||
if (strsrch->getOffset() != search.offset[count + 1] + 1) {
|
||||
errln("Error setting offset");
|
||||
return;
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2045,9 +2057,12 @@ void StringSearchTest::TestGetSetOffsetCanonical()
|
|||
errln("Pattern: %s", str);
|
||||
errln("Error match found at %d %d", strsrch->getMatchedStart(),
|
||||
strsrch->getMatchedLength());
|
||||
return;
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
bail:
|
||||
collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, status);
|
||||
delete strsrch;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/****************************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2001-2005, International Business Machines Corporation and others
|
||||
* Copyright (c) 2001-2008, International Business Machines Corporation and others
|
||||
* All Rights Reserved.
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -88,7 +88,7 @@ private:
|
|||
void TestUClassID();
|
||||
void TestSubclass();
|
||||
void TestCoverage();
|
||||
void TestDiactricMatch();
|
||||
void TestDiacriticMatch();
|
||||
};
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
|
1670
icu4c/source/test/intltest/ssearch.cpp
Normal file
1670
icu4c/source/test/intltest/ssearch.cpp
Normal file
File diff suppressed because it is too large
Load diff
40
icu4c/source/test/intltest/ssearch.h
Normal file
40
icu4c/source/test/intltest/ssearch.h
Normal file
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2008, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef __SSEARCH_H
|
||||
#define __SSEARCH_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
#include "intltest.h"
|
||||
|
||||
//
|
||||
// Test of the function usearch_search()
|
||||
//
|
||||
// See srchtest.h for the tests for the rest of the string search functions.
|
||||
//
|
||||
class SSearchTest: public IntlTest {
|
||||
public:
|
||||
|
||||
SSearchTest();
|
||||
virtual ~SSearchTest();
|
||||
|
||||
virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* params = NULL );
|
||||
|
||||
virtual void searchTest();
|
||||
virtual void offsetTest();
|
||||
virtual void monkeyTest(char *params);
|
||||
|
||||
private:
|
||||
virtual const char *getPath(char buffer[2048], const char *filename);
|
||||
virtual int32_t monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
|
||||
const char *name, const char *strength, uint32_t seed);
|
||||
};
|
||||
|
||||
#endif
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2007, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2008, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -42,6 +42,7 @@
|
|||
#include "normconf.h"
|
||||
#include "thcoll.h"
|
||||
#include "srchtest.h"
|
||||
#include "ssearch.h"
|
||||
#include "cntabcol.h"
|
||||
#include "lcukocol.h"
|
||||
#include "ucaconf.h"
|
||||
|
@ -49,6 +50,9 @@
|
|||
#include "cmemory.h"
|
||||
//#include "rndmcoll.h"
|
||||
|
||||
// Set to 1 to test offsets in backAndForth()
|
||||
#define TEST_OFFSETS 0
|
||||
|
||||
#define TESTCLASS(n,classname) \
|
||||
case n: \
|
||||
name = #classname; \
|
||||
|
@ -89,6 +93,7 @@ void IntlTestCollator::runIndexedTest( int32_t index, UBool exec, const char* &n
|
|||
TESTCLASS(19, CollationServiceTest);
|
||||
TESTCLASS(20, CollationFinnishTest); // removed by weiv - we have changed Finnish collation
|
||||
//TESTCLASS(21, RandomCollatorTest); // See ticket 5747 about reenabling this test.
|
||||
TESTCLASS(21, SSearchTest);
|
||||
|
||||
default: name = ""; break;
|
||||
}
|
||||
|
@ -392,7 +397,7 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
|
|||
{
|
||||
// Run through the iterator forwards and stick it into an array
|
||||
int32_t orderLength = 0;
|
||||
int32_t *orders = getOrders(iter, orderLength);
|
||||
Order *orders = getOrders(iter, orderLength);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// Now go through it backwards and make sure we get the same values
|
||||
|
@ -404,6 +409,8 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
|
|||
|
||||
while ((o = iter.previous(status)) != CollationElementIterator::NULLORDER)
|
||||
{
|
||||
int32_t offset = iter.getOffset();
|
||||
|
||||
if (index == 0) {
|
||||
if(o == 0) {
|
||||
continue;
|
||||
|
@ -411,28 +418,39 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
|
|||
// going backwards
|
||||
errln("Backward iteration returned a non ignorable after orders are exhausted");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (o != orders[--index])
|
||||
{
|
||||
|
||||
index -= 1;
|
||||
if (o != orders[index].order) {
|
||||
if (o == 0)
|
||||
index ++;
|
||||
else
|
||||
{
|
||||
while (index > 0 && orders[--index] == 0)
|
||||
{
|
||||
index += 1;
|
||||
else {
|
||||
while (index > 0 && orders[--index].order == 0) {
|
||||
// nothing...
|
||||
}
|
||||
if (o != orders[index])
|
||||
{
|
||||
errln("Mismatch at index %d: 0x%X vs 0x%X", index,
|
||||
orders[index], o);
|
||||
break;
|
||||
|
||||
if (o != orders[index].order) {
|
||||
errln("Mismatched order at index %d: 0x%0:8X vs. 0x%0:8X", index,
|
||||
orders[index].order, o);
|
||||
//break;
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if TEST_OFFSETS
|
||||
if (offset != orders[index].offset) {
|
||||
errln("Mismatched offset at index %d: %d vs. %d", index,
|
||||
orders[index].offset, offset);
|
||||
//break;
|
||||
goto bail;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
while (index != 0 && orders[index - 1] == 0)
|
||||
while (index != 0 && orders[index - 1].order == 0)
|
||||
{
|
||||
index --;
|
||||
}
|
||||
|
@ -466,6 +484,7 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
|
|||
errln("");
|
||||
}
|
||||
|
||||
bail:
|
||||
delete[] orders;
|
||||
}
|
||||
|
||||
|
@ -474,12 +493,13 @@ void IntlTestCollator::backAndForth(CollationElementIterator &iter)
|
|||
* Return an integer array containing all of the collation orders
|
||||
* returned by calls to next on the specified iterator
|
||||
*/
|
||||
int32_t *IntlTestCollator::getOrders(CollationElementIterator &iter, int32_t &orderLength)
|
||||
IntlTestCollator::Order *IntlTestCollator::getOrders(CollationElementIterator &iter, int32_t &orderLength)
|
||||
{
|
||||
int32_t maxSize = 100;
|
||||
int32_t size = 0;
|
||||
int32_t *orders = new int32_t[maxSize];
|
||||
Order *orders = new Order[maxSize];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t offset = iter.getOffset();
|
||||
|
||||
int32_t order;
|
||||
while ((order = iter.next(status)) != CollationElementIterator::NULLORDER)
|
||||
|
@ -487,21 +507,25 @@ int32_t *IntlTestCollator::getOrders(CollationElementIterator &iter, int32_t &or
|
|||
if (size == maxSize)
|
||||
{
|
||||
maxSize *= 2;
|
||||
int32_t *temp = new int32_t[maxSize];
|
||||
Order *temp = new Order[maxSize];
|
||||
|
||||
uprv_memcpy(temp, orders, size * sizeof(int32_t));
|
||||
uprv_memcpy(temp, orders, size * sizeof(Order));
|
||||
delete[] orders;
|
||||
orders = temp;
|
||||
}
|
||||
|
||||
orders[size++] = order;
|
||||
orders[size].order = order;
|
||||
orders[size].offset = offset;
|
||||
|
||||
offset = iter.getOffset();
|
||||
size += 1;
|
||||
}
|
||||
|
||||
if (maxSize > size)
|
||||
{
|
||||
int32_t *temp = new int32_t[size];
|
||||
Order *temp = new Order[size];
|
||||
|
||||
uprv_memcpy(temp, orders, size * sizeof(int32_t));
|
||||
uprv_memcpy(temp, orders, size * sizeof(Order));
|
||||
delete[] orders;
|
||||
orders = temp;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2003, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2008, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -23,6 +23,12 @@
|
|||
class IntlTestCollator: public IntlTest {
|
||||
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
protected:
|
||||
struct Order
|
||||
{
|
||||
int32_t order;
|
||||
int32_t offset;
|
||||
};
|
||||
|
||||
// These two should probably go down in IntlTest
|
||||
void doTest(Collator* col, const UChar *source, const UChar *target, Collator::EComparisonResult result);
|
||||
|
||||
|
@ -42,7 +48,7 @@ protected:
|
|||
* Return an integer array containing all of the collation orders
|
||||
* returned by calls to next on the specified iterator
|
||||
*/
|
||||
int32_t *getOrders(CollationElementIterator &iter, int32_t &orderLength);
|
||||
Order *getOrders(CollationElementIterator &iter, int32_t &orderLength);
|
||||
UCollationResult compareUsingPartials(UCollator *coll, const UChar source[], int32_t sLen, const UChar target[], int32_t tLen, int32_t pieceSize, UErrorCode &status);
|
||||
|
||||
};
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2001-2007 International Business Machines
|
||||
// Copyright (c) 2001-2008 International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
DataDrivenCollationTest:table(nofallback) {
|
||||
Info {
|
||||
|
@ -513,6 +513,25 @@ DataDrivenCollationTest:table(nofallback) {
|
|||
"xj<xSx<xș=xş<xȘ=xŞ<Xș=Xş<XȘ=XŞ<xșx=xşx<xȘx=xŞx<xT<xTx<xț=xţ<xȚ=xŢ<Xț=Xţ<XȚ"
|
||||
"=XŢ<xțx=xţx<xȚx=xŢx<xU"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
testOffsets {
|
||||
Info {
|
||||
Description { "This tests cases where forwards and backwards iteration get different offsets" }
|
||||
}
|
||||
|
||||
Settings {
|
||||
{
|
||||
TestLocale { "en" }
|
||||
Arguments { "[strength 3]" }
|
||||
}
|
||||
}
|
||||
|
||||
Cases {
|
||||
"a\uD800\uDC00\uDC00<b\uD800\uDC00\uDC00",
|
||||
"\u0301A\u0301\u0301<\u0301B\u0301\u0301",
|
||||
"abcd\r\u0301<abce\r\u0301"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
413
icu4c/source/test/testdata/ssearch.xml
vendored
Normal file
413
icu4c/source/test/testdata/ssearch.xml
vendored
Normal file
|
@ -0,0 +1,413 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<!-- Copyright (c) 2007-2008 IBM Corporation and others. All rights reserved -->
|
||||
|
||||
<!-- Test data file for string search -->
|
||||
<!DOCTYPE stringsearch-tests [
|
||||
<!ELEMENT stringsearch-tests (test-case+)>
|
||||
<!ATTLIST stringsearch-tests debug IDREF #IMPLIED >
|
||||
<!ELEMENT test-case (pattern, pre?, m?, post?)>
|
||||
<!ATTLIST test-case
|
||||
id ID #REQUIRED
|
||||
locale CDATA "en"
|
||||
strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY"
|
||||
norm (ON | OFF) "OFF"
|
||||
>
|
||||
|
||||
<!ELEMENT pattern (#PCDATA)>
|
||||
<!ELEMENT pre (#PCDATA)>
|
||||
<!ELEMENT m (#PCDATA)>
|
||||
<!ELEMENT post (#PCDATA)>
|
||||
]>
|
||||
|
||||
<stringsearch-tests debug="test32">
|
||||
<!-- debug="test11" (for copying into the above element) -->
|
||||
|
||||
<!-- Very simple match -->
|
||||
<test-case id="test01" >
|
||||
<pattern>abc</pattern>
|
||||
<pre>xxx</pre><m>abc</m><post>yyy</post>
|
||||
</test-case>
|
||||
|
||||
<!-- Very simple no-match -->
|
||||
<test-case id="test02" >
|
||||
<pattern>abc</pattern>
|
||||
<pre>xxx</pre><post>yyy</post>
|
||||
</test-case>
|
||||
|
||||
<!-- Match after several near-misses. -->
|
||||
<test-case id="test03" >
|
||||
<pattern>string</pattern>
|
||||
<pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test04" strength="PRIMARY" >
|
||||
<pattern>FUSS</pattern>
|
||||
<pre>abc</pre><m>fuss</m><post>sss</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test05" strength="PRIMARY" >
|
||||
<pattern>FUSS</pattern>
|
||||
<pre>abc</pre><m>fuß</m><post>sss</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test05.5" strength="PRIMARY" >
|
||||
<pattern>fuss</pattern>
|
||||
<pre>a </pre>
|
||||
<m>fuß</m>
|
||||
<post>ball table</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test06" strength="PRIMARY" >
|
||||
<pattern>fuß</pattern>
|
||||
<pre>abc</pre><m>fuss</m><post>xyz</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test07" strength="SECONDARY" >
|
||||
<pattern>fuß</pattern>
|
||||
<pre>abcfussxyz</pre>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test08" strength="PRIMARY" >
|
||||
<pattern>fus</pattern>
|
||||
<pre>abcfuß</pre><post>xyz</post>
|
||||
</test-case>
|
||||
|
||||
<!-- A good match following an initial match that failed because
|
||||
of not ending on a character boundary -->
|
||||
<test-case id="test09" strength="PRIMARY">
|
||||
<pattern>fus</pattern>
|
||||
<pre>fuß </pre><m>fus</m><post>sss</post>
|
||||
</test-case>
|
||||
|
||||
|
||||
<!-- Test cases from usrchdat.c BREAKITERATOREXACT -->
|
||||
|
||||
<test-case id="test10" strength="TERTIARY">
|
||||
<pattern>fox</pattern>
|
||||
<m>fox</m><post>y fox</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test11" strength="PRIMARY" locale="de_DE@collation=phonebook">
|
||||
<pattern>toe</pattern>
|
||||
<pre>This is a </pre><m>Tö</m><post>ne</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test11a" strength="SECONDARY" locale="de_DE@collation=phonebook">
|
||||
<pattern>toe</pattern>
|
||||
<pre>This is a </pre><post>Töne</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test12" strength="TERTIARY">
|
||||
<pattern>e</pattern>
|
||||
<pre>tésting that é doés not match </pre><m>e</m><post></post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test13" strength="PRIMARY" locale="fr">
|
||||
<pattern>e</pattern>
|
||||
<pre></pre><m>É</m><post>É</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test14" strength="PRIMARY" locale="fr">
|
||||
<pattern>O</pattern>
|
||||
<pre>C</pre><m>O\u0302</m><post>TÉ</post>
|
||||
</test-case>
|
||||
|
||||
|
||||
<!-- Test cases from usrchdat.c STRENGTH -->
|
||||
|
||||
|
||||
<test-case id="test15" strength="PRIMARY" locale="en">
|
||||
<pattern>fox</pattern>
|
||||
<pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test16" strength="PRIMARY" locale="fr">
|
||||
<pattern>peche</pattern>
|
||||
<pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test17" strength="PRIMARY" locale="fr">
|
||||
<pattern>peche</pattern>
|
||||
<pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test18" strength="PRIMARY" locale="fr">
|
||||
<pattern>peche</pattern>
|
||||
<pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test19" strength="PRIMARY" locale="fr">
|
||||
<pattern>peche</pattern>
|
||||
<pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test20" strength="PRIMARY" locale="es">
|
||||
<pattern>channel</pattern>
|
||||
<pre>A </pre><m>channel</m><post>, </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test21" strength="PRIMARY" locale="es">
|
||||
<pattern>channel</pattern>
|
||||
<pre>A </pre><m>CHANNEL</m><post>, </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test22" strength="PRIMARY" locale="es">
|
||||
<pattern>channel</pattern>
|
||||
<pre>A </pre><m>Channel</m><post>s, </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test23" strength="PRIMARY" locale="es">
|
||||
<pattern>channel</pattern>
|
||||
<pre>A </pre><m>channel</m><post>... </post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test24" strength="TERTIARY" locale="en">
|
||||
<pattern>A\u0300</pattern>
|
||||
<pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A"</post>
|
||||
</test-case>
|
||||
|
||||
<!-- TODO: In the original test data, this test matched at IDENTICAL strength.
|
||||
Doesn't seem right. The characters are different.
|
||||
-->
|
||||
<test-case id="test24a" strength="IDENTICAL" locale="en">
|
||||
<pattern>A\u0300</pattern>
|
||||
<pre>At IDENTICAL, shoud this match? </pre><m>\u00c0</m><post></post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test25" strength="SECONDARY" locale="en">
|
||||
<pattern>Ű</pattern>
|
||||
<pre>12</pre><m>ű</m><post> Ű</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test26" strength="SECONDARY" locale="en">
|
||||
<pattern>A</pattern>
|
||||
<pre>12</pre><m>a</m><post>...</post>
|
||||
</test-case>
|
||||
|
||||
|
||||
<!-- Test Cases from usrchdat.c, VARIABLE -->
|
||||
<test-case id="test27" strength="TERTIARY" locale="en">
|
||||
<pattern>blackbird</pattern>
|
||||
<pre>black-bird </pre><m>blackbird</m><post>...</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test28" strength="TERTIARY" locale="en">
|
||||
<pattern>go</pattern>
|
||||
<pre> on</pre>
|
||||
</test-case>
|
||||
|
||||
<!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
|
||||
the UStringSearch. How did the orignal test run? -->
|
||||
<!--
|
||||
<test-case id="test29" strength="PRIMARY" locale="en">
|
||||
<pattern> </pattern>
|
||||
<pre></pre><m></m><post>abc</post>
|
||||
</test-case>
|
||||
-->
|
||||
|
||||
<test-case id="test30" strength="SECONDARY" locale="en">
|
||||
<pattern>abc</pattern>
|
||||
<pre> a bc ab c a bc ab c"</pre>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test31" strength="SECONDARY" locale="en">
|
||||
<pattern>abc</pattern>
|
||||
<pre> ---------------</pre>
|
||||
</test-case>
|
||||
|
||||
|
||||
<!-- Normalization test cases from usrchdat.c -->
|
||||
<test-case id="test32" strength="TERTIARY" norm="ON">
|
||||
<pattern>a\u0325\u0300</pattern>
|
||||
<pre></pre><m>a\u0300\u0325</m>
|
||||
</test-case>
|
||||
|
||||
|
||||
<test-case id="test32a" strength="TERTIARY" norm="OFF">
|
||||
<pattern>a\u0325\u0300</pattern>
|
||||
<pre>a\u0300\u0325</pre>
|
||||
</test-case>
|
||||
|
||||
|
||||
<!-- COMPOSITEBOUNDARIES from usrchdat.c
|
||||
Boundaries are not identical to orignal test data because
|
||||
of matching only full combining sequences
|
||||
-->
|
||||
<test-case id="test40" strength="TERTIARY">
|
||||
<pattern>A</pattern>
|
||||
<pre>À</pre> <!-- \u00C0 -->
|
||||
</test-case>
|
||||
|
||||
<test-case id="test41" strength="TERTIARY">
|
||||
<pattern>A</pattern>
|
||||
<pre>À</pre><m>A</m><post>C</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test42" strength="TERTIARY">
|
||||
<pattern>A\u030A</pattern>
|
||||
<pre>À\u01FA</pre>
|
||||
</test-case>
|
||||
|
||||
|
||||
|
||||
<!-- SUPPLEMENTARYCANONICAL from usrchdat.c -->
|
||||
<test-case id="test50" strength="TERTIARY">
|
||||
<pattern>\uD800\uDC00</pattern>
|
||||
<pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m>
|
||||
<post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test51" strength="TERTIARY">
|
||||
<pattern>\\uD834\\uDDB9</pattern>
|
||||
<pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test52" strength="TERTIARY">
|
||||
<pattern> \\uD834\\uDDB9 </pattern>
|
||||
<pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test53" strength="TERTIARY">
|
||||
<pattern>-\\uD834\\uDDB9-</pattern>
|
||||
<pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test54" strength="TERTIARY">
|
||||
<pattern>,\\uD834\\uDDB9,</pattern>
|
||||
<pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test55" strength="TERTIARY">
|
||||
<pattern>?\\uD834\\uDDB9?</pattern>
|
||||
<pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post>
|
||||
</test-case>
|
||||
|
||||
|
||||
<!-- Long combining sequences -->
|
||||
<test-case id="test60" strength="PRIMARY">
|
||||
<pattern>A\u0301\u0301\u0301\u0301</pattern>
|
||||
<m>A\u0301\u0301\u0301\u0301\u0301</m>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test61" strength="TERTIARY">
|
||||
<pattern>A\u0301\u0301\u0301\u0301</pattern>
|
||||
<pre>A\u0301\u0301\u0301\u0301\u0301</pre>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test62" strength="TERTIARY">
|
||||
<pattern>A\u0301\u0301\u0301\u0301</pattern>
|
||||
<m>A\u0301\u0301\u0301\u0301</m>
|
||||
</test-case>
|
||||
|
||||
<!-- stand-alone combining marks don't match attached marks -->
|
||||
<test-case id="test63" strength="TERTIARY">
|
||||
<pattern>\u0301</pattern>
|
||||
<pre>A\u0301\u0301\u0301\u0301</pre>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test64" strength="TERTIARY">
|
||||
<pattern>\u0301</pattern>
|
||||
<post>\u0301\u0301\u0301\u0301</post>
|
||||
</test-case>
|
||||
|
||||
<!-- stand-alone combining mark does match an un-attached combining mark -->
|
||||
<test-case id="test65" strength="TERTIARY">
|
||||
<pattern>\u0301</pattern>
|
||||
<m>\u0301</m><post>A\u0301\u0301</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test66" strength="TERTIARY">
|
||||
<pattern>\u0301</pattern>
|
||||
<m>\u0301</m>
|
||||
</test-case>
|
||||
|
||||
<!-- stand-alone combining marks at end of the target text -->
|
||||
<test-case id="test67" strength="TERTIARY">
|
||||
<pattern>\u0301</pattern>
|
||||
<pre>abcd\r</pre><m>\u0301</m>
|
||||
</test-case>
|
||||
|
||||
<!-- attached combining marks at end of the target text, no match -->
|
||||
<test-case id="test68" strength="TERTIARY">
|
||||
<pattern>\u0301</pattern>
|
||||
<pre>abcd\u0301</pre>
|
||||
</test-case>
|
||||
|
||||
|
||||
|
||||
<!-- no match within expansions at the start -->
|
||||
<test-case id="test70" strength="PRIMARY">
|
||||
<pattern>Eligature</pattern>
|
||||
<pre>Æligature</pre>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test71" strength="PRIMARY">
|
||||
<pattern>AEligature</pattern>
|
||||
<m>Æligature</m>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test72" strength="PRIMARY">
|
||||
<pattern>AEligature</pattern>
|
||||
<m>Æligature</m>
|
||||
</test-case>
|
||||
|
||||
<!-- unattached combining Tilde will not match a Tilde that is
|
||||
part of a composed Ñ (\u00D1) -->
|
||||
<test-case id="test73" strength="SECONDARY">
|
||||
<pattern>\u0303</pattern> <!-- combining tilde -->
|
||||
<pre>Ñ
</pre><m>\u0303</m>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test74" strength="SECONDARY">
|
||||
<pattern>\u0303</pattern> <!-- combining tilde -->
|
||||
<pre>Ñ 
</pre><m>\u0303</m><post>a</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test75" strength="TERTIARY" locale="fr">
|
||||
<pattern>\u00EA</pattern>
|
||||
<pre>p</pre><m>\u00EA</m><post>che</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test76" strength="TERTIARY" locale="fr">
|
||||
<pattern>\u00EA</pattern>
|
||||
<pre>p</pre><m>e\u0302</m><post>che</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test77" strength="TERTIARY" locale="fr">
|
||||
<pattern>e\u0302</pattern>
|
||||
<pre>p</pre><m>\u00EA</m><post>che</post>
|
||||
</test-case>
|
||||
|
||||
<!-- Test cases from ticket:5382 -->
|
||||
<test-case id="test78" strength="SECONDARY" locale="hu_HU">
|
||||
<pattern>\u0170</pattern>
|
||||
<m>\u0171</m>
|
||||
<post>12</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test79" strength="SECONDARY" locale="hu_HU">
|
||||
<pattern>\u0170</pattern>
|
||||
<pre>1</pre>
|
||||
<m>\u0171</m>
|
||||
<post>2</post>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test80" strength="SECONDARY" locale="hu_HU">
|
||||
<pattern>\u0170</pattern>
|
||||
<pre>12</pre>
|
||||
<m>\u0171</m>
|
||||
</test-case>
|
||||
|
||||
<!-- Test cases from ticket:5959 -->
|
||||
<test-case id="test81" strength="SECONDARY">
|
||||
<pattern>\u2166</pattern>
|
||||
<m>VII</m>
|
||||
</test-case>
|
||||
|
||||
<test-case id="test82" strength="SECONDARY">
|
||||
<pattern>VII</pattern>
|
||||
<m>\u2166</m>
|
||||
</test-case>
|
||||
</stringsearch-tests>
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2004-2006, International Business Machines
|
||||
* Copyright (C) 2004-2008, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -74,10 +74,15 @@ UXMLParser::UXMLParser(UErrorCode &status) :
|
|||
|
||||
// XML Doctype decl production #28
|
||||
// example "<!DOCTYPE foo SYSTEM "somewhere" >
|
||||
// or "<!DOCTYPE foo [internal dtd]>
|
||||
// TODO: we don't actually parse the DOCTYPE or internal subsets.
|
||||
// Some internal dtd subsets could confuse this simple-minded
|
||||
// attempt at skipping over them.
|
||||
mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
|
||||
// attempt at skipping over them, specifically, occcurences
|
||||
// of closeing square brackets. These could appear in comments,
|
||||
// or in parameter entity declarations, for example.
|
||||
mXMLDoctype(UnicodeString(
|
||||
"(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)"
|
||||
), 0, status),
|
||||
|
||||
// XML PI production #16
|
||||
// example "<?target stuff?>
|
||||
|
|
Loading…
Add table
Reference in a new issue