ICU-3109 more Thai fixes

X-SVN-Rev: 12663
This commit is contained in:
Vladimir Weinstein 2003-07-23 22:49:25 +00:00
parent cc4a354714
commit 2a53fc8a87
3 changed files with 138 additions and 40 deletions

View file

@ -2580,6 +2580,7 @@ inline UChar getPrevNormalizedChar(collIterate *data)
uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
collIterateState entryState;
UChar buffer[UCOL_MAX_BUFFER];
backupState(source, &entryState);
UChar32 cp = ch;
@ -2624,9 +2625,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
case THAI_TAG:
/* Thai/Lao reordering */
if (((source->flags) & UCOL_ITER_INNORMBUF) /* Already Swapped || */
|| (source->iterator && !source->iterator->hasNext(source->iterator))
|| (source->pos && source->endp == source->pos) /* At end of string. No swap possible || */
/*|| UCOL_ISTHAIBASECONSONANT(*(source->pos)) == 0*/) /* next char not Thai base cons.*/ // This is from the old specs - we now rearrange unconditionally
|| collIter_eos(source)) /* At end of string. No swap possible */
{
// Treat Thai as a length one expansion */
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
@ -2634,36 +2633,37 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
}
else
{
if(!collIter_eos(source)) {
// Move the prevowel and the following base Consonant into the normalization buffer
// with their order swapped
UChar thCh = peekCharacter(source, 0); //getNextNormalizedChar(source);
UChar32 cp = 0;
if(U16_IS_LEAD(thCh)) {
if(!collIter_eos(source)) {
collIterateState thaiState;
backupState(source, &thaiState);
getNextNormalizedChar(source);
UChar trailCh = peekCharacter(source, 0); //getNextNormalizedChar(source);
if(U16_IS_TRAIL(trailCh)) {
cp = U16_GET_SUPPLEMENTARY(thCh, trailCh);
} else {
loadState(source, &thaiState, TRUE);
}
// Move the prevowel and the following base Consonant into the normalization buffer
// with their order swapped
// Note: this operation might activate the normalization buffer. We have to check for
// that and act accordingly.
UChar thCh = getNextNormalizedChar(source);
UChar32 cp = 0;
if(U16_IS_LEAD(thCh)) {
if(!collIter_eos(source)) {
collIterateState thaiState;
backupState(source, &thaiState);
UChar trailCh = getNextNormalizedChar(source);
if(U16_IS_TRAIL(trailCh)) {
cp = U16_GET_SUPPLEMENTARY(thCh, trailCh);
} else {
cp = (UChar32)thCh;
loadState(source, &thaiState, TRUE);
}
} else {
cp = (UChar32)thCh;
cp = (UChar32)thCh;
}
} else {
cp = (UChar32)thCh;
}
// Now we have the character that needs to be decomposed
// if the normalizing buffer was not used, we can just use our structure and be happy.
if(source->flags & UCOL_ITER_INNORMBUF == 0) {
// decompose into writable buffer
int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1);
if(decompLen < 0) {
decompLen = -decompLen;
}
// reorder Thai and the character after it
if(decompLen >= 2 && U16_IS_LEAD(source->writableBuffer[1]) && U16_IS_TRAIL(source->writableBuffer[2])) {
source->writableBuffer[0] = source->writableBuffer[1];
source->writableBuffer[1] = source->writableBuffer[2];
@ -2672,12 +2672,8 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
source->writableBuffer[0] = source->writableBuffer[1];
source->writableBuffer[1] = ch;
}
// zero terminate, since normalization buffer is always zero terminated
source->writableBuffer[decompLen+1] = 0; // we added the prevowel
/*
source->writableBuffer[0] = peekCharacter(source, 0);
source->writableBuffer[1] = peekCharacter(source, -1);
source->writableBuffer[2] = 0;
*/
if(source->pos) {
source->fcdPosition = source->pos+1; // Indicate where to continue in main input string
// after exhausting the writableBuffer
@ -2690,9 +2686,49 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
CE = UCOL_IGNORABLE;
} else {
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
CE = *CEOffset++;
} else { // stuff is already normalized... what to do here???
int32_t decompLen = unorm_getDecomposition(cp, FALSE, &buffer[1], UCOL_MAX_BUFFER-1);
if(decompLen < 0) {
decompLen = -decompLen;
}
if(decompLen >= 2 && U16_IS_LEAD(buffer[1]) && U16_IS_TRAIL(buffer[2])) {
buffer[0] = buffer[1];
buffer[1] = buffer[2];
buffer[2] = ch;
} else {
buffer[0] = buffer[1];
buffer[1] = ch;
}
buffer[decompLen+1] = 0; // we added the prevowel
// we will construct a new iterator and suck out CEs.
collIterate temp;
// Here is the string initialization. We have decomposed character (decompLen) + 1 Thai + trailing zero
IInit_collIterate(coll, buffer, decompLen+2, &temp);
// We need the trailing zero so that we can tell the iterate function that it is in the normalized and reordered
// buffer. This buffer is always zero terminated.
temp.flags |= UCOL_ITER_INNORMBUF;
// This is where to return after iteration is done. We point at the end of the string
temp.fcdPosition = buffer+decompLen+2;
temp.flags &= ~UCOL_ITER_NORM;
CE = ucol_IGetNextCE(coll, &temp, status);
uint32_t *endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
while (CE != UCOL_NO_MORE_CES) {
*(source->CEpos ++) = CE;
if (source->CEpos == endCEBuffer) {
/* ran out of CE space, bail.
there's no guarantee of the right character position after
this bail*/
*status = U_BUFFER_OVERFLOW_ERROR;
source->CEpos = source->CEs;
freeHeapWritableBuffer(&temp);
return UCOL_NULLORDER;
}
CE = ucol_IGetNextCE(coll, &temp, status);
}
freeHeapWritableBuffer(&temp);
// return the first of CEs so that we save a call
CE = *(source->toReturn++);
}
}
break;
@ -3338,7 +3374,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
*(UCharOffset --) = 0;
noChars = 0;
// have to swap thai characters
while (ucol_unsafeCP(schar, coll) || (!collIter_bos(source) && UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)))) { // this one is problematic
while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1))) {
// we might have ended here after trying to reorder Thai, but seeing that there are unsafe points
// in the backward processing
*(UCharOffset) = schar;
noChars++;
UCharOffset --;

View file

@ -72,6 +72,7 @@ void CollationThaiTest::runIndexedTest(int32_t index, UBool exec, const char* &n
CASE(1,TestCornerCases)
CASE(2,TestNamesList)
CASE(3,TestInvalidThai)
CASE(4,TestReordering)
default: name = ""; break;
}
}
@ -332,18 +333,18 @@ void CollationThaiTest::TestCornerCases(void) {
// Internal utilities
//------------------------------------------------------------------------
void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
void CollationThaiTest::compareArray(Collator& c, const char* tests[],
int32_t testsLength) {
UErrorCode status = U_ZERO_ERROR;
for (int32_t i = 0; i < testsLength; i += 3) {
int32_t expect = 0;
Collator::EComparisonResult expect;
if (tests[i+1][0] == '<') {
expect = -1;
expect = Collator::LESS;
} else if (tests[i+1][0] == '>') {
expect = 1;
expect = Collator::GREATER;
} else if (tests[i+1][0] == '=') {
expect = 0;
expect = Collator::EQUAL;
} else {
// expect = Integer.decode(tests[i+1]).intValue();
errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
@ -354,6 +355,8 @@ void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
parseChars(s1, tests[i]);
parseChars(s2, tests[i+2]);
doTest(&c, s1, s2, expect);
#if 0
int32_t result = c.compare(s1, s2);
if (sign(result) != sign(expect))
{
@ -395,6 +398,7 @@ void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
errln((UnicodeString)" " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
}
}
#endif
}
}
@ -481,4 +485,54 @@ void CollationThaiTest::TestInvalidThai(void) {
delete c;
}
void CollationThaiTest::TestReordering(void) {
const char *tests[] = {
"\\u0E41c\\u0301", "=", "\\u0E41\\u0107", // composition
"\\u0E41\\uD834\\uDC00", "<", "\\u0E41\\uD834\\uDC01", // supplementaries
"\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
"\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
"\\u0E41\\u0301", "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
"\\u0E41\\u0301\\u0316", "=", "\\u0E41\\u0316\\u0301",
"abc\\u0E41c\\u0301", "=", "abc\\u0E41\\u0107", // composition
"abc\\u0E41\\uD834\\uDC00", "<", "abc\\u0E41\\uD834\\uDC01", // supplementaries
"abc\\u0E41\\uD834\\uDD5F", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
"abc\\u0E41\\uD87E\\uDC02", "=", "abc\\u0E41\\u4E41", // supplementary composition decomps to BMP
"abc\\u0E41\\u0301", "=", "abc\\u0E41\\u0301", // unsafe (just checking backwards iteration)
"abc\\u0E41\\u0301\\u0316", "=", "abc\\u0E41\\u0316\\u0301",
"\\u0E41c\\u0301abc", "=", "\\u0E41\\u0107abc", // composition
"\\u0E41\\uD834\\uDC00abc", "<", "\\u0E41\\uD834\\uDC01abc", // supplementaries
"\\u0E41\\uD834\\uDD5Fabc", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
"\\u0E41\\uD87E\\uDC02abc", "=", "\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
"\\u0E41\\u0301abc", "=", "\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
"\\u0E41\\u0301\\u0316abc", "=", "\\u0E41\\u0316\\u0301abc",
"abc\\u0E41c\\u0301abc", "=", "abc\\u0E41\\u0107abc", // composition
"abc\\u0E41\\uD834\\uDC00abc", "<", "abc\\u0E41\\uD834\\uDC01abc", // supplementaries
"abc\\u0E41\\uD834\\uDD5Fabc", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
"abc\\u0E41\\uD87E\\uDC02abc", "=", "abc\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
"abc\\u0E41\\u0301abc", "=", "abc\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
"abc\\u0E41\\u0301\\u0316abc", "=", "abc\\u0E41\\u0316\\u0301abc",
};
compareArray(*coll, tests, sizeof(tests)/sizeof(tests[0]));
const char *rule = "& c < ab";
const char *testcontraction[] = { "\\u0E41ab", "<", "\\u0E41c"};
UnicodeString rules;
UErrorCode status = U_ZERO_ERROR;
parseChars(rules, rule);
RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
if(U_SUCCESS(status)) {
//compareArray(*rcoll, testcontraction, 3);
delete rcoll;
} else {
errln("Couldn't instantiate collator from rules");
}
//genericRulesStarter(rule, test10, 2);
}
#endif /* #if !UCONFIG_NO_COLLATION */

View file

@ -53,9 +53,15 @@ private:
* test that invalid Thai sorts properly
*/
void TestInvalidThai(void);
/**
* test that reording is done properly
*/
void TestReordering(void);
private:
void compareArray(const Collator& c, const char* tests[],
void compareArray(Collator& c, const char* tests[],
int32_t testsLength);
int8_t sign(int32_t i);