mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-3109 more Thai fixes
X-SVN-Rev: 12663
This commit is contained in:
parent
cc4a354714
commit
2a53fc8a87
3 changed files with 138 additions and 40 deletions
|
@ -2580,6 +2580,7 @@ inline UChar getPrevNormalizedChar(collIterate *data)
|
|||
|
||||
uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
|
||||
collIterateState entryState;
|
||||
UChar buffer[UCOL_MAX_BUFFER];
|
||||
backupState(source, &entryState);
|
||||
UChar32 cp = ch;
|
||||
|
||||
|
@ -2624,9 +2625,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
case THAI_TAG:
|
||||
/* Thai/Lao reordering */
|
||||
if (((source->flags) & UCOL_ITER_INNORMBUF) /* Already Swapped || */
|
||||
|| (source->iterator && !source->iterator->hasNext(source->iterator))
|
||||
|| (source->pos && source->endp == source->pos) /* At end of string. No swap possible || */
|
||||
/*|| UCOL_ISTHAIBASECONSONANT(*(source->pos)) == 0*/) /* next char not Thai base cons.*/ // This is from the old specs - we now rearrange unconditionally
|
||||
|| collIter_eos(source)) /* At end of string. No swap possible */
|
||||
{
|
||||
// Treat Thai as a length one expansion */
|
||||
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
|
||||
|
@ -2634,36 +2633,37 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
}
|
||||
else
|
||||
{
|
||||
if(!collIter_eos(source)) {
|
||||
// Move the prevowel and the following base Consonant into the normalization buffer
|
||||
// with their order swapped
|
||||
|
||||
UChar thCh = peekCharacter(source, 0); //getNextNormalizedChar(source);
|
||||
UChar32 cp = 0;
|
||||
if(U16_IS_LEAD(thCh)) {
|
||||
if(!collIter_eos(source)) {
|
||||
collIterateState thaiState;
|
||||
backupState(source, &thaiState);
|
||||
getNextNormalizedChar(source);
|
||||
UChar trailCh = peekCharacter(source, 0); //getNextNormalizedChar(source);
|
||||
if(U16_IS_TRAIL(trailCh)) {
|
||||
cp = U16_GET_SUPPLEMENTARY(thCh, trailCh);
|
||||
} else {
|
||||
loadState(source, &thaiState, TRUE);
|
||||
}
|
||||
// Move the prevowel and the following base Consonant into the normalization buffer
|
||||
// with their order swapped
|
||||
// Note: this operation might activate the normalization buffer. We have to check for
|
||||
// that and act accordingly.
|
||||
UChar thCh = getNextNormalizedChar(source);
|
||||
UChar32 cp = 0;
|
||||
if(U16_IS_LEAD(thCh)) {
|
||||
if(!collIter_eos(source)) {
|
||||
collIterateState thaiState;
|
||||
backupState(source, &thaiState);
|
||||
UChar trailCh = getNextNormalizedChar(source);
|
||||
if(U16_IS_TRAIL(trailCh)) {
|
||||
cp = U16_GET_SUPPLEMENTARY(thCh, trailCh);
|
||||
} else {
|
||||
cp = (UChar32)thCh;
|
||||
loadState(source, &thaiState, TRUE);
|
||||
}
|
||||
} else {
|
||||
cp = (UChar32)thCh;
|
||||
cp = (UChar32)thCh;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
cp = (UChar32)thCh;
|
||||
}
|
||||
// Now we have the character that needs to be decomposed
|
||||
// if the normalizing buffer was not used, we can just use our structure and be happy.
|
||||
if(source->flags & UCOL_ITER_INNORMBUF == 0) {
|
||||
// decompose into writable buffer
|
||||
int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1);
|
||||
if(decompLen < 0) {
|
||||
decompLen = -decompLen;
|
||||
}
|
||||
|
||||
// reorder Thai and the character after it
|
||||
if(decompLen >= 2 && U16_IS_LEAD(source->writableBuffer[1]) && U16_IS_TRAIL(source->writableBuffer[2])) {
|
||||
source->writableBuffer[0] = source->writableBuffer[1];
|
||||
source->writableBuffer[1] = source->writableBuffer[2];
|
||||
|
@ -2672,12 +2672,8 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
source->writableBuffer[0] = source->writableBuffer[1];
|
||||
source->writableBuffer[1] = ch;
|
||||
}
|
||||
// zero terminate, since normalization buffer is always zero terminated
|
||||
source->writableBuffer[decompLen+1] = 0; // we added the prevowel
|
||||
/*
|
||||
source->writableBuffer[0] = peekCharacter(source, 0);
|
||||
source->writableBuffer[1] = peekCharacter(source, -1);
|
||||
source->writableBuffer[2] = 0;
|
||||
*/
|
||||
if(source->pos) {
|
||||
source->fcdPosition = source->pos+1; // Indicate where to continue in main input string
|
||||
// after exhausting the writableBuffer
|
||||
|
@ -2690,9 +2686,49 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
|
||||
|
||||
CE = UCOL_IGNORABLE;
|
||||
} else {
|
||||
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
|
||||
CE = *CEOffset++;
|
||||
} else { // stuff is already normalized... what to do here???
|
||||
int32_t decompLen = unorm_getDecomposition(cp, FALSE, &buffer[1], UCOL_MAX_BUFFER-1);
|
||||
if(decompLen < 0) {
|
||||
decompLen = -decompLen;
|
||||
}
|
||||
if(decompLen >= 2 && U16_IS_LEAD(buffer[1]) && U16_IS_TRAIL(buffer[2])) {
|
||||
buffer[0] = buffer[1];
|
||||
buffer[1] = buffer[2];
|
||||
buffer[2] = ch;
|
||||
} else {
|
||||
buffer[0] = buffer[1];
|
||||
buffer[1] = ch;
|
||||
}
|
||||
buffer[decompLen+1] = 0; // we added the prevowel
|
||||
// we will construct a new iterator and suck out CEs.
|
||||
collIterate temp;
|
||||
// Here is the string initialization. We have decomposed character (decompLen) + 1 Thai + trailing zero
|
||||
IInit_collIterate(coll, buffer, decompLen+2, &temp);
|
||||
// We need the trailing zero so that we can tell the iterate function that it is in the normalized and reordered
|
||||
// buffer. This buffer is always zero terminated.
|
||||
temp.flags |= UCOL_ITER_INNORMBUF;
|
||||
// This is where to return after iteration is done. We point at the end of the string
|
||||
temp.fcdPosition = buffer+decompLen+2;
|
||||
temp.flags &= ~UCOL_ITER_NORM;
|
||||
|
||||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
uint32_t *endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
while (CE != UCOL_NO_MORE_CES) {
|
||||
*(source->CEpos ++) = CE;
|
||||
if (source->CEpos == endCEBuffer) {
|
||||
/* ran out of CE space, bail.
|
||||
there's no guarantee of the right character position after
|
||||
this bail*/
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
source->CEpos = source->CEs;
|
||||
freeHeapWritableBuffer(&temp);
|
||||
return UCOL_NULLORDER;
|
||||
}
|
||||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
}
|
||||
freeHeapWritableBuffer(&temp);
|
||||
// return the first of CEs so that we save a call
|
||||
CE = *(source->toReturn++);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -3338,7 +3374,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
*(UCharOffset --) = 0;
|
||||
noChars = 0;
|
||||
// have to swap thai characters
|
||||
while (ucol_unsafeCP(schar, coll) || (!collIter_bos(source) && UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)))) { // this one is problematic
|
||||
while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1))) {
|
||||
// we might have ended here after trying to reorder Thai, but seeing that there are unsafe points
|
||||
// in the backward processing
|
||||
*(UCharOffset) = schar;
|
||||
noChars++;
|
||||
UCharOffset --;
|
||||
|
|
|
@ -72,6 +72,7 @@ void CollationThaiTest::runIndexedTest(int32_t index, UBool exec, const char* &n
|
|||
CASE(1,TestCornerCases)
|
||||
CASE(2,TestNamesList)
|
||||
CASE(3,TestInvalidThai)
|
||||
CASE(4,TestReordering)
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -332,18 +333,18 @@ void CollationThaiTest::TestCornerCases(void) {
|
|||
// Internal utilities
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
|
||||
void CollationThaiTest::compareArray(Collator& c, const char* tests[],
|
||||
int32_t testsLength) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
for (int32_t i = 0; i < testsLength; i += 3) {
|
||||
|
||||
int32_t expect = 0;
|
||||
Collator::EComparisonResult expect;
|
||||
if (tests[i+1][0] == '<') {
|
||||
expect = -1;
|
||||
expect = Collator::LESS;
|
||||
} else if (tests[i+1][0] == '>') {
|
||||
expect = 1;
|
||||
expect = Collator::GREATER;
|
||||
} else if (tests[i+1][0] == '=') {
|
||||
expect = 0;
|
||||
expect = Collator::EQUAL;
|
||||
} else {
|
||||
// expect = Integer.decode(tests[i+1]).intValue();
|
||||
errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
|
||||
|
@ -354,6 +355,8 @@ void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
|
|||
parseChars(s1, tests[i]);
|
||||
parseChars(s2, tests[i+2]);
|
||||
|
||||
doTest(&c, s1, s2, expect);
|
||||
#if 0
|
||||
int32_t result = c.compare(s1, s2);
|
||||
if (sign(result) != sign(expect))
|
||||
{
|
||||
|
@ -395,6 +398,7 @@ void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
|
|||
errln((UnicodeString)" " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -481,4 +485,54 @@ void CollationThaiTest::TestInvalidThai(void) {
|
|||
delete c;
|
||||
}
|
||||
|
||||
void CollationThaiTest::TestReordering(void) {
|
||||
const char *tests[] = {
|
||||
"\\u0E41c\\u0301", "=", "\\u0E41\\u0107", // composition
|
||||
"\\u0E41\\uD834\\uDC00", "<", "\\u0E41\\uD834\\uDC01", // supplementaries
|
||||
"\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
|
||||
"\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
|
||||
"\\u0E41\\u0301", "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
|
||||
"\\u0E41\\u0301\\u0316", "=", "\\u0E41\\u0316\\u0301",
|
||||
|
||||
"abc\\u0E41c\\u0301", "=", "abc\\u0E41\\u0107", // composition
|
||||
"abc\\u0E41\\uD834\\uDC00", "<", "abc\\u0E41\\uD834\\uDC01", // supplementaries
|
||||
"abc\\u0E41\\uD834\\uDD5F", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
|
||||
"abc\\u0E41\\uD87E\\uDC02", "=", "abc\\u0E41\\u4E41", // supplementary composition decomps to BMP
|
||||
"abc\\u0E41\\u0301", "=", "abc\\u0E41\\u0301", // unsafe (just checking backwards iteration)
|
||||
"abc\\u0E41\\u0301\\u0316", "=", "abc\\u0E41\\u0316\\u0301",
|
||||
|
||||
"\\u0E41c\\u0301abc", "=", "\\u0E41\\u0107abc", // composition
|
||||
"\\u0E41\\uD834\\uDC00abc", "<", "\\u0E41\\uD834\\uDC01abc", // supplementaries
|
||||
"\\u0E41\\uD834\\uDD5Fabc", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
|
||||
"\\u0E41\\uD87E\\uDC02abc", "=", "\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
|
||||
"\\u0E41\\u0301abc", "=", "\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
|
||||
"\\u0E41\\u0301\\u0316abc", "=", "\\u0E41\\u0316\\u0301abc",
|
||||
|
||||
"abc\\u0E41c\\u0301abc", "=", "abc\\u0E41\\u0107abc", // composition
|
||||
"abc\\u0E41\\uD834\\uDC00abc", "<", "abc\\u0E41\\uD834\\uDC01abc", // supplementaries
|
||||
"abc\\u0E41\\uD834\\uDD5Fabc", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
|
||||
"abc\\u0E41\\uD87E\\uDC02abc", "=", "abc\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
|
||||
"abc\\u0E41\\u0301abc", "=", "abc\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
|
||||
"abc\\u0E41\\u0301\\u0316abc", "=", "abc\\u0E41\\u0316\\u0301abc",
|
||||
};
|
||||
|
||||
compareArray(*coll, tests, sizeof(tests)/sizeof(tests[0]));
|
||||
|
||||
const char *rule = "& c < ab";
|
||||
const char *testcontraction[] = { "\\u0E41ab", "<", "\\u0E41c"};
|
||||
UnicodeString rules;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
parseChars(rules, rule);
|
||||
RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
//compareArray(*rcoll, testcontraction, 3);
|
||||
delete rcoll;
|
||||
} else {
|
||||
errln("Couldn't instantiate collator from rules");
|
||||
}
|
||||
//genericRulesStarter(rule, test10, 2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
|
|
@ -53,9 +53,15 @@ private:
|
|||
* test that invalid Thai sorts properly
|
||||
*/
|
||||
void TestInvalidThai(void);
|
||||
|
||||
/**
|
||||
* test that reording is done properly
|
||||
*/
|
||||
void TestReordering(void);
|
||||
|
||||
private:
|
||||
|
||||
void compareArray(const Collator& c, const char* tests[],
|
||||
void compareArray(Collator& c, const char* tests[],
|
||||
int32_t testsLength);
|
||||
|
||||
int8_t sign(int32_t i);
|
||||
|
|
Loading…
Add table
Reference in a new issue