mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-5532 temp fix for crash in RBBI dictionary code with UTF-8 text
X-SVN-Rev: 28361
This commit is contained in:
parent
fa05e3a3d3
commit
a2605b9c83
3 changed files with 72 additions and 2 deletions
|
@ -1562,6 +1562,30 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
return (reverse ? startPos : endPos);
|
||||
}
|
||||
|
||||
// Bug 5532. The dictionary code will crash if the input text is UTF-8
|
||||
// because native indexes are different from UTF-16 indexes.
|
||||
// Temporary hack: skip dictionary lookup for UTF-8 encoded text.
|
||||
// It wont give the right breaks, but it's better than a crash.
|
||||
//
|
||||
// Check the type of the UText by checking its pFuncs field, which
|
||||
// is UText's function dispatch table. It will be the same for all
|
||||
// UTF-8 UTexts and different for any other UText type.
|
||||
//
|
||||
// We have no other type of UText available with non-UTF-16 native indexing.
|
||||
// This whole check will go away once the dictionary code is fixed.
|
||||
static const void *utext_utf8Funcs;
|
||||
if (utext_utf8Funcs == NULL) {
|
||||
// Cache the UTF-8 UText function pointer value.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText tempUText = UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&tempUText, NULL, 0, &status);
|
||||
utext_utf8Funcs = tempUText.pFuncs;
|
||||
utext_close(&tempUText);
|
||||
}
|
||||
if (fText->pFuncs == utext_utf8Funcs) {
|
||||
return (reverse ? startPos : endPos);
|
||||
}
|
||||
|
||||
// Starting from the starting point, scan towards the proposed result,
|
||||
// looking for the first dictionary character (which may be the one
|
||||
// we're on, if we're starting in the middle of a range).
|
||||
|
|
|
@ -144,7 +144,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
#endif
|
||||
case 24: name = "TestDictRules";
|
||||
if (exec) TestDictRules(); break;
|
||||
|
||||
case 25: name = "TestBug5532";
|
||||
if (exec) TestBug5532(); break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
}
|
||||
|
@ -4697,6 +4698,50 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Bug 5532. UTF-8 based UText fails in dictionary code.
|
||||
// This test checks the initial patch,
|
||||
// which is to just keep it from crashing. Correct word boundaries
|
||||
// await a proper fix to the dictionary code.
|
||||
//
|
||||
void RBBITest::TestBug5532(void) {
|
||||
// Text includes a mixture of Thai and Latin.
|
||||
const unsigned char utf8Data[] = {
|
||||
0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
|
||||
0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
|
||||
0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
|
||||
0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
|
||||
0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
|
||||
0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
|
||||
0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
|
||||
0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
|
||||
0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
|
||||
0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
|
||||
0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText utext=UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
bi->setText(&utext, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
int32_t breakCount = 0;
|
||||
int32_t previousBreak = -1;
|
||||
for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
|
||||
// For now, just make sure that the break iterator doesn't hang.
|
||||
TEST_ASSERT(previousBreak < bi->current());
|
||||
previousBreak = bi->current();
|
||||
}
|
||||
TEST_ASSERT(breakCount > 0);
|
||||
delete bi;
|
||||
utext_close(&utext);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// TestDebug - A place-holder test for debugging purposes.
|
||||
// For putting in fragments of other tests that can be invoked
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 1999-2009, International Business Machines
|
||||
* Copyright (c) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*************************************************************************
|
||||
* Date Name Description
|
||||
|
@ -71,6 +71,7 @@ public:
|
|||
void TestThaiBreaks();
|
||||
void TestTailoredBreaks();
|
||||
void TestDictRules();
|
||||
void TestBug5532();
|
||||
|
||||
void TestDebug();
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue