From a2605b9c83a0f8a8349c11496e5324238da37964 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 23 Jul 2010 00:15:37 +0000 Subject: [PATCH] ICU-5532 temp fix for crash in RBBI dictionary code with UTF-8 text X-SVN-Rev: 28361 --- icu4c/source/common/rbbi.cpp | 24 +++++++++++++ icu4c/source/test/intltest/rbbitst.cpp | 47 +++++++++++++++++++++++++- icu4c/source/test/intltest/rbbitst.h | 3 +- 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 1a2cd8fc4af..2615a4b32b5 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -1562,6 +1562,30 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, return (reverse ? startPos : endPos); } + // Bug 5532. The dictionary code will crash if the input text is UTF-8 + // because native indexes are different from UTF-16 indexes. + // Temporary hack: skip dictionary lookup for UTF-8 encoded text. + // It wont give the right breaks, but it's better than a crash. + // + // Check the type of the UText by checking its pFuncs field, which + // is UText's function dispatch table. It will be the same for all + // UTF-8 UTexts and different for any other UText type. + // + // We have no other type of UText available with non-UTF-16 native indexing. + // This whole check will go away once the dictionary code is fixed. + static const void *utext_utf8Funcs; + if (utext_utf8Funcs == NULL) { + // Cache the UTF-8 UText function pointer value. + UErrorCode status = U_ZERO_ERROR; + UText tempUText = UTEXT_INITIALIZER; + utext_openUTF8(&tempUText, NULL, 0, &status); + utext_utf8Funcs = tempUText.pFuncs; + utext_close(&tempUText); + } + if (fText->pFuncs == utext_utf8Funcs) { + return (reverse ? startPos : endPos); + } + // Starting from the starting point, scan towards the proposed result, // looking for the first dictionary character (which may be the one // we're on, if we're starting in the middle of a range). diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index d9284a1838d..f4a01de30ef 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -144,7 +144,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha #endif case 24: name = "TestDictRules"; if (exec) TestDictRules(); break; - + case 25: name = "TestBug5532"; + if (exec) TestBug5532(); break; default: name = ""; break; //needed to end loop } } @@ -4697,6 +4698,50 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name #endif } + +// Bug 5532. UTF-8 based UText fails in dictionary code. +// This test checks the initial patch, +// which is to just keep it from crashing. Correct word boundaries +// await a proper fix to the dictionary code. +// +void RBBITest::TestBug5532(void) { + // Text includes a mixture of Thai and Latin. + const unsigned char utf8Data[] = { + 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, + 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, + 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, + 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, + 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, + 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, + 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, + 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, + 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, + 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, + 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; + + UErrorCode status = U_ZERO_ERROR; + UText utext=UTEXT_INITIALIZER; + utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); + TEST_ASSERT_SUCCESS(status); + + BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); + TEST_ASSERT_SUCCESS(status); + bi->setText(&utext, status); + TEST_ASSERT_SUCCESS(status); + + int32_t breakCount = 0; + int32_t previousBreak = -1; + for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { + // For now, just make sure that the break iterator doesn't hang. + TEST_ASSERT(previousBreak < bi->current()); + previousBreak = bi->current(); + } + TEST_ASSERT(breakCount > 0); + delete bi; + utext_close(&utext); +} + + // // TestDebug - A place-holder test for debugging purposes. // For putting in fragments of other tests that can be invoked diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index fbdbbf880e4..d46c9b59976 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 1999-2009, International Business Machines + * Copyright (c) 1999-2010, International Business Machines * Corporation and others. All Rights Reserved. ************************************************************************* * Date Name Description @@ -71,6 +71,7 @@ public: void TestThaiBreaks(); void TestTailoredBreaks(); void TestDictRules(); + void TestBug5532(); void TestDebug();