From a2605b9c83a0f8a8349c11496e5324238da37964 Mon Sep 17 00:00:00 2001
From: Andy Heninger <andy.heninger@gmail.com>
Date: Fri, 23 Jul 2010 00:15:37 +0000
Subject: [PATCH] ICU-5532 temp fix for crash in RBBI dictionary code with
 UTF-8 text

X-SVN-Rev: 28361
---
 icu4c/source/common/rbbi.cpp           | 24 +++++++++++++
 icu4c/source/test/intltest/rbbitst.cpp | 47 +++++++++++++++++++++++++-
 icu4c/source/test/intltest/rbbitst.h   |  3 +-
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
index 1a2cd8fc4af..2615a4b32b5 100644
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -1562,6 +1562,30 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
         return (reverse ? startPos : endPos);
     }
     
+    // Bug 5532.  The dictionary code will crash if the input text is UTF-8
+    //      because native indexes are different from UTF-16 indexes.
+    //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
+    //      It wont give the right breaks, but it's better than a crash.
+    //
+    //      Check the type of the UText by checking its pFuncs field, which
+    //      is UText's function dispatch table.  It will be the same for all
+    //      UTF-8 UTexts and different for any other UText type.
+    //
+    //      We have no other type of UText available with non-UTF-16 native indexing.
+    //      This whole check will go away once the dictionary code is fixed.
+    static const void *utext_utf8Funcs;
+    if (utext_utf8Funcs == NULL) {
+        // Cache the UTF-8 UText function pointer value.
+        UErrorCode status = U_ZERO_ERROR;
+        UText tempUText = UTEXT_INITIALIZER; 
+        utext_openUTF8(&tempUText, NULL, 0, &status);
+        utext_utf8Funcs = tempUText.pFuncs;
+        utext_close(&tempUText);
+    }
+    if (fText->pFuncs == utext_utf8Funcs) {
+        return (reverse ? startPos : endPos);
+    }
+
     // Starting from the starting point, scan towards the proposed result,
     // looking for the first dictionary character (which may be the one
     // we're on, if we're starting in the middle of a range).
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index d9284a1838d..f4a01de30ef 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -144,7 +144,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
 #endif
         case 24: name = "TestDictRules";
             if (exec) TestDictRules();                         break;
-
+        case 25: name = "TestBug5532";
+            if (exec) TestBug5532();                           break;
         default: name = ""; break; //needed to end loop
     }
 }
@@ -4697,6 +4698,50 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
 #endif
 }
 
+
+//  Bug 5532.  UTF-8 based UText fails in dictionary code.
+//             This test checks the initial patch,
+//             which is to just keep it from crashing.  Correct word boundaries
+//             await a proper fix to the dictionary code.
+//
+void RBBITest::TestBug5532(void)  {
+   // Text includes a mixture of Thai and Latin.
+   const unsigned char utf8Data[] = {
+           0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
+           0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 
+           0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
+           0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
+           0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
+           0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 
+           0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 
+           0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 
+           0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 
+           0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 
+           0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
+
+    UErrorCode status = U_ZERO_ERROR;
+    UText utext=UTEXT_INITIALIZER;
+    utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
+    TEST_ASSERT_SUCCESS(status);
+
+    BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
+    TEST_ASSERT_SUCCESS(status);
+    bi->setText(&utext, status);
+    TEST_ASSERT_SUCCESS(status);
+
+    int32_t breakCount = 0;
+    int32_t previousBreak = -1;
+    for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
+        // For now, just make sure that the break iterator doesn't hang.
+        TEST_ASSERT(previousBreak < bi->current());
+        previousBreak = bi->current();
+    }
+    TEST_ASSERT(breakCount > 0);
+    delete bi;
+    utext_close(&utext);
+}
+
+
 //
 //  TestDebug    -  A place-holder test for debugging purposes.
 //                  For putting in fragments of other tests that can be invoked
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index fbdbbf880e4..d46c9b59976 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 1999-2009, International Business Machines
+ * Copyright (c) 1999-2010, International Business Machines
  * Corporation and others. All Rights Reserved.
  *************************************************************************
  *   Date        Name        Description
@@ -71,6 +71,7 @@ public:
     void TestThaiBreaks();
     void TestTailoredBreaks();
     void TestDictRules();
+    void TestBug5532();
 
     void TestDebug();