diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index dbeab0818a8..35c58053d95 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -486,6 +486,37 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) { } +/** + * Provide a new UText for the input text. Must reference text with contents identical + * to the original. + * Intended for use with text data originating in Java (garbage collected) environments + * where the data may be moved in memory at arbitrary times. + */ +RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (input == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + int64_t pos = utext_getNativeIndex(fText); + // Shallow read-only clone of the new UText into the existing input UText + fText = utext_clone(fText, input, FALSE, TRUE, &status); + if (U_FAILURE(status)) { + return *this; + } + utext_setNativeIndex(fText, pos); + if (utext_getNativeIndex(fText) != pos) { + // Sanity check. The new input utext is supposed to have the exact same + // contents as the old. If we can't set to the same position, it doesn't. + // The contents underlying the old utext might be invalid at this point, + // so it's not safe to check directly. + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return *this; +} + /** * Sets the current iteration position to the beginning of the text. diff --git a/icu4c/source/common/ubrk.cpp b/icu4c/source/common/ubrk.cpp index 944708ab8db..f6a70efeeae 100644 --- a/icu4c/source/common/ubrk.cpp +++ b/icu4c/source/common/ubrk.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************** -* Copyright (C) 1996-2008, International Business Machines +* Copyright (C) 1996-2011, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** */ @@ -290,4 +290,14 @@ ubrk_getLocaleByType(const UBreakIterator *bi, } +void ubrk_refreshUText(UBreakIterator *bi, + UText *text, + UErrorCode *status) +{ + BreakIterator *bii = reinterpret_cast(bi); + bii->refreshInputText(text, *status); +} + + + #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h index bdd3cc700f4..6cae690e66a 100644 --- a/icu4c/source/common/unicode/brkiter.h +++ b/icu4c/source/common/unicode/brkiter.h @@ -1,6 +1,6 @@ /* ******************************************************************************** -* Copyright (C) 1997-2010, International Business Machines +* Copyright (C) 1997-2011, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** * @@ -514,6 +514,33 @@ public: */ const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const; + /** + * Set the subject text string upon which the break iterator is operating + * without changing any other aspect of the matching state. + * The new and previous text strings must have the same content. + * + * This function is intended for use in environments where ICU is operating on + * strings that may move around in memory. It provides a mechanism for notifying + * ICU that the string has been relocated, and providing a new UText to access the + * string in its new position. + * + * Note that the break iterator implementation never copies the underlying text + * of a string being processed, but always operates directly on the original text + * provided by the user. Refreshing simply drops the references to the old text + * and replaces them with references to the new. + * + * Caution: this function is normally used only by very specialized, + * system-level code. One example use case is with garbage collection that moves + * the text in memory. + * + * @param input The new (moved) text string. + * @param status Receives errors detected by this function. + * @return *this + * + * @draft ICU 5.0 + */ + virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0; + private: static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status); static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status); diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index f93b57766e9..529a5897fda 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -633,6 +633,33 @@ public: */ virtual const uint8_t *getBinaryRules(uint32_t &length); + /** + * Set the subject text string upon which the break iterator is operating + * without changing any other aspect of the matching state. + * The new and previous text strings must have the same content. + * + * This function is intended for use in environments where ICU is operating on + * strings that may move around in memory. It provides a mechanism for notifying + * ICU that the string has been relocated, and providing a new UText to access the + * string in its new position. + * + * Note that the break iterator implementation never copies the underlying text + * of a string being processed, but always operates directly on the original text + * provided by the user. Refreshing simply drops the references to the old text + * and replaces them with references to the new. + * + * Caution: this function is normally used only by very specialized, + * system-level code. One example use case is with garbage collection that moves + * the text in memory. + * + * @param input The new (moved) text string. + * @param status Receives errors detected by this function. + * @return *this + * + * @draft ICU 5.0 + */ + virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status); + protected: //======================================================================= diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h index f8304a6628d..c473fffa7d0 100644 --- a/icu4c/source/common/unicode/ubrk.h +++ b/icu4c/source/common/unicode/ubrk.h @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1996-2010, International Business Machines Corporation and others. +* Copyright (C) 1996-2011, International Business Machines Corporation and others. * All Rights Reserved. ****************************************************************************** */ @@ -496,6 +496,37 @@ U_STABLE const char* U_EXPORT2 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status); +/** + * Set the subject text string upon which the break iterator is operating + * without changing any other aspect of the state. + * The new and previous text strings must have the same content. + * + * This function is intended for use in environments where ICU is operating on + * strings that may move around in memory. It provides a mechanism for notifying + * ICU that the string has been relocated, and providing a new UText to access the + * string in its new position. + * + * Note that the break iterator never copies the underlying text + * of a string being processed, but always operates directly on the original text + * provided by the user. Refreshing simply drops the references to the old text + * and replaces them with references to the new. + * + * Caution: this function is normally used only by very specialized + * system-level code. One example use case is with garbage collection + * that moves the text in memory. + * + * @param bi The break iterator. + * @param text The new (moved) text string. + * @param status Receives errors detected by this function. + * + * @draft ICU 5.0 + */ +U_DRAFT void U_EXPORT2 +ubrk_refreshUText(UBreakIterator *bi, + UText *text, + UErrorCode *status); + + #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ #endif diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c index 8792cc4a65f..886c3bfae2e 100644 --- a/icu4c/source/test/cintltst/cbiapts.c +++ b/icu4c/source/test/cintltst/cbiapts.c @@ -44,6 +44,7 @@ static void TestBreakIteratorRuleError(void); static void TestBreakIteratorStatusVec(void); static void TestBreakIteratorUText(void); static void TestBreakIteratorTailoring(void); +static void TestBreakIteratorRefresh(void); void addBrkIterAPITest(TestNode** root); @@ -58,6 +59,7 @@ void addBrkIterAPITest(TestNode** root) addTest(root, &TestBreakIteratorRuleError, "tstxtbd/cbiapts/TestBreakIteratorRuleError"); addTest(root, &TestBreakIteratorStatusVec, "tstxtbd/cbiapts/TestBreakIteratorStatusVec"); addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring"); + addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh"); } #define CLONETEST_ITERATOR_COUNT 2 @@ -823,4 +825,52 @@ static void TestBreakIteratorTailoring(void) { } } + +static void TestBreakIteratorRefresh(void) { + /* + * RefreshInput changes out the input of a Break Iterator without + * changing anything else in the iterator's state. Used with Java JNI, + * when Java moves the underlying string storage. This test + * runs a ubrk_next() repeatedly, moving the text in the middle of the sequence. + * The right set of boundaries should still be found. + */ + UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ + UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; + UErrorCode status = U_ZERO_ERROR; + UBreakIterator *bi; + UText ut1 = UTEXT_INITIALIZER; + UText ut2 = UTEXT_INITIALIZER; + + bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status); + TEST_ASSERT_SUCCESS(status); + + utext_openUChars(&ut1, testStr, -1, &status); + TEST_ASSERT_SUCCESS(status); + ubrk_setUText(bi, &ut1, &status); + TEST_ASSERT_SUCCESS(status); + + /* Line boundaries will occur before each letter in the original string */ + TEST_ASSERT(1 == ubrk_next(bi)); + TEST_ASSERT(3 == ubrk_next(bi)); + + /* Move the string, kill the original string. */ + u_strcpy(movedStr, testStr); + u_memset(testStr, 0x20, u_strlen(testStr)); + utext_openUChars(&ut2, movedStr, -1, &status); + TEST_ASSERT_SUCCESS(status); + ubrk_refreshUText(bi, &ut2, &status); + TEST_ASSERT_SUCCESS(status); + + /* Find the following matches, now working in the moved string. */ + TEST_ASSERT(5 == ubrk_next(bi)); + TEST_ASSERT(7 == ubrk_next(bi)); + TEST_ASSERT(8 == ubrk_next(bi)); + TEST_ASSERT(UBRK_DONE == ubrk_next(bi)); + TEST_ASSERT_SUCCESS(status); + + ubrk_close(bi); + utext_close(&ut1); + utext_close(&ut2); +} + #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index 89afdb6dd76..cb40076b4b8 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -1122,6 +1122,54 @@ void RBBIAPITest::TestCreateFromRBBIData() { } } + +void RBBIAPITest::TestRefreshInputText() { + /* + * RefreshInput changes out the input of a Break Iterator without + * changing anything else in the iterator's state. Used with Java JNI, + * when Java moves the underlying string storage. This test + * runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence. + * The right set of boundaries should still be found. + */ + UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */ + UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0}; + UErrorCode status = U_ZERO_ERROR; + UText ut1 = UTEXT_INITIALIZER; + UText ut2 = UTEXT_INITIALIZER; + RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); + TEST_ASSERT_SUCCESS(status); + + utext_openUChars(&ut1, testStr, -1, &status); + TEST_ASSERT_SUCCESS(status); + bi->setText(&ut1, status); + TEST_ASSERT_SUCCESS(status); + + /* Line boundaries will occur before each letter in the original string */ + TEST_ASSERT(1 == bi->next()); + TEST_ASSERT(3 == bi->next()); + + /* Move the string, kill the original string. */ + u_strcpy(movedStr, testStr); + u_memset(testStr, 0x20, u_strlen(testStr)); + utext_openUChars(&ut2, movedStr, -1, &status); + TEST_ASSERT_SUCCESS(status); + RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(bi == returnedBI); + + /* Find the following matches, now working in the moved string. */ + TEST_ASSERT(5 == bi->next()); + TEST_ASSERT(7 == bi->next()); + TEST_ASSERT(8 == bi->next()); + TEST_ASSERT(UBRK_DONE == bi->next()); + + delete bi; + utext_close(&ut1); + utext_close(&ut2); + +} + + //--------------------------------------------- // runIndexedTest //--------------------------------------------- @@ -1153,6 +1201,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, #else case 9: case 10: case 11: case 12: case 13: name = "skip"; break; #endif + case 14: name = "TestRefreshInputText"; if (exec) TestRefreshInputText(); break; default: name = ""; break; // needed to end loop } diff --git a/icu4c/source/test/intltest/rbbiapts.h b/icu4c/source/test/intltest/rbbiapts.h index 0ce64ac3bad..e31b1da1903 100644 --- a/icu4c/source/test/intltest/rbbiapts.h +++ b/icu4c/source/test/intltest/rbbiapts.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1999-2004,2008 International Business Machines Corporation and + * Copyright (c) 1999-2011 International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /************************************************************************ @@ -86,6 +86,8 @@ public: void TestRegistration(); + void TestRefreshInputText(); + /** *Internal subroutines **/