From ca5d005978cfe6d6793e3668f71d4e160a69cdbc Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 10 Jul 2007 01:25:26 +0000 Subject: [PATCH] ICU-5766 Extended Grapheme Clusters for ICU4C X-SVN-Rev: 21933 --- icu4c/source/common/brkiter.cpp | 12 ++++++++++++ icu4c/source/common/unicode/brkiter.h | 16 ++++++++++++++++ icu4c/source/common/unicode/ubrk.h | 4 +++- icu4c/source/data/brkitr/root.txt | 3 ++- icu4c/source/data/xml/brkitr/root.xml | 3 ++- icu4c/source/test/intltest/rbbiapts.cpp | 7 +++++++ icu4c/source/test/intltest/rbbitst.cpp | 7 +++++++ icu4c/source/test/testdata/rbbitst.txt | 24 +++++++++++++++++++++++- 8 files changed, 72 insertions(+), 4 deletions(-) diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index 70bffcc1695..7a15ee79a6c 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -189,6 +189,15 @@ BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status) // ------------------------------------- +// Creates a break iterator for Extended Grapheme Cluster breaks. +BreakIterator* U_EXPORT2 +BreakIterator::createXGraphemeClusterInstance(const Locale& key, UErrorCode& status) +{ + return createInstance(key, UBRK_X_GRAPHEME_CLUSTER, status); +} + +// ------------------------------------- + // Gets all the available locales that has localized text boundary data. const Locale* U_EXPORT2 BreakIterator::getAvailableLocales(int32_t& count) @@ -424,6 +433,9 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_TITLE: result = BreakIterator::buildInstance(loc, "title", kind, status); break; + case UBRK_X_GRAPHEME_CLUSTER: + result = BreakIterator::buildInstance(loc, "xgc", kind, status); + break; default: status = U_ILLEGAL_ARGUMENT_ERROR; } diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h index c4ed3d2eb27..63fcf7562a1 100644 --- a/icu4c/source/common/unicode/brkiter.h +++ b/icu4c/source/common/unicode/brkiter.h @@ -397,6 +397,22 @@ public: static BreakIterator* U_EXPORT2 createTitleInstance(const Locale& where, UErrorCode& status); + /** + * Create BreakIterator for Extended Grapheme Clusters using specified locale + * Returns an instance of a BreakIterator for locating XGC booundaries + * Extended Grapheme Clusters are combining character sequences and other + * sequences that should remain unbroken when iterating over + * "characters" from a user perspective. + * @param loc the locale. + * @param status Receive information regarding any errors or warnings that + * occurred in creating the break iterator. + * @return A BreakIterator for Extended Grapheme Clusters. + * The caller owns the returned object and is responsible for deleting it. + * @draft ICU 3.8 + */ + static BreakIterator* U_EXPORT2 + createXGraphemeClusterInstance(const Locale& loc, UErrorCode& status); + /** * Get the set of Locales for which TextBoundaries are installed. *

Note: this will not return locales added through the register diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h index 338f0b8f258..abd3ab4fbf3 100644 --- a/icu4c/source/common/unicode/ubrk.h +++ b/icu4c/source/common/unicode/ubrk.h @@ -105,7 +105,9 @@ typedef enum UBreakIteratorType { */ UBRK_TITLE = 4, #endif /* U_HIDE_DEPRECATED_API */ - UBRK_COUNT = 5 + /** Extended Grapheme Cluster breaks @draft ICU 3.8 */ + UBRK_X_GRAPHEME_CLUSTER=6, + UBRK_COUNT = 6 } UBreakIteratorType; /** Value indicating all text boundaries have been returned. diff --git a/icu4c/source/data/brkitr/root.txt b/icu4c/source/data/brkitr/root.txt index 350b0e64ab3..113afbcb993 100644 --- a/icu4c/source/data/brkitr/root.txt +++ b/icu4c/source/data/brkitr/root.txt @@ -1,4 +1,4 @@ -// *************************************************************************** +// *************************************************************************** // * // * Copyright (C) 2007 International Business Machines // * Corporation and others. All Rights Reserved. @@ -14,6 +14,7 @@ root{ sentence:process(dependency){"sent.brk"} title:process(dependency){"title.brk"} word:process(dependency){"word.brk"} + xgc:process(dependency){"xgc.brk"} } dictionaries{ Thai:process(dependency){"thaidict.ctd"} diff --git a/icu4c/source/data/xml/brkitr/root.xml b/icu4c/source/data/xml/brkitr/root.xml index e1199bd97a6..0ca2c358ab2 100644 --- a/icu4c/source/data/xml/brkitr/root.xml +++ b/icu4c/source/data/xml/brkitr/root.xml @@ -1,6 +1,6 @@ + diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index 54f4db99cf3..a135fbc0efe 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -411,6 +411,13 @@ void RBBIAPITest::TestIteration() } delete bi; + status=U_ZERO_ERROR; + bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator:: + createXGraphemeClusterInstance(Locale::getDefault(), status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(bi != NULL); + delete bi; + status=U_ZERO_ERROR; bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status); if (U_FAILURE(status) || bi == NULL) { diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index a3dd0190a64..022c39c0a53 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1502,6 +1502,13 @@ void RBBITest::TestExtended() { charIdx += 6; break; } + if (testString.compare(charIdx-1, 5, "") == 0) { + delete tp.bi; + tp.bi = BreakIterator::createXGraphemeClusterInstance(locale, status); + charIdx += 4; + break; + } + // localeMatcher.reset(testString); if (localeMatcher.lookingAt(charIdx-1, status)) { diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index d336c674e81..eddee5da64b 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2001-2006 International Business Machines +# Copyright (c) 2001-2006 International Business Machines # Corporation and others. All Rights Reserved. # # RBBI Test Data @@ -91,6 +91,28 @@ # Treat Japanese Half Width voicing marks as combining •A\uff9e•B\uff9f\uff9e\uff9f•C• +######################################################################################## +# +# +# Extended G r a p h e m e C l u s t e r T e s t s +# +# +########################################################################################## + + +# Plain Vanilla grapheme clusters +•a•b•c• +•a\u0301\u0302• •b\u0303\u0304• + +# Assorted Hindi combining marks +•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C• + +# Thai Clusters +# $Prepend $Extend* $PrependBase $Extend*; +# +•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• • + + ######################################################################################## # #