From 225c380bde7d4fda7f082f12da471b8c1eb45ee4 Mon Sep 17 00:00:00 2001 From: Deborah Goldsmith Date: Thu, 21 Oct 2004 01:03:01 +0000 Subject: [PATCH] ICU-3561 Locale-based text boundaries X-SVN-Rev: 16582 --- icu4c/source/common/brkiter.cpp | 301 ++++++++-------------- icu4c/source/common/unicode/brkiter.h | 7 +- icu4c/source/data/brkitr/brkfiles.mk | 2 +- icu4c/source/data/brkitr/word_POSIX.txt | 243 +++++++++++++++++ icu4c/source/data/brkitr/word_ja.txt | 255 ++++++++++++++++++ icu4c/source/data/locales/en_US_POSIX.txt | 3 + icu4c/source/data/locales/ja.txt | 3 + icu4c/source/data/locales/root.txt | 8 + icu4c/source/data/locales/th.txt | 4 + icu4c/source/test/intltest/citrtest.cpp | 1 + icu4c/source/test/intltest/rbbitst.cpp | 27 ++ icu4c/source/test/intltest/rbbitst.h | 1 + 12 files changed, 652 insertions(+), 203 deletions(-) create mode 100644 icu4c/source/data/brkitr/word_POSIX.txt create mode 100644 icu4c/source/data/brkitr/word_ja.txt diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index 2663de273fe..ef76634270f 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -31,6 +31,7 @@ #include "mutex.h" #include "iculserv.h" #include "locbased.h" +#include "uresimp.h" // ***************************************************************************** // class BreakIterator @@ -45,6 +46,105 @@ const int32_t BreakIterator::DONE = (int32_t)-1; // ------------------------------------- +BreakIterator* +BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode &status) +{ + char fnbuff[256]; + char actualLocale[ULOC_FULLNAME_CAPACITY]; + int32_t size; + const UChar* brkfname = NULL; + UResourceBundle brkrules, brkname; + BreakIterator *result = NULL; + + if (U_FAILURE(status)) + return NULL; + + // Get the locale + UResourceBundle *b = ures_open(NULL, loc.getName(), &status); + + // Get the "boundaries" array. + if (U_SUCCESS(status)) { + ures_initStackObject(&brkrules); + (void) ures_getByKeyWithFallback(b, "boundaries", &brkrules, &status); + } + + // Get the string object naming the rules file + if (U_SUCCESS(status)) { + ures_initStackObject(&brkname); + (void) ures_getByKeyWithFallback(&brkrules, type, &brkname, &status); + } + + // Get the actual string + if (U_SUCCESS(status)) { + brkfname = ures_getString(&brkname, &size, &status); + uprv_strncpy(actualLocale, ures_getLocale(&brkname, &status), sizeof(actualLocale)/sizeof(actualLocale[0])); + } + + // Use the string if we found it + if (U_SUCCESS(status)) { + u_UCharsToChars(brkfname, fnbuff, size); + fnbuff[size] = '\0'; + } + + ures_close(&brkrules); + ures_close(&brkname); + + UDataMemory* file = udata_open(NULL, "brk", fnbuff, &status); + if (U_FAILURE(status)) { + ures_close(b); + return NULL; + } + + // We found the break rules; now see if a dictionary is needed + if (dict) + { + UErrorCode localStatus = U_ZERO_ERROR; + ures_initStackObject(&brkname); + (void) ures_getByKeyWithFallback(b, "BreakDictionaryData", &brkname, &localStatus); +#if 0 + if (U_SUCCESS(localStatus)) { + brkfname = ures_getString(&brkname, &size, &localStatus); + } +#endif + if (U_SUCCESS(localStatus)) { +#if 0 + u_UCharsToChars(brkfname, fnbuff, size); + fnbuff[size] = '\0'; +#endif + result = new DictionaryBasedBreakIterator(file, "thaidict.brk", status); + } + ures_close(&brkname); + } + + // If there is still no result but we haven't had an error, no dictionary, + // so make a non-dictionary break iterator + if (U_SUCCESS(status) && result == NULL) { + result = new RuleBasedBreakIterator(file, status); + } + + // If there is a result, set the valid locale and actual locale + if (U_SUCCESS(status) && result != NULL) { + U_LOCALE_BASED(locBased, *result); + locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale); + } + + ures_close(b); + + if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple + delete result; + return NULL; + } + + if (result == NULL) { + udata_close(file); + if (U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } + } + + return result; +} + // Creates a break iterator for word breaks. BreakIterator* U_EXPORT2 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) @@ -52,50 +152,6 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) return createInstance(key, UBRK_WORD, status); } -BreakIterator* -BreakIterator::makeWordInstance(const Locale& key, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - const char* filename = "word"; - - if (U_FAILURE(status)) - return NULL; - - if (!uprv_strcmp(key.getLanguage(), "th")) - { - filename = "word_th"; - } - - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - if(!uprv_strcmp(filename, "word_th")) { - filename = "thaidict.brk"; - result = new DictionaryBasedBreakIterator(file, filename, status); - } - else { - result = new RuleBasedBreakIterator(file, status); - } - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - if (result != NULL) { - delete result; - } - return NULL; - } - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - - return result; -} - // ------------------------------------- // Creates a break iterator for line breaks. @@ -105,49 +161,6 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status) return createInstance(key, UBRK_LINE, status); } -BreakIterator* -BreakIterator::makeLineInstance(const Locale& key, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - const char* filename = "line"; - - if (U_FAILURE(status)) - return NULL; - - if (!uprv_strcmp(key.getLanguage(), "th")) - { - filename = "line_th"; - } - - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - if (!uprv_strcmp(key.getLanguage(), "th")) { - filename = "thaidict.brk"; - result = new DictionaryBasedBreakIterator(file, filename, status); - } - else { - result = new RuleBasedBreakIterator(file, status); - } - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - if (result != NULL) { - delete result; - } - return NULL; - } - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - return result; -} - // ------------------------------------- // Creates a break iterator for character breaks. @@ -157,38 +170,6 @@ BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status) return createInstance(key, UBRK_CHARACTER, status); } -BreakIterator* -BreakIterator::makeCharacterInstance(const Locale& /* key */, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - static const char filename[] = "char"; - - if (U_FAILURE(status)) - return NULL; - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - result = new RuleBasedBreakIterator(file, status); - - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - if (result != NULL) { - delete result; - } - return NULL; - } - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - return result; -} - // ------------------------------------- // Creates a break iterator for sentence breaks. @@ -198,38 +179,6 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status) return createInstance(key, UBRK_SENTENCE, status); } -BreakIterator* -BreakIterator::makeSentenceInstance(const Locale& /*key */, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - static const char filename[] = "sent"; - - if (U_FAILURE(status)) - return NULL; - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - result = new RuleBasedBreakIterator(file, status); - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - if (result != NULL) { - delete result; - } - return NULL; - } - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - - return result; -} - // ------------------------------------- // Creates a break iterator for title casing breaks. @@ -239,38 +188,6 @@ BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status) return createInstance(key, UBRK_TITLE, status); } -BreakIterator* -BreakIterator::makeTitleInstance(const Locale& /* key */, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files. This function will have to be made fully general - // at some time in the future! - BreakIterator* result = NULL; - static const char filename[] = "title"; - - if (U_FAILURE(status)) - return NULL; - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - result = new RuleBasedBreakIterator(file, status); - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - if (result != NULL) { - delete result; - } - return NULL; - } - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - - return result; -} - // ------------------------------------- // Gets all the available locales that has localized text boundary data. @@ -495,19 +412,19 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) BreakIterator *result = NULL; switch (kind) { case UBRK_CHARACTER: - result = BreakIterator::makeCharacterInstance(loc, status); + result = BreakIterator::buildInstance(loc, "grapheme", FALSE, status); break; case UBRK_WORD: - result = BreakIterator::makeWordInstance(loc, status); + result = BreakIterator::buildInstance(loc, "word", TRUE, status); break; case UBRK_LINE: - result = BreakIterator::makeLineInstance(loc, status); + result = BreakIterator::buildInstance(loc, "line", TRUE, status); break; case UBRK_SENTENCE: - result = BreakIterator::makeSentenceInstance(loc, status); + result = BreakIterator::buildInstance(loc, "sentence", FALSE, status); break; case UBRK_TITLE: - result = BreakIterator::makeTitleInstance(loc, status); + result = BreakIterator::buildInstance(loc, "title", FALSE, status); break; default: status = U_ILLEGAL_ARGUMENT_ERROR; @@ -517,14 +434,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) return NULL; } - // this is more of a placeholder. All the break iterators have the same actual locale: root - // except the Thai one - UResourceBundle *res = ures_open(NULL, loc.getName(), &status); - U_LOCALE_BASED(locBased, *result); - locBased.setLocaleIDs(ures_getLocaleByType(res, ULOC_VALID_LOCALE, &status), - (uprv_strcmp(loc.getLanguage(), "th") == 0) ? - "th" : "root"); - ures_close(res); return result; } diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h index 07f73fa8cbd..9e397128cac 100644 --- a/icu4c/source/common/unicode/brkiter.h +++ b/icu4c/source/common/unicode/brkiter.h @@ -590,12 +590,7 @@ public: const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const; private: - static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status); - static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status); - + static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status); static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status); static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status); diff --git a/icu4c/source/data/brkitr/brkfiles.mk b/icu4c/source/data/brkitr/brkfiles.mk index 3ce2551543f..84863aec53f 100644 --- a/icu4c/source/data/brkitr/brkfiles.mk +++ b/icu4c/source/data/brkitr/brkfiles.mk @@ -28,4 +28,4 @@ # char.txt, title.txt and word.txt are not included so that more tests pass by default, # and so that the makefile rules are simplier. BRK_SOURCE = \ -line.txt sent.txt line_th.txt word_th.txt +line.txt sent.txt line_th.txt word_th.txt word_ja.txt word_POSIX.txt diff --git a/icu4c/source/data/brkitr/word_POSIX.txt b/icu4c/source/data/brkitr/word_POSIX.txt new file mode 100644 index 00000000000..63017dcbc74 --- /dev/null +++ b/icu4c/source/data/brkitr/word_POSIX.txt @@ -0,0 +1,243 @@ +# +# Copyright (C) 2002-2004, +# International Business Machines Corporation and others. +# All Rights Reserved. +# +# file: word.txt +# +# ICU Word Break Rules +# See Unicode Standard Annex #29. +# These rules are based on Version 4.0.0, dated 2003-04-17 +# + +############################################################################## +# +# Character class definitions from TR 29 +# +############################################################################## + +!!chain; +!!LBCMNoChain; + +$Katakana = [[:Script = KATAKANA:] + [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] + [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; + + +$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] + - [:Ideographic:] + - $Katakana + - [:Script = Thai:] + - [:Script = Lao:] + - [:Script = Hiragana:]]; + +$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]]; +$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]]; + +$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] + [:name = HEBREW PUNCTUATION GERSHAYIM:] + [:name = RIGHT SINGLE QUOTATION MARK:] + [:name = HYPHENATION POINT:]]; + +$MidNumLet = [[:name = FULL STOP:]]; + +$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet]; +$Numeric = [:LineBreak = Numeric:]; + +# +# Character Class Definitions. +# The names are those from TR29. +# + +$CR = \u000d; +$LF = \u000a; +$Extend = [[:Grapheme_Extend = TRUE:]]; +$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend]; +$Format = [[:Cf:] - $Extend]; +$Hiragana = [:Hiragana:]; +$Ideographic = [:IDEOGRAPHIC:]; + +## ------------------------------------------------- + +!!forward; + +$CR $LF; + +# rule 3 and 4 + +$ALetterEx = $ALetter $Extend*; +$ABaseLetterEx = $ABaseLetter $Extend*; +$ACMLetterEx = $ACMLetter $Extend*; +$NumericEx = $Numeric $Extend*; +$MidNumEx = $MidNum $Extend*; +$MidNumLetEx = $MidNumLet $Extend*; +$MidLetterEx = $MidLetter $Extend*; +$KatakanaEx = $Katakana $Extend*; + +# see character breaks + +[^$Control] $Extend*; + +# rule 5 + +$ALetterEx ($Format* $ALetterEx)* {200}; + +# rule 6 and 7 + +$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx); + +$ALetterSeq = +$ALetterEx +( + $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx +)*; + +$MidALetterSeq = +$MidALetterEx +( + $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx +)*; + +# rule 8 + +$NumericEx ($Format* $NumericEx)* {100}; + +# rule 9 + +$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200}; + +# rule 10 + +$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200}; + +# rule 11 and 12 + +$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100}; + +# rule 13 + +$KatakanaEx ($Format* $KatakanaEx)* {300}; +$Hiragana $Extend* {300}; +$Ideographic $Extend* {400}; + +## ------------------------------------------------- + +!!reverse; + +$BackALetterEx = $Extend* $ALetter; +$BackABaseLetterEx = $Extend* $ABaseLetter; +$BackACMLetterEx = $Extend* $ACMLetter; +$BackNumericEx = $Extend* $Numeric; +$BackMidNumEx = $Extend* $MidNum; +$BackMidNumLetEx = $Extend* $MidNumLet; +$BackMidLetterEx = $Extend* $MidLetter; +$BackKatakanaEx = $Extend* $Katakana; + +$LF $CR; + +# see character breaks + +$Extend* [^$Control]; + +# rule 5 + +($BackALetterEx $Format*)* $BackABaseLetterEx; +($BackALetterEx $Format*)* $BackACMLetterEx / $Control; + +# rule 6 and 7 + +$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format); + +$BackALetterSeq = +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackABaseLetterEx; + +$BackMidALetterSeq = +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackMidALetterEx; + +# rule 8 + +$BackNumericEx $Format* $BackNumericEx; + +# rule 10 + +(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq; + +# to handle letter sequences ending with a combining mark +(($BackNumericEx | $BackMidALetterSeq) $Format*)* +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackACMLetterEx / $Control; + +# rule 10 + +($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx; + +# rule 11 and 12 + +$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx; + +# rule 13 + +$BackKatakanaEx $Format* $BackKatakanaEx; + +## ------------------------------------------------- + +!!safe_reverse; + +# rule 3 +$Extend+ [^$Extend]; +$Extend+; # comes into play when buffer _begins_ with an $Extend+. + +# rule 4 +$Format+ $BackABaseLetterEx; +$Format+ $BackACMLetterEx / $Control; +$Format+ $BackNumericEx; +$Format+ $BackMidLetterEx; +$Format+ $BackMidNumLetEx; +$Format+ $BackMidNumEx; +$Format+ $BackKatakanaEx; + + +# rule 6 +($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx; +($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control; + +# rule 11 +($MidNum | $MidNumLet) $Format* $BackNumericEx; + +## ------------------------------------------------- + +!!safe_forward; + +# rule 3 +$Extend+; + +# rule 4 +$Extend* $Format+ $ALetterEx; +$Extend* $Format+ $NumericEx; +$Extend* $Format+ $MidLetterEx; +$Extend* $Format+ $MidNumLetEx; +$Extend* $Format+ $MidNumEx; +$Extend* $Format+ $KatakanaEx; + +$Extend+ $Format* $ALetterEx; +$Extend+ $Format* $NumericEx; +$Extend+ $Format* $MidLetterEx; +$Extend+ $Format* $MidNumLetEx; +$Extend+ $Format* $MidNumEx; +$Extend+ $Format* $KatakanaEx; + +# rule 6 +($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx; + +# rule 11 +($MidNumEx | $MidNumLetEx) $Format* $NumericEx; diff --git a/icu4c/source/data/brkitr/word_ja.txt b/icu4c/source/data/brkitr/word_ja.txt new file mode 100644 index 00000000000..939d43ce1dd --- /dev/null +++ b/icu4c/source/data/brkitr/word_ja.txt @@ -0,0 +1,255 @@ +# +# Copyright (C) 2002-2004, +# International Business Machines Corporation and others. +# All Rights Reserved. +# +# file: word_ja.txt +# +# ICU Word Break Rules +# See Unicode Standard Annex #29. +# These rules are based on Version 4.0.0, dated 2003-04-17 +# + +############################################################################## +# +# Character class definitions from TR 29 +# +############################################################################## + +!!chain; +!!LBCMNoChain; + +$Katakana = [[:Script = KATAKANA:] + [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] + [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; + + +$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] + - [:Ideographic:] + - $Katakana + - [:Script = Thai:] + - [:Script = Lao:] + - [:Script = Hiragana:]]; + +$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]]; +$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]]; + +$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] + [:name = HEBREW PUNCTUATION GERSHAYIM:] + [:name = RIGHT SINGLE QUOTATION MARK:] + [:name = HYPHENATION POINT:]]; + +$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]]; + +$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet]; +$Numeric = [:LineBreak = Numeric:]; + +# +# Character Class Definitions. +# The names are those from TR29. +# + +$CR = \u000d; +$LF = \u000a; +$Extend = [[:Grapheme_Extend = TRUE:]]; +$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend]; +$Format = [[:Cf:] - $Extend]; +$Hiragana = [:Hiragana:]; +$Ideographic = [:IDEOGRAPHIC:]; + +## ------------------------------------------------- + +!!forward; + +$CR $LF; + +# rule 3 and 4 + +$ALetterEx = $ALetter $Extend*; +$ABaseLetterEx = $ABaseLetter $Extend*; +$ACMLetterEx = $ACMLetter $Extend*; +$NumericEx = $Numeric $Extend*; +$MidNumEx = $MidNum $Extend*; +$MidNumLetEx = $MidNumLet $Extend*; +$MidLetterEx = $MidLetter $Extend*; +$KatakanaEx = $Katakana $Extend*; +$HiraganaEx = $Hiragana $Extend*; +$IdeographicEx = $Ideographic $Extend*; + +# see character breaks + +[^$Control] $Extend*; + +# rule 5 + +$ALetterEx ($Format* $ALetterEx)* {200}; + +# rule 6 and 7 + +$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx); + +$ALetterSeq = +$ALetterEx +( + $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx +)*; + +$MidALetterSeq = +$MidALetterEx +( + $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx +)*; + +# rule 8 + +$NumericEx ($Format* $NumericEx)* {100}; + +# rule 9 + +$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200}; + +# rule 10 + +$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200}; + +# rule 11 and 12 + +$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100}; + +# rule 13 + +$KatakanaEx ($Format* $KatakanaEx)* {300}; +$HiraganaEx ($Format* $HiraganaEx)* {300}; +$IdeographicEx ($Format* $IdeographicEx)* {400}; + +## ------------------------------------------------- + +!!reverse; + +$BackALetterEx = $Extend* $ALetter; +$BackABaseLetterEx = $Extend* $ABaseLetter; +$BackACMLetterEx = $Extend* $ACMLetter; +$BackNumericEx = $Extend* $Numeric; +$BackMidNumEx = $Extend* $MidNum; +$BackMidNumLetEx = $Extend* $MidNumLet; +$BackMidLetterEx = $Extend* $MidLetter; +$BackKatakanaEx = $Extend* $Katakana; +$BackHiraganaEx = $Extend* $Hiragana; +$BackIdeographicEx = $Extend* $Ideographic; + +$LF $CR; + +# see character breaks + +$Extend* [^$Control]; + +# rule 5 + +($BackALetterEx $Format*)* $BackABaseLetterEx; +($BackALetterEx $Format*)* $BackACMLetterEx / $Control; + +# rule 6 and 7 + +$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format); + +$BackALetterSeq = +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackABaseLetterEx; + +$BackMidALetterSeq = +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackMidALetterEx; + +# rule 8 + +$BackNumericEx $Format* $BackNumericEx; + +# rule 10 + +(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq; + +# to handle letter sequences ending with a combining mark +(($BackNumericEx | $BackMidALetterSeq) $Format*)* +( + $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format* +)* +$BackACMLetterEx / $Control; + +# rule 10 + +($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx; + +# rule 11 and 12 + +$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx; + +# rule 13 + +$BackKatakanaEx $Format* $BackKatakanaEx; +$BackHiraganaEx $Format* $BackHiraganaEx; +$BackIdeographicEx $Format* $BackIdeographicEx; + +## ------------------------------------------------- + +!!safe_reverse; + +# rule 3 +$Extend+ [^$Extend]; +$Extend+; # comes into play when buffer _begins_ with an $Extend+. + +# rule 4 +$Format+ $BackABaseLetterEx; +$Format+ $BackACMLetterEx / $Control; +$Format+ $BackNumericEx; +$Format+ $BackMidLetterEx; +$Format+ $BackMidNumLetEx; +$Format+ $BackMidNumEx; +$Format+ $BackKatakanaEx; +$Format+ $BackHiraganaEx; +$Format+ $BackIdeographicEx; + + +# rule 6 +($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx; +($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control; + +# rule 11 +($MidNum | $MidNumLet) $Format* $BackNumericEx; + +## ------------------------------------------------- + +!!safe_forward; + +# rule 3 +$Extend+; + +# rule 4 +$Extend* $Format+ $ALetterEx; +$Extend* $Format+ $NumericEx; +$Extend* $Format+ $MidLetterEx; +$Extend* $Format+ $MidNumLetEx; +$Extend* $Format+ $MidNumEx; +$Extend* $Format+ $KatakanaEx; +$Extend* $Format+ $HiraganaEx; +$Extend* $Format+ $IdeographicEx; + +$Extend+ $Format* $ALetterEx; +$Extend+ $Format* $NumericEx; +$Extend+ $Format* $MidLetterEx; +$Extend+ $Format* $MidNumLetEx; +$Extend+ $Format* $MidNumEx; +$Extend+ $Format* $KatakanaEx; +$Extend+ $Format* $HiraganaEx; +$Extend+ $Format* $IdeographicEx; + +# rule 6 +($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx; + +# rule 11 +($MidNumEx | $MidNumLetEx) $Format* $NumericEx; diff --git a/icu4c/source/data/locales/en_US_POSIX.txt b/icu4c/source/data/locales/en_US_POSIX.txt index 221ddc6ef79..da1bba3087f 100644 --- a/icu4c/source/data/locales/en_US_POSIX.txt +++ b/icu4c/source/data/locales/en_US_POSIX.txt @@ -7,6 +7,9 @@ // * // *************************************************************************** en_US_POSIX{ + boundaries { + word { "word_POSIX" } + } NumberElements{ ".", ",", diff --git a/icu4c/source/data/locales/ja.txt b/icu4c/source/data/locales/ja.txt index e4ab1bf9780..33005da688b 100644 --- a/icu4c/source/data/locales/ja.txt +++ b/icu4c/source/data/locales/ja.txt @@ -10,6 +10,9 @@ * ICU source: ../../../locale/icu/main\ja.xml */ ja{ + boundaries { + word { "word_ja" } + } Countries{ AD{"アンドラ"} AE{"アラブ首長国連邦"} diff --git a/icu4c/source/data/locales/root.txt b/icu4c/source/data/locales/root.txt index 7b068a77b09..8f5be264615 100644 --- a/icu4c/source/data/locales/root.txt +++ b/icu4c/source/data/locales/root.txt @@ -10,6 +10,14 @@ * ICU source: ../../../locale/icu/main\root.xml */ root{ + boundaries { + grapheme { "char" } + line { "line" } + sentence { "sent" } + title { "title" } + word { "word" } + } + Currencies{ EUR{ "€", diff --git a/icu4c/source/data/locales/th.txt b/icu4c/source/data/locales/th.txt index bf918ecd499..0996138bbc2 100644 --- a/icu4c/source/data/locales/th.txt +++ b/icu4c/source/data/locales/th.txt @@ -11,6 +11,10 @@ */ th{ BreakDictionaryData:import { "../brkitr/thaidict.brk" } + boundaries { + word { "word_th" } + line { "line_th" } + } Countries{ AD{"อันดอร์รา"} AE{"สหรัฐอาหรับเอมิเรตส์"} diff --git a/icu4c/source/test/intltest/citrtest.cpp b/icu4c/source/test/intltest/citrtest.cpp index 771859c38aa..344f8acd8c0 100644 --- a/icu4c/source/test/intltest/citrtest.cpp +++ b/icu4c/source/test/intltest/citrtest.cpp @@ -15,6 +15,7 @@ #include "unicode/schriter.h" #include "unicode/uchriter.h" #include "unicode/uiter.h" +#include "unicode/putil.h" #include "citrtest.h" diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index ae00df0f638..2e42874074e 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -575,6 +575,31 @@ void RBBITest::TestBug3818() { delete bi; } + +void RBBITest::TestJapaneseWordBreak() { + UErrorCode status = U_ZERO_ERROR; + BITestData japaneseWordSelection(status); + + ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data + ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 + ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 + ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 + ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 + ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 + ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 + + RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( + Locale("ja"), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); + return; + } + + generalIteratorTest(*e, japaneseWordSelection); + delete e; +} + //--------------------------------------------- // runIndexedTest //--------------------------------------------- @@ -633,6 +658,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha break; case 18: name = "TestBug3818"; if(exec) TestBug3818(); break; + case 19: name = "TestJapaneseWordBreak"; + if(exec) TestJapaneseWordBreak(); break; default: name = ""; break; //needed to end loop } diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 94772d31939..51ded0fde6d 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -75,6 +75,7 @@ public: void TestLineBreaks(); void TestSentBreaks(); void TestBug3818(); + void TestJapaneseWordBreak(); /***********************/