ICU-3561 Locale-based text boundaries

X-SVN-Rev: 16582
2025-04-05 21:45:37 +00:00 · 2004-10-21 01:03:01 +00:00 · 2004-10-21 01:03:01 +00:00 · 225c380bde
commit 225c380bde
parent d118393447
12 changed files with 652 additions and 203 deletions
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@ -31,6 +31,7 @@
 #include "mutex.h"
 #include "iculserv.h"
 #include "locbased.h"
+#include "uresimp.h"

 // *****************************************************************************
 // class BreakIterator
@ -45,6 +46,105 @@ const int32_t BreakIterator::DONE = (int32_t)-1;

 // -------------------------------------

+BreakIterator*
+BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode &status)
+{
+    char fnbuff[256];
+    char actualLocale[ULOC_FULLNAME_CAPACITY];
+    int32_t size;
+    const UChar* brkfname = NULL;
+    UResourceBundle brkrules, brkname;
+    BreakIterator *result = NULL;
+    
+    if (U_FAILURE(status))
+        return NULL;
+
+    // Get the locale
+    UResourceBundle *b = ures_open(NULL, loc.getName(), &status);
+
+    // Get the "boundaries" array.
+    if (U_SUCCESS(status)) {
+        ures_initStackObject(&brkrules);
+        (void) ures_getByKeyWithFallback(b, "boundaries", &brkrules, &status);
+    }
+
+    // Get the string object naming the rules file
+    if (U_SUCCESS(status)) {
+        ures_initStackObject(&brkname);
+        (void) ures_getByKeyWithFallback(&brkrules, type, &brkname, &status);
+    }
+
+    // Get the actual string
+    if (U_SUCCESS(status)) {
+        brkfname = ures_getString(&brkname, &size, &status);
+        uprv_strncpy(actualLocale, ures_getLocale(&brkname, &status), sizeof(actualLocale)/sizeof(actualLocale[0]));
+    }
+    
+    // Use the string if we found it
+    if (U_SUCCESS(status)) {
+        u_UCharsToChars(brkfname, fnbuff, size);
+        fnbuff[size] = '\0';
+    }
+    
+    ures_close(&brkrules);
+    ures_close(&brkname);
+    
+    UDataMemory* file = udata_open(NULL, "brk", fnbuff, &status);
+    if (U_FAILURE(status)) {
+        ures_close(b);
+        return NULL;
+    }
+
+    // We found the break rules; now see if a dictionary is needed
+    if (dict)
+    {
+        UErrorCode localStatus = U_ZERO_ERROR;
+        ures_initStackObject(&brkname);
+        (void) ures_getByKeyWithFallback(b, "BreakDictionaryData", &brkname, &localStatus);
+#if 0
+        if (U_SUCCESS(localStatus)) {
+            brkfname = ures_getString(&brkname, &size, &localStatus);
+        }
+#endif
+        if (U_SUCCESS(localStatus)) {
+#if 0
+            u_UCharsToChars(brkfname, fnbuff, size);
+            fnbuff[size] = '\0';
+#endif
+            result = new DictionaryBasedBreakIterator(file, "thaidict.brk", status);
+        }
+        ures_close(&brkname);
+    }
+    
+    // If there is still no result but we haven't had an error, no dictionary,
+    // so make a non-dictionary break iterator
+    if (U_SUCCESS(status) && result == NULL) {
+        result = new RuleBasedBreakIterator(file, status);
+    }
+
+    // If there is a result, set the valid locale and actual locale
+    if (U_SUCCESS(status) && result != NULL) {
+        U_LOCALE_BASED(locBased, *result);
+        locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale);
+    }
+
+    ures_close(b);
+    
+    if (U_FAILURE(status) && result != NULL) {  // Sometimes redundant check, but simple
+        delete result;
+        return NULL;
+    }
+
+    if (result == NULL) {
+        udata_close(file);
+        if (U_SUCCESS(status)) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+        }
+    }
+
+    return result;
+}
+
 // Creates a break iterator for word breaks.
 BreakIterator* U_EXPORT2
 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
@ -52,50 +152,6 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
    return createInstance(key, UBRK_WORD, status);
 }

-BreakIterator*
-BreakIterator::makeWordInstance(const Locale& key, UErrorCode& status)
-{
-    // WARNING: This routine is currently written specifically to handle only the
-    // default rules files and the alternate rules files for Thai.  This function
-    // will have to be made fully general at some time in the future!
-    BreakIterator* result = NULL;
-    const char* filename = "word";
-
-    if (U_FAILURE(status))
-        return NULL;
-
-    if (!uprv_strcmp(key.getLanguage(), "th"))
-    {
-        filename = "word_th";
-    }
-
-    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
-    // The UDataMemory is adopted by the break iterator.
-
-    if(!uprv_strcmp(filename, "word_th")) {
-        filename = "thaidict.brk";
-        result = new DictionaryBasedBreakIterator(file, filename, status);
-    }
-    else {
-        result = new RuleBasedBreakIterator(file, status);
-    }
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        if (result != NULL) {
-            delete result;
-        }
-        return NULL;
-    }
-    if (result == NULL) {
-        udata_close(file);
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
-    
-    return result;
-}
-
 // -------------------------------------

 // Creates a break iterator  for line breaks.
@ -105,49 +161,6 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
    return createInstance(key, UBRK_LINE, status);
 }

-BreakIterator*
-BreakIterator::makeLineInstance(const Locale& key, UErrorCode& status)
-{
-    // WARNING: This routine is currently written specifically to handle only the
-    // default rules files and the alternate rules files for Thai.  This function
-    // will have to be made fully general at some time in the future!
-    BreakIterator* result = NULL;
-    const char* filename = "line";
-
-    if (U_FAILURE(status))
-        return NULL;
-
-    if (!uprv_strcmp(key.getLanguage(), "th"))
-    {
-        filename = "line_th";
-    }
-
-    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
-    // The UDataMemory is adopted by the break iterator.
-
-    if (!uprv_strcmp(key.getLanguage(), "th")) {
-        filename = "thaidict.brk";
-        result = new DictionaryBasedBreakIterator(file, filename, status);
-    }
-    else {
-        result = new RuleBasedBreakIterator(file, status);
-    }
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        if (result != NULL) {
-            delete result;
-        }
-        return NULL;
-    }
-    if (result == NULL) {
-        udata_close(file);
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
-    return result;
-}
-
 // -------------------------------------

 // Creates a break iterator  for character breaks.
@ -157,38 +170,6 @@ BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
    return createInstance(key, UBRK_CHARACTER, status);
 }

-BreakIterator*
-BreakIterator::makeCharacterInstance(const Locale& /* key */, UErrorCode& status)
-{
-    // WARNING: This routine is currently written specifically to handle only the
-    // default rules files and the alternate rules files for Thai.  This function
-    // will have to be made fully general at some time in the future!
-    BreakIterator* result = NULL;
-    static const char filename[] = "char";
-
-    if (U_FAILURE(status))
-        return NULL;
-    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
-    // The UDataMemory is adopted by the break iterator.
-
-    result = new RuleBasedBreakIterator(file, status);
-    
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        if (result != NULL) {
-            delete result;
-        }
-        return NULL;
-    }
-    if (result == NULL) {
-        udata_close(file);
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
-    return result;
-}
-
 // -------------------------------------

 // Creates a break iterator  for sentence breaks.
@ -198,38 +179,6 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
    return createInstance(key, UBRK_SENTENCE, status);
 }

-BreakIterator*
-BreakIterator::makeSentenceInstance(const Locale& /*key */, UErrorCode& status)
-{
-    // WARNING: This routine is currently written specifically to handle only the
-    // default rules files and the alternate rules files for Thai.  This function
-    // will have to be made fully general at some time in the future!
-    BreakIterator* result = NULL;
-    static const char filename[] = "sent";
-
-    if (U_FAILURE(status))
-        return NULL;
-    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
-    // The UDataMemory is adopted by the break iterator.
-
-    result = new RuleBasedBreakIterator(file, status);
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        if (result != NULL) {
-            delete result;
-        }
-        return NULL;
-    }
-    if (result == NULL) {
-        udata_close(file);
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
-
-    return result;
-}
-
 // -------------------------------------

 // Creates a break iterator for title casing breaks.
@ -239,38 +188,6 @@ BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
    return createInstance(key, UBRK_TITLE, status);
 }

-BreakIterator*
-BreakIterator::makeTitleInstance(const Locale& /* key */, UErrorCode& status)
-{
-    // WARNING: This routine is currently written specifically to handle only the
-    // default rules files.  This function will have to be made fully general
-    // at some time in the future!
-    BreakIterator* result = NULL;
-    static const char filename[] = "title";
-
-    if (U_FAILURE(status))
-        return NULL;
-    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
-    // The UDataMemory is adopted by the break iterator.
-
-    result = new RuleBasedBreakIterator(file, status);
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        if (result != NULL) {
-            delete result;
-        }
-        return NULL;
-    }
-    if (result == NULL) {
-        udata_close(file);
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
-
-    return result;
-}
-
 // -------------------------------------

 // Gets all the available locales that has localized text boundary data.
@ -495,19 +412,19 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
    BreakIterator *result = NULL;
    switch (kind) {
    case UBRK_CHARACTER: 
-        result = BreakIterator::makeCharacterInstance(loc, status);
+        result = BreakIterator::buildInstance(loc, "grapheme", FALSE, status);
        break;
    case UBRK_WORD:
-        result = BreakIterator::makeWordInstance(loc, status);
+        result = BreakIterator::buildInstance(loc, "word", TRUE, status);
        break;
    case UBRK_LINE:
-        result = BreakIterator::makeLineInstance(loc, status);
+        result = BreakIterator::buildInstance(loc, "line", TRUE, status);
        break;
    case UBRK_SENTENCE:
-        result = BreakIterator::makeSentenceInstance(loc, status);
+        result = BreakIterator::buildInstance(loc, "sentence", FALSE, status);
        break;
    case UBRK_TITLE:
-        result = BreakIterator::makeTitleInstance(loc, status);
+        result = BreakIterator::buildInstance(loc, "title", FALSE, status);
        break;
    default:
        status = U_ILLEGAL_ARGUMENT_ERROR;
@ -517,14 +434,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
 		return NULL;
 	}

-    // this is more of a placeholder. All the break iterators have the same actual locale: root
-    // except the Thai one
-    UResourceBundle *res = ures_open(NULL, loc.getName(), &status);
-    U_LOCALE_BASED(locBased, *result);
-    locBased.setLocaleIDs(ures_getLocaleByType(res, ULOC_VALID_LOCALE, &status),
-                          (uprv_strcmp(loc.getLanguage(), "th") == 0) ?
-                          "th" : "root");
-    ures_close(res);
    return result;
 }

--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@ -590,12 +590,7 @@ public:
    const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;

 private:
-    static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status);
-    static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status);
-    static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status);
-    static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status);
-    static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status);
-
+    static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status);
    static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
    static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);

--- a/icu4c/source/data/brkitr/brkfiles.mk
+++ b/icu4c/source/data/brkitr/brkfiles.mk
@ -28,4 +28,4 @@
 # char.txt, title.txt and word.txt are not included so that more tests pass by default,
 # and so that the makefile rules are simplier.
 BRK_SOURCE = \
-line.txt sent.txt line_th.txt word_th.txt
+line.txt sent.txt line_th.txt word_th.txt word_ja.txt word_POSIX.txt
--- a/icu4c/source/data/brkitr/word_POSIX.txt
+++ b/icu4c/source/data/brkitr/word_POSIX.txt
@ -0,0 +1,243 @@
+#
+# Copyright (C) 2002-2004,
+# International Business Machines Corporation and others.
+# All Rights Reserved.
+#
+# file:  word.txt
+#
+# ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on Version 4.0.0, dated 2003-04-17
+#
+
+##############################################################################
+#
+#  Character class definitions from TR 29
+#
+##############################################################################
+
+!!chain;
+!!LBCMNoChain;
+
+$Katakana  = [[:Script = KATAKANA:]
+			  [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+
+
+$ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
+						   - [:Ideographic:]
+						   - $Katakana
+						   - [:Script = Thai:]
+						   - [:Script = Lao:]
+						   - [:Script = Hiragana:]];
+
+$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
+$ACMLetter   = [$ALetter & [:Grapheme_Extend = TRUE:]];
+
+$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
+			  [:name = HEBREW PUNCTUATION GERSHAYIM:]
+			  [:name = RIGHT SINGLE QUOTATION MARK:]
+			  [:name = HYPHENATION POINT:]];
+
+$MidNumLet = [[:name = FULL STOP:]];
+
+$MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
+$Numeric   = [:LineBreak = Numeric:];
+
+#
+#  Character Class Definitions.
+#    The names are those from TR29.
+#
+
+$CR      = \u000d;
+$LF      = \u000a;
+$Extend  = [[:Grapheme_Extend = TRUE:]];
+$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
+$Format  = [[:Cf:] - $Extend];
+$Hiragana = [:Hiragana:];
+$Ideographic = [:IDEOGRAPHIC:];
+
+## -------------------------------------------------
+
+!!forward;
+
+$CR $LF;
+
+# rule 3 and 4
+
+$ALetterEx     = $ALetter     $Extend*;
+$ABaseLetterEx = $ABaseLetter $Extend*;
+$ACMLetterEx   = $ACMLetter   $Extend*;
+$NumericEx     = $Numeric     $Extend*;
+$MidNumEx      = $MidNum      $Extend*;
+$MidNumLetEx   = $MidNumLet   $Extend*;
+$MidLetterEx   = $MidLetter   $Extend*;
+$KatakanaEx    = $Katakana    $Extend*;
+
+# see character breaks
+
+[^$Control] $Extend*;
+
+# rule 5
+
+$ALetterEx ($Format* $ALetterEx)* {200};
+
+# rule 6 and 7
+
+$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx);
+
+$ALetterSeq =
+$ALetterEx
+(
+    $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
+)*;
+
+$MidALetterSeq =
+$MidALetterEx
+(
+    $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
+)*;
+
+# rule 8
+
+$NumericEx ($Format* $NumericEx)* {100};
+
+# rule 9
+
+$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200};
+
+# rule 10
+
+$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200};
+
+# rule 11 and 12 
+
+$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100};
+
+# rule 13
+
+$KatakanaEx ($Format* $KatakanaEx)* {300};
+$Hiragana $Extend* {300};
+$Ideographic $Extend* {400};
+
+## -------------------------------------------------
+
+!!reverse;
+
+$BackALetterEx     = $Extend* $ALetter;
+$BackABaseLetterEx = $Extend* $ABaseLetter;
+$BackACMLetterEx   = $Extend* $ACMLetter;
+$BackNumericEx     = $Extend* $Numeric;
+$BackMidNumEx      = $Extend* $MidNum;
+$BackMidNumLetEx   = $Extend* $MidNumLet;
+$BackMidLetterEx   = $Extend* $MidLetter;
+$BackKatakanaEx    = $Extend* $Katakana;
+
+$LF $CR;
+
+# see character breaks
+
+$Extend* [^$Control];
+
+# rule 5
+
+($BackALetterEx $Format*)* $BackABaseLetterEx;
+($BackALetterEx $Format*)* $BackACMLetterEx / $Control;
+
+# rule 6 and 7
+
+$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format);
+
+$BackALetterSeq =
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackABaseLetterEx;
+
+$BackMidALetterSeq =
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackMidALetterEx;
+
+# rule 8
+
+$BackNumericEx $Format* $BackNumericEx;
+
+# rule 10
+
+(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq;
+
+# to handle letter sequences ending with a combining mark
+(($BackNumericEx | $BackMidALetterSeq) $Format*)* 
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackACMLetterEx / $Control;
+
+# rule 10
+
+($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx;
+
+# rule 11 and 12
+
+$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
+
+# rule 13
+
+$BackKatakanaEx $Format* $BackKatakanaEx;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# rule 3
+$Extend+ [^$Extend];
+$Extend+;               # comes into play when buffer _begins_ with an $Extend+.
+
+# rule 4
+$Format+ $BackABaseLetterEx;
+$Format+ $BackACMLetterEx / $Control;
+$Format+ $BackNumericEx;
+$Format+ $BackMidLetterEx;
+$Format+ $BackMidNumLetEx;
+$Format+ $BackMidNumEx;
+$Format+ $BackKatakanaEx;
+
+
+# rule 6
+($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
+($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control;
+
+# rule 11
+($MidNum | $MidNumLet) $Format* $BackNumericEx;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# rule 3
+$Extend+;
+
+# rule 4
+$Extend* $Format+ $ALetterEx;
+$Extend* $Format+ $NumericEx;
+$Extend* $Format+ $MidLetterEx;
+$Extend* $Format+ $MidNumLetEx;
+$Extend* $Format+ $MidNumEx;
+$Extend* $Format+ $KatakanaEx;
+
+$Extend+ $Format* $ALetterEx;
+$Extend+ $Format* $NumericEx;
+$Extend+ $Format* $MidLetterEx;
+$Extend+ $Format* $MidNumLetEx;
+$Extend+ $Format* $MidNumEx;
+$Extend+ $Format* $KatakanaEx;
+
+# rule 6
+($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx;
+
+# rule 11
+($MidNumEx | $MidNumLetEx) $Format* $NumericEx;
--- a/icu4c/source/data/brkitr/word_ja.txt
+++ b/icu4c/source/data/brkitr/word_ja.txt
@ -0,0 +1,255 @@
+#
+# Copyright (C) 2002-2004,
+# International Business Machines Corporation and others.
+# All Rights Reserved.
+#
+# file:  word_ja.txt
+#
+# ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on Version 4.0.0, dated 2003-04-17
+#
+
+##############################################################################
+#
+#  Character class definitions from TR 29
+#
+##############################################################################
+
+!!chain;
+!!LBCMNoChain;
+
+$Katakana  = [[:Script = KATAKANA:]
+			  [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+
+
+$ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
+						   - [:Ideographic:]
+						   - $Katakana
+						   - [:Script = Thai:]
+						   - [:Script = Lao:]
+						   - [:Script = Hiragana:]];
+
+$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
+$ACMLetter   = [$ALetter & [:Grapheme_Extend = TRUE:]];
+
+$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
+			  [:name = HEBREW PUNCTUATION GERSHAYIM:]
+			  [:name = RIGHT SINGLE QUOTATION MARK:]
+			  [:name = HYPHENATION POINT:]];
+
+$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
+
+$MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
+$Numeric   = [:LineBreak = Numeric:];
+
+#
+#  Character Class Definitions.
+#    The names are those from TR29.
+#
+
+$CR      = \u000d;
+$LF      = \u000a;
+$Extend  = [[:Grapheme_Extend = TRUE:]];
+$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
+$Format  = [[:Cf:] - $Extend];
+$Hiragana = [:Hiragana:];
+$Ideographic = [:IDEOGRAPHIC:];
+
+## -------------------------------------------------
+
+!!forward;
+
+$CR $LF;
+
+# rule 3 and 4
+
+$ALetterEx     = $ALetter     $Extend*;
+$ABaseLetterEx = $ABaseLetter $Extend*;
+$ACMLetterEx   = $ACMLetter   $Extend*;
+$NumericEx     = $Numeric     $Extend*;
+$MidNumEx      = $MidNum      $Extend*;
+$MidNumLetEx   = $MidNumLet   $Extend*;
+$MidLetterEx   = $MidLetter   $Extend*;
+$KatakanaEx    = $Katakana    $Extend*;
+$HiraganaEx     = $Hiragana    $Extend*;
+$IdeographicEx  = $Ideographic $Extend*;
+
+# see character breaks
+
+[^$Control] $Extend*;
+
+# rule 5
+
+$ALetterEx ($Format* $ALetterEx)* {200};
+
+# rule 6 and 7
+
+$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx);
+
+$ALetterSeq =
+$ALetterEx
+(
+    $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
+)*;
+
+$MidALetterSeq =
+$MidALetterEx
+(
+    $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
+)*;
+
+# rule 8
+
+$NumericEx ($Format* $NumericEx)* {100};
+
+# rule 9
+
+$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200};
+
+# rule 10
+
+$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200};
+
+# rule 11 and 12 
+
+$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100};
+
+# rule 13
+
+$KatakanaEx ($Format* $KatakanaEx)* {300};
+$HiraganaEx  ($Format* $HiraganaEx)* {300};
+$IdeographicEx ($Format* $IdeographicEx)* {400};
+
+## -------------------------------------------------
+
+!!reverse;
+
+$BackALetterEx     = $Extend* $ALetter;
+$BackABaseLetterEx = $Extend* $ABaseLetter;
+$BackACMLetterEx   = $Extend* $ACMLetter;
+$BackNumericEx     = $Extend* $Numeric;
+$BackMidNumEx      = $Extend* $MidNum;
+$BackMidNumLetEx   = $Extend* $MidNumLet;
+$BackMidLetterEx   = $Extend* $MidLetter;
+$BackKatakanaEx    = $Extend* $Katakana;
+$BackHiraganaEx    = $Extend* $Hiragana;
+$BackIdeographicEx = $Extend* $Ideographic;
+
+$LF $CR;
+
+# see character breaks
+
+$Extend* [^$Control];
+
+# rule 5
+
+($BackALetterEx $Format*)* $BackABaseLetterEx;
+($BackALetterEx $Format*)* $BackACMLetterEx / $Control;
+
+# rule 6 and 7
+
+$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format);
+
+$BackALetterSeq =
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackABaseLetterEx;
+
+$BackMidALetterSeq =
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackMidALetterEx;
+
+# rule 8
+
+$BackNumericEx $Format* $BackNumericEx;
+
+# rule 10
+
+(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq;
+
+# to handle letter sequences ending with a combining mark
+(($BackNumericEx | $BackMidALetterSeq) $Format*)* 
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackACMLetterEx / $Control;
+
+# rule 10
+
+($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx;
+
+# rule 11 and 12
+
+$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
+
+# rule 13
+
+$BackKatakanaEx $Format* $BackKatakanaEx;
+$BackHiraganaEx $Format* $BackHiraganaEx;
+$BackIdeographicEx $Format* $BackIdeographicEx;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# rule 3
+$Extend+ [^$Extend];
+$Extend+;               # comes into play when buffer _begins_ with an $Extend+.
+
+# rule 4
+$Format+ $BackABaseLetterEx;
+$Format+ $BackACMLetterEx / $Control;
+$Format+ $BackNumericEx;
+$Format+ $BackMidLetterEx;
+$Format+ $BackMidNumLetEx;
+$Format+ $BackMidNumEx;
+$Format+ $BackKatakanaEx;
+$Format+ $BackHiraganaEx;
+$Format+ $BackIdeographicEx;
+
+
+# rule 6
+($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
+($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control;
+
+# rule 11
+($MidNum | $MidNumLet) $Format* $BackNumericEx;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# rule 3
+$Extend+;
+
+# rule 4
+$Extend* $Format+ $ALetterEx;
+$Extend* $Format+ $NumericEx;
+$Extend* $Format+ $MidLetterEx;
+$Extend* $Format+ $MidNumLetEx;
+$Extend* $Format+ $MidNumEx;
+$Extend* $Format+ $KatakanaEx;
+$Extend* $Format+ $HiraganaEx;
+$Extend* $Format+ $IdeographicEx;
+
+$Extend+ $Format* $ALetterEx;
+$Extend+ $Format* $NumericEx;
+$Extend+ $Format* $MidLetterEx;
+$Extend+ $Format* $MidNumLetEx;
+$Extend+ $Format* $MidNumEx;
+$Extend+ $Format* $KatakanaEx;
+$Extend+ $Format* $HiraganaEx;
+$Extend+ $Format* $IdeographicEx;
+
+# rule 6
+($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx;
+
+# rule 11
+($MidNumEx | $MidNumLetEx) $Format* $NumericEx;
--- a/icu4c/source/data/locales/en_US_POSIX.txt
+++ b/icu4c/source/data/locales/en_US_POSIX.txt
@ -7,6 +7,9 @@
 // *
 // ***************************************************************************
 en_US_POSIX{
+    boundaries {
+        word { "word_POSIX" }
+    }
    NumberElements{
        ".",
        ",",
--- a/icu4c/source/data/locales/ja.txt
+++ b/icu4c/source/data/locales/ja.txt
@ -10,6 +10,9 @@
 *  ICU <specials> source: ../../../locale/icu/main\ja.xml
 */
 ja{
+    boundaries {
+        word { "word_ja" }
+    }
    Countries{
        AD{"アンドラ"}
        AE{"アラブ首長国連邦"}
--- a/icu4c/source/data/locales/root.txt
+++ b/icu4c/source/data/locales/root.txt
@ -10,6 +10,14 @@
 *  ICU <specials> source: ../../../locale/icu/main\root.xml
 */
 root{
+    boundaries {
+        grapheme { "char" }
+        line { "line" }
+        sentence { "sent" }
+        title { "title" }
+        word { "word" }
+    }
+
    Currencies{
        EUR{
            "€",
--- a/icu4c/source/data/locales/th.txt
+++ b/icu4c/source/data/locales/th.txt
@ -11,6 +11,10 @@
 */
 th{
    BreakDictionaryData:import { "../brkitr/thaidict.brk" }
+    boundaries {
+        word { "word_th" }
+        line { "line_th" }
+    }
    Countries{
        AD{"อันดอร์รา"}
        AE{"สหรัฐอาหรับเอมิเรตส์"}
--- a/icu4c/source/test/intltest/citrtest.cpp
+++ b/icu4c/source/test/intltest/citrtest.cpp
@ -15,6 +15,7 @@
 #include "unicode/schriter.h"
 #include "unicode/uchriter.h"
 #include "unicode/uiter.h"
+#include "unicode/putil.h"
 #include "citrtest.h"


--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -575,6 +575,31 @@ void RBBITest::TestBug3818() {
    delete bi;
 }

+
+void RBBITest::TestJapaneseWordBreak() {
+    UErrorCode status = U_ZERO_ERROR;
+    BITestData   japaneseWordSelection(status);
+
+    ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
+    ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
+    ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
+    ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
+    ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
+    ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
+    ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
+
+    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
+        Locale("ja"), status);
+    if (U_FAILURE(status))
+    {
+        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
+        return;
+    }
+
+    generalIteratorTest(*e, japaneseWordSelection);
+    delete e;
+}
+
 //---------------------------------------------
 // runIndexedTest
 //---------------------------------------------
@ -633,6 +658,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
                                                               break;
        case 18: name = "TestBug3818";
            if(exec) TestBug3818();                            break;
+        case 19: name = "TestJapaneseWordBreak";
+            if(exec) TestJapaneseWordBreak();                  break;

        default: name = ""; break; //needed to end loop
    }
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -75,6 +75,7 @@ public:
    void TestLineBreaks();
    void TestSentBreaks();
    void TestBug3818();
+    void TestJapaneseWordBreak();
    
    
 /***********************/