ICU-3561 Locale-based text boundaries

X-SVN-Rev: 16582
This commit is contained in:
Deborah Goldsmith 2004-10-21 01:03:01 +00:00
parent d118393447
commit 225c380bde
12 changed files with 652 additions and 203 deletions

View file

@ -31,6 +31,7 @@
#include "mutex.h"
#include "iculserv.h"
#include "locbased.h"
#include "uresimp.h"
// *****************************************************************************
// class BreakIterator
@ -45,6 +46,105 @@ const int32_t BreakIterator::DONE = (int32_t)-1;
// -------------------------------------
BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode &status)
{
char fnbuff[256];
char actualLocale[ULOC_FULLNAME_CAPACITY];
int32_t size;
const UChar* brkfname = NULL;
UResourceBundle brkrules, brkname;
BreakIterator *result = NULL;
if (U_FAILURE(status))
return NULL;
// Get the locale
UResourceBundle *b = ures_open(NULL, loc.getName(), &status);
// Get the "boundaries" array.
if (U_SUCCESS(status)) {
ures_initStackObject(&brkrules);
(void) ures_getByKeyWithFallback(b, "boundaries", &brkrules, &status);
}
// Get the string object naming the rules file
if (U_SUCCESS(status)) {
ures_initStackObject(&brkname);
(void) ures_getByKeyWithFallback(&brkrules, type, &brkname, &status);
}
// Get the actual string
if (U_SUCCESS(status)) {
brkfname = ures_getString(&brkname, &size, &status);
uprv_strncpy(actualLocale, ures_getLocale(&brkname, &status), sizeof(actualLocale)/sizeof(actualLocale[0]));
}
// Use the string if we found it
if (U_SUCCESS(status)) {
u_UCharsToChars(brkfname, fnbuff, size);
fnbuff[size] = '\0';
}
ures_close(&brkrules);
ures_close(&brkname);
UDataMemory* file = udata_open(NULL, "brk", fnbuff, &status);
if (U_FAILURE(status)) {
ures_close(b);
return NULL;
}
// We found the break rules; now see if a dictionary is needed
if (dict)
{
UErrorCode localStatus = U_ZERO_ERROR;
ures_initStackObject(&brkname);
(void) ures_getByKeyWithFallback(b, "BreakDictionaryData", &brkname, &localStatus);
#if 0
if (U_SUCCESS(localStatus)) {
brkfname = ures_getString(&brkname, &size, &localStatus);
}
#endif
if (U_SUCCESS(localStatus)) {
#if 0
u_UCharsToChars(brkfname, fnbuff, size);
fnbuff[size] = '\0';
#endif
result = new DictionaryBasedBreakIterator(file, "thaidict.brk", status);
}
ures_close(&brkname);
}
// If there is still no result but we haven't had an error, no dictionary,
// so make a non-dictionary break iterator
if (U_SUCCESS(status) && result == NULL) {
result = new RuleBasedBreakIterator(file, status);
}
// If there is a result, set the valid locale and actual locale
if (U_SUCCESS(status) && result != NULL) {
U_LOCALE_BASED(locBased, *result);
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale);
}
ures_close(b);
if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple
delete result;
return NULL;
}
if (result == NULL) {
udata_close(file);
if (U_SUCCESS(status)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
return result;
}
// Creates a break iterator for word breaks.
BreakIterator* U_EXPORT2
BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
@ -52,50 +152,6 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
return createInstance(key, UBRK_WORD, status);
}
BreakIterator*
BreakIterator::makeWordInstance(const Locale& key, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "word";
if (U_FAILURE(status))
return NULL;
if (!uprv_strcmp(key.getLanguage(), "th"))
{
filename = "word_th";
}
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if(!uprv_strcmp(filename, "word_th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
}
else {
result = new RuleBasedBreakIterator(file, status);
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
if (result != NULL) {
delete result;
}
return NULL;
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
return result;
}
// -------------------------------------
// Creates a break iterator for line breaks.
@ -105,49 +161,6 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
return createInstance(key, UBRK_LINE, status);
}
BreakIterator*
BreakIterator::makeLineInstance(const Locale& key, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "line";
if (U_FAILURE(status))
return NULL;
if (!uprv_strcmp(key.getLanguage(), "th"))
{
filename = "line_th";
}
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if (!uprv_strcmp(key.getLanguage(), "th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
}
else {
result = new RuleBasedBreakIterator(file, status);
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
if (result != NULL) {
delete result;
}
return NULL;
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
return result;
}
// -------------------------------------
// Creates a break iterator for character breaks.
@ -157,38 +170,6 @@ BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
return createInstance(key, UBRK_CHARACTER, status);
}
BreakIterator*
BreakIterator::makeCharacterInstance(const Locale& /* key */, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
static const char filename[] = "char";
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
result = new RuleBasedBreakIterator(file, status);
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
if (result != NULL) {
delete result;
}
return NULL;
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
return result;
}
// -------------------------------------
// Creates a break iterator for sentence breaks.
@ -198,38 +179,6 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
return createInstance(key, UBRK_SENTENCE, status);
}
BreakIterator*
BreakIterator::makeSentenceInstance(const Locale& /*key */, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
static const char filename[] = "sent";
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
result = new RuleBasedBreakIterator(file, status);
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
if (result != NULL) {
delete result;
}
return NULL;
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
return result;
}
// -------------------------------------
// Creates a break iterator for title casing breaks.
@ -239,38 +188,6 @@ BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
return createInstance(key, UBRK_TITLE, status);
}
BreakIterator*
BreakIterator::makeTitleInstance(const Locale& /* key */, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files. This function will have to be made fully general
// at some time in the future!
BreakIterator* result = NULL;
static const char filename[] = "title";
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
result = new RuleBasedBreakIterator(file, status);
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
if (result != NULL) {
delete result;
}
return NULL;
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
return result;
}
// -------------------------------------
// Gets all the available locales that has localized text boundary data.
@ -495,19 +412,19 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
BreakIterator *result = NULL;
switch (kind) {
case UBRK_CHARACTER:
result = BreakIterator::makeCharacterInstance(loc, status);
result = BreakIterator::buildInstance(loc, "grapheme", FALSE, status);
break;
case UBRK_WORD:
result = BreakIterator::makeWordInstance(loc, status);
result = BreakIterator::buildInstance(loc, "word", TRUE, status);
break;
case UBRK_LINE:
result = BreakIterator::makeLineInstance(loc, status);
result = BreakIterator::buildInstance(loc, "line", TRUE, status);
break;
case UBRK_SENTENCE:
result = BreakIterator::makeSentenceInstance(loc, status);
result = BreakIterator::buildInstance(loc, "sentence", FALSE, status);
break;
case UBRK_TITLE:
result = BreakIterator::makeTitleInstance(loc, status);
result = BreakIterator::buildInstance(loc, "title", FALSE, status);
break;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;
@ -517,14 +434,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
return NULL;
}
// this is more of a placeholder. All the break iterators have the same actual locale: root
// except the Thai one
UResourceBundle *res = ures_open(NULL, loc.getName(), &status);
U_LOCALE_BASED(locBased, *result);
locBased.setLocaleIDs(ures_getLocaleByType(res, ULOC_VALID_LOCALE, &status),
(uprv_strcmp(loc.getLanguage(), "th") == 0) ?
"th" : "root");
ures_close(res);
return result;
}

View file

@ -590,12 +590,7 @@ public:
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
private:
static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);

View file

@ -28,4 +28,4 @@
# char.txt, title.txt and word.txt are not included so that more tests pass by default,
# and so that the makefile rules are simplier.
BRK_SOURCE = \
line.txt sent.txt line_th.txt word_th.txt
line.txt sent.txt line_th.txt word_th.txt word_ja.txt word_POSIX.txt

View file

@ -0,0 +1,243 @@
#
# Copyright (C) 2002-2004,
# International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on Version 4.0.0, dated 2003-04-17
#
##############################################################################
#
# Character class definitions from TR 29
#
##############################################################################
!!chain;
!!LBCMNoChain;
$Katakana = [[:Script = KATAKANA:]
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
- [:Ideographic:]
- $Katakana
- [:Script = Thai:]
- [:Script = Lao:]
- [:Script = Hiragana:]];
$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]];
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
[:name = HEBREW PUNCTUATION GERSHAYIM:]
[:name = RIGHT SINGLE QUOTATION MARK:]
[:name = HYPHENATION POINT:]];
$MidNumLet = [[:name = FULL STOP:]];
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
$Numeric = [:LineBreak = Numeric:];
#
# Character Class Definitions.
# The names are those from TR29.
#
$CR = \u000d;
$LF = \u000a;
$Extend = [[:Grapheme_Extend = TRUE:]];
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
$Format = [[:Cf:] - $Extend];
$Hiragana = [:Hiragana:];
$Ideographic = [:IDEOGRAPHIC:];
## -------------------------------------------------
!!forward;
$CR $LF;
# rule 3 and 4
$ALetterEx = $ALetter $Extend*;
$ABaseLetterEx = $ABaseLetter $Extend*;
$ACMLetterEx = $ACMLetter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidNumEx = $MidNum $Extend*;
$MidNumLetEx = $MidNumLet $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$KatakanaEx = $Katakana $Extend*;
# see character breaks
[^$Control] $Extend*;
# rule 5
$ALetterEx ($Format* $ALetterEx)* {200};
# rule 6 and 7
$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx);
$ALetterSeq =
$ALetterEx
(
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
)*;
$MidALetterSeq =
$MidALetterEx
(
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
)*;
# rule 8
$NumericEx ($Format* $NumericEx)* {100};
# rule 9
$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200};
# rule 10
$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200};
# rule 11 and 12
$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100};
# rule 13
$KatakanaEx ($Format* $KatakanaEx)* {300};
$Hiragana $Extend* {300};
$Ideographic $Extend* {400};
## -------------------------------------------------
!!reverse;
$BackALetterEx = $Extend* $ALetter;
$BackABaseLetterEx = $Extend* $ABaseLetter;
$BackACMLetterEx = $Extend* $ACMLetter;
$BackNumericEx = $Extend* $Numeric;
$BackMidNumEx = $Extend* $MidNum;
$BackMidNumLetEx = $Extend* $MidNumLet;
$BackMidLetterEx = $Extend* $MidLetter;
$BackKatakanaEx = $Extend* $Katakana;
$LF $CR;
# see character breaks
$Extend* [^$Control];
# rule 5
($BackALetterEx $Format*)* $BackABaseLetterEx;
($BackALetterEx $Format*)* $BackACMLetterEx / $Control;
# rule 6 and 7
$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format);
$BackALetterSeq =
(
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
)*
$BackABaseLetterEx;
$BackMidALetterSeq =
(
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
)*
$BackMidALetterEx;
# rule 8
$BackNumericEx $Format* $BackNumericEx;
# rule 10
(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq;
# to handle letter sequences ending with a combining mark
(($BackNumericEx | $BackMidALetterSeq) $Format*)*
(
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
)*
$BackACMLetterEx / $Control;
# rule 10
($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx;
# rule 11 and 12
$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
# rule 13
$BackKatakanaEx $Format* $BackKatakanaEx;
## -------------------------------------------------
!!safe_reverse;
# rule 3
$Extend+ [^$Extend];
$Extend+; # comes into play when buffer _begins_ with an $Extend+.
# rule 4
$Format+ $BackABaseLetterEx;
$Format+ $BackACMLetterEx / $Control;
$Format+ $BackNumericEx;
$Format+ $BackMidLetterEx;
$Format+ $BackMidNumLetEx;
$Format+ $BackMidNumEx;
$Format+ $BackKatakanaEx;
# rule 6
($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control;
# rule 11
($MidNum | $MidNumLet) $Format* $BackNumericEx;
## -------------------------------------------------
!!safe_forward;
# rule 3
$Extend+;
# rule 4
$Extend* $Format+ $ALetterEx;
$Extend* $Format+ $NumericEx;
$Extend* $Format+ $MidLetterEx;
$Extend* $Format+ $MidNumLetEx;
$Extend* $Format+ $MidNumEx;
$Extend* $Format+ $KatakanaEx;
$Extend+ $Format* $ALetterEx;
$Extend+ $Format* $NumericEx;
$Extend+ $Format* $MidLetterEx;
$Extend+ $Format* $MidNumLetEx;
$Extend+ $Format* $MidNumEx;
$Extend+ $Format* $KatakanaEx;
# rule 6
($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx;
# rule 11
($MidNumEx | $MidNumLetEx) $Format* $NumericEx;

View file

@ -0,0 +1,255 @@
#
# Copyright (C) 2002-2004,
# International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: word_ja.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on Version 4.0.0, dated 2003-04-17
#
##############################################################################
#
# Character class definitions from TR 29
#
##############################################################################
!!chain;
!!LBCMNoChain;
$Katakana = [[:Script = KATAKANA:]
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
- [:Ideographic:]
- $Katakana
- [:Script = Thai:]
- [:Script = Lao:]
- [:Script = Hiragana:]];
$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]];
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
[:name = HEBREW PUNCTUATION GERSHAYIM:]
[:name = RIGHT SINGLE QUOTATION MARK:]
[:name = HYPHENATION POINT:]];
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
$Numeric = [:LineBreak = Numeric:];
#
# Character Class Definitions.
# The names are those from TR29.
#
$CR = \u000d;
$LF = \u000a;
$Extend = [[:Grapheme_Extend = TRUE:]];
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
$Format = [[:Cf:] - $Extend];
$Hiragana = [:Hiragana:];
$Ideographic = [:IDEOGRAPHIC:];
## -------------------------------------------------
!!forward;
$CR $LF;
# rule 3 and 4
$ALetterEx = $ALetter $Extend*;
$ABaseLetterEx = $ABaseLetter $Extend*;
$ACMLetterEx = $ACMLetter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidNumEx = $MidNum $Extend*;
$MidNumLetEx = $MidNumLet $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$KatakanaEx = $Katakana $Extend*;
$HiraganaEx = $Hiragana $Extend*;
$IdeographicEx = $Ideographic $Extend*;
# see character breaks
[^$Control] $Extend*;
# rule 5
$ALetterEx ($Format* $ALetterEx)* {200};
# rule 6 and 7
$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx);
$ALetterSeq =
$ALetterEx
(
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
)*;
$MidALetterSeq =
$MidALetterEx
(
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
)*;
# rule 8
$NumericEx ($Format* $NumericEx)* {100};
# rule 9
$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200};
# rule 10
$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200};
# rule 11 and 12
$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100};
# rule 13
$KatakanaEx ($Format* $KatakanaEx)* {300};
$HiraganaEx ($Format* $HiraganaEx)* {300};
$IdeographicEx ($Format* $IdeographicEx)* {400};
## -------------------------------------------------
!!reverse;
$BackALetterEx = $Extend* $ALetter;
$BackABaseLetterEx = $Extend* $ABaseLetter;
$BackACMLetterEx = $Extend* $ACMLetter;
$BackNumericEx = $Extend* $Numeric;
$BackMidNumEx = $Extend* $MidNum;
$BackMidNumLetEx = $Extend* $MidNumLet;
$BackMidLetterEx = $Extend* $MidLetter;
$BackKatakanaEx = $Extend* $Katakana;
$BackHiraganaEx = $Extend* $Hiragana;
$BackIdeographicEx = $Extend* $Ideographic;
$LF $CR;
# see character breaks
$Extend* [^$Control];
# rule 5
($BackALetterEx $Format*)* $BackABaseLetterEx;
($BackALetterEx $Format*)* $BackACMLetterEx / $Control;
# rule 6 and 7
$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format);
$BackALetterSeq =
(
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
)*
$BackABaseLetterEx;
$BackMidALetterSeq =
(
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
)*
$BackMidALetterEx;
# rule 8
$BackNumericEx $Format* $BackNumericEx;
# rule 10
(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq;
# to handle letter sequences ending with a combining mark
(($BackNumericEx | $BackMidALetterSeq) $Format*)*
(
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
)*
$BackACMLetterEx / $Control;
# rule 10
($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx;
# rule 11 and 12
$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
# rule 13
$BackKatakanaEx $Format* $BackKatakanaEx;
$BackHiraganaEx $Format* $BackHiraganaEx;
$BackIdeographicEx $Format* $BackIdeographicEx;
## -------------------------------------------------
!!safe_reverse;
# rule 3
$Extend+ [^$Extend];
$Extend+; # comes into play when buffer _begins_ with an $Extend+.
# rule 4
$Format+ $BackABaseLetterEx;
$Format+ $BackACMLetterEx / $Control;
$Format+ $BackNumericEx;
$Format+ $BackMidLetterEx;
$Format+ $BackMidNumLetEx;
$Format+ $BackMidNumEx;
$Format+ $BackKatakanaEx;
$Format+ $BackHiraganaEx;
$Format+ $BackIdeographicEx;
# rule 6
($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control;
# rule 11
($MidNum | $MidNumLet) $Format* $BackNumericEx;
## -------------------------------------------------
!!safe_forward;
# rule 3
$Extend+;
# rule 4
$Extend* $Format+ $ALetterEx;
$Extend* $Format+ $NumericEx;
$Extend* $Format+ $MidLetterEx;
$Extend* $Format+ $MidNumLetEx;
$Extend* $Format+ $MidNumEx;
$Extend* $Format+ $KatakanaEx;
$Extend* $Format+ $HiraganaEx;
$Extend* $Format+ $IdeographicEx;
$Extend+ $Format* $ALetterEx;
$Extend+ $Format* $NumericEx;
$Extend+ $Format* $MidLetterEx;
$Extend+ $Format* $MidNumLetEx;
$Extend+ $Format* $MidNumEx;
$Extend+ $Format* $KatakanaEx;
$Extend+ $Format* $HiraganaEx;
$Extend+ $Format* $IdeographicEx;
# rule 6
($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx;
# rule 11
($MidNumEx | $MidNumLetEx) $Format* $NumericEx;

View file

@ -7,6 +7,9 @@
// *
// ***************************************************************************
en_US_POSIX{
boundaries {
word { "word_POSIX" }
}
NumberElements{
".",
",",

View file

@ -10,6 +10,9 @@
* ICU <specials> source: ../../../locale/icu/main\ja.xml
*/
ja{
boundaries {
word { "word_ja" }
}
Countries{
AD{"アンドラ"}
AE{"アラブ首長国連邦"}

View file

@ -10,6 +10,14 @@
* ICU <specials> source: ../../../locale/icu/main\root.xml
*/
root{
boundaries {
grapheme { "char" }
line { "line" }
sentence { "sent" }
title { "title" }
word { "word" }
}
Currencies{
EUR{
"€",

View file

@ -11,6 +11,10 @@
*/
th{
BreakDictionaryData:import { "../brkitr/thaidict.brk" }
boundaries {
word { "word_th" }
line { "line_th" }
}
Countries{
AD{"อันดอร์รา"}
AE{"สหรัฐอาหรับเอมิเรตส์"}

View file

@ -15,6 +15,7 @@
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
#include "unicode/uiter.h"
#include "unicode/putil.h"
#include "citrtest.h"

View file

@ -575,6 +575,31 @@ void RBBITest::TestBug3818() {
delete bi;
}
void RBBITest::TestJapaneseWordBreak() {
UErrorCode status = U_ZERO_ERROR;
BITestData japaneseWordSelection(status);
ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
Locale("ja"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
return;
}
generalIteratorTest(*e, japaneseWordSelection);
delete e;
}
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
@ -633,6 +658,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
break;
case 18: name = "TestBug3818";
if(exec) TestBug3818(); break;
case 19: name = "TestJapaneseWordBreak";
if(exec) TestJapaneseWordBreak(); break;
default: name = ""; break; //needed to end loop
}

View file

@ -75,6 +75,7 @@ public:
void TestLineBreaks();
void TestSentBreaks();
void TestBug3818();
void TestJapaneseWordBreak();
/***********************/