mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-3561 Locale-based text boundaries
X-SVN-Rev: 16582
This commit is contained in:
parent
d118393447
commit
225c380bde
12 changed files with 652 additions and 203 deletions
|
@ -31,6 +31,7 @@
|
|||
#include "mutex.h"
|
||||
#include "iculserv.h"
|
||||
#include "locbased.h"
|
||||
#include "uresimp.h"
|
||||
|
||||
// *****************************************************************************
|
||||
// class BreakIterator
|
||||
|
@ -45,6 +46,105 @@ const int32_t BreakIterator::DONE = (int32_t)-1;
|
|||
|
||||
// -------------------------------------
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode &status)
|
||||
{
|
||||
char fnbuff[256];
|
||||
char actualLocale[ULOC_FULLNAME_CAPACITY];
|
||||
int32_t size;
|
||||
const UChar* brkfname = NULL;
|
||||
UResourceBundle brkrules, brkname;
|
||||
BreakIterator *result = NULL;
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
|
||||
// Get the locale
|
||||
UResourceBundle *b = ures_open(NULL, loc.getName(), &status);
|
||||
|
||||
// Get the "boundaries" array.
|
||||
if (U_SUCCESS(status)) {
|
||||
ures_initStackObject(&brkrules);
|
||||
(void) ures_getByKeyWithFallback(b, "boundaries", &brkrules, &status);
|
||||
}
|
||||
|
||||
// Get the string object naming the rules file
|
||||
if (U_SUCCESS(status)) {
|
||||
ures_initStackObject(&brkname);
|
||||
(void) ures_getByKeyWithFallback(&brkrules, type, &brkname, &status);
|
||||
}
|
||||
|
||||
// Get the actual string
|
||||
if (U_SUCCESS(status)) {
|
||||
brkfname = ures_getString(&brkname, &size, &status);
|
||||
uprv_strncpy(actualLocale, ures_getLocale(&brkname, &status), sizeof(actualLocale)/sizeof(actualLocale[0]));
|
||||
}
|
||||
|
||||
// Use the string if we found it
|
||||
if (U_SUCCESS(status)) {
|
||||
u_UCharsToChars(brkfname, fnbuff, size);
|
||||
fnbuff[size] = '\0';
|
||||
}
|
||||
|
||||
ures_close(&brkrules);
|
||||
ures_close(&brkname);
|
||||
|
||||
UDataMemory* file = udata_open(NULL, "brk", fnbuff, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
ures_close(b);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// We found the break rules; now see if a dictionary is needed
|
||||
if (dict)
|
||||
{
|
||||
UErrorCode localStatus = U_ZERO_ERROR;
|
||||
ures_initStackObject(&brkname);
|
||||
(void) ures_getByKeyWithFallback(b, "BreakDictionaryData", &brkname, &localStatus);
|
||||
#if 0
|
||||
if (U_SUCCESS(localStatus)) {
|
||||
brkfname = ures_getString(&brkname, &size, &localStatus);
|
||||
}
|
||||
#endif
|
||||
if (U_SUCCESS(localStatus)) {
|
||||
#if 0
|
||||
u_UCharsToChars(brkfname, fnbuff, size);
|
||||
fnbuff[size] = '\0';
|
||||
#endif
|
||||
result = new DictionaryBasedBreakIterator(file, "thaidict.brk", status);
|
||||
}
|
||||
ures_close(&brkname);
|
||||
}
|
||||
|
||||
// If there is still no result but we haven't had an error, no dictionary,
|
||||
// so make a non-dictionary break iterator
|
||||
if (U_SUCCESS(status) && result == NULL) {
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
|
||||
// If there is a result, set the valid locale and actual locale
|
||||
if (U_SUCCESS(status) && result != NULL) {
|
||||
U_LOCALE_BASED(locBased, *result);
|
||||
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale);
|
||||
}
|
||||
|
||||
ures_close(b);
|
||||
|
||||
if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple
|
||||
delete result;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (result == NULL) {
|
||||
udata_close(file);
|
||||
if (U_SUCCESS(status)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Creates a break iterator for word breaks.
|
||||
BreakIterator* U_EXPORT2
|
||||
BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
|
||||
|
@ -52,50 +152,6 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
|
|||
return createInstance(key, UBRK_WORD, status);
|
||||
}
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::makeWordInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
const char* filename = "word";
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
|
||||
if (!uprv_strcmp(key.getLanguage(), "th"))
|
||||
{
|
||||
filename = "word_th";
|
||||
}
|
||||
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
// The UDataMemory is adopted by the break iterator.
|
||||
|
||||
if(!uprv_strcmp(filename, "word_th")) {
|
||||
filename = "thaidict.brk";
|
||||
result = new DictionaryBasedBreakIterator(file, filename, status);
|
||||
}
|
||||
else {
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
|
||||
if (result != NULL) {
|
||||
delete result;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (result == NULL) {
|
||||
udata_close(file);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for line breaks.
|
||||
|
@ -105,49 +161,6 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
|
|||
return createInstance(key, UBRK_LINE, status);
|
||||
}
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::makeLineInstance(const Locale& key, UErrorCode& status)
|
||||
{
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
const char* filename = "line";
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
|
||||
if (!uprv_strcmp(key.getLanguage(), "th"))
|
||||
{
|
||||
filename = "line_th";
|
||||
}
|
||||
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
// The UDataMemory is adopted by the break iterator.
|
||||
|
||||
if (!uprv_strcmp(key.getLanguage(), "th")) {
|
||||
filename = "thaidict.brk";
|
||||
result = new DictionaryBasedBreakIterator(file, filename, status);
|
||||
}
|
||||
else {
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
|
||||
if (result != NULL) {
|
||||
delete result;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (result == NULL) {
|
||||
udata_close(file);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for character breaks.
|
||||
|
@ -157,38 +170,6 @@ BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
|
|||
return createInstance(key, UBRK_CHARACTER, status);
|
||||
}
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::makeCharacterInstance(const Locale& /* key */, UErrorCode& status)
|
||||
{
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
static const char filename[] = "char";
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
// The UDataMemory is adopted by the break iterator.
|
||||
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
|
||||
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
|
||||
if (result != NULL) {
|
||||
delete result;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (result == NULL) {
|
||||
udata_close(file);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for sentence breaks.
|
||||
|
@ -198,38 +179,6 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
|
|||
return createInstance(key, UBRK_SENTENCE, status);
|
||||
}
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::makeSentenceInstance(const Locale& /*key */, UErrorCode& status)
|
||||
{
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
static const char filename[] = "sent";
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
// The UDataMemory is adopted by the break iterator.
|
||||
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
|
||||
if (result != NULL) {
|
||||
delete result;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (result == NULL) {
|
||||
udata_close(file);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Creates a break iterator for title casing breaks.
|
||||
|
@ -239,38 +188,6 @@ BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
|
|||
return createInstance(key, UBRK_TITLE, status);
|
||||
}
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::makeTitleInstance(const Locale& /* key */, UErrorCode& status)
|
||||
{
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files. This function will have to be made fully general
|
||||
// at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
static const char filename[] = "title";
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
// The UDataMemory is adopted by the break iterator.
|
||||
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
|
||||
if (result != NULL) {
|
||||
delete result;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (result == NULL) {
|
||||
udata_close(file);
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
||||
// Gets all the available locales that has localized text boundary data.
|
||||
|
@ -495,19 +412,19 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
BreakIterator *result = NULL;
|
||||
switch (kind) {
|
||||
case UBRK_CHARACTER:
|
||||
result = BreakIterator::makeCharacterInstance(loc, status);
|
||||
result = BreakIterator::buildInstance(loc, "grapheme", FALSE, status);
|
||||
break;
|
||||
case UBRK_WORD:
|
||||
result = BreakIterator::makeWordInstance(loc, status);
|
||||
result = BreakIterator::buildInstance(loc, "word", TRUE, status);
|
||||
break;
|
||||
case UBRK_LINE:
|
||||
result = BreakIterator::makeLineInstance(loc, status);
|
||||
result = BreakIterator::buildInstance(loc, "line", TRUE, status);
|
||||
break;
|
||||
case UBRK_SENTENCE:
|
||||
result = BreakIterator::makeSentenceInstance(loc, status);
|
||||
result = BreakIterator::buildInstance(loc, "sentence", FALSE, status);
|
||||
break;
|
||||
case UBRK_TITLE:
|
||||
result = BreakIterator::makeTitleInstance(loc, status);
|
||||
result = BreakIterator::buildInstance(loc, "title", FALSE, status);
|
||||
break;
|
||||
default:
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -517,14 +434,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// this is more of a placeholder. All the break iterators have the same actual locale: root
|
||||
// except the Thai one
|
||||
UResourceBundle *res = ures_open(NULL, loc.getName(), &status);
|
||||
U_LOCALE_BASED(locBased, *result);
|
||||
locBased.setLocaleIDs(ures_getLocaleByType(res, ULOC_VALID_LOCALE, &status),
|
||||
(uprv_strcmp(loc.getLanguage(), "th") == 0) ?
|
||||
"th" : "root");
|
||||
ures_close(res);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -590,12 +590,7 @@ public:
|
|||
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
|
||||
|
||||
private:
|
||||
static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status);
|
||||
static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status);
|
||||
static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status);
|
||||
static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status);
|
||||
static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status);
|
||||
|
||||
static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status);
|
||||
static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
|
||||
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
|
||||
|
||||
|
|
|
@ -28,4 +28,4 @@
|
|||
# char.txt, title.txt and word.txt are not included so that more tests pass by default,
|
||||
# and so that the makefile rules are simplier.
|
||||
BRK_SOURCE = \
|
||||
line.txt sent.txt line_th.txt word_th.txt
|
||||
line.txt sent.txt line_th.txt word_th.txt word_ja.txt word_POSIX.txt
|
||||
|
|
243
icu4c/source/data/brkitr/word_POSIX.txt
Normal file
243
icu4c/source/data/brkitr/word_POSIX.txt
Normal file
|
@ -0,0 +1,243 @@
|
|||
#
|
||||
# Copyright (C) 2002-2004,
|
||||
# International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
#
|
||||
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
$Katakana = [[:Script = KATAKANA:]
|
||||
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- [:Ideographic:]
|
||||
- $Katakana
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
|
||||
$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:]
|
||||
[:name = HYPHENATION POINT:]];
|
||||
|
||||
$MidNumLet = [[:name = FULL STOP:]];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
|
||||
$Format = [[:Cf:] - $Extend];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [:IDEOGRAPHIC:];
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
|
||||
# rule 3 and 4
|
||||
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$ABaseLetterEx = $ABaseLetter $Extend*;
|
||||
$ACMLetterEx = $ACMLetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidNumLetEx = $MidNumLet $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
|
||||
# see character breaks
|
||||
|
||||
[^$Control] $Extend*;
|
||||
|
||||
# rule 5
|
||||
|
||||
$ALetterEx ($Format* $ALetterEx)* {200};
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx);
|
||||
|
||||
$ALetterSeq =
|
||||
$ALetterEx
|
||||
(
|
||||
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
|
||||
)*;
|
||||
|
||||
$MidALetterSeq =
|
||||
$MidALetterEx
|
||||
(
|
||||
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
|
||||
)*;
|
||||
|
||||
# rule 8
|
||||
|
||||
$NumericEx ($Format* $NumericEx)* {100};
|
||||
|
||||
# rule 9
|
||||
|
||||
$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200};
|
||||
|
||||
# rule 10
|
||||
|
||||
$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200};
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx ($Format* $KatakanaEx)* {300};
|
||||
$Hiragana $Extend* {300};
|
||||
$Ideographic $Extend* {400};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$BackALetterEx = $Extend* $ALetter;
|
||||
$BackABaseLetterEx = $Extend* $ABaseLetter;
|
||||
$BackACMLetterEx = $Extend* $ACMLetter;
|
||||
$BackNumericEx = $Extend* $Numeric;
|
||||
$BackMidNumEx = $Extend* $MidNum;
|
||||
$BackMidNumLetEx = $Extend* $MidNumLet;
|
||||
$BackMidLetterEx = $Extend* $MidLetter;
|
||||
$BackKatakanaEx = $Extend* $Katakana;
|
||||
|
||||
$LF $CR;
|
||||
|
||||
# see character breaks
|
||||
|
||||
$Extend* [^$Control];
|
||||
|
||||
# rule 5
|
||||
|
||||
($BackALetterEx $Format*)* $BackABaseLetterEx;
|
||||
($BackALetterEx $Format*)* $BackACMLetterEx / $Control;
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format);
|
||||
|
||||
$BackALetterSeq =
|
||||
(
|
||||
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
|
||||
)*
|
||||
$BackABaseLetterEx;
|
||||
|
||||
$BackMidALetterSeq =
|
||||
(
|
||||
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
|
||||
)*
|
||||
$BackMidALetterEx;
|
||||
|
||||
# rule 8
|
||||
|
||||
$BackNumericEx $Format* $BackNumericEx;
|
||||
|
||||
# rule 10
|
||||
|
||||
(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq;
|
||||
|
||||
# to handle letter sequences ending with a combining mark
|
||||
(($BackNumericEx | $BackMidALetterSeq) $Format*)*
|
||||
(
|
||||
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
|
||||
)*
|
||||
$BackACMLetterEx / $Control;
|
||||
|
||||
# rule 10
|
||||
|
||||
($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx;
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
|
||||
|
||||
# rule 13
|
||||
|
||||
$BackKatakanaEx $Format* $BackKatakanaEx;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
$Extend+ [^$Extend];
|
||||
$Extend+; # comes into play when buffer _begins_ with an $Extend+.
|
||||
|
||||
# rule 4
|
||||
$Format+ $BackABaseLetterEx;
|
||||
$Format+ $BackACMLetterEx / $Control;
|
||||
$Format+ $BackNumericEx;
|
||||
$Format+ $BackMidLetterEx;
|
||||
$Format+ $BackMidNumLetEx;
|
||||
$Format+ $BackMidNumEx;
|
||||
$Format+ $BackKatakanaEx;
|
||||
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
|
||||
($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control;
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet) $Format* $BackNumericEx;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# rule 3
|
||||
$Extend+;
|
||||
|
||||
# rule 4
|
||||
$Extend* $Format+ $ALetterEx;
|
||||
$Extend* $Format+ $NumericEx;
|
||||
$Extend* $Format+ $MidLetterEx;
|
||||
$Extend* $Format+ $MidNumLetEx;
|
||||
$Extend* $Format+ $MidNumEx;
|
||||
$Extend* $Format+ $KatakanaEx;
|
||||
|
||||
$Extend+ $Format* $ALetterEx;
|
||||
$Extend+ $Format* $NumericEx;
|
||||
$Extend+ $Format* $MidLetterEx;
|
||||
$Extend+ $Format* $MidNumLetEx;
|
||||
$Extend+ $Format* $MidNumEx;
|
||||
$Extend+ $Format* $KatakanaEx;
|
||||
|
||||
# rule 6
|
||||
($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx;
|
||||
|
||||
# rule 11
|
||||
($MidNumEx | $MidNumLetEx) $Format* $NumericEx;
|
255
icu4c/source/data/brkitr/word_ja.txt
Normal file
255
icu4c/source/data/brkitr/word_ja.txt
Normal file
|
@ -0,0 +1,255 @@
|
|||
#
|
||||
# Copyright (C) 2002-2004,
|
||||
# International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: word_ja.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
#
|
||||
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
$Katakana = [[:Script = KATAKANA:]
|
||||
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- [:Ideographic:]
|
||||
- $Katakana
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
|
||||
$ACMLetter = [$ALetter & [:Grapheme_Extend = TRUE:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:]
|
||||
[:name = HYPHENATION POINT:]];
|
||||
|
||||
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
|
||||
$Format = [[:Cf:] - $Extend];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [:IDEOGRAPHIC:];
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
|
||||
# rule 3 and 4
|
||||
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$ABaseLetterEx = $ABaseLetter $Extend*;
|
||||
$ACMLetterEx = $ACMLetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidNumLetEx = $MidNumLet $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$HiraganaEx = $Hiragana $Extend*;
|
||||
$IdeographicEx = $Ideographic $Extend*;
|
||||
|
||||
# see character breaks
|
||||
|
||||
[^$Control] $Extend*;
|
||||
|
||||
# rule 5
|
||||
|
||||
$ALetterEx ($Format* $ALetterEx)* {200};
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx);
|
||||
|
||||
$ALetterSeq =
|
||||
$ALetterEx
|
||||
(
|
||||
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
|
||||
)*;
|
||||
|
||||
$MidALetterSeq =
|
||||
$MidALetterEx
|
||||
(
|
||||
$Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
|
||||
)*;
|
||||
|
||||
# rule 8
|
||||
|
||||
$NumericEx ($Format* $NumericEx)* {100};
|
||||
|
||||
# rule 9
|
||||
|
||||
$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200};
|
||||
|
||||
# rule 10
|
||||
|
||||
$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200};
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx ($Format* $KatakanaEx)* {300};
|
||||
$HiraganaEx ($Format* $HiraganaEx)* {300};
|
||||
$IdeographicEx ($Format* $IdeographicEx)* {400};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$BackALetterEx = $Extend* $ALetter;
|
||||
$BackABaseLetterEx = $Extend* $ABaseLetter;
|
||||
$BackACMLetterEx = $Extend* $ACMLetter;
|
||||
$BackNumericEx = $Extend* $Numeric;
|
||||
$BackMidNumEx = $Extend* $MidNum;
|
||||
$BackMidNumLetEx = $Extend* $MidNumLet;
|
||||
$BackMidLetterEx = $Extend* $MidLetter;
|
||||
$BackKatakanaEx = $Extend* $Katakana;
|
||||
$BackHiraganaEx = $Extend* $Hiragana;
|
||||
$BackIdeographicEx = $Extend* $Ideographic;
|
||||
|
||||
$LF $CR;
|
||||
|
||||
# see character breaks
|
||||
|
||||
$Extend* [^$Control];
|
||||
|
||||
# rule 5
|
||||
|
||||
($BackALetterEx $Format*)* $BackABaseLetterEx;
|
||||
($BackALetterEx $Format*)* $BackACMLetterEx / $Control;
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format);
|
||||
|
||||
$BackALetterSeq =
|
||||
(
|
||||
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
|
||||
)*
|
||||
$BackABaseLetterEx;
|
||||
|
||||
$BackMidALetterSeq =
|
||||
(
|
||||
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
|
||||
)*
|
||||
$BackMidALetterEx;
|
||||
|
||||
# rule 8
|
||||
|
||||
$BackNumericEx $Format* $BackNumericEx;
|
||||
|
||||
# rule 10
|
||||
|
||||
(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq;
|
||||
|
||||
# to handle letter sequences ending with a combining mark
|
||||
(($BackNumericEx | $BackMidALetterSeq) $Format*)*
|
||||
(
|
||||
$BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
|
||||
)*
|
||||
$BackACMLetterEx / $Control;
|
||||
|
||||
# rule 10
|
||||
|
||||
($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx;
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
|
||||
|
||||
# rule 13
|
||||
|
||||
$BackKatakanaEx $Format* $BackKatakanaEx;
|
||||
$BackHiraganaEx $Format* $BackHiraganaEx;
|
||||
$BackIdeographicEx $Format* $BackIdeographicEx;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
$Extend+ [^$Extend];
|
||||
$Extend+; # comes into play when buffer _begins_ with an $Extend+.
|
||||
|
||||
# rule 4
|
||||
$Format+ $BackABaseLetterEx;
|
||||
$Format+ $BackACMLetterEx / $Control;
|
||||
$Format+ $BackNumericEx;
|
||||
$Format+ $BackMidLetterEx;
|
||||
$Format+ $BackMidNumLetEx;
|
||||
$Format+ $BackMidNumEx;
|
||||
$Format+ $BackKatakanaEx;
|
||||
$Format+ $BackHiraganaEx;
|
||||
$Format+ $BackIdeographicEx;
|
||||
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
|
||||
($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control;
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet) $Format* $BackNumericEx;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# rule 3
|
||||
$Extend+;
|
||||
|
||||
# rule 4
|
||||
$Extend* $Format+ $ALetterEx;
|
||||
$Extend* $Format+ $NumericEx;
|
||||
$Extend* $Format+ $MidLetterEx;
|
||||
$Extend* $Format+ $MidNumLetEx;
|
||||
$Extend* $Format+ $MidNumEx;
|
||||
$Extend* $Format+ $KatakanaEx;
|
||||
$Extend* $Format+ $HiraganaEx;
|
||||
$Extend* $Format+ $IdeographicEx;
|
||||
|
||||
$Extend+ $Format* $ALetterEx;
|
||||
$Extend+ $Format* $NumericEx;
|
||||
$Extend+ $Format* $MidLetterEx;
|
||||
$Extend+ $Format* $MidNumLetEx;
|
||||
$Extend+ $Format* $MidNumEx;
|
||||
$Extend+ $Format* $KatakanaEx;
|
||||
$Extend+ $Format* $HiraganaEx;
|
||||
$Extend+ $Format* $IdeographicEx;
|
||||
|
||||
# rule 6
|
||||
($MidLetterEx | $MidNumLetEx) $Format* $ALetterEx;
|
||||
|
||||
# rule 11
|
||||
($MidNumEx | $MidNumLetEx) $Format* $NumericEx;
|
|
@ -7,6 +7,9 @@
|
|||
// *
|
||||
// ***************************************************************************
|
||||
en_US_POSIX{
|
||||
boundaries {
|
||||
word { "word_POSIX" }
|
||||
}
|
||||
NumberElements{
|
||||
".",
|
||||
",",
|
||||
|
|
|
@ -10,6 +10,9 @@
|
|||
* ICU <specials> source: ../../../locale/icu/main\ja.xml
|
||||
*/
|
||||
ja{
|
||||
boundaries {
|
||||
word { "word_ja" }
|
||||
}
|
||||
Countries{
|
||||
AD{"アンドラ"}
|
||||
AE{"アラブ首長国連邦"}
|
||||
|
|
|
@ -10,6 +10,14 @@
|
|||
* ICU <specials> source: ../../../locale/icu/main\root.xml
|
||||
*/
|
||||
root{
|
||||
boundaries {
|
||||
grapheme { "char" }
|
||||
line { "line" }
|
||||
sentence { "sent" }
|
||||
title { "title" }
|
||||
word { "word" }
|
||||
}
|
||||
|
||||
Currencies{
|
||||
EUR{
|
||||
"€",
|
||||
|
|
|
@ -11,6 +11,10 @@
|
|||
*/
|
||||
th{
|
||||
BreakDictionaryData:import { "../brkitr/thaidict.brk" }
|
||||
boundaries {
|
||||
word { "word_th" }
|
||||
line { "line_th" }
|
||||
}
|
||||
Countries{
|
||||
AD{"อันดอร์รา"}
|
||||
AE{"สหรัฐอาหรับเอมิเรตส์"}
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "unicode/schriter.h"
|
||||
#include "unicode/uchriter.h"
|
||||
#include "unicode/uiter.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "citrtest.h"
|
||||
|
||||
|
||||
|
|
|
@ -575,6 +575,31 @@ void RBBITest::TestBug3818() {
|
|||
delete bi;
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestJapaneseWordBreak() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BITestData japaneseWordSelection(status);
|
||||
|
||||
ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
|
||||
ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
|
||||
ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
|
||||
ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
|
||||
ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
|
||||
ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
|
||||
ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
|
||||
|
||||
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
|
||||
Locale("ja"), status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
generalIteratorTest(*e, japaneseWordSelection);
|
||||
delete e;
|
||||
}
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
//---------------------------------------------
|
||||
|
@ -633,6 +658,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
break;
|
||||
case 18: name = "TestBug3818";
|
||||
if(exec) TestBug3818(); break;
|
||||
case 19: name = "TestJapaneseWordBreak";
|
||||
if(exec) TestJapaneseWordBreak(); break;
|
||||
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
|
|
|
@ -75,6 +75,7 @@ public:
|
|||
void TestLineBreaks();
|
||||
void TestSentBreaks();
|
||||
void TestBug3818();
|
||||
void TestJapaneseWordBreak();
|
||||
|
||||
|
||||
/***********************/
|
||||
|
|
Loading…
Add table
Reference in a new issue