mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-11770 If locale has ss-standard, sentence break iterator uses suppressions data via FilteredBreakIterator
X-SVN-Rev: 37914
This commit is contained in:
parent
e4d76d4125
commit
cd4634345e
4 changed files with 158 additions and 21 deletions
|
@ -27,6 +27,7 @@
|
|||
#include "unicode/udata.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/filteredbrk.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "cstring.h"
|
||||
#include "umutex.h"
|
||||
|
@ -383,7 +384,7 @@ BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& statu
|
|||
}
|
||||
|
||||
// -------------------------------------
|
||||
enum { kLBTypeLenMax = 32 };
|
||||
enum { kKeyValueLenMax = 32 };
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
||||
|
@ -392,7 +393,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
char lbType[kLBTypeLenMax];
|
||||
char lbType[kKeyValueLenMax];
|
||||
|
||||
BreakIterator *result = NULL;
|
||||
switch (kind) {
|
||||
|
@ -405,9 +406,9 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
case UBRK_LINE:
|
||||
uprv_strcpy(lbType, "line");
|
||||
{
|
||||
char lbKeyValue[kLBTypeLenMax] = {0};
|
||||
char lbKeyValue[kKeyValueLenMax] = {0};
|
||||
UErrorCode kvStatus = U_ZERO_ERROR;
|
||||
int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kLBTypeLenMax, kvStatus);
|
||||
int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
|
||||
uprv_strcat(lbType, "_");
|
||||
uprv_strcat(lbType, lbKeyValue);
|
||||
|
@ -417,6 +418,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
break;
|
||||
case UBRK_SENTENCE:
|
||||
result = BreakIterator::buildInstance(loc, "sentence", kind, status);
|
||||
{
|
||||
char ssKeyValue[kKeyValueLenMax] = {0};
|
||||
UErrorCode kvStatus = U_ZERO_ERROR;
|
||||
int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
|
||||
FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
|
||||
if (U_SUCCESS(kvStatus)) {
|
||||
result = fbiBuilder->build(result, status);
|
||||
delete fbiBuilder;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UBRK_TITLE:
|
||||
result = BreakIterator::buildInstance(loc, "title", kind, status);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
********************************************************************************
|
||||
* Copyright (C) 1996-2013, International Business Machines
|
||||
* Copyright (C) 1996-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************************
|
||||
*/
|
||||
|
@ -164,10 +164,9 @@ ubrk_setText(UBreakIterator* bi,
|
|||
int32_t textLength,
|
||||
UErrorCode* status)
|
||||
{
|
||||
BreakIterator *brit = (BreakIterator *)bi;
|
||||
UText ut = UTEXT_INITIALIZER;
|
||||
utext_openUChars(&ut, text, textLength, status);
|
||||
brit->setText(&ut, *status);
|
||||
((BreakIterator*)bi)->setText(&ut, *status);
|
||||
// A stack allocated UText wrapping a UChar * string
|
||||
// can be dumped without explicitly closing it.
|
||||
}
|
||||
|
@ -179,8 +178,7 @@ ubrk_setUText(UBreakIterator *bi,
|
|||
UText *text,
|
||||
UErrorCode *status)
|
||||
{
|
||||
RuleBasedBreakIterator *brit = (RuleBasedBreakIterator *)bi;
|
||||
brit->RuleBasedBreakIterator::setText(text, *status);
|
||||
((BreakIterator*)bi)->setText(text, *status);
|
||||
}
|
||||
|
||||
|
||||
|
@ -191,35 +189,35 @@ U_CAPI int32_t U_EXPORT2
|
|||
ubrk_current(const UBreakIterator *bi)
|
||||
{
|
||||
|
||||
return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::current();
|
||||
return ((BreakIterator*)bi)->current();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_next(UBreakIterator *bi)
|
||||
{
|
||||
|
||||
return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::next();
|
||||
return ((BreakIterator*)bi)->next();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_previous(UBreakIterator *bi)
|
||||
{
|
||||
|
||||
return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::previous();
|
||||
return ((BreakIterator*)bi)->previous();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_first(UBreakIterator *bi)
|
||||
{
|
||||
|
||||
return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::first();
|
||||
return ((BreakIterator*)bi)->first();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_last(UBreakIterator *bi)
|
||||
{
|
||||
|
||||
return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::last();
|
||||
return ((BreakIterator*)bi)->last();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
@ -227,7 +225,7 @@ ubrk_preceding(UBreakIterator *bi,
|
|||
int32_t offset)
|
||||
{
|
||||
|
||||
return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::preceding(offset);
|
||||
return ((BreakIterator*)bi)->preceding(offset);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
@ -235,7 +233,7 @@ ubrk_following(UBreakIterator *bi,
|
|||
int32_t offset)
|
||||
{
|
||||
|
||||
return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::following(offset);
|
||||
return ((BreakIterator*)bi)->following(offset);
|
||||
}
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
|
@ -256,20 +254,20 @@ ubrk_countAvailable()
|
|||
U_CAPI UBool U_EXPORT2
|
||||
ubrk_isBoundary(UBreakIterator *bi, int32_t offset)
|
||||
{
|
||||
return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::isBoundary(offset);
|
||||
return ((BreakIterator*)bi)->isBoundary(offset);
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_getRuleStatus(UBreakIterator *bi)
|
||||
{
|
||||
return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::getRuleStatus();
|
||||
return ((BreakIterator*)bi)->getRuleStatus();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status)
|
||||
{
|
||||
return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::getRuleStatusVec(fillInVec, capacity, *status);
|
||||
return ((BreakIterator*)bi)->getRuleStatusVec(fillInVec, capacity, *status);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and others.
|
||||
* Copyright (C) 1996-2015, International Business Machines Corporation and others.
|
||||
* All Rights Reserved.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -45,10 +45,20 @@
|
|||
* when line-wrapping. The mechanism correctly handles punctuation and
|
||||
* hyphenated words.
|
||||
* <p>
|
||||
* Note: The locale keyword "lb" can be used to modify line break
|
||||
* behavior according to the CSS level 3 line-break options, see
|
||||
* <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
|
||||
* "ja@lb=strict", "zh@lb=loose".
|
||||
* <p>
|
||||
* Sentence boundary analysis allows selection with correct
|
||||
* interpretation of periods within numbers and abbreviations, and
|
||||
* trailing punctuation marks such as quotation marks and parentheses.
|
||||
* <p>
|
||||
* Note: The locale keyword "ss" can be used to enable use of
|
||||
* segmentation suppression data (preventing breaks in English after
|
||||
* abbreviations such as "Mr." or "Est.", for example), as follows:
|
||||
* "en@ss=standard".
|
||||
* <p>
|
||||
* Word boundary analysis is used by search and replace functions, as
|
||||
* well as within text editing applications that allow the user to
|
||||
* select words with a double click. Word selection provides correct
|
||||
|
@ -202,7 +212,9 @@ typedef enum USentenceBreakTag {
|
|||
* and sentence breaks in text.
|
||||
* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
|
||||
* UBRK_LINE, UBRK_SENTENCE
|
||||
* @param locale The locale specifying the text-breaking conventions.
|
||||
* @param locale The locale specifying the text-breaking conventions. Note that
|
||||
* locale keys such as "lb" and "ss" may be used to modify text break behavior,
|
||||
* see general discussion of BreakIterator C API.
|
||||
* @param text The text to be iterated over.
|
||||
* @param textLength The number of characters in text, or -1 if null-terminated.
|
||||
* @param status A UErrorCode to receive any errors.
|
||||
|
|
|
@ -49,6 +49,7 @@ static void TestBreakIteratorUText(void);
|
|||
static void TestBreakIteratorTailoring(void);
|
||||
static void TestBreakIteratorRefresh(void);
|
||||
static void TestBug11665(void);
|
||||
static void TestBreakIteratorSuppressions(void);
|
||||
|
||||
void addBrkIterAPITest(TestNode** root);
|
||||
|
||||
|
@ -65,6 +66,7 @@ void addBrkIterAPITest(TestNode** root)
|
|||
addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
|
||||
addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
|
||||
addTest(root, &TestBug11665, "tstxtbd/cbiapts/TestBug11665");
|
||||
addTest(root, &TestBreakIteratorSuppressions, "tstxtbd/cbiapts/TestBreakIteratorSuppressions");
|
||||
}
|
||||
|
||||
#define CLONETEST_ITERATOR_COUNT 2
|
||||
|
@ -934,5 +936,117 @@ static void TestBug11665(void) {
|
|||
ubrk_close(bi);
|
||||
}
|
||||
|
||||
static const char testSentenceSuppressionsEn[] = "Mr. Jones comes home. Dr. Smith Ph.D. is out. In the U.S.A. it is hot.";
|
||||
static const int32_t testSentSuppFwdOffsetsEn[] = { 22, 26, 46, 70, -1 }; /* With suppressions, currently not handling Dr. */
|
||||
static const int32_t testSentFwdOffsetsEn[] = { 4, 22, 26, 46, 70, -1 }; /* Without suppressions */
|
||||
static const int32_t testSentSuppRevOffsetsEn[] = { 46, 26, 22, 0, -1 }; /* With suppressions, currently not handling Dr. */
|
||||
static const int32_t testSentRevOffsetsEn[] = { 46, 26, 22, 4, 0, -1 }; /* Without suppressions */
|
||||
|
||||
static const char testSentenceSuppressionsDe[] = "Wenn ich schon h\\u00F6re zu Guttenberg kommt evtl. zur\\u00FCck.";
|
||||
static const int32_t testSentSuppFwdOffsetsDe[] = { 53, -1 }; /* With suppressions */
|
||||
static const int32_t testSentFwdOffsetsDe[] = { 53, -1 }; /* Without suppressions; no break in evtl. zur due to casing */
|
||||
static const int32_t testSentSuppRevOffsetsDe[] = { 0, -1 }; /* With suppressions */
|
||||
static const int32_t testSentRevOffsetsDe[] = { 0, -1 }; /* Without suppressions */
|
||||
|
||||
static const char testSentenceSuppressionsEs[] = "Te esperamos todos los miercoles en Bravo 416, Col. El Pueblo a las 7 PM.";
|
||||
static const int32_t testSentSuppFwdOffsetsEs[] = { 73, -1 }; /* With suppressions */
|
||||
static const int32_t testSentFwdOffsetsEs[] = { 52, 73, -1 }; /* Without suppressions */
|
||||
static const int32_t testSentSuppRevOffsetsEs[] = { 0, -1 }; /* With suppressions */
|
||||
static const int32_t testSentRevOffsetsEs[] = { 52, 0, -1 }; /* Without suppressions */
|
||||
|
||||
enum { kTextULenMax = 128 };
|
||||
|
||||
typedef struct {
|
||||
const char * locale;
|
||||
const char * text;
|
||||
const int32_t * expFwdOffsets;
|
||||
const int32_t * expRevOffsets;
|
||||
} TestBISuppressionsItem;
|
||||
|
||||
static const TestBISuppressionsItem testBISuppressionsItems[] = {
|
||||
{ "en@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn },
|
||||
{ "en", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn },
|
||||
{ "fr@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn },
|
||||
{ "af@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn }, /* no brkiter data => en suppressions? */
|
||||
{ "zh@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */
|
||||
{ "zh_Hant@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */
|
||||
{ "fi@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */
|
||||
{ "ja@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */
|
||||
{ "de@ss=standard", testSentenceSuppressionsDe, testSentSuppFwdOffsetsDe, testSentSuppRevOffsetsDe },
|
||||
{ "de", testSentenceSuppressionsDe, testSentFwdOffsetsDe, testSentRevOffsetsDe },
|
||||
{ "es@ss=standard", testSentenceSuppressionsEs, testSentSuppFwdOffsetsEs, testSentSuppRevOffsetsEs },
|
||||
{ "es", testSentenceSuppressionsEs, testSentFwdOffsetsEs, testSentRevOffsetsEs },
|
||||
{ NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
static void TestBreakIteratorSuppressions(void) {
|
||||
const TestBISuppressionsItem * itemPtr;
|
||||
|
||||
for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
|
||||
UChar textU[kTextULenMax];
|
||||
int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UBreakIterator *bi = ubrk_open(UBRK_SENTENCE, itemPtr->locale, textU, textULen, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
int32_t offset, start;
|
||||
const int32_t * expOffsetPtr;
|
||||
|
||||
expOffsetPtr = itemPtr->expFwdOffsets;
|
||||
ubrk_first(bi);
|
||||
for (; (offset = ubrk_next(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
|
||||
if (offset != *expOffsetPtr) {
|
||||
log_err("FAIL: ubrk_next loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset);
|
||||
}
|
||||
}
|
||||
if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
|
||||
log_err("FAIL: ubrk_next loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr);
|
||||
}
|
||||
|
||||
expOffsetPtr = itemPtr->expFwdOffsets;
|
||||
start = ubrk_first(bi) + 1;
|
||||
for (; (offset = ubrk_following(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
|
||||
if (offset != *expOffsetPtr) {
|
||||
log_err("FAIL: ubrk_following(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset);
|
||||
}
|
||||
start = *expOffsetPtr + 1;
|
||||
}
|
||||
if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
|
||||
log_err("FAIL: ubrk_following(%d) loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr);
|
||||
}
|
||||
|
||||
expOffsetPtr = itemPtr->expRevOffsets;
|
||||
ubrk_last(bi);
|
||||
for (; (offset = ubrk_previous(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
|
||||
if (offset != *expOffsetPtr) {
|
||||
log_err("FAIL: ubrk_previous loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset);
|
||||
}
|
||||
}
|
||||
if (offset == UBRK_DONE && expOffsetPtr == itemPtr->expRevOffsets &&
|
||||
log_knownIssue("11786", "Filtered break iterator issues at beginning/end of text")) {
|
||||
// skip this test for problem cases until the fix for #11786 is complete
|
||||
} else
|
||||
if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
|
||||
log_err("FAIL: ubrk_previous loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr);
|
||||
}
|
||||
|
||||
expOffsetPtr = itemPtr->expRevOffsets;
|
||||
start = ubrk_last(bi) - 1;
|
||||
for (; (offset = ubrk_preceding(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
|
||||
if (offset != *expOffsetPtr) {
|
||||
log_err("FAIL: ubrk_preceding(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset);
|
||||
}
|
||||
start = *expOffsetPtr - 1;
|
||||
}
|
||||
if (start >=0 && (offset != UBRK_DONE || *expOffsetPtr >= 0)) {
|
||||
log_err("FAIL: ubrk_preceding loc(%d) \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr);
|
||||
}
|
||||
|
||||
ubrk_close(bi);
|
||||
} else {
|
||||
log_data_err("FAIL: ubrk_open(UBRK_SENTENCE, \"%s\", ...) status %s (Are you missing data?)\n", itemPtr->locale, u_errorName(status));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
Loading…
Add table
Reference in a new issue