ICU-9198 add titlecasing options: wholeString, sentences, adjustToCased

X-SVN-Rev: 40164
This commit is contained in:
Markus Scherer 2017-06-09 23:04:03 +00:00
parent 06a03303cb
commit cfef2fb339
19 changed files with 831 additions and 243 deletions

View file

@ -69,10 +69,16 @@ enum {
/**
* Bit mask for getting just the options from a string compare options word
* that are relevant for case folding (of a single string or code point).
*
* Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I.
* It is conceivable that at some point we might use one more bit for using uppercase sharp s.
* It is conceivable that at some point we might want the option to use only simple case foldings
* when operating on strings.
*
* See stringoptions.h.
* @internal
*/
#define _FOLD_CASE_OPTIONS_MASK 0xff
#define _FOLD_CASE_OPTIONS_MASK 7
/* single-code point functions */

View file

@ -381,7 +381,7 @@ ucasemap_internalUTF8ToTitle(
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
return 0;
}
@ -408,45 +408,38 @@ ucasemap_internalUTF8ToTitle(
}
/*
* Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* In this implementation, segment [prev..index[ into 3 parts:
* a) uncased characters (copy as-is) [prev..titleStart[
* b) first case letter (titlecase) [titleStart..titleLimit[
* Segment [prev..index[ into 3 parts:
* a) skipped characters (copy as-is) [prev..titleStart[
* b) first letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
/* find and copy uncased characters [prev..titleStart[ */
/* find and copy skipped characters [prev..titleStart[ */
int32_t titleStart=prev;
int32_t titleLimit=prev;
UChar32 c;
U8_NEXT(src, titleLimit, index, c);
if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
/* Adjust the titlecasing index (titleStart) to the next cased character. */
for(;;) {
if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
// Adjust the titlecasing index to the next cased character,
// or to the next letter/number/symbol/private use.
// Stop with titleStart<titleLimit<=index
// if there is a character to be titlecased,
// or else stop with titleStart==titleLimit==index.
UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit;
if(titleLimit==index) {
/*
* only uncased characters in [prev..index[
* stop with titleStart==titleLimit==index
*/
break;
}
U8_NEXT(src, titleLimit, index, c);
if(UCASE_NONE!=ucase_getType(c)) {
break; /* cased letter at [titleStart..titleLimit[ */
}
}
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+prev, titleStart-prev, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (prev < titleStart) {
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+prev, titleStart-prev, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
}

View file

@ -9,8 +9,27 @@
#include "unicode/utypes.h"
#include "unicode/ucasemap.h"
#include "unicode/uchar.h"
#include "ucase.h"
/**
* Bit mask for the titlecasing iterator options bit field.
* Currently only 3 out of 8 values are used:
* 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* See stringoptions.h.
* @internal
*/
#define U_TITLECASE_ITERATOR_MASK 0xe0
/**
* Bit mask for the titlecasing index adjustment options bit set.
* Currently two bits are defined:
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
* See stringoptions.h.
* @internal
*/
#define U_TITLECASE_ADJUSTMENT_MASK 0x600
/**
* Internal API, used by u_strcasecmp() etc.
* Compare strings case-insensitively,
@ -23,7 +42,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
UErrorCode *pErrorCode);
/**
* Interanl API, used for detecting length of
* Internal API, used for detecting length of
* shared prefix case-insensitively.
* @param s1 input string 1
* @param length1 length of string 1, or -1 (NULL terminated)
@ -52,6 +71,40 @@ uprv_haveProperties(UErrorCode *pErrorCode);
#ifdef __cplusplus
U_NAMESPACE_BEGIN
/** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
// Both options together.
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
return TRUE;
}
inline UBool ustrcase_isLNS(UChar32 c) {
// Letter, number, symbol,
// or a private use code point because those are typically used as letters or numbers.
// Consider modifier letters only if they are cased.
const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
int gc = u_charType(c);
return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
}
#if !UCONFIG_NO_BREAK_ITERATION
/** Returns nullptr if error. Pass in either locale or locID, not both. */
U_CFUNC
BreakIterator *ustrcase_getTitleBreakIterator(
const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
#endif
U_NAMESPACE_END
#include "unicode/unistr.h" // for UStringCaseMapper
/*

View file

@ -42,11 +42,8 @@ int32_t CaseMap::utf8ToTitle(
UText utext=UTEXT_INITIALIZER;
utext_openUTF8(&utext, src, srcLength, &errorCode);
LocalPointer<BreakIterator> ownedIter;
iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
if(iter==NULL) {
iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
ownedIter.adoptInstead(iter);
}
if(U_FAILURE(errorCode)) {
utext_close(&utext);
return 0;
}
@ -88,12 +85,19 @@ ucasemap_utf8ToTitle(UCaseMap *csm,
}
UText utext=UTEXT_INITIALIZER;
utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
if(csm->iter==NULL) {
csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
}
if (U_FAILURE(*pErrorCode)) {
return 0;
}
if(csm->iter==NULL) {
LocalPointer<BreakIterator> ownedIter;
BreakIterator *iter = ustrcase_getTitleBreakIterator(
nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
if (iter == nullptr) {
utext_close(&utext);
return 0;
}
csm->iter = ownedIter.orphan();
}
csm->iter->setText(&utext, *pErrorCode);
int32_t length=ucasemap_mapUTF8(
csm->caseLocale, csm->options, csm->iter,

View file

@ -113,7 +113,9 @@ public:
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
* U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
* It is set to the source string (setText())
* and used one or more times for iteration (first() and next()).
@ -272,9 +274,11 @@ public:
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
* U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
* It is set to the source string (setText())
* It is set to the source string (setUText())
* and used one or more times for iteration (first() and next()).
* If NULL, then a word break iterator for the locale is used
* (or something equivalent).

View file

@ -39,49 +39,101 @@
*/
#define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
#ifndef U_HIDE_DRAFT_API
/**
* Titlecase the string as a whole rather than each word.
* (Titlecase only the character at index 0, possibly adjusted.)
* Option bits value for titlecasing APIs that take an options bit set.
*
* It is an error to specify multiple titlecasing iterator options together,
* including both an options bit and an explicit BreakIterator.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @draft ICU 60
*/
#define U_TITLECASE_WHOLE_STRING 0x20
/**
* Titlecase sentences rather than words.
* (Titlecase only the first character of each sentence, possibly adjusted.)
* Option bits value for titlecasing APIs that take an options bit set.
*
* It is an error to specify multiple titlecasing iterator options together,
* including both an options bit and an explicit BreakIterator.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @draft ICU 60
*/
#define U_TITLECASE_SENTENCES 0x40
#endif // U_HIDE_DRAFT_API
/**
* Do not lowercase non-initial parts of words when titlecasing.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will titlecase the first cased character
* of a word and lowercase all other characters.
* By default, titlecasing will titlecase the character at each
* (possibly adjusted) BreakIterator index and
* lowercase all other characters up to the next iterator index.
* With this option, the other characters will not be modified.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @see UnicodeString::toTitle
* @see CaseMap::toTitle
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @see UnicodeString::toTitle
* @stable ICU 3.8
*/
#define U_TITLECASE_NO_LOWERCASE 0x100
/**
* Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
* Do not adjust the titlecasing BreakIterator indexes;
* titlecase exactly the characters at breaks from the iterator.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will take each break iterator index,
* adjust it by looking for the next cased character, and titlecase that one.
* adjust it to the next relevant character (see U_TITLECASE_ADJUST_TO_CASED),
* and titlecase that one.
*
* Other characters are lowercased.
*
* This follows Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
* It is an error to specify multiple titlecasing adjustment options together.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @see U_TITLECASE_NO_LOWERCASE
* @see UnicodeString::toTitle
* @see CaseMap::toTitle
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @see UnicodeString::toTitle
* @see U_TITLECASE_NO_LOWERCASE
* @stable ICU 3.8
*/
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
#ifndef U_HIDE_DRAFT_API
/**
* Adjust each titlecasing BreakIterator index to the next cased character.
* (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
* Option bit for titlecasing APIs that take an options bit set.
*
* This used to be the default index adjustment in ICU.
* Since ICU 60, the default index adjustment is to the next character that is
* a letter, number, symbol, or private use code point.
* (Uncased modifier letters are skipped.)
* The difference in behavior is small for word titlecasing,
* but the new adjustment is much better for whole-string and sentence titlecasing:
* It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
*
* It is an error to specify multiple titlecasing adjustment options together.
*
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @draft ICU 60
*/
#define U_TITLECASE_ADJUST_TO_CASED 0x400
/**
* Omit unchanged text when recording how source substrings
* relate to changed and unchanged result substrings.
@ -126,7 +178,9 @@
//
// Internal: (may change or be removed)
// ucase.h #define _STRCASECMP_OPTIONS_MASK 0xffff
// ucase.h #define _FOLD_CASE_OPTIONS_MASK 0xff
// ucase.h #define _FOLD_CASE_OPTIONS_MASK 7
// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000

View file

@ -202,7 +202,7 @@ ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
* This function uses only the setUText(), first(), next() and close() methods of the
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* The result may be longer or shorter than the original.

View file

@ -2775,11 +2775,11 @@ public:
* break iterator is opened.
* Otherwise the provided iterator is set to the string's text.
* @param locale The locale to consider.
* @param options Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param options Options bit set, see ucasemap_open().
* @return A reference to this.
* @see U_TITLECASE_NO_LOWERCASE
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @see ucasemap_open
* @stable ICU 3.8
*/
UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options);

View file

@ -30,31 +30,26 @@
U_NAMESPACE_BEGIN
UnicodeString &
UnicodeString::toTitle(BreakIterator *titleIter) {
return toTitle(titleIter, Locale::getDefault(), 0);
UnicodeString::toTitle(BreakIterator *iter) {
return toTitle(iter, Locale::getDefault(), 0);
}
UnicodeString &
UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) {
return toTitle(titleIter, locale, 0);
UnicodeString::toTitle(BreakIterator *iter, const Locale &locale) {
return toTitle(iter, locale, 0);
}
UnicodeString &
UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options) {
BreakIterator *bi=titleIter;
if(bi==NULL) {
UErrorCode errorCode=U_ZERO_ERROR;
bi=BreakIterator::createWordInstance(locale, errorCode);
if(U_FAILURE(errorCode)) {
setToBogus();
return *this;
UnicodeString::toTitle(BreakIterator *iter, const Locale &locale, uint32_t options) {
LocalPointer<BreakIterator> ownedIter;
UErrorCode errorCode = U_ZERO_ERROR;
iter = ustrcase_getTitleBreakIterator(&locale, "", options, iter, ownedIter, errorCode);
if (iter == nullptr) {
setToBogus();
return *this;
}
}
caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, bi, ustrcase_internalToTitle);
if(titleIter==NULL) {
delete bi;
}
return *this;
caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, iter, ustrcase_internalToTitle);
return *this;
}
U_NAMESPACE_END

View file

@ -23,46 +23,153 @@
#include "unicode/brkiter.h"
#include "unicode/casemap.h"
#include "unicode/chariter.h"
#include "unicode/localpointer.h"
#include "unicode/ubrk.h"
#include "unicode/ucasemap.h"
#include "unicode/utext.h"
#include "cmemory.h"
#include "uassert.h"
#include "ucase.h"
#include "ucasemap_imp.h"
U_NAMESPACE_USE
U_NAMESPACE_BEGIN
/* functions available in the common library (for unistr_case.cpp) */
/**
* Whole-string BreakIterator.
* Titlecasing only calls setText(), first(), and next().
* We implement the rest only to satisfy the abstract interface.
*/
class WholeStringBreakIterator : public BreakIterator {
public:
WholeStringBreakIterator() : BreakIterator(), length(0) {}
~WholeStringBreakIterator() override;
UBool operator==(const BreakIterator&) const override;
BreakIterator *clone() const override;
static UClassID U_EXPORT2 getStaticClassID();
UClassID getDynamicClassID() const override;
CharacterIterator &getText() const override;
UText *getUText(UText *fillIn, UErrorCode &errorCode) const override;
void setText(const UnicodeString &text) override;
void setText(UText *text, UErrorCode &errorCode) override;
void adoptText(CharacterIterator* it) override;
int32_t first() override;
int32_t last() override;
int32_t previous() override;
int32_t next() override;
int32_t current() const override;
int32_t following(int32_t offset) override;
int32_t preceding(int32_t offset) override;
UBool isBoundary(int32_t offset) override;
int32_t next(int32_t n) override;
BreakIterator *createBufferClone(void *stackBuffer, int32_t &BufferSize,
UErrorCode &errorCode) override;
BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) override;
/* public API functions */
private:
int32_t length;
};
U_CAPI int32_t U_EXPORT2
u_strToTitle(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
UErrorCode *pErrorCode) {
LocalPointer<BreakIterator> ownedIter;
BreakIterator *iter;
if(titleIter!=NULL) {
iter=reinterpret_cast<BreakIterator *>(titleIter);
} else {
iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode);
ownedIter.adoptInstead(iter);
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(WholeStringBreakIterator)
WholeStringBreakIterator::~WholeStringBreakIterator() {}
UBool WholeStringBreakIterator::operator==(const BreakIterator&) const { return FALSE; }
BreakIterator *WholeStringBreakIterator::clone() const { return nullptr; }
CharacterIterator &WholeStringBreakIterator::getText() const {
U_ASSERT(FALSE); // really should not be called
// Returns a null reference.
// Otherwise we would have to define a dummy CharacterIterator,
// and either have it as a field and const_cast it to a non-const reference,
// or have it via a pointer and return a reference to that.
CharacterIterator *none = nullptr;
return *none;
}
UText *WholeStringBreakIterator::getUText(UText * /*fillIn*/, UErrorCode &errorCode) const {
if (U_SUCCESS(errorCode)) {
errorCode = U_UNSUPPORTED_ERROR;
}
if(U_FAILURE(*pErrorCode)) {
return 0;
}
UnicodeString s(srcLength<0, src, srcLength);
iter->setText(s);
return ustrcase_mapWithOverlap(
ustrcase_getCaseLocale(locale), 0, iter,
dest, destCapacity,
src, srcLength,
ustrcase_internalToTitle, *pErrorCode);
return nullptr;
}
U_NAMESPACE_BEGIN
void WholeStringBreakIterator::setText(const UnicodeString &text) {
length = text.length();
}
void WholeStringBreakIterator::setText(UText *text, UErrorCode &errorCode) {
if (U_SUCCESS(errorCode)) {
int64_t length64 = utext_nativeLength(text);
if (length64 <= INT32_MAX) {
length = (int32_t)length64;
} else {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
}
}
}
void WholeStringBreakIterator::adoptText(CharacterIterator* it) {
U_ASSERT(FALSE); // should not be called
length = it->getLength();
delete it;
}
int32_t WholeStringBreakIterator::first() { return 0; }
int32_t WholeStringBreakIterator::last() { return length; }
int32_t WholeStringBreakIterator::previous() { return 0; }
int32_t WholeStringBreakIterator::next() { return length; }
int32_t WholeStringBreakIterator::current() const { return 0; }
int32_t WholeStringBreakIterator::following(int32_t /*offset*/) { return length; }
int32_t WholeStringBreakIterator::preceding(int32_t /*offset*/) { return 0; }
UBool WholeStringBreakIterator::isBoundary(int32_t /*offset*/) { return FALSE; }
int32_t WholeStringBreakIterator::next(int32_t /*n*/) { return length; }
BreakIterator *WholeStringBreakIterator::createBufferClone(
void * /*stackBuffer*/, int32_t & /*BufferSize*/, UErrorCode &errorCode) {
if (U_SUCCESS(errorCode)) {
errorCode = U_UNSUPPORTED_ERROR;
}
return nullptr;
}
BreakIterator &WholeStringBreakIterator::refreshInputText(
UText * /*input*/, UErrorCode &errorCode) {
if (U_SUCCESS(errorCode)) {
errorCode = U_UNSUPPORTED_ERROR;
}
return *this;
}
U_CFUNC
BreakIterator *ustrcase_getTitleBreakIterator(
const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
options &= U_TITLECASE_ITERATOR_MASK;
if (options != 0 && iter != nullptr) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
if (iter == nullptr) {
switch (options) {
case 0:
iter = BreakIterator::createWordInstance(
locale != nullptr ? *locale : Locale(locID), errorCode);
break;
case U_TITLECASE_WHOLE_STRING:
iter = new WholeStringBreakIterator();
if (iter == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
}
break;
case U_TITLECASE_SENTENCES:
iter = BreakIterator::createSentenceInstance(
locale != nullptr ? *locale : Locale(locID), errorCode);
break;
default:
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
break;
}
ownedIter.adoptInstead(iter);
}
return iter;
}
int32_t CaseMap::toTitle(
const char *locale, uint32_t options, BreakIterator *iter,
@ -70,11 +177,8 @@ int32_t CaseMap::toTitle(
UChar *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode) {
LocalPointer<BreakIterator> ownedIter;
iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
if(iter==NULL) {
iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
ownedIter.adoptInstead(iter);
}
if(U_FAILURE(errorCode)) {
return 0;
}
UnicodeString s(srcLength<0, src, srcLength);
@ -88,6 +192,30 @@ int32_t CaseMap::toTitle(
U_NAMESPACE_END
U_NAMESPACE_USE
U_CAPI int32_t U_EXPORT2
u_strToTitle(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
UErrorCode *pErrorCode) {
LocalPointer<BreakIterator> ownedIter;
BreakIterator *iter = ustrcase_getTitleBreakIterator(
nullptr, locale, 0, reinterpret_cast<BreakIterator *>(titleIter),
ownedIter, *pErrorCode);
if (iter == nullptr) {
return 0;
}
UnicodeString s(srcLength<0, src, srcLength);
iter->setText(s);
return ustrcase_mapWithOverlap(
ustrcase_getCaseLocale(locale), 0, iter,
dest, destCapacity,
src, srcLength,
ustrcase_internalToTitle, *pErrorCode);
}
U_CAPI int32_t U_EXPORT2
ucasemap_toTitle(UCaseMap *csm,
UChar *dest, int32_t destCapacity,
@ -97,10 +225,13 @@ ucasemap_toTitle(UCaseMap *csm,
return 0;
}
if (csm->iter == NULL) {
csm->iter = BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
}
if (U_FAILURE(*pErrorCode)) {
return 0;
LocalPointer<BreakIterator> ownedIter;
BreakIterator *iter = ustrcase_getTitleBreakIterator(
nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
if (iter == nullptr) {
return 0;
}
csm->iter = ownedIter.orphan();
}
UnicodeString s(srcLength<0, src, srcLength);
csm->iter->setText(s);

View file

@ -237,7 +237,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
return 0;
}
@ -264,45 +264,38 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
}
/*
* Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* In this implementation, segment [prev..index[ into 3 parts:
* a) uncased characters (copy as-is) [prev..titleStart[
* b) first case letter (titlecase) [titleStart..titleLimit[
* Segment [prev..index[ into 3 parts:
* a) skipped characters (copy as-is) [prev..titleStart[
* b) first letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
/* find and copy uncased characters [prev..titleStart[ */
// Find and copy skipped characters [prev..titleStart[
int32_t titleStart=prev;
int32_t titleLimit=prev;
UChar32 c;
U16_NEXT(src, titleLimit, index, c);
if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
/* Adjust the titlecasing index (titleStart) to the next cased character. */
for(;;) {
if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
// Adjust the titlecasing index to the next cased character,
// or to the next letter/number/symbol/private use.
// Stop with titleStart<titleLimit<=index
// if there is a character to be titlecased,
// or else stop with titleStart==titleLimit==index.
UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit;
if(titleLimit==index) {
/*
* only uncased characters in [prev..index[
* stop with titleStart==titleLimit==index
*/
break;
}
U16_NEXT(src, titleLimit, index, c);
if(UCASE_NONE!=ucase_getType(c)) {
break; /* cased letter at [titleStart..titleLimit[ */
}
}
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+prev, titleStart-prev, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (prev < titleStart) {
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+prev, titleStart-prev, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
}

View file

@ -19,6 +19,7 @@
*/
#include "unicode/std_string.h"
#include "unicode/brkiter.h"
#include "unicode/casemap.h"
#include "unicode/edits.h"
#include "unicode/uchar.h"
@ -49,6 +50,7 @@ public:
int32_t whichCase,
void *iter, const char *localeID, uint32_t options);
void TestCasing();
void TestTitleOptions();
void TestFullCaseFoldingIterator();
void TestGreekUpper();
void TestLongUpper();
@ -84,6 +86,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
TESTCASE_AUTO(TestCaseConversion);
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
TESTCASE_AUTO(TestCasing);
TESTCASE_AUTO(TestTitleOptions);
#endif
TESTCASE_AUTO(TestFullCaseFoldingIterator);
TESTCASE_AUTO(TestGreekUpper);
@ -593,6 +596,59 @@ StringCaseTest::TestCasing() {
#endif
}
void
StringCaseTest::TestTitleOptions() {
// New options in ICU 60.
TestCasingImpl(u"ʻcAt! ʻeTc.", u"ʻCat! ʻetc.", TEST_TITLE,
nullptr, "", U_TITLECASE_WHOLE_STRING);
TestCasingImpl(u"a ʻCaT. A ʻdOg! ʻeTc.", u"A ʻCaT. A ʻdOg! ʻETc.", TEST_TITLE,
nullptr, "", U_TITLECASE_SENTENCES|U_TITLECASE_NO_LOWERCASE);
TestCasingImpl(u"49eRs", u"49ers", TEST_TITLE,
nullptr, "", U_TITLECASE_WHOLE_STRING);
TestCasingImpl(u"«丰(aBc)»", u"«丰(abc)»", TEST_TITLE,
nullptr, "", U_TITLECASE_WHOLE_STRING);
TestCasingImpl(u"49eRs", u"49Ers", TEST_TITLE,
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
TestCasingImpl(u"«丰(aBc)»", u"«丰(Abc)»", TEST_TITLE,
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
TestCasingImpl(u" john. Smith", u" John. Smith", TEST_TITLE,
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_LOWERCASE);
TestCasingImpl(u" john. Smith", u" john. smith", TEST_TITLE,
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_BREAK_ADJUSTMENT);
TestCasingImpl(u"«ijs»", u"«IJs»", TEST_TITLE,
nullptr, "nl-BE", U_TITLECASE_WHOLE_STRING);
TestCasingImpl(u"«ijs»", u"«İjs»", TEST_TITLE,
nullptr, "tr-DE", U_TITLECASE_WHOLE_STRING);
// Test conflicting settings.
// If & when we add more options, then the ORed combinations may become
// indistinguishable from valid values.
IcuTestErrorCode errorCode(*this, "TestTitleOptions");
CaseMap::toTitle("", U_TITLECASE_NO_BREAK_ADJUSTMENT|U_TITLECASE_ADJUST_TO_CASED, nullptr,
u"", 0, nullptr, 0, nullptr, errorCode);
if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
errln("CaseMap::toTitle(multiple adjustment options) -> %s not illegal argument",
errorCode.errorName());
}
errorCode.reset();
CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING|U_TITLECASE_SENTENCES, nullptr,
u"", 0, nullptr, 0, nullptr, errorCode);
if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
errln("CaseMap::toTitle(multiple iterator options) -> %s not illegal argument",
errorCode.errorName());
}
errorCode.reset();
LocalPointer<BreakIterator> iter(
BreakIterator::createCharacterInstance(Locale::getRoot(), errorCode));
CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING, iter.getAlias(),
u"", 0, nullptr, 0, nullptr, errorCode);
if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
errln("CaseMap::toTitle(iterator option + iterator) -> %s not illegal argument",
errorCode.errorName());
}
errorCode.reset();
}
void
StringCaseTest::TestFullCaseFoldingIterator() {
UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");

View file

@ -3,11 +3,15 @@
package com.ibm.icu.impl;
import java.io.IOException;
import java.text.CharacterIterator;
import java.util.Locale;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.Edits;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.ULocale;
public final class CaseMapImpl {
/**
@ -134,11 +138,192 @@ public final class CaseMapImpl {
protected int dir; // 0=initial state >0=forward <0=backward
}
public static final int TITLECASE_WHOLE_STRING = 0x20;
public static final int TITLECASE_SENTENCES = 0x40;
/**
* Bit mask for the titlecasing iterator options bit field.
* Currently only 3 out of 8 values are used:
* 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
* See stringoptions.h.
* @internal
*/
private static final int TITLECASE_ITERATOR_MASK = 0xe0;
public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
/**
* Bit mask for the titlecasing index adjustment options bit set.
* Currently two bits are defined:
* TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
* See stringoptions.h.
* @internal
*/
private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
public static int addTitleAdjustmentOption(int options, int newOption) {
int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
if (adjOptions !=0 && adjOptions != newOption) {
throw new IllegalArgumentException("multiple titlecasing index adjustment options");
}
return options | newOption;
}
private static final int LNS =
(1 << UCharacterCategory.UPPERCASE_LETTER) |
(1 << UCharacterCategory.LOWERCASE_LETTER) |
(1 << UCharacterCategory.TITLECASE_LETTER) |
// Not MODIFIER_LETTER: We count only cased modifier letters.
(1 << UCharacterCategory.OTHER_LETTER) |
(1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
(1 << UCharacterCategory.LETTER_NUMBER) |
(1 << UCharacterCategory.OTHER_NUMBER) |
(1 << UCharacterCategory.MATH_SYMBOL) |
(1 << UCharacterCategory.CURRENCY_SYMBOL) |
(1 << UCharacterCategory.MODIFIER_SYMBOL) |
(1 << UCharacterCategory.OTHER_SYMBOL) |
(1 << UCharacterCategory.PRIVATE_USE);
private static boolean isLNS(int c) {
// Letter, number, symbol,
// or a private use code point because those are typically used as letters or numbers.
// Consider modifier letters only if they are cased.
int gc = UCharacterProperty.INSTANCE.getType(c);
return ((1 << gc) & LNS) != 0 ||
(gc == UCharacterCategory.MODIFIER_LETTER &&
UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
}
public static int addTitleIteratorOption(int options, int newOption) {
int iterOptions = options & TITLECASE_ITERATOR_MASK;
if (iterOptions !=0 && iterOptions != newOption) {
throw new IllegalArgumentException("multiple titlecasing iterator options");
}
return options | newOption;
}
public static BreakIterator getTitleBreakIterator(
Locale locale, int options, BreakIterator iter) {
options &= TITLECASE_ITERATOR_MASK;
if (options != 0 && iter != null) {
throw new IllegalArgumentException(
"titlecasing iterator option together with an explicit iterator");
}
if (iter == null) {
switch (options) {
case 0:
iter = BreakIterator.getWordInstance(locale);
break;
case TITLECASE_WHOLE_STRING:
iter = new WholeStringBreakIterator();
break;
case TITLECASE_SENTENCES:
iter = BreakIterator.getSentenceInstance(locale);
break;
default:
throw new IllegalArgumentException("unknown titlecasing iterator option");
}
}
return iter;
}
public static BreakIterator getTitleBreakIterator(
ULocale locale, int options, BreakIterator iter) {
options &= TITLECASE_ITERATOR_MASK;
if (options != 0 && iter != null) {
throw new IllegalArgumentException(
"titlecasing iterator option together with an explicit iterator");
}
if (iter == null) {
switch (options) {
case 0:
iter = BreakIterator.getWordInstance(locale);
break;
case TITLECASE_WHOLE_STRING:
iter = new WholeStringBreakIterator();
break;
case TITLECASE_SENTENCES:
iter = BreakIterator.getSentenceInstance(locale);
break;
default:
throw new IllegalArgumentException("unknown titlecasing iterator option");
}
}
return iter;
}
/**
* Omit unchanged text when case-mapping with Edits.
*/
public static final int OMIT_UNCHANGED_TEXT = 0x4000;
private static final class WholeStringBreakIterator extends BreakIterator {
private int length;
private static void notImplemented() {
throw new UnsupportedOperationException("should not occur");
}
@Override
public int first() {
return 0;
}
@Override
public int last() {
notImplemented();
return 0;
}
@Override
public int next(int n) {
notImplemented();
return 0;
}
@Override
public int next() {
return length;
}
@Override
public int previous() {
notImplemented();
return 0;
}
@Override
public int following(int offset) {
notImplemented();
return 0;
}
@Override
public int current() {
notImplemented();
return 0;
}
@Override
public CharacterIterator getText() {
notImplemented();
return null;
}
@Override
public void setText(CharacterIterator newText) {
length = newText.getEndIndex();
}
@Override
public void setText(String newText) {
length = newText.length();
}
}
private static int appendCodePoint(Appendable a, int c) throws IOException {
if (c <= Character.MAX_VALUE) {
a.append((char)c);
@ -266,32 +451,33 @@ public final class CaseMapImpl {
}
/*
* Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* In this implementation, segment [prev..index[ into 3 parts:
* a) uncased characters (copy as-is) [prev..titleStart[
* b) first case letter (titlecase) [titleStart..titleLimit[
* Segment [prev..index[ into 3 parts:
* a) skipped characters (copy as-is) [prev..titleStart[
* b) first letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
// find and copy uncased characters [prev..titleStart[
// Find and copy skipped characters [prev..titleStart[
int titleStart=prev;
iter.setLimit(index);
int c=iter.nextCaseMapCP();
if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
// Adjust the titlecasing index (titleStart) to the next cased character.
while((c=iter.nextCaseMapCP())>=0
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
// Adjust the titlecasing index to the next cased character,
// or to the next letter/number/symbol/private use.
// Stop with titleStart<titleLimit<=index
// if there is a character to be titlecased,
// or else stop with titleStart==titleLimit==index.
boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
while ((toCased ?
UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
!CaseMapImpl.isLNS(c)) &&
(c=iter.nextCaseMapCP())>=0) {}
// If c<0 then we have only uncased characters in [prev..index[
// and stopped with titleStart==titleLimit==index.
titleStart=iter.getCPStart();
appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
if (prev < titleStart) {
appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
}
}
if(titleStart<index) {

View file

@ -26,6 +26,7 @@ import com.ibm.icu.impl.locale.AsciiUtil;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.CaseMap;
import com.ibm.icu.text.DisplayContext;
import com.ibm.icu.text.DisplayContext.Type;
import com.ibm.icu.text.LocaleDisplayNames;
@ -86,6 +87,13 @@ public class LocaleDisplayNamesImpl extends LocaleDisplayNames {
*/
private transient BreakIterator capitalizationBrkIter = null;
private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
CaseMap.toTitle().wholeString().noLowercase();
private static String toTitleWholeStringNoLowercase(ULocale locale, String s) {
return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
locale.toLocale(), null, s, new StringBuilder(), null).toString();
}
public static LocaleDisplayNames getInstance(ULocale locale, DialectHandling dialectHandling) {
synchronized (cache) {
@ -602,9 +610,12 @@ public class LocaleDisplayNamesImpl extends LocaleDisplayNames {
ULocale minimized = ULocale.minimizeSubtags(modified, ULocale.Minimize.FAVOR_SCRIPT);
String tempName = modified.getDisplayName(locale);
boolean titlecase = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU;
String nameInDisplayLocale = titlecase ? UCharacter.toTitleFirst(locale, tempName) : tempName;
String nameInDisplayLocale =
titlecase ? toTitleWholeStringNoLowercase(locale, tempName) : tempName;
tempName = modified.getDisplayName(modified);
String nameInSelf = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ? UCharacter.toTitleFirst(modified, tempName) : tempName;
String nameInSelf = capContext ==
DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ?
toTitleWholeStringNoLowercase(modified, tempName) : tempName;
return new UiListItem(minimized, modified, nameInDisplayLocale, nameInSelf);
}

View file

@ -1124,9 +1124,15 @@ public final class UCaseProps {
/**
* Bit mask for getting just the options from a string compare options word
* that are relevant for case folding (of a single string or code point).
*
* Currently only bit 0 for FOLD_CASE_EXCLUDE_SPECIAL_I.
* It is conceivable that at some point we might use one more bit for using uppercase sharp s.
* It is conceivable that at some point we might want the option to use only simple case foldings
* when operating on strings.
*
* @internal
*/
private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
private static final int FOLD_CASE_OPTIONS_MASK = 7;
/* return the simple case folding mapping for c */
public final int fold(int c, int options) {

View file

@ -5185,22 +5185,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static String toTitleCase(ULocale locale, String str,
BreakIterator titleIter, int options) {
if(titleIter == null) {
if (locale == null) {
locale = ULocale.getDefault();
}
titleIter = BreakIterator.getWordInstance(locale);
if (titleIter == null && locale == null) {
locale = ULocale.getDefault();
}
titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
titleIter.setText(str);
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
}
private static final int BREAK_MASK =
(1<<UCharacterCategory.DECIMAL_DIGIT_NUMBER)
| (1<<UCharacterCategory.OTHER_LETTER)
| (1<<UCharacterCategory.MODIFIER_LETTER);
/**
* Return a string with just the first word titlecased, for menus and UI, etc. This does not affect most of the string,
* and sometimes has no effect at all; the original string is returned whenever casing
@ -5225,49 +5217,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
@Deprecated
public static String toTitleFirst(ULocale locale, String str) {
int c = 0;
for (int i = 0; i < str.length(); i += UCharacter.charCount(c)) {
c = UCharacter.codePointAt(str, i);
int propertyMask = UCharacter.getIntPropertyValue(c, UProperty.GENERAL_CATEGORY_MASK);
if ((propertyMask & BREAK_MASK) != 0) { // handle "49ers", initial CJK
break;
}
if (UCaseProps.INSTANCE.getType(c) == UCaseProps.NONE) {
continue;
}
// we now have the first cased character
// What we really want is something like:
// String titled = UCharacter.toTitleCase(locale, str, i, outputCharsTaken);
// That is, just give us the titlecased string, for the locale, at i and following,
// and tell us how many characters are replaced.
// The following won't work completely: it needs some more substantial changes to UCaseProps
String substring = str.substring(i, i+UCharacter.charCount(c));
String titled = UCharacter.toTitleCase(locale, substring, BreakIterator.getSentenceInstance(locale), 0);
// skip if no change
if (titled.codePointAt(0) == c) {
// Using 0 is safe, since any change in titling will not have first initial character
break;
}
StringBuilder result = new StringBuilder(str.length()).append(str, 0, i);
int startOfSuffix;
// handle dutch, but check first for 'i', since that's faster. Should be built into UCaseProps.
if (c == 'i' && locale.getLanguage().equals("nl") && i < str.length() && str.charAt(i+1) == 'j') {
result.append("IJ");
startOfSuffix = 2;
} else {
result.append(titled);
startOfSuffix = i + UCharacter.charCount(c);
}
// add the remainder, and return
return result.append(str, startOfSuffix, str.length()).toString();
}
return str; // no change
return toTitleCase(locale, str, null,
CaseMapImpl.TITLECASE_WHOLE_STRING|TITLECASE_NO_LOWERCASE);
// TODO: Remove this function.
// Move something like the following helper function into CLDR.
// private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
// CaseMap.toTitle().wholeString().noLowercase();
// return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
// locale.toLocale(), null, str, new StringBuilder(), null).toString();
}
/**
@ -5295,9 +5252,10 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
public static String toTitleCase(Locale locale, String str,
BreakIterator titleIter,
int options) {
if(titleIter == null) {
titleIter = BreakIterator.getWordInstance(locale);
if (titleIter == null && locale == null) {
locale = Locale.getDefault();
}
titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
titleIter.setText(str);
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
}

View file

@ -174,6 +174,42 @@ public abstract class CaseMap {
private static final Title OMIT_UNCHANGED = new Title(CaseMapImpl.OMIT_UNCHANGED_TEXT);
private Title(int opt) { super(opt); }
/**
* Returns an instance that behaves like this one but
* titlecases the string as a whole rather than each word.
* (Titlecases only the character at index 0, possibly adjusted.)
*
* <p>It is an error to specify multiple titlecasing iterator options together,
* including both an option and an explicit BreakIterator.
*
* @return an options object with this option.
* @see #adjustToCased()
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public Title wholeString() {
return new Title(CaseMapImpl.addTitleIteratorOption(
internalOptions, CaseMapImpl.TITLECASE_WHOLE_STRING));
}
/**
* Returns an instance that behaves like this one but
* titlecases sentences rather than words.
* (Titlecases only the first character of each sentence, possibly adjusted.)
*
* <p>It is an error to specify multiple titlecasing iterator options together,
* including both an option and an explicit BreakIterator.
*
* @return an options object with this option.
* @see #adjustToCased()
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public Title sentences() {
return new Title(CaseMapImpl.addTitleIteratorOption(
internalOptions, CaseMapImpl.TITLECASE_SENTENCES));
}
/**
* {@inheritDoc}
* @draft ICU 59
@ -191,12 +227,14 @@ public abstract class CaseMap {
* Returns an instance that behaves like this one but
* does not lowercase non-initial parts of words when titlecasing.
*
* <p>By default, titlecasing will titlecase the first cased character
* of a word and lowercase all other characters.
* <p>By default, titlecasing will titlecase the character at each
* (possibly adjusted) BreakIterator index and
* lowercase all other characters up to the next iterator index.
* With this option, the other characters will not be modified.
*
* @return an options object with this option.
* @see UCharacter#TITLECASE_NO_LOWERCASE
* @see #adjustToCased()
* @draft ICU 59
* @provisional This API might change or be removed in a future release.
*/
@ -204,22 +242,16 @@ public abstract class CaseMap {
return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE);
}
// TODO: update references to the Unicode Standard for recent version
/**
* Returns an instance that behaves like this one but
* does not adjust the titlecasing indexes from BreakIterator::next() indexes;
* does not adjust the titlecasing BreakIterator indexes;
* titlecases exactly the characters at breaks from the iterator.
*
* <p>By default, titlecasing will take each break iterator index,
* adjust it by looking for the next cased character, and titlecase that one.
* Other characters are lowercased.
* adjust it to the next relevant character (see {@link #adjustToCased()}),
* and titlecase that one.
*
* <p>This follows Unicode 4 &amp; 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
* <p>Other characters are lowercased.
*
* @return an options object with this option.
* @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT
@ -227,7 +259,33 @@ public abstract class CaseMap {
* @provisional This API might change or be removed in a future release.
*/
public Title noBreakAdjustment() {
return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT);
return new Title(CaseMapImpl.addTitleAdjustmentOption(
internalOptions, UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT));
}
/**
* Returns an instance that behaves like this one but
* adjusts each titlecasing BreakIterator index to the next cased character.
* (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
*
* <p>This used to be the default index adjustment in ICU.
* Since ICU 60, the default index adjustment is to the next character that is
* a letter, number, symbol, or private use code point.
* (Uncased modifier letters are skipped.)
* The difference in behavior is small for word titlecasing,
* but the new adjustment is much better for whole-string and sentence titlecasing:
* It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
*
* <p>It is an error to specify multiple titlecasing adjustment options together.
*
* @return an options object with this option.
* @see #noBreakAdjustment()
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public Title adjustToCased() {
return new Title(CaseMapImpl.addTitleAdjustmentOption(
internalOptions, CaseMapImpl.TITLECASE_ADJUST_TO_CASED));
}
/**
@ -259,9 +317,10 @@ public abstract class CaseMap {
*/
public <A extends Appendable> A apply(
Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) {
if (iter == null) {
iter = BreakIterator.getWordInstance(locale);
if (iter == null && locale == null) {
locale = Locale.getDefault();
}
iter = CaseMapImpl.getTitleBreakIterator(locale, internalOptions, iter);
iter.setText(src.toString());
return CaseMapImpl.toTitle(
getCaseLocale(locale), internalOptions, iter, src, dest, edits);

View file

@ -343,6 +343,63 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
// Not a @Test. See ICU4C intltest strcase.cpp TestCasingImpl().
void TestCasingImpl(String input, String output, CaseMap.Title toTitle, Locale locale) {
String result = toTitle.apply(locale, null, input, new StringBuilder(), null).toString();
assertEquals("toTitle(" + input + ')', output, result);
}
@Test
public void TestTitleOptions() {
Locale root = Locale.ROOT;
// New options in ICU 60.
TestCasingImpl("ʻcAt! ʻeTc.", "ʻCat! ʻetc.",
CaseMap.toTitle().wholeString(), root);
TestCasingImpl("a ʻCaT. A ʻdOg! ʻeTc.", "A ʻCaT. A ʻdOg! ʻETc.",
CaseMap.toTitle().sentences().noLowercase(), root);
TestCasingImpl("49eRs", "49ers",
CaseMap.toTitle().wholeString(), root);
TestCasingImpl("«丰(aBc)»", "«丰(abc)»",
CaseMap.toTitle().wholeString(), root);
TestCasingImpl("49eRs", "49Ers",
CaseMap.toTitle().wholeString().adjustToCased(), root);
TestCasingImpl("«丰(aBc)»", "«丰(Abc)»",
CaseMap.toTitle().wholeString().adjustToCased(), root);
TestCasingImpl(" john. Smith", " John. Smith",
CaseMap.toTitle().wholeString().noLowercase(), root);
TestCasingImpl(" john. Smith", " john. smith",
CaseMap.toTitle().wholeString().noBreakAdjustment(), root);
TestCasingImpl("«ijs»", "«IJs»",
CaseMap.toTitle().wholeString(), new Locale("nl", "BE"));
TestCasingImpl("«ijs»", "«İjs»",
CaseMap.toTitle().wholeString(), new Locale("tr", "DE"));
// Test conflicting settings.
// If & when we add more options, then the ORed combinations may become
// indistinguishable from valid values.
try {
CaseMap.toTitle().noBreakAdjustment().adjustToCased().
apply(root, null, "", new StringBuilder(), null);
fail("CaseMap.toTitle(multiple adjustment options) " +
"did not throw an IllegalArgumentException");
} catch(IllegalArgumentException expected) {
}
try {
CaseMap.toTitle().wholeString().sentences().
apply(root, null, "", new StringBuilder(), null);
fail("CaseMap.toTitle(multiple iterator options) " +
"did not throw an IllegalArgumentException");
} catch(IllegalArgumentException expected) {
}
BreakIterator iter = BreakIterator.getCharacterInstance(root);
try {
CaseMap.toTitle().wholeString().apply(root, iter, "", new StringBuilder(), null);
fail("CaseMap.toTitle(iterator option + iterator) " +
"did not throw an IllegalArgumentException");
} catch(IllegalArgumentException expected) {
}
}
@Test
public void TestDutchTitle() {
ULocale LOC_DUTCH = new ULocale("nl");

View file

@ -133,7 +133,7 @@ public class TransliteratorTest extends TestFmwk {
Transliterator hanLatin = Transliterator.getInstance("Han-Latin");
assertTransform("Transform", "z\u00E0o Unicode", hanLatin, "\u9020Unicode");
assertTransform("Transform", "z\u00E0i chu\u00E0ng z\u00E0o Unicode zh\u012B qi\u00E1n", hanLatin, "\u5728\u5275\u9020Unicode\u4E4B\u524D");
}
}
@Test
public void TestRegistry() {
@ -510,15 +510,19 @@ public class TransliteratorTest extends TestFmwk {
Transliterator hex = Transliterator.getInstance("Any-Hex");
hex.setFilter(new UnicodeFilter() {
@Override
public boolean contains(int c) {
return c != 'c';
}
@Override
public String toPattern(boolean escapeUnprintable) {
return "";
}
@Override
public boolean matchesIndexValue(int v) {
return false;
}
@Override
public void addMatchSetTo(UnicodeSet toUnionTo) {}
});
String s = "abcde";
@ -1561,6 +1565,7 @@ public class TransliteratorTest extends TestFmwk {
public NameableNullTrans(String id) {
super(id, null);
}
@Override
protected void handleTransliterate(Replaceable text,
Position offsets, boolean incremental) {
offsets.start = offsets.limit;
@ -1570,6 +1575,7 @@ public class TransliteratorTest extends TestFmwk {
public TestFact(String theID) {
id = theID;
}
@Override
public Transliterator getInstance(String ignoredID) {
return new NameableNullTrans(id);
}
@ -1873,8 +1879,8 @@ public class TransliteratorTest extends TestFmwk {
t.setFilter(new UnicodeSet("[:Ll:]"));
expect(t, "aAaA", "bAbA");
} finally {
Transliterator.unregister("a_to_A");
Transliterator.unregister("A_to_b");
Transliterator.unregister("a_to_A");
Transliterator.unregister("A_to_b");
}
}
@ -2731,6 +2737,7 @@ public class TransliteratorTest extends TestFmwk {
//System.out.println("Registering: " + ID + ", " + t.toRules(true));
Transliterator.registerFactory(ID, singleton);
}
@Override
public Transliterator getInstance(String ID) {
return (Transliterator) m.get(ID);
}
@ -2751,8 +2758,17 @@ public class TransliteratorTest extends TestFmwk {
String casefold = UCharacter.foldCase(s, true);
assertEquals("Casefold", casefold, toCasefold.transform(s));
String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
assertEquals("Title", title, toTitle.transform(s));
if (i != 0x0345) {
// ICU 60 changes the default titlecasing index adjustment.
// For word breaks it is mostly the same as before,
// but it is different for the iota subscript (the only cased combining mark).
// This should be ok because the iota subscript is not supposed to appear
// at the start of a word.
// The title Transliterator is far below feature parity with the
// UCharacter and CaseMap titlecasing functions.
String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
assertEquals("Title", title, toTitle.transform(s));
}
String upper = UCharacter.toUpperCase(ULocale.ROOT, s);
assertEquals("Upper", upper, toUpper.transform(s));
@ -3008,6 +3024,7 @@ public class TransliteratorTest extends TestFmwk {
Transliterator.registerFactory(ID, singleton);
}
@Override
public Transliterator getInstance(String ID) {
return (Transliterator) m.get(new CaseInsensitiveString(ID));
}
@ -3040,7 +3057,7 @@ public class TransliteratorTest extends TestFmwk {
*/
@Test
public void TestAny() {
UnicodeSet alphabetic = (UnicodeSet) new UnicodeSet("[:alphabetic:]").freeze();
UnicodeSet alphabetic = new UnicodeSet("[:alphabetic:]").freeze();
StringBuffer testString = new StringBuffer();
for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
UnicodeSet sample = new UnicodeSet().applyPropertyAlias("script", UScript.getShortName(i)).retainAll(alphabetic);
@ -3142,7 +3159,7 @@ public class TransliteratorTest extends TestFmwk {
// add all the trail characters
if (!nonStarters.containsSome(trailString)) {
continue;
continue;
}
UnicodeSet trailSet = leadToTrail.get(first);
if (trailSet == null) {
@ -3190,7 +3207,7 @@ public class TransliteratorTest extends TestFmwk {
// disorderedMarks.add(s);
// disorderedMarks.add(nfc.normalize(s));
// addDerivedStrings(nfc, disorderedMarks, s);
// }
// }
// s = nfd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
@ -3292,6 +3309,10 @@ public class TransliteratorTest extends TestFmwk {
addSourceTarget(s, empiricalSource, t, empiricalTarget);
}
}
if (rule.contains("title")) {
// See the comment in TestCasing() about the iota subscript.
empiricalSource.remove(0x345);
}
assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
}
@ -3336,8 +3357,8 @@ public class TransliteratorTest extends TestFmwk {
String direction = t == t0 ? "FORWARD\t" : "REVERSE\t";
targetIndex++;
UnicodeSet expectedTarget = testPair.length <= targetIndex ? expectedSource
: testPair[targetIndex] == null ? expectedSource
: testPair[targetIndex].length() == 0 ? expectedSource
: testPair[targetIndex] == null ? expectedSource
: testPair[targetIndex].length() == 0 ? expectedSource
: new UnicodeSet(testPair[targetIndex]);
ok = assertEquals(direction + "getSource\t\"" + test + '"', expectedSource, source);
if (!ok) { // for debugging
@ -3410,7 +3431,7 @@ public class TransliteratorTest extends TestFmwk {
};
for (String[] row : startTests) {
int actual = findSharedStartLength(row[1], row[2]);
assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
Integer.parseInt(row[0]),
actual);
}
@ -3423,8 +3444,8 @@ public class TransliteratorTest extends TestFmwk {
};
for (String[] row : endTests) {
int actual = findSharedEndLength(row[1], row[2]);
assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
Integer.parseInt(row[0]),
assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
Integer.parseInt(row[0]),
actual);
}
}
@ -3916,7 +3937,7 @@ the ::BEGIN/::END stuff)
@Test
public void TestThai() {
Transliterator tr = Transliterator.getInstance("Any-Latin", Transliterator.FORWARD);
String thaiText =
String thaiText =
"\u0e42\u0e14\u0e22\u0e1e\u0e37\u0e49\u0e19\u0e10\u0e32\u0e19\u0e41\u0e25\u0e49\u0e27, \u0e04\u0e2d" +
"\u0e21\u0e1e\u0e34\u0e27\u0e40\u0e15\u0e2d\u0e23\u0e4c\u0e08\u0e30\u0e40\u0e01\u0e35\u0e48\u0e22" +
"\u0e27\u0e02\u0e49\u0e2d\u0e07\u0e01\u0e31\u0e1a\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07\u0e02\u0e2d" +
@ -3948,7 +3969,7 @@ the ::BEGIN/::END stuff)
"\u0e17\u0e04\u0e19\u0e34\u0e04\u0e17\u0e35\u0e48\u0e43\u0e0a\u0e49\u0e01\u0e31\u0e19\u0e2d\u0e22" +
"\u0e39\u0e48\u0e17\u0e31\u0e48\u0e27\u0e44\u0e1b.";
String latinText =
String latinText =
"doy ph\u1ee5\u0304\u0302n \u1e6d\u0304h\u0101n l\u00e6\u0302w, khxmphiwtexr\u0312 ca ke\u012b\u0300" +
"ywk\u0304\u0125xng k\u1ea1b re\u1ee5\u0304\u0300xng k\u0304hxng t\u1ea1wlek\u0304h. khxmphiwtexr" +
"\u0312 c\u1ea1d k\u0115b t\u1ea1w x\u1ea1ks\u0304\u02b9r l\u00e6a x\u1ea1kk\u0304h ra x\u1ee5\u0304" +
@ -4041,6 +4062,7 @@ the ::BEGIN/::END stuff)
this.expectedData = expectedData;
}
@Override
public void run() {
errorMsg = null;
StringBuffer inBuf = new StringBuffer(testData);