mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-9198 add titlecasing options: wholeString, sentences, adjustToCased
X-SVN-Rev: 40164
This commit is contained in:
parent
06a03303cb
commit
cfef2fb339
19 changed files with 831 additions and 243 deletions
|
@ -69,10 +69,16 @@ enum {
|
|||
/**
|
||||
* Bit mask for getting just the options from a string compare options word
|
||||
* that are relevant for case folding (of a single string or code point).
|
||||
*
|
||||
* Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I.
|
||||
* It is conceivable that at some point we might use one more bit for using uppercase sharp s.
|
||||
* It is conceivable that at some point we might want the option to use only simple case foldings
|
||||
* when operating on strings.
|
||||
*
|
||||
* See stringoptions.h.
|
||||
* @internal
|
||||
*/
|
||||
#define _FOLD_CASE_OPTIONS_MASK 0xff
|
||||
#define _FOLD_CASE_OPTIONS_MASK 7
|
||||
|
||||
/* single-code point functions */
|
||||
|
||||
|
|
|
@ -381,7 +381,7 @@ ucasemap_internalUTF8ToTitle(
|
|||
const uint8_t *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -408,45 +408,38 @@ ucasemap_internalUTF8ToTitle(
|
|||
}
|
||||
|
||||
/*
|
||||
* Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* In this implementation, segment [prev..index[ into 3 parts:
|
||||
* a) uncased characters (copy as-is) [prev..titleStart[
|
||||
* b) first case letter (titlecase) [titleStart..titleLimit[
|
||||
* Segment [prev..index[ into 3 parts:
|
||||
* a) skipped characters (copy as-is) [prev..titleStart[
|
||||
* b) first letter (titlecase) [titleStart..titleLimit[
|
||||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||||
*/
|
||||
if(prev<index) {
|
||||
/* find and copy uncased characters [prev..titleStart[ */
|
||||
/* find and copy skipped characters [prev..titleStart[ */
|
||||
int32_t titleStart=prev;
|
||||
int32_t titleLimit=prev;
|
||||
UChar32 c;
|
||||
U8_NEXT(src, titleLimit, index, c);
|
||||
if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
|
||||
/* Adjust the titlecasing index (titleStart) to the next cased character. */
|
||||
for(;;) {
|
||||
if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
|
||||
// Adjust the titlecasing index to the next cased character,
|
||||
// or to the next letter/number/symbol/private use.
|
||||
// Stop with titleStart<titleLimit<=index
|
||||
// if there is a character to be titlecased,
|
||||
// or else stop with titleStart==titleLimit==index.
|
||||
UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
|
||||
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
|
||||
titleStart=titleLimit;
|
||||
if(titleLimit==index) {
|
||||
/*
|
||||
* only uncased characters in [prev..index[
|
||||
* stop with titleStart==titleLimit==index
|
||||
*/
|
||||
break;
|
||||
}
|
||||
U8_NEXT(src, titleLimit, index, c);
|
||||
if(UCASE_NONE!=ucase_getType(c)) {
|
||||
break; /* cased letter at [titleStart..titleLimit[ */
|
||||
}
|
||||
}
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+prev, titleStart-prev, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (prev < titleStart) {
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+prev, titleStart-prev, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -9,8 +9,27 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "ucase.h"
|
||||
|
||||
/**
|
||||
* Bit mask for the titlecasing iterator options bit field.
|
||||
* Currently only 3 out of 8 values are used:
|
||||
* 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
|
||||
* See stringoptions.h.
|
||||
* @internal
|
||||
*/
|
||||
#define U_TITLECASE_ITERATOR_MASK 0xe0
|
||||
|
||||
/**
|
||||
* Bit mask for the titlecasing index adjustment options bit set.
|
||||
* Currently two bits are defined:
|
||||
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
|
||||
* See stringoptions.h.
|
||||
* @internal
|
||||
*/
|
||||
#define U_TITLECASE_ADJUSTMENT_MASK 0x600
|
||||
|
||||
/**
|
||||
* Internal API, used by u_strcasecmp() etc.
|
||||
* Compare strings case-insensitively,
|
||||
|
@ -23,7 +42,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Interanl API, used for detecting length of
|
||||
* Internal API, used for detecting length of
|
||||
* shared prefix case-insensitively.
|
||||
* @param s1 input string 1
|
||||
* @param length1 length of string 1, or -1 (NULL terminated)
|
||||
|
@ -52,6 +71,40 @@ uprv_haveProperties(UErrorCode *pErrorCode);
|
|||
|
||||
#ifdef __cplusplus
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
|
||||
inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
|
||||
// Both options together.
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
inline UBool ustrcase_isLNS(UChar32 c) {
|
||||
// Letter, number, symbol,
|
||||
// or a private use code point because those are typically used as letters or numbers.
|
||||
// Consider modifier letters only if they are cased.
|
||||
const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
|
||||
int gc = u_charType(c);
|
||||
return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/** Returns nullptr if error. Pass in either locale or locID, not both. */
|
||||
U_CFUNC
|
||||
BreakIterator *ustrcase_getTitleBreakIterator(
|
||||
const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
|
||||
LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
|
||||
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#include "unicode/unistr.h" // for UStringCaseMapper
|
||||
|
||||
/*
|
||||
|
|
|
@ -42,11 +42,8 @@ int32_t CaseMap::utf8ToTitle(
|
|||
UText utext=UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&utext, src, srcLength, &errorCode);
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
|
||||
if(iter==NULL) {
|
||||
iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
|
||||
ownedIter.adoptInstead(iter);
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
utext_close(&utext);
|
||||
return 0;
|
||||
}
|
||||
|
@ -88,12 +85,19 @@ ucasemap_utf8ToTitle(UCaseMap *csm,
|
|||
}
|
||||
UText utext=UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
|
||||
if(csm->iter==NULL) {
|
||||
csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
|
||||
}
|
||||
if (U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(csm->iter==NULL) {
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
BreakIterator *iter = ustrcase_getTitleBreakIterator(
|
||||
nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
|
||||
if (iter == nullptr) {
|
||||
utext_close(&utext);
|
||||
return 0;
|
||||
}
|
||||
csm->iter = ownedIter.orphan();
|
||||
}
|
||||
csm->iter->setText(&utext, *pErrorCode);
|
||||
int32_t length=ucasemap_mapUTF8(
|
||||
csm->caseLocale, csm->options, csm->iter,
|
||||
|
|
|
@ -113,7 +113,9 @@ public:
|
|||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
|
||||
* U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
|
||||
* U_TITLECASE_NO_LOWERCASE,
|
||||
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
|
||||
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
|
||||
* @param iter A break iterator to find the first characters of words that are to be titlecased.
|
||||
* It is set to the source string (setText())
|
||||
* and used one or more times for iteration (first() and next()).
|
||||
|
@ -272,9 +274,11 @@ public:
|
|||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
|
||||
* U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
|
||||
* U_TITLECASE_NO_LOWERCASE,
|
||||
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
|
||||
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
|
||||
* @param iter A break iterator to find the first characters of words that are to be titlecased.
|
||||
* It is set to the source string (setText())
|
||||
* It is set to the source string (setUText())
|
||||
* and used one or more times for iteration (first() and next()).
|
||||
* If NULL, then a word break iterator for the locale is used
|
||||
* (or something equivalent).
|
||||
|
|
|
@ -39,49 +39,101 @@
|
|||
*/
|
||||
#define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Titlecase the string as a whole rather than each word.
|
||||
* (Titlecase only the character at index 0, possibly adjusted.)
|
||||
* Option bits value for titlecasing APIs that take an options bit set.
|
||||
*
|
||||
* It is an error to specify multiple titlecasing iterator options together,
|
||||
* including both an options bit and an explicit BreakIterator.
|
||||
*
|
||||
* @see U_TITLECASE_ADJUST_TO_CASED
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U_TITLECASE_WHOLE_STRING 0x20
|
||||
|
||||
/**
|
||||
* Titlecase sentences rather than words.
|
||||
* (Titlecase only the first character of each sentence, possibly adjusted.)
|
||||
* Option bits value for titlecasing APIs that take an options bit set.
|
||||
*
|
||||
* It is an error to specify multiple titlecasing iterator options together,
|
||||
* including both an options bit and an explicit BreakIterator.
|
||||
*
|
||||
* @see U_TITLECASE_ADJUST_TO_CASED
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U_TITLECASE_SENTENCES 0x40
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Do not lowercase non-initial parts of words when titlecasing.
|
||||
* Option bit for titlecasing APIs that take an options bit set.
|
||||
*
|
||||
* By default, titlecasing will titlecase the first cased character
|
||||
* of a word and lowercase all other characters.
|
||||
* By default, titlecasing will titlecase the character at each
|
||||
* (possibly adjusted) BreakIterator index and
|
||||
* lowercase all other characters up to the next iterator index.
|
||||
* With this option, the other characters will not be modified.
|
||||
*
|
||||
* @see U_TITLECASE_ADJUST_TO_CASED
|
||||
* @see UnicodeString::toTitle
|
||||
* @see CaseMap::toTitle
|
||||
* @see ucasemap_setOptions
|
||||
* @see ucasemap_toTitle
|
||||
* @see ucasemap_utf8ToTitle
|
||||
* @see UnicodeString::toTitle
|
||||
* @stable ICU 3.8
|
||||
*/
|
||||
#define U_TITLECASE_NO_LOWERCASE 0x100
|
||||
|
||||
/**
|
||||
* Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
|
||||
* Do not adjust the titlecasing BreakIterator indexes;
|
||||
* titlecase exactly the characters at breaks from the iterator.
|
||||
* Option bit for titlecasing APIs that take an options bit set.
|
||||
*
|
||||
* By default, titlecasing will take each break iterator index,
|
||||
* adjust it by looking for the next cased character, and titlecase that one.
|
||||
* adjust it to the next relevant character (see U_TITLECASE_ADJUST_TO_CASED),
|
||||
* and titlecase that one.
|
||||
*
|
||||
* Other characters are lowercased.
|
||||
*
|
||||
* This follows Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
* It is an error to specify multiple titlecasing adjustment options together.
|
||||
*
|
||||
* @see U_TITLECASE_ADJUST_TO_CASED
|
||||
* @see U_TITLECASE_NO_LOWERCASE
|
||||
* @see UnicodeString::toTitle
|
||||
* @see CaseMap::toTitle
|
||||
* @see ucasemap_setOptions
|
||||
* @see ucasemap_toTitle
|
||||
* @see ucasemap_utf8ToTitle
|
||||
* @see UnicodeString::toTitle
|
||||
* @see U_TITLECASE_NO_LOWERCASE
|
||||
* @stable ICU 3.8
|
||||
*/
|
||||
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Adjust each titlecasing BreakIterator index to the next cased character.
|
||||
* (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
|
||||
* Option bit for titlecasing APIs that take an options bit set.
|
||||
*
|
||||
* This used to be the default index adjustment in ICU.
|
||||
* Since ICU 60, the default index adjustment is to the next character that is
|
||||
* a letter, number, symbol, or private use code point.
|
||||
* (Uncased modifier letters are skipped.)
|
||||
* The difference in behavior is small for word titlecasing,
|
||||
* but the new adjustment is much better for whole-string and sentence titlecasing:
|
||||
* It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
|
||||
*
|
||||
* It is an error to specify multiple titlecasing adjustment options together.
|
||||
*
|
||||
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U_TITLECASE_ADJUST_TO_CASED 0x400
|
||||
|
||||
/**
|
||||
* Omit unchanged text when recording how source substrings
|
||||
* relate to changed and unchanged result substrings.
|
||||
|
@ -126,7 +178,9 @@
|
|||
//
|
||||
// Internal: (may change or be removed)
|
||||
// ucase.h #define _STRCASECMP_OPTIONS_MASK 0xffff
|
||||
// ucase.h #define _FOLD_CASE_OPTIONS_MASK 0xff
|
||||
// ucase.h #define _FOLD_CASE_OPTIONS_MASK 7
|
||||
// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
|
||||
// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
|
||||
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
|
||||
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000
|
||||
|
||||
|
|
|
@ -202,7 +202,7 @@ ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode
|
|||
* The standard titlecase iterator for the root locale implements the
|
||||
* algorithm of Unicode TR 21.
|
||||
*
|
||||
* This function uses only the setUText(), first(), next() and close() methods of the
|
||||
* This function uses only the setText(), first() and next() methods of the
|
||||
* provided break iterator.
|
||||
*
|
||||
* The result may be longer or shorter than the original.
|
||||
|
|
|
@ -2775,11 +2775,11 @@ public:
|
|||
* break iterator is opened.
|
||||
* Otherwise the provided iterator is set to the string's text.
|
||||
* @param locale The locale to consider.
|
||||
* @param options Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE,
|
||||
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
|
||||
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
|
||||
* @param options Options bit set, see ucasemap_open().
|
||||
* @return A reference to this.
|
||||
* @see U_TITLECASE_NO_LOWERCASE
|
||||
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
* @see ucasemap_open
|
||||
* @stable ICU 3.8
|
||||
*/
|
||||
UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options);
|
||||
|
|
|
@ -30,31 +30,26 @@
|
|||
U_NAMESPACE_BEGIN
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toTitle(BreakIterator *titleIter) {
|
||||
return toTitle(titleIter, Locale::getDefault(), 0);
|
||||
UnicodeString::toTitle(BreakIterator *iter) {
|
||||
return toTitle(iter, Locale::getDefault(), 0);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) {
|
||||
return toTitle(titleIter, locale, 0);
|
||||
UnicodeString::toTitle(BreakIterator *iter, const Locale &locale) {
|
||||
return toTitle(iter, locale, 0);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options) {
|
||||
BreakIterator *bi=titleIter;
|
||||
if(bi==NULL) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
bi=BreakIterator::createWordInstance(locale, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
setToBogus();
|
||||
return *this;
|
||||
UnicodeString::toTitle(BreakIterator *iter, const Locale &locale, uint32_t options) {
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
iter = ustrcase_getTitleBreakIterator(&locale, "", options, iter, ownedIter, errorCode);
|
||||
if (iter == nullptr) {
|
||||
setToBogus();
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, bi, ustrcase_internalToTitle);
|
||||
if(titleIter==NULL) {
|
||||
delete bi;
|
||||
}
|
||||
return *this;
|
||||
caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, iter, ustrcase_internalToTitle);
|
||||
return *this;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -23,46 +23,153 @@
|
|||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
#include "ucase.h"
|
||||
#include "ucasemap_imp.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/* functions available in the common library (for unistr_case.cpp) */
|
||||
/**
|
||||
* Whole-string BreakIterator.
|
||||
* Titlecasing only calls setText(), first(), and next().
|
||||
* We implement the rest only to satisfy the abstract interface.
|
||||
*/
|
||||
class WholeStringBreakIterator : public BreakIterator {
|
||||
public:
|
||||
WholeStringBreakIterator() : BreakIterator(), length(0) {}
|
||||
~WholeStringBreakIterator() override;
|
||||
UBool operator==(const BreakIterator&) const override;
|
||||
BreakIterator *clone() const override;
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
UClassID getDynamicClassID() const override;
|
||||
CharacterIterator &getText() const override;
|
||||
UText *getUText(UText *fillIn, UErrorCode &errorCode) const override;
|
||||
void setText(const UnicodeString &text) override;
|
||||
void setText(UText *text, UErrorCode &errorCode) override;
|
||||
void adoptText(CharacterIterator* it) override;
|
||||
int32_t first() override;
|
||||
int32_t last() override;
|
||||
int32_t previous() override;
|
||||
int32_t next() override;
|
||||
int32_t current() const override;
|
||||
int32_t following(int32_t offset) override;
|
||||
int32_t preceding(int32_t offset) override;
|
||||
UBool isBoundary(int32_t offset) override;
|
||||
int32_t next(int32_t n) override;
|
||||
BreakIterator *createBufferClone(void *stackBuffer, int32_t &BufferSize,
|
||||
UErrorCode &errorCode) override;
|
||||
BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) override;
|
||||
|
||||
/* public API functions */
|
||||
private:
|
||||
int32_t length;
|
||||
};
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strToTitle(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBreakIterator *titleIter,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode) {
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
BreakIterator *iter;
|
||||
if(titleIter!=NULL) {
|
||||
iter=reinterpret_cast<BreakIterator *>(titleIter);
|
||||
} else {
|
||||
iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode);
|
||||
ownedIter.adoptInstead(iter);
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(WholeStringBreakIterator)
|
||||
|
||||
WholeStringBreakIterator::~WholeStringBreakIterator() {}
|
||||
UBool WholeStringBreakIterator::operator==(const BreakIterator&) const { return FALSE; }
|
||||
BreakIterator *WholeStringBreakIterator::clone() const { return nullptr; }
|
||||
|
||||
CharacterIterator &WholeStringBreakIterator::getText() const {
|
||||
U_ASSERT(FALSE); // really should not be called
|
||||
// Returns a null reference.
|
||||
// Otherwise we would have to define a dummy CharacterIterator,
|
||||
// and either have it as a field and const_cast it to a non-const reference,
|
||||
// or have it via a pointer and return a reference to that.
|
||||
CharacterIterator *none = nullptr;
|
||||
return *none;
|
||||
}
|
||||
UText *WholeStringBreakIterator::getUText(UText * /*fillIn*/, UErrorCode &errorCode) const {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
}
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString s(srcLength<0, src, srcLength);
|
||||
iter->setText(s);
|
||||
return ustrcase_mapWithOverlap(
|
||||
ustrcase_getCaseLocale(locale), 0, iter,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToTitle, *pErrorCode);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
void WholeStringBreakIterator::setText(const UnicodeString &text) {
|
||||
length = text.length();
|
||||
}
|
||||
void WholeStringBreakIterator::setText(UText *text, UErrorCode &errorCode) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
int64_t length64 = utext_nativeLength(text);
|
||||
if (length64 <= INT32_MAX) {
|
||||
length = (int32_t)length64;
|
||||
} else {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
void WholeStringBreakIterator::adoptText(CharacterIterator* it) {
|
||||
U_ASSERT(FALSE); // should not be called
|
||||
length = it->getLength();
|
||||
delete it;
|
||||
}
|
||||
|
||||
int32_t WholeStringBreakIterator::first() { return 0; }
|
||||
int32_t WholeStringBreakIterator::last() { return length; }
|
||||
int32_t WholeStringBreakIterator::previous() { return 0; }
|
||||
int32_t WholeStringBreakIterator::next() { return length; }
|
||||
int32_t WholeStringBreakIterator::current() const { return 0; }
|
||||
int32_t WholeStringBreakIterator::following(int32_t /*offset*/) { return length; }
|
||||
int32_t WholeStringBreakIterator::preceding(int32_t /*offset*/) { return 0; }
|
||||
UBool WholeStringBreakIterator::isBoundary(int32_t /*offset*/) { return FALSE; }
|
||||
int32_t WholeStringBreakIterator::next(int32_t /*n*/) { return length; }
|
||||
|
||||
BreakIterator *WholeStringBreakIterator::createBufferClone(
|
||||
void * /*stackBuffer*/, int32_t & /*BufferSize*/, UErrorCode &errorCode) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
BreakIterator &WholeStringBreakIterator::refreshInputText(
|
||||
UText * /*input*/, UErrorCode &errorCode) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
U_CFUNC
|
||||
BreakIterator *ustrcase_getTitleBreakIterator(
|
||||
const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
|
||||
LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
options &= U_TITLECASE_ITERATOR_MASK;
|
||||
if (options != 0 && iter != nullptr) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
if (iter == nullptr) {
|
||||
switch (options) {
|
||||
case 0:
|
||||
iter = BreakIterator::createWordInstance(
|
||||
locale != nullptr ? *locale : Locale(locID), errorCode);
|
||||
break;
|
||||
case U_TITLECASE_WHOLE_STRING:
|
||||
iter = new WholeStringBreakIterator();
|
||||
if (iter == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
break;
|
||||
case U_TITLECASE_SENTENCES:
|
||||
iter = BreakIterator::createSentenceInstance(
|
||||
locale != nullptr ? *locale : Locale(locID), errorCode);
|
||||
break;
|
||||
default:
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
break;
|
||||
}
|
||||
ownedIter.adoptInstead(iter);
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
|
||||
int32_t CaseMap::toTitle(
|
||||
const char *locale, uint32_t options, BreakIterator *iter,
|
||||
|
@ -70,11 +177,8 @@ int32_t CaseMap::toTitle(
|
|||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
|
||||
if(iter==NULL) {
|
||||
iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
|
||||
ownedIter.adoptInstead(iter);
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString s(srcLength<0, src, srcLength);
|
||||
|
@ -88,6 +192,30 @@ int32_t CaseMap::toTitle(
|
|||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strToTitle(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBreakIterator *titleIter,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode) {
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
BreakIterator *iter = ustrcase_getTitleBreakIterator(
|
||||
nullptr, locale, 0, reinterpret_cast<BreakIterator *>(titleIter),
|
||||
ownedIter, *pErrorCode);
|
||||
if (iter == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString s(srcLength<0, src, srcLength);
|
||||
iter->setText(s);
|
||||
return ustrcase_mapWithOverlap(
|
||||
ustrcase_getCaseLocale(locale), 0, iter,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToTitle, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucasemap_toTitle(UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
|
@ -97,10 +225,13 @@ ucasemap_toTitle(UCaseMap *csm,
|
|||
return 0;
|
||||
}
|
||||
if (csm->iter == NULL) {
|
||||
csm->iter = BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
|
||||
}
|
||||
if (U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
BreakIterator *iter = ustrcase_getTitleBreakIterator(
|
||||
nullptr, csm->locale, csm->options, nullptr, ownedIter, *pErrorCode);
|
||||
if (iter == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
csm->iter = ownedIter.orphan();
|
||||
}
|
||||
UnicodeString s(srcLength<0, src, srcLength);
|
||||
csm->iter->setText(s);
|
||||
|
|
|
@ -237,7 +237,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
|
|||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -264,45 +264,38 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
|
|||
}
|
||||
|
||||
/*
|
||||
* Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* In this implementation, segment [prev..index[ into 3 parts:
|
||||
* a) uncased characters (copy as-is) [prev..titleStart[
|
||||
* b) first case letter (titlecase) [titleStart..titleLimit[
|
||||
* Segment [prev..index[ into 3 parts:
|
||||
* a) skipped characters (copy as-is) [prev..titleStart[
|
||||
* b) first letter (titlecase) [titleStart..titleLimit[
|
||||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||||
*/
|
||||
if(prev<index) {
|
||||
/* find and copy uncased characters [prev..titleStart[ */
|
||||
// Find and copy skipped characters [prev..titleStart[
|
||||
int32_t titleStart=prev;
|
||||
int32_t titleLimit=prev;
|
||||
UChar32 c;
|
||||
U16_NEXT(src, titleLimit, index, c);
|
||||
if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
|
||||
/* Adjust the titlecasing index (titleStart) to the next cased character. */
|
||||
for(;;) {
|
||||
if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
|
||||
// Adjust the titlecasing index to the next cased character,
|
||||
// or to the next letter/number/symbol/private use.
|
||||
// Stop with titleStart<titleLimit<=index
|
||||
// if there is a character to be titlecased,
|
||||
// or else stop with titleStart==titleLimit==index.
|
||||
UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
|
||||
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
|
||||
titleStart=titleLimit;
|
||||
if(titleLimit==index) {
|
||||
/*
|
||||
* only uncased characters in [prev..index[
|
||||
* stop with titleStart==titleLimit==index
|
||||
*/
|
||||
break;
|
||||
}
|
||||
U16_NEXT(src, titleLimit, index, c);
|
||||
if(UCASE_NONE!=ucase_getType(c)) {
|
||||
break; /* cased letter at [titleStart..titleLimit[ */
|
||||
}
|
||||
}
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+prev, titleStart-prev, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (prev < titleStart) {
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+prev, titleStart-prev, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
*/
|
||||
|
||||
#include "unicode/std_string.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/uchar.h"
|
||||
|
@ -49,6 +50,7 @@ public:
|
|||
int32_t whichCase,
|
||||
void *iter, const char *localeID, uint32_t options);
|
||||
void TestCasing();
|
||||
void TestTitleOptions();
|
||||
void TestFullCaseFoldingIterator();
|
||||
void TestGreekUpper();
|
||||
void TestLongUpper();
|
||||
|
@ -84,6 +86,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
|
|||
TESTCASE_AUTO(TestCaseConversion);
|
||||
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
|
||||
TESTCASE_AUTO(TestCasing);
|
||||
TESTCASE_AUTO(TestTitleOptions);
|
||||
#endif
|
||||
TESTCASE_AUTO(TestFullCaseFoldingIterator);
|
||||
TESTCASE_AUTO(TestGreekUpper);
|
||||
|
@ -593,6 +596,59 @@ StringCaseTest::TestCasing() {
|
|||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
StringCaseTest::TestTitleOptions() {
|
||||
// New options in ICU 60.
|
||||
TestCasingImpl(u"ʻcAt! ʻeTc.", u"ʻCat! ʻetc.", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_WHOLE_STRING);
|
||||
TestCasingImpl(u"a ʻCaT. A ʻdOg! ʻeTc.", u"A ʻCaT. A ʻdOg! ʻETc.", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_SENTENCES|U_TITLECASE_NO_LOWERCASE);
|
||||
TestCasingImpl(u"49eRs", u"49ers", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_WHOLE_STRING);
|
||||
TestCasingImpl(u"«丰(aBc)»", u"«丰(abc)»", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_WHOLE_STRING);
|
||||
TestCasingImpl(u"49eRs", u"49Ers", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
|
||||
TestCasingImpl(u"«丰(aBc)»", u"«丰(Abc)»", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_ADJUST_TO_CASED);
|
||||
TestCasingImpl(u" john. Smith", u" John. Smith", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_LOWERCASE);
|
||||
TestCasingImpl(u" john. Smith", u" john. smith", TEST_TITLE,
|
||||
nullptr, "", U_TITLECASE_WHOLE_STRING|U_TITLECASE_NO_BREAK_ADJUSTMENT);
|
||||
TestCasingImpl(u"«ijs»", u"«IJs»", TEST_TITLE,
|
||||
nullptr, "nl-BE", U_TITLECASE_WHOLE_STRING);
|
||||
TestCasingImpl(u"«ijs»", u"«İjs»", TEST_TITLE,
|
||||
nullptr, "tr-DE", U_TITLECASE_WHOLE_STRING);
|
||||
|
||||
// Test conflicting settings.
|
||||
// If & when we add more options, then the ORed combinations may become
|
||||
// indistinguishable from valid values.
|
||||
IcuTestErrorCode errorCode(*this, "TestTitleOptions");
|
||||
CaseMap::toTitle("", U_TITLECASE_NO_BREAK_ADJUSTMENT|U_TITLECASE_ADJUST_TO_CASED, nullptr,
|
||||
u"", 0, nullptr, 0, nullptr, errorCode);
|
||||
if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
errln("CaseMap::toTitle(multiple adjustment options) -> %s not illegal argument",
|
||||
errorCode.errorName());
|
||||
}
|
||||
errorCode.reset();
|
||||
CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING|U_TITLECASE_SENTENCES, nullptr,
|
||||
u"", 0, nullptr, 0, nullptr, errorCode);
|
||||
if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
errln("CaseMap::toTitle(multiple iterator options) -> %s not illegal argument",
|
||||
errorCode.errorName());
|
||||
}
|
||||
errorCode.reset();
|
||||
LocalPointer<BreakIterator> iter(
|
||||
BreakIterator::createCharacterInstance(Locale::getRoot(), errorCode));
|
||||
CaseMap::toTitle("", U_TITLECASE_WHOLE_STRING, iter.getAlias(),
|
||||
u"", 0, nullptr, 0, nullptr, errorCode);
|
||||
if (errorCode.get() != U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
errln("CaseMap::toTitle(iterator option + iterator) -> %s not illegal argument",
|
||||
errorCode.errorName());
|
||||
}
|
||||
errorCode.reset();
|
||||
}
|
||||
|
||||
void
|
||||
StringCaseTest::TestFullCaseFoldingIterator() {
|
||||
UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");
|
||||
|
|
|
@ -3,11 +3,15 @@
|
|||
package com.ibm.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.Edits;
|
||||
import com.ibm.icu.util.ICUUncheckedIOException;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
public final class CaseMapImpl {
|
||||
/**
|
||||
|
@ -134,11 +138,192 @@ public final class CaseMapImpl {
|
|||
protected int dir; // 0=initial state >0=forward <0=backward
|
||||
}
|
||||
|
||||
public static final int TITLECASE_WHOLE_STRING = 0x20;
|
||||
public static final int TITLECASE_SENTENCES = 0x40;
|
||||
|
||||
/**
|
||||
* Bit mask for the titlecasing iterator options bit field.
|
||||
* Currently only 3 out of 8 values are used:
|
||||
* 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
|
||||
* See stringoptions.h.
|
||||
* @internal
|
||||
*/
|
||||
private static final int TITLECASE_ITERATOR_MASK = 0xe0;
|
||||
|
||||
public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
|
||||
|
||||
/**
|
||||
* Bit mask for the titlecasing index adjustment options bit set.
|
||||
* Currently two bits are defined:
|
||||
* TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
|
||||
* See stringoptions.h.
|
||||
* @internal
|
||||
*/
|
||||
private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
|
||||
|
||||
public static int addTitleAdjustmentOption(int options, int newOption) {
|
||||
int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
|
||||
if (adjOptions !=0 && adjOptions != newOption) {
|
||||
throw new IllegalArgumentException("multiple titlecasing index adjustment options");
|
||||
}
|
||||
return options | newOption;
|
||||
}
|
||||
|
||||
private static final int LNS =
|
||||
(1 << UCharacterCategory.UPPERCASE_LETTER) |
|
||||
(1 << UCharacterCategory.LOWERCASE_LETTER) |
|
||||
(1 << UCharacterCategory.TITLECASE_LETTER) |
|
||||
// Not MODIFIER_LETTER: We count only cased modifier letters.
|
||||
(1 << UCharacterCategory.OTHER_LETTER) |
|
||||
|
||||
(1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << UCharacterCategory.LETTER_NUMBER) |
|
||||
(1 << UCharacterCategory.OTHER_NUMBER) |
|
||||
|
||||
(1 << UCharacterCategory.MATH_SYMBOL) |
|
||||
(1 << UCharacterCategory.CURRENCY_SYMBOL) |
|
||||
(1 << UCharacterCategory.MODIFIER_SYMBOL) |
|
||||
(1 << UCharacterCategory.OTHER_SYMBOL) |
|
||||
|
||||
(1 << UCharacterCategory.PRIVATE_USE);
|
||||
|
||||
private static boolean isLNS(int c) {
|
||||
// Letter, number, symbol,
|
||||
// or a private use code point because those are typically used as letters or numbers.
|
||||
// Consider modifier letters only if they are cased.
|
||||
int gc = UCharacterProperty.INSTANCE.getType(c);
|
||||
return ((1 << gc) & LNS) != 0 ||
|
||||
(gc == UCharacterCategory.MODIFIER_LETTER &&
|
||||
UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
|
||||
}
|
||||
|
||||
public static int addTitleIteratorOption(int options, int newOption) {
|
||||
int iterOptions = options & TITLECASE_ITERATOR_MASK;
|
||||
if (iterOptions !=0 && iterOptions != newOption) {
|
||||
throw new IllegalArgumentException("multiple titlecasing iterator options");
|
||||
}
|
||||
return options | newOption;
|
||||
}
|
||||
|
||||
public static BreakIterator getTitleBreakIterator(
|
||||
Locale locale, int options, BreakIterator iter) {
|
||||
options &= TITLECASE_ITERATOR_MASK;
|
||||
if (options != 0 && iter != null) {
|
||||
throw new IllegalArgumentException(
|
||||
"titlecasing iterator option together with an explicit iterator");
|
||||
}
|
||||
if (iter == null) {
|
||||
switch (options) {
|
||||
case 0:
|
||||
iter = BreakIterator.getWordInstance(locale);
|
||||
break;
|
||||
case TITLECASE_WHOLE_STRING:
|
||||
iter = new WholeStringBreakIterator();
|
||||
break;
|
||||
case TITLECASE_SENTENCES:
|
||||
iter = BreakIterator.getSentenceInstance(locale);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("unknown titlecasing iterator option");
|
||||
}
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
|
||||
public static BreakIterator getTitleBreakIterator(
|
||||
ULocale locale, int options, BreakIterator iter) {
|
||||
options &= TITLECASE_ITERATOR_MASK;
|
||||
if (options != 0 && iter != null) {
|
||||
throw new IllegalArgumentException(
|
||||
"titlecasing iterator option together with an explicit iterator");
|
||||
}
|
||||
if (iter == null) {
|
||||
switch (options) {
|
||||
case 0:
|
||||
iter = BreakIterator.getWordInstance(locale);
|
||||
break;
|
||||
case TITLECASE_WHOLE_STRING:
|
||||
iter = new WholeStringBreakIterator();
|
||||
break;
|
||||
case TITLECASE_SENTENCES:
|
||||
iter = BreakIterator.getSentenceInstance(locale);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("unknown titlecasing iterator option");
|
||||
}
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Omit unchanged text when case-mapping with Edits.
|
||||
*/
|
||||
public static final int OMIT_UNCHANGED_TEXT = 0x4000;
|
||||
|
||||
private static final class WholeStringBreakIterator extends BreakIterator {
|
||||
private int length;
|
||||
|
||||
private static void notImplemented() {
|
||||
throw new UnsupportedOperationException("should not occur");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
notImplemented();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
notImplemented();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
return length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
notImplemented();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int offset) {
|
||||
notImplemented();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
notImplemented();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
notImplemented();
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator newText) {
|
||||
length = newText.getEndIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(String newText) {
|
||||
length = newText.length();
|
||||
}
|
||||
}
|
||||
|
||||
private static int appendCodePoint(Appendable a, int c) throws IOException {
|
||||
if (c <= Character.MAX_VALUE) {
|
||||
a.append((char)c);
|
||||
|
@ -266,32 +451,33 @@ public final class CaseMapImpl {
|
|||
}
|
||||
|
||||
/*
|
||||
* Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* In this implementation, segment [prev..index[ into 3 parts:
|
||||
* a) uncased characters (copy as-is) [prev..titleStart[
|
||||
* b) first case letter (titlecase) [titleStart..titleLimit[
|
||||
* Segment [prev..index[ into 3 parts:
|
||||
* a) skipped characters (copy as-is) [prev..titleStart[
|
||||
* b) first letter (titlecase) [titleStart..titleLimit[
|
||||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||||
*/
|
||||
if(prev<index) {
|
||||
// find and copy uncased characters [prev..titleStart[
|
||||
// Find and copy skipped characters [prev..titleStart[
|
||||
int titleStart=prev;
|
||||
iter.setLimit(index);
|
||||
int c=iter.nextCaseMapCP();
|
||||
if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0
|
||||
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
|
||||
// Adjust the titlecasing index (titleStart) to the next cased character.
|
||||
while((c=iter.nextCaseMapCP())>=0
|
||||
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
|
||||
if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
|
||||
// Adjust the titlecasing index to the next cased character,
|
||||
// or to the next letter/number/symbol/private use.
|
||||
// Stop with titleStart<titleLimit<=index
|
||||
// if there is a character to be titlecased,
|
||||
// or else stop with titleStart==titleLimit==index.
|
||||
boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
|
||||
while ((toCased ?
|
||||
UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
|
||||
!CaseMapImpl.isLNS(c)) &&
|
||||
(c=iter.nextCaseMapCP())>=0) {}
|
||||
// If c<0 then we have only uncased characters in [prev..index[
|
||||
// and stopped with titleStart==titleLimit==index.
|
||||
titleStart=iter.getCPStart();
|
||||
appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
|
||||
if (prev < titleStart) {
|
||||
appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
|
||||
}
|
||||
}
|
||||
|
||||
if(titleStart<index) {
|
||||
|
|
|
@ -26,6 +26,7 @@ import com.ibm.icu.impl.locale.AsciiUtil;
|
|||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.CaseMap;
|
||||
import com.ibm.icu.text.DisplayContext;
|
||||
import com.ibm.icu.text.DisplayContext.Type;
|
||||
import com.ibm.icu.text.LocaleDisplayNames;
|
||||
|
@ -86,6 +87,13 @@ public class LocaleDisplayNamesImpl extends LocaleDisplayNames {
|
|||
*/
|
||||
private transient BreakIterator capitalizationBrkIter = null;
|
||||
|
||||
private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
|
||||
CaseMap.toTitle().wholeString().noLowercase();
|
||||
|
||||
private static String toTitleWholeStringNoLowercase(ULocale locale, String s) {
|
||||
return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
|
||||
locale.toLocale(), null, s, new StringBuilder(), null).toString();
|
||||
}
|
||||
|
||||
public static LocaleDisplayNames getInstance(ULocale locale, DialectHandling dialectHandling) {
|
||||
synchronized (cache) {
|
||||
|
@ -602,9 +610,12 @@ public class LocaleDisplayNamesImpl extends LocaleDisplayNames {
|
|||
ULocale minimized = ULocale.minimizeSubtags(modified, ULocale.Minimize.FAVOR_SCRIPT);
|
||||
String tempName = modified.getDisplayName(locale);
|
||||
boolean titlecase = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU;
|
||||
String nameInDisplayLocale = titlecase ? UCharacter.toTitleFirst(locale, tempName) : tempName;
|
||||
String nameInDisplayLocale =
|
||||
titlecase ? toTitleWholeStringNoLowercase(locale, tempName) : tempName;
|
||||
tempName = modified.getDisplayName(modified);
|
||||
String nameInSelf = capContext == DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ? UCharacter.toTitleFirst(modified, tempName) : tempName;
|
||||
String nameInSelf = capContext ==
|
||||
DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU ?
|
||||
toTitleWholeStringNoLowercase(modified, tempName) : tempName;
|
||||
return new UiListItem(minimized, modified, nameInDisplayLocale, nameInSelf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1124,9 +1124,15 @@ public final class UCaseProps {
|
|||
/**
|
||||
* Bit mask for getting just the options from a string compare options word
|
||||
* that are relevant for case folding (of a single string or code point).
|
||||
*
|
||||
* Currently only bit 0 for FOLD_CASE_EXCLUDE_SPECIAL_I.
|
||||
* It is conceivable that at some point we might use one more bit for using uppercase sharp s.
|
||||
* It is conceivable that at some point we might want the option to use only simple case foldings
|
||||
* when operating on strings.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
|
||||
private static final int FOLD_CASE_OPTIONS_MASK = 7;
|
||||
|
||||
/* return the simple case folding mapping for c */
|
||||
public final int fold(int c, int options) {
|
||||
|
|
|
@ -5185,22 +5185,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
|||
*/
|
||||
public static String toTitleCase(ULocale locale, String str,
|
||||
BreakIterator titleIter, int options) {
|
||||
if(titleIter == null) {
|
||||
if (locale == null) {
|
||||
locale = ULocale.getDefault();
|
||||
}
|
||||
titleIter = BreakIterator.getWordInstance(locale);
|
||||
if (titleIter == null && locale == null) {
|
||||
locale = ULocale.getDefault();
|
||||
}
|
||||
titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
|
||||
titleIter.setText(str);
|
||||
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
|
||||
}
|
||||
|
||||
|
||||
private static final int BREAK_MASK =
|
||||
(1<<UCharacterCategory.DECIMAL_DIGIT_NUMBER)
|
||||
| (1<<UCharacterCategory.OTHER_LETTER)
|
||||
| (1<<UCharacterCategory.MODIFIER_LETTER);
|
||||
|
||||
/**
|
||||
* Return a string with just the first word titlecased, for menus and UI, etc. This does not affect most of the string,
|
||||
* and sometimes has no effect at all; the original string is returned whenever casing
|
||||
|
@ -5225,49 +5217,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
|||
*/
|
||||
@Deprecated
|
||||
public static String toTitleFirst(ULocale locale, String str) {
|
||||
int c = 0;
|
||||
for (int i = 0; i < str.length(); i += UCharacter.charCount(c)) {
|
||||
c = UCharacter.codePointAt(str, i);
|
||||
int propertyMask = UCharacter.getIntPropertyValue(c, UProperty.GENERAL_CATEGORY_MASK);
|
||||
if ((propertyMask & BREAK_MASK) != 0) { // handle "49ers", initial CJK
|
||||
break;
|
||||
}
|
||||
if (UCaseProps.INSTANCE.getType(c) == UCaseProps.NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// we now have the first cased character
|
||||
// What we really want is something like:
|
||||
// String titled = UCharacter.toTitleCase(locale, str, i, outputCharsTaken);
|
||||
// That is, just give us the titlecased string, for the locale, at i and following,
|
||||
// and tell us how many characters are replaced.
|
||||
// The following won't work completely: it needs some more substantial changes to UCaseProps
|
||||
|
||||
String substring = str.substring(i, i+UCharacter.charCount(c));
|
||||
String titled = UCharacter.toTitleCase(locale, substring, BreakIterator.getSentenceInstance(locale), 0);
|
||||
|
||||
// skip if no change
|
||||
if (titled.codePointAt(0) == c) {
|
||||
// Using 0 is safe, since any change in titling will not have first initial character
|
||||
break;
|
||||
}
|
||||
StringBuilder result = new StringBuilder(str.length()).append(str, 0, i);
|
||||
int startOfSuffix;
|
||||
|
||||
// handle dutch, but check first for 'i', since that's faster. Should be built into UCaseProps.
|
||||
|
||||
if (c == 'i' && locale.getLanguage().equals("nl") && i < str.length() && str.charAt(i+1) == 'j') {
|
||||
result.append("IJ");
|
||||
startOfSuffix = 2;
|
||||
} else {
|
||||
result.append(titled);
|
||||
startOfSuffix = i + UCharacter.charCount(c);
|
||||
}
|
||||
|
||||
// add the remainder, and return
|
||||
return result.append(str, startOfSuffix, str.length()).toString();
|
||||
}
|
||||
return str; // no change
|
||||
return toTitleCase(locale, str, null,
|
||||
CaseMapImpl.TITLECASE_WHOLE_STRING|TITLECASE_NO_LOWERCASE);
|
||||
// TODO: Remove this function.
|
||||
// Move something like the following helper function into CLDR.
|
||||
// private static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE =
|
||||
// CaseMap.toTitle().wholeString().noLowercase();
|
||||
// return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(
|
||||
// locale.toLocale(), null, str, new StringBuilder(), null).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -5295,9 +5252,10 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
|||
public static String toTitleCase(Locale locale, String str,
|
||||
BreakIterator titleIter,
|
||||
int options) {
|
||||
if(titleIter == null) {
|
||||
titleIter = BreakIterator.getWordInstance(locale);
|
||||
if (titleIter == null && locale == null) {
|
||||
locale = Locale.getDefault();
|
||||
}
|
||||
titleIter = CaseMapImpl.getTitleBreakIterator(locale, options, titleIter);
|
||||
titleIter.setText(str);
|
||||
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
|
||||
}
|
||||
|
|
|
@ -174,6 +174,42 @@ public abstract class CaseMap {
|
|||
private static final Title OMIT_UNCHANGED = new Title(CaseMapImpl.OMIT_UNCHANGED_TEXT);
|
||||
private Title(int opt) { super(opt); }
|
||||
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* titlecases the string as a whole rather than each word.
|
||||
* (Titlecases only the character at index 0, possibly adjusted.)
|
||||
*
|
||||
* <p>It is an error to specify multiple titlecasing iterator options together,
|
||||
* including both an option and an explicit BreakIterator.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see #adjustToCased()
|
||||
* @draft ICU 60
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Title wholeString() {
|
||||
return new Title(CaseMapImpl.addTitleIteratorOption(
|
||||
internalOptions, CaseMapImpl.TITLECASE_WHOLE_STRING));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* titlecases sentences rather than words.
|
||||
* (Titlecases only the first character of each sentence, possibly adjusted.)
|
||||
*
|
||||
* <p>It is an error to specify multiple titlecasing iterator options together,
|
||||
* including both an option and an explicit BreakIterator.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see #adjustToCased()
|
||||
* @draft ICU 60
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Title sentences() {
|
||||
return new Title(CaseMapImpl.addTitleIteratorOption(
|
||||
internalOptions, CaseMapImpl.TITLECASE_SENTENCES));
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @draft ICU 59
|
||||
|
@ -191,12 +227,14 @@ public abstract class CaseMap {
|
|||
* Returns an instance that behaves like this one but
|
||||
* does not lowercase non-initial parts of words when titlecasing.
|
||||
*
|
||||
* <p>By default, titlecasing will titlecase the first cased character
|
||||
* of a word and lowercase all other characters.
|
||||
* <p>By default, titlecasing will titlecase the character at each
|
||||
* (possibly adjusted) BreakIterator index and
|
||||
* lowercase all other characters up to the next iterator index.
|
||||
* With this option, the other characters will not be modified.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see UCharacter#TITLECASE_NO_LOWERCASE
|
||||
* @see #adjustToCased()
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
|
@ -204,22 +242,16 @@ public abstract class CaseMap {
|
|||
return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE);
|
||||
}
|
||||
|
||||
// TODO: update references to the Unicode Standard for recent version
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* does not adjust the titlecasing indexes from BreakIterator::next() indexes;
|
||||
* does not adjust the titlecasing BreakIterator indexes;
|
||||
* titlecases exactly the characters at breaks from the iterator.
|
||||
*
|
||||
* <p>By default, titlecasing will take each break iterator index,
|
||||
* adjust it by looking for the next cased character, and titlecase that one.
|
||||
* Other characters are lowercased.
|
||||
* adjust it to the next relevant character (see {@link #adjustToCased()}),
|
||||
* and titlecase that one.
|
||||
*
|
||||
* <p>This follows Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
* <p>Other characters are lowercased.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
|
@ -227,7 +259,33 @@ public abstract class CaseMap {
|
|||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Title noBreakAdjustment() {
|
||||
return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT);
|
||||
return new Title(CaseMapImpl.addTitleAdjustmentOption(
|
||||
internalOptions, UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* adjusts each titlecasing BreakIterator index to the next cased character.
|
||||
* (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
|
||||
*
|
||||
* <p>This used to be the default index adjustment in ICU.
|
||||
* Since ICU 60, the default index adjustment is to the next character that is
|
||||
* a letter, number, symbol, or private use code point.
|
||||
* (Uncased modifier letters are skipped.)
|
||||
* The difference in behavior is small for word titlecasing,
|
||||
* but the new adjustment is much better for whole-string and sentence titlecasing:
|
||||
* It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
|
||||
*
|
||||
* <p>It is an error to specify multiple titlecasing adjustment options together.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see #noBreakAdjustment()
|
||||
* @draft ICU 60
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Title adjustToCased() {
|
||||
return new Title(CaseMapImpl.addTitleAdjustmentOption(
|
||||
internalOptions, CaseMapImpl.TITLECASE_ADJUST_TO_CASED));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -259,9 +317,10 @@ public abstract class CaseMap {
|
|||
*/
|
||||
public <A extends Appendable> A apply(
|
||||
Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) {
|
||||
if (iter == null) {
|
||||
iter = BreakIterator.getWordInstance(locale);
|
||||
if (iter == null && locale == null) {
|
||||
locale = Locale.getDefault();
|
||||
}
|
||||
iter = CaseMapImpl.getTitleBreakIterator(locale, internalOptions, iter);
|
||||
iter.setText(src.toString());
|
||||
return CaseMapImpl.toTitle(
|
||||
getCaseLocale(locale), internalOptions, iter, src, dest, edits);
|
||||
|
|
|
@ -343,6 +343,63 @@ public final class UCharacterCaseTest extends TestFmwk
|
|||
}
|
||||
}
|
||||
|
||||
// Not a @Test. See ICU4C intltest strcase.cpp TestCasingImpl().
|
||||
void TestCasingImpl(String input, String output, CaseMap.Title toTitle, Locale locale) {
|
||||
String result = toTitle.apply(locale, null, input, new StringBuilder(), null).toString();
|
||||
assertEquals("toTitle(" + input + ')', output, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestTitleOptions() {
|
||||
Locale root = Locale.ROOT;
|
||||
// New options in ICU 60.
|
||||
TestCasingImpl("ʻcAt! ʻeTc.", "ʻCat! ʻetc.",
|
||||
CaseMap.toTitle().wholeString(), root);
|
||||
TestCasingImpl("a ʻCaT. A ʻdOg! ʻeTc.", "A ʻCaT. A ʻdOg! ʻETc.",
|
||||
CaseMap.toTitle().sentences().noLowercase(), root);
|
||||
TestCasingImpl("49eRs", "49ers",
|
||||
CaseMap.toTitle().wholeString(), root);
|
||||
TestCasingImpl("«丰(aBc)»", "«丰(abc)»",
|
||||
CaseMap.toTitle().wholeString(), root);
|
||||
TestCasingImpl("49eRs", "49Ers",
|
||||
CaseMap.toTitle().wholeString().adjustToCased(), root);
|
||||
TestCasingImpl("«丰(aBc)»", "«丰(Abc)»",
|
||||
CaseMap.toTitle().wholeString().adjustToCased(), root);
|
||||
TestCasingImpl(" john. Smith", " John. Smith",
|
||||
CaseMap.toTitle().wholeString().noLowercase(), root);
|
||||
TestCasingImpl(" john. Smith", " john. smith",
|
||||
CaseMap.toTitle().wholeString().noBreakAdjustment(), root);
|
||||
TestCasingImpl("«ijs»", "«IJs»",
|
||||
CaseMap.toTitle().wholeString(), new Locale("nl", "BE"));
|
||||
TestCasingImpl("«ijs»", "«İjs»",
|
||||
CaseMap.toTitle().wholeString(), new Locale("tr", "DE"));
|
||||
|
||||
// Test conflicting settings.
|
||||
// If & when we add more options, then the ORed combinations may become
|
||||
// indistinguishable from valid values.
|
||||
try {
|
||||
CaseMap.toTitle().noBreakAdjustment().adjustToCased().
|
||||
apply(root, null, "", new StringBuilder(), null);
|
||||
fail("CaseMap.toTitle(multiple adjustment options) " +
|
||||
"did not throw an IllegalArgumentException");
|
||||
} catch(IllegalArgumentException expected) {
|
||||
}
|
||||
try {
|
||||
CaseMap.toTitle().wholeString().sentences().
|
||||
apply(root, null, "", new StringBuilder(), null);
|
||||
fail("CaseMap.toTitle(multiple iterator options) " +
|
||||
"did not throw an IllegalArgumentException");
|
||||
} catch(IllegalArgumentException expected) {
|
||||
}
|
||||
BreakIterator iter = BreakIterator.getCharacterInstance(root);
|
||||
try {
|
||||
CaseMap.toTitle().wholeString().apply(root, iter, "", new StringBuilder(), null);
|
||||
fail("CaseMap.toTitle(iterator option + iterator) " +
|
||||
"did not throw an IllegalArgumentException");
|
||||
} catch(IllegalArgumentException expected) {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestDutchTitle() {
|
||||
ULocale LOC_DUTCH = new ULocale("nl");
|
||||
|
|
|
@ -133,7 +133,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
Transliterator hanLatin = Transliterator.getInstance("Han-Latin");
|
||||
assertTransform("Transform", "z\u00E0o Unicode", hanLatin, "\u9020Unicode");
|
||||
assertTransform("Transform", "z\u00E0i chu\u00E0ng z\u00E0o Unicode zh\u012B qi\u00E1n", hanLatin, "\u5728\u5275\u9020Unicode\u4E4B\u524D");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestRegistry() {
|
||||
|
@ -510,15 +510,19 @@ public class TransliteratorTest extends TestFmwk {
|
|||
|
||||
Transliterator hex = Transliterator.getInstance("Any-Hex");
|
||||
hex.setFilter(new UnicodeFilter() {
|
||||
@Override
|
||||
public boolean contains(int c) {
|
||||
return c != 'c';
|
||||
}
|
||||
@Override
|
||||
public String toPattern(boolean escapeUnprintable) {
|
||||
return "";
|
||||
}
|
||||
@Override
|
||||
public boolean matchesIndexValue(int v) {
|
||||
return false;
|
||||
}
|
||||
@Override
|
||||
public void addMatchSetTo(UnicodeSet toUnionTo) {}
|
||||
});
|
||||
String s = "abcde";
|
||||
|
@ -1561,6 +1565,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
public NameableNullTrans(String id) {
|
||||
super(id, null);
|
||||
}
|
||||
@Override
|
||||
protected void handleTransliterate(Replaceable text,
|
||||
Position offsets, boolean incremental) {
|
||||
offsets.start = offsets.limit;
|
||||
|
@ -1570,6 +1575,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
public TestFact(String theID) {
|
||||
id = theID;
|
||||
}
|
||||
@Override
|
||||
public Transliterator getInstance(String ignoredID) {
|
||||
return new NameableNullTrans(id);
|
||||
}
|
||||
|
@ -1873,8 +1879,8 @@ public class TransliteratorTest extends TestFmwk {
|
|||
t.setFilter(new UnicodeSet("[:Ll:]"));
|
||||
expect(t, "aAaA", "bAbA");
|
||||
} finally {
|
||||
Transliterator.unregister("a_to_A");
|
||||
Transliterator.unregister("A_to_b");
|
||||
Transliterator.unregister("a_to_A");
|
||||
Transliterator.unregister("A_to_b");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2731,6 +2737,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
//System.out.println("Registering: " + ID + ", " + t.toRules(true));
|
||||
Transliterator.registerFactory(ID, singleton);
|
||||
}
|
||||
@Override
|
||||
public Transliterator getInstance(String ID) {
|
||||
return (Transliterator) m.get(ID);
|
||||
}
|
||||
|
@ -2751,8 +2758,17 @@ public class TransliteratorTest extends TestFmwk {
|
|||
String casefold = UCharacter.foldCase(s, true);
|
||||
assertEquals("Casefold", casefold, toCasefold.transform(s));
|
||||
|
||||
String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
|
||||
assertEquals("Title", title, toTitle.transform(s));
|
||||
if (i != 0x0345) {
|
||||
// ICU 60 changes the default titlecasing index adjustment.
|
||||
// For word breaks it is mostly the same as before,
|
||||
// but it is different for the iota subscript (the only cased combining mark).
|
||||
// This should be ok because the iota subscript is not supposed to appear
|
||||
// at the start of a word.
|
||||
// The title Transliterator is far below feature parity with the
|
||||
// UCharacter and CaseMap titlecasing functions.
|
||||
String title = UCharacter.toTitleCase(ULocale.ROOT, s, null);
|
||||
assertEquals("Title", title, toTitle.transform(s));
|
||||
}
|
||||
|
||||
String upper = UCharacter.toUpperCase(ULocale.ROOT, s);
|
||||
assertEquals("Upper", upper, toUpper.transform(s));
|
||||
|
@ -3008,6 +3024,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
Transliterator.registerFactory(ID, singleton);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Transliterator getInstance(String ID) {
|
||||
return (Transliterator) m.get(new CaseInsensitiveString(ID));
|
||||
}
|
||||
|
@ -3040,7 +3057,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
*/
|
||||
@Test
|
||||
public void TestAny() {
|
||||
UnicodeSet alphabetic = (UnicodeSet) new UnicodeSet("[:alphabetic:]").freeze();
|
||||
UnicodeSet alphabetic = new UnicodeSet("[:alphabetic:]").freeze();
|
||||
StringBuffer testString = new StringBuffer();
|
||||
for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
|
||||
UnicodeSet sample = new UnicodeSet().applyPropertyAlias("script", UScript.getShortName(i)).retainAll(alphabetic);
|
||||
|
@ -3142,7 +3159,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
|
||||
// add all the trail characters
|
||||
if (!nonStarters.containsSome(trailString)) {
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
UnicodeSet trailSet = leadToTrail.get(first);
|
||||
if (trailSet == null) {
|
||||
|
@ -3190,7 +3207,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
// disorderedMarks.add(s);
|
||||
// disorderedMarks.add(nfc.normalize(s));
|
||||
// addDerivedStrings(nfc, disorderedMarks, s);
|
||||
// }
|
||||
// }
|
||||
// s = nfd.getDecomposition(i);
|
||||
// if (s != null) {
|
||||
// disorderedMarks.add(s);
|
||||
|
@ -3292,6 +3309,10 @@ public class TransliteratorTest extends TestFmwk {
|
|||
addSourceTarget(s, empiricalSource, t, empiricalTarget);
|
||||
}
|
||||
}
|
||||
if (rule.contains("title")) {
|
||||
// See the comment in TestCasing() about the iota subscript.
|
||||
empiricalSource.remove(0x345);
|
||||
}
|
||||
assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
|
||||
assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
|
||||
}
|
||||
|
@ -3336,8 +3357,8 @@ public class TransliteratorTest extends TestFmwk {
|
|||
String direction = t == t0 ? "FORWARD\t" : "REVERSE\t";
|
||||
targetIndex++;
|
||||
UnicodeSet expectedTarget = testPair.length <= targetIndex ? expectedSource
|
||||
: testPair[targetIndex] == null ? expectedSource
|
||||
: testPair[targetIndex].length() == 0 ? expectedSource
|
||||
: testPair[targetIndex] == null ? expectedSource
|
||||
: testPair[targetIndex].length() == 0 ? expectedSource
|
||||
: new UnicodeSet(testPair[targetIndex]);
|
||||
ok = assertEquals(direction + "getSource\t\"" + test + '"', expectedSource, source);
|
||||
if (!ok) { // for debugging
|
||||
|
@ -3410,7 +3431,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
};
|
||||
for (String[] row : startTests) {
|
||||
int actual = findSharedStartLength(row[1], row[2]);
|
||||
assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
|
||||
assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
|
||||
Integer.parseInt(row[0]),
|
||||
actual);
|
||||
}
|
||||
|
@ -3423,8 +3444,8 @@ public class TransliteratorTest extends TestFmwk {
|
|||
};
|
||||
for (String[] row : endTests) {
|
||||
int actual = findSharedEndLength(row[1], row[2]);
|
||||
assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
|
||||
Integer.parseInt(row[0]),
|
||||
assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
|
||||
Integer.parseInt(row[0]),
|
||||
actual);
|
||||
}
|
||||
}
|
||||
|
@ -3916,7 +3937,7 @@ the ::BEGIN/::END stuff)
|
|||
@Test
|
||||
public void TestThai() {
|
||||
Transliterator tr = Transliterator.getInstance("Any-Latin", Transliterator.FORWARD);
|
||||
String thaiText =
|
||||
String thaiText =
|
||||
"\u0e42\u0e14\u0e22\u0e1e\u0e37\u0e49\u0e19\u0e10\u0e32\u0e19\u0e41\u0e25\u0e49\u0e27, \u0e04\u0e2d" +
|
||||
"\u0e21\u0e1e\u0e34\u0e27\u0e40\u0e15\u0e2d\u0e23\u0e4c\u0e08\u0e30\u0e40\u0e01\u0e35\u0e48\u0e22" +
|
||||
"\u0e27\u0e02\u0e49\u0e2d\u0e07\u0e01\u0e31\u0e1a\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07\u0e02\u0e2d" +
|
||||
|
@ -3948,7 +3969,7 @@ the ::BEGIN/::END stuff)
|
|||
"\u0e17\u0e04\u0e19\u0e34\u0e04\u0e17\u0e35\u0e48\u0e43\u0e0a\u0e49\u0e01\u0e31\u0e19\u0e2d\u0e22" +
|
||||
"\u0e39\u0e48\u0e17\u0e31\u0e48\u0e27\u0e44\u0e1b.";
|
||||
|
||||
String latinText =
|
||||
String latinText =
|
||||
"doy ph\u1ee5\u0304\u0302n \u1e6d\u0304h\u0101n l\u00e6\u0302w, khxmphiwtexr\u0312 ca ke\u012b\u0300" +
|
||||
"ywk\u0304\u0125xng k\u1ea1b re\u1ee5\u0304\u0300xng k\u0304hxng t\u1ea1wlek\u0304h. khxmphiwtexr" +
|
||||
"\u0312 c\u1ea1d k\u0115b t\u1ea1w x\u1ea1ks\u0304\u02b9r l\u00e6a x\u1ea1kk\u0304h ra x\u1ee5\u0304" +
|
||||
|
@ -4041,6 +4062,7 @@ the ::BEGIN/::END stuff)
|
|||
this.expectedData = expectedData;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
errorMsg = null;
|
||||
StringBuffer inBuf = new StringBuffer(testData);
|
||||
|
|
Loading…
Add table
Reference in a new issue