ICU-4935 add U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT, ucasemap_get/setBreakIterator, ucasemap_toTitle, ucasemap_utf8ToTitle, ucasemap_utf8FoldCase, UnicodeString::toTitle(...options)

X-SVN-Rev: 22170
This commit is contained in:
Markus Scherer 2007-07-27 06:37:08 +00:00
parent 2531662042
commit c12f6712f9
11 changed files with 1031 additions and 278 deletions

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005, International Business Machines
* Copyright (C) 2005-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -20,6 +20,10 @@
#include "unicode/uloc.h"
#include "unicode/ustring.h"
#include "unicode/ucasemap.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/ubrk.h"
#include "unicode/utext.h"
#endif
#include "cmemory.h"
#include "cstring.h"
#include "ucase.h"
@ -27,13 +31,6 @@
/* UCaseMap service object -------------------------------------------------- */
struct UCaseMap {
const UCaseProps *csp;
char locale[32];
int32_t locCache;
uint32_t options;
};
U_DRAFT UCaseMap * U_EXPORT2
ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
UCaseMap *csm;
@ -62,6 +59,9 @@ ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
U_DRAFT void U_EXPORT2
ucasemap_close(UCaseMap *csm) {
if(csm!=NULL) {
#if !UCONFIG_NO_BREAK_ITERATION
ubrk_close(csm->iter);
#endif
uprv_free(csm);
}
}
@ -106,8 +106,25 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
csm->options=options;
}
#if !UCONFIG_NO_BREAK_ITERATION
U_DRAFT const UBreakIterator * U_EXPORT2
ucasemap_getBreakIterator(const UCaseMap *csm) {
return csm->iter;
}
U_DRAFT void U_EXPORT2
ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode) {
ubrk_close(csm->iter);
csm->iter=iterToAdopt;
}
#endif
/* UTF-8 string case mappings ----------------------------------------------- */
/* TODO(markus): Move to a new, separate utf8case.c file. */
/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
static U_INLINE int32_t
appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
@ -146,7 +163,7 @@ appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
(char *)(dest+destIndex), destCapacity-destIndex, &destLength,
s, length,
&errorCode);
destIndex+=length;
destIndex+=destLength;
/* we might have an overflow, but we know the actual length */
}
} else {
@ -159,7 +176,7 @@ appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
NULL, 0, &destLength,
s, length,
&errorCode);
destIndex+=length;
destIndex+=destLength;
}
}
return destIndex;
@ -197,12 +214,6 @@ utf8_caseContextIterator(void *context, int8_t dir) {
return U_SENTINEL;
}
typedef int32_t U_CALLCONV
UCaseMapFull(const UCaseProps *csp, UChar32 c,
UCaseContextIterator *iter, void *context,
const UChar **pString,
const char *locale, int32_t *locCache);
/*
* Case-maps [srcStart..srcLimit[ but takes
* context [0..srcLength[ into account.
@ -214,7 +225,7 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
int32_t srcStart, int32_t srcLimit,
UErrorCode *pErrorCode) {
const UChar *s;
UChar32 c;
UChar32 c, c2;
int32_t srcIndex, destIndex;
int32_t locCache;
@ -227,8 +238,192 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
csc->cpStart=srcIndex;
U8_NEXT(src, srcIndex, srcLimit, c);
csc->cpLimit=srcIndex;
if(c<0) {
int32_t i=csc->cpStart;
while(destIndex<destCapacity && i<srcIndex) {
dest[destIndex++]=src[i++];
}
continue;
}
c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
/* fast path version of appendResult() for ASCII results */
dest[destIndex++]=(uint8_t)c2;
} else {
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
}
}
if(destIndex>destCapacity) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
return destIndex;
}
#if !UCONFIG_NO_BREAK_ITERATION
/*
* Internal titlecasing function.
*/
static int32_t
_toTitle(UCaseMap *csm,
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, UCaseContext *csc,
int32_t srcLength,
UErrorCode *pErrorCode) {
UText utext=UTEXT_INITIALIZER;
const UChar *s;
UChar32 c;
int32_t prev, titleStart, titleLimit, index, destIndex, length;
UBool isFirstIndex;
utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(csm->iter==NULL) {
csm->iter=ubrk_open(UBRK_WORD, csm->locale,
NULL, 0,
pErrorCode);
}
ubrk_setUText(csm->iter, &utext, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
utext_close(&utext);
return 0;
}
/* set up local variables */
destIndex=0;
prev=0;
isFirstIndex=TRUE;
/* titlecasing loop */
while(prev<srcLength) {
/* find next index where to titlecase */
if(isFirstIndex) {
isFirstIndex=FALSE;
index=ubrk_first(csm->iter);
} else {
index=ubrk_next(csm->iter);
}
if(index==UBRK_DONE || index>srcLength) {
index=srcLength;
}
/*
* Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* In this implementation, segment [prev..index[ into 3 parts:
* a) uncased characters (copy as-is) [prev..titleStart[
* b) first case letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
/* find and copy uncased characters [prev..titleStart[ */
titleStart=titleLimit=prev;
U8_NEXT(src, titleLimit, index, c);
if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
/* Adjust the titlecasing index (titleStart) to the next cased character. */
for(;;) {
titleStart=titleLimit;
if(titleLimit==index) {
/*
* only uncased characters in [prev..index[
* stop with titleStart==titleLimit==index
*/
break;
}
U8_NEXT(src, titleLimit, index, c);
if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
break; /* cased letter at [titleStart..titleLimit[ */
}
}
length=titleStart-prev;
if(length>0) {
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, src+prev, length);
}
destIndex+=length;
}
}
if(titleStart<titleLimit) {
/* titlecase c which is from [titleStart..titleLimit[ */
csc->cpStart=titleStart;
csc->cpLimit=titleLimit;
c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
/* lowercase [titleLimit..index[ */
if(titleLimit<index) {
if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
/* Normal operation: Lowercase the rest of the word. */
destIndex+=
_caseMap(
csm, ucase_toFullLower,
dest+destIndex, destCapacity-destIndex,
src, csc,
titleLimit, index,
pErrorCode);
} else {
/* Optionally just copy the rest of the word unchanged. */
length=index-titleLimit;
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, src+titleLimit, length);
}
destIndex+=length;
}
}
}
}
prev=index;
}
if(destIndex>destCapacity) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
utext_close(&utext);
return destIndex;
}
#endif
U_CFUNC int32_t
utf8_foldCase(const UCaseProps *csp,
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
uint32_t options,
UErrorCode *pErrorCode) {
int32_t srcIndex, destIndex;
const UChar *s;
UChar32 c, c2;
int32_t start;
/* case mapping loop */
srcIndex=destIndex=0;
while(srcIndex<srcLength) {
start=srcIndex;
U8_NEXT(src, srcIndex, srcLength, c);
if(c<0) {
while(destIndex<destCapacity && start<srcIndex) {
dest[destIndex++]=src[start++];
}
continue;
}
c=ucase_toFullFolding(csp, c, &s, options);
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
/* fast path version of appendResult() for ASCII results */
dest[destIndex++]=(uint8_t)c2;
} else {
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
}
}
if(destIndex>destCapacity) {
@ -241,12 +436,6 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
* Implement argument checking and buffer handling
* for string case mapping as a common function.
*/
enum {
TO_LOWER,
TO_UPPER,
TO_TITLE,
FOLD_CASE
};
/* common internal function for public API functions */
@ -256,7 +445,6 @@ caseMap(const UCaseMap *csm,
const uint8_t *src, int32_t srcLength,
int32_t toWhichCase,
UErrorCode *pErrorCode) {
UCaseContext csc={ NULL };
int32_t destLength;
/* check argument values */
@ -288,21 +476,37 @@ caseMap(const UCaseMap *csm,
destLength=0;
csc.p=(void *)src;
csc.limit=srcLength;
if(toWhichCase==FOLD_CASE) {
destLength=utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength,
csm->options, pErrorCode);
} else {
UCaseContext csc={ NULL };
if(toWhichCase==TO_LOWER) {
destLength=_caseMap(csm, ucase_toFullLower,
dest, destCapacity,
src, &csc,
0, srcLength,
pErrorCode);
} else /* if(toWhichCase==TO_UPPER) */ {
destLength=_caseMap(csm, ucase_toFullUpper,
dest, destCapacity,
src, &csc,
0, srcLength,
pErrorCode);
csc.p=(void *)src;
csc.limit=srcLength;
if(toWhichCase==TO_LOWER) {
destLength=_caseMap(csm, ucase_toFullLower,
dest, destCapacity,
src, &csc,
0, srcLength,
pErrorCode);
} else if(toWhichCase==TO_UPPER) {
destLength=_caseMap(csm, ucase_toFullUpper,
dest, destCapacity,
src, &csc,
0, srcLength,
pErrorCode);
} else /* if(toWhichCase==TO_TITLE) */ {
#if UCONFIG_NO_BREAK_ITERATION
*pErrorCode=U_UNSUPPORTED_ERROR;
#else
/* UCaseMap is actually non-const in toTitle() APIs. */
destLength=_toTitle((UCaseMap *)csm, dest, destCapacity,
src, &csc, srcLength,
pErrorCode);
#endif
}
}
return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
@ -331,3 +535,29 @@ ucasemap_utf8ToUpper(const UCaseMap *csm,
(const uint8_t *)src, srcLength,
TO_UPPER, pErrorCode);
}
#if !UCONFIG_NO_BREAK_ITERATION
U_DRAFT int32_t U_EXPORT2
ucasemap_utf8ToTitle(UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode) {
return caseMap(csm,
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
TO_TITLE, pErrorCode);
}
#endif
U_DRAFT int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode) {
return caseMap(csm,
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
FOLD_CASE, pErrorCode);
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005, International Business Machines
* Copyright (C) 2005-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -30,10 +30,9 @@
* for the attributes, as usual.
*
* Currently, the functionality provided here does not overlap with uchar.h
* and ustring.h.
* and ustring.h, except for ucasemap_toTitle().
*
* ucasemap_utf8ToLower() and ucasemap_utf8ToUpper() operate directly on
* UTF-8 strings.
* ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
*/
/**
@ -60,6 +59,10 @@ typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @draft ICU
* which must not indicate a failure before the function call.
* @return Pointer to a UCaseMap service object, if successful.
*
* @see U_FOLD_CASE_DEFAULT
* @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @see U_TITLECASE_NO_LOWERCASE
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @draft ICU 3.4
*/
U_DRAFT UCaseMap * U_EXPORT2
@ -119,6 +122,135 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);
U_DRAFT void U_EXPORT2
ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
#ifndef U_HIDE_DRAFT_API
/**
* Do not lowercase non-initial parts of words when titlecasing.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will titlecase the first cased character
* of a word and lowercase all other characters.
* With this option, the other characters will not be modified.
*
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @see UnicodeString::toTitle
* @draft ICU 3.8
*/
#define U_TITLECASE_NO_LOWERCASE 0x100
/**
* Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
* titlecase exactly the characters at breaks from the iterator.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will take each break iterator index,
* adjust it by looking for the next cased character, and titlecase that one.
* Other characters are lowercased.
*
* This follows Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @see UnicodeString::toTitle
* @see U_TITLECASE_NO_LOWERCASE
* @draft ICU 3.8
*/
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
#endif
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Get the break iterator that is used for titlecasing.
* Do not modify the returned break iterator.
* @param csm UCaseMap service object.
* @return titlecasing break iterator
* @draft ICU 3.8
*/
U_DRAFT const UBreakIterator * U_EXPORT2
ucasemap_getBreakIterator(const UCaseMap *csm);
/**
* Set the break iterator that is used for titlecasing.
* The UCaseMap service object releases a previously set break iterator
* and "adopts" this new one, taking ownership of it.
* It will be released in a subsequent call to ucasemap_setBreakIterator()
* or ucasemap_close().
*
* Break iterator operations are not thread-safe. Therefore, titlecasing
* functions use non-const UCaseMap objects. It is not possible to titlecase
* strings concurrently using the same UCaseMap.
*
* @param csm UCaseMap service object.
* @param iterToAdopt Break iterator to be adopted for titlecasing.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @draft ICU 3.8
*/
U_DRAFT void U_EXPORT2
ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);
/**
* Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
* except that it takes ucasemap_setOptions() into account and has performance
* advantages from being able to use a UCaseMap object for multiple case mapping
* operations, saving setup time.
*
* Casing is locale-dependent and context-sensitive.
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with ucasemap_setOptions().)
*
* The titlecase break iterator can be provided to customize for arbitrary
* styles, using rules and dictionaries beyond the standard iterators.
* It may be more efficient to always provide an iterator to avoid
* opening and closing one for each string.
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToTitle
* @draft ICU 3.8
*/
U_DRAFT int32_t U_EXPORT2
ucasemap_toTitle(UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif
/**
* Lowercase the characters in a UTF-8 string.
* Casing is locale-dependent and context-sensitive.
@ -132,7 +264,7 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
@ -161,7 +293,7 @@ ucasemap_utf8ToLower(const UCaseMap *csm,
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
@ -177,4 +309,87 @@ ucasemap_utf8ToUpper(const UCaseMap *csm,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Titlecase a UTF-8 string.
* Casing is locale-dependent and context-sensitive.
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with ucasemap_setOptions().)
*
* The titlecase break iterator can be provided to customize for arbitrary
* styles, using rules and dictionaries beyond the standard iterators.
* It may be more efficient to always provide an iterator to avoid
* opening and closing one for each string.
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToTitle
* @see U_TITLECASE_NO_LOWERCASE
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @draft ICU 3.8
*/
U_DRAFT int32_t U_EXPORT2
ucasemap_utf8ToTitle(UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif
/**
* Case-fold the characters in a UTF-8 string.
* Case-folding is locale-independent and not context-sensitive,
* but there is an option for whether to include or exclude mappings for dotted I
* and dotless i that are marked with 'I' in CaseFolding.txt.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strFoldCase
* @see ucasemap_setOptions
* @see U_FOLD_CASE_DEFAULT
* @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @draft ICU 3.8
*/
U_DRAFT int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif

View file

@ -2430,7 +2430,7 @@ public:
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
* This function uses only the first() and next() methods of the
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* @param titleIter A break iterator to find the first characters of words
@ -2458,7 +2458,7 @@ public:
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
* This function uses only the first() and next() methods of the
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* @param titleIter A break iterator to find the first characters of words
@ -2472,6 +2472,37 @@ public:
*/
UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale);
/**
* Titlecase this string, with options.
*
* Casing is locale-dependent and context-sensitive.
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with options.)
*
* The titlecase break iterator can be provided to customize for arbitrary
* styles, using rules and dictionaries beyond the standard iterators.
* It may be more efficient to always provide an iterator to avoid
* opening and closing one for each string.
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* @param titleIter A break iterator to find the first characters of words
* that are to be titlecased.
* If none is provided (0), then a standard titlecase
* break iterator is opened.
* Otherwise the provided iterator is set to the string's text.
* @param locale The locale to consider.
* @return A reference to this.
* @see U_TITLECASE_NO_LOWERCASE
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @draft ICU 3.8
*/
UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options);
#endif
/**

View file

@ -1105,7 +1105,7 @@ u_strToLower(UChar *dest, int32_t destCapacity,
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
* This function uses only the first() and next() methods of the
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* The result may be longer or shorter than the original.

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2005, International Business Machines
* Copyright (C) 1999-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -138,28 +138,6 @@ UnicodeString::caseMap(BreakIterator *titleIter,
return *this;
}
#if !UCONFIG_NO_BREAK_ITERATION
// set up the titlecasing break iterator
UBreakIterator *cTitleIter = 0;
if(toWhichCase == TO_TITLE) {
errorCode = U_ZERO_ERROR;
if(titleIter != 0) {
cTitleIter = (UBreakIterator *)titleIter;
ubrk_setText(cTitleIter, oldArray, oldLength, &errorCode);
} else {
cTitleIter = ubrk_open(UBRK_WORD, locale,
oldArray, oldLength,
&errorCode);
}
if(U_FAILURE(errorCode)) {
uprv_free(bufferToDelete);
setToBogus();
return *this;
}
}
#endif
// Case-map, and if the result is too long, then reallocate and repeat.
do {
errorCode = U_ZERO_ERROR;
@ -177,7 +155,7 @@ UnicodeString::caseMap(BreakIterator *titleIter,
#else
fLength = ustr_toTitle(csp, fArray, fCapacity,
oldArray, oldLength,
cTitleIter, locale, &errorCode);
(UBreakIterator *)titleIter, locale, options, &errorCode);
#endif
} else {
fLength = ustr_foldCase(csp, fArray, fCapacity,
@ -187,12 +165,6 @@ UnicodeString::caseMap(BreakIterator *titleIter,
}
} while(errorCode==U_BUFFER_OVERFLOW_ERROR && cloneArrayIfNeeded(fLength, fLength, FALSE));
#if !UCONFIG_NO_BREAK_ITERATION
if(cTitleIter != 0 && titleIter == 0) {
ubrk_close(cTitleIter);
}
#endif
if (bufferToDelete) {
uprv_free(bufferToDelete);
}
@ -234,6 +206,11 @@ UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) {
return caseMap(titleIter, locale.getName(), 0, TO_TITLE);
}
UnicodeString &
UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options) {
return caseMap(titleIter, locale.getName(), options, TO_TITLE);
}
#endif
UnicodeString &

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2006, International Business Machines
* Copyright (C) 1999-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ustr_imp.h
@ -98,6 +98,42 @@ u_growBufferFromStatic(void *context,
* ustring.h/ustrcase.c and UnicodeString case mapping functions.
*/
/**
* @internal
*/
struct UCaseMap {
const UCaseProps *csp;
#if !UCONFIG_NO_BREAK_ITERATION
UBreakIterator *iter; /* We adopt the iterator, so we own it. */
#endif
char locale[32];
int32_t locCache;
uint32_t options;
};
#ifndef __UCASEMAP_H__
typedef struct UCaseMap UCaseMap;
#endif
/**
* @internal
*/
enum {
TO_LOWER,
TO_UPPER,
TO_TITLE,
FOLD_CASE
};
/**
* @internal
*/
typedef int32_t U_CALLCONV
UCaseMapFull(const UCaseProps *csp, UChar32 c,
UCaseContextIterator *iter, void *context,
const UChar **pString,
const char *locale, int32_t *locCache);
/**
* @internal
*/
@ -128,7 +164,7 @@ ustr_toTitle(const UCaseProps *csp,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
const char *locale, uint32_t options,
UErrorCode *pErrorCode);
#endif

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2006, International Business Machines
* Copyright (C) 2001-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -21,6 +21,7 @@
#include "unicode/utypes.h"
#include "unicode/uloc.h"
#include "unicode/ustring.h"
#include "unicode/ucasemap.h"
#include "unicode/ubrk.h"
#include "cmemory.h"
#include "ucase.h"
@ -114,26 +115,22 @@ utf16_caseContextIterator(void *context, int8_t dir) {
return U_SENTINEL;
}
typedef int32_t U_CALLCONV
UCaseMapFull(const UCaseProps *csp, UChar32 c,
UCaseContextIterator *iter, void *context,
const UChar **pString,
const char *locale, int32_t *locCache);
/*
* Case-maps [srcStart..srcLimit[ but takes
* context [0..srcLength[ into account.
*/
static int32_t
_caseMap(const UCaseProps *csp, UCaseMapFull *map,
_caseMap(const UCaseMap *csm, UCaseMapFull *map,
UChar *dest, int32_t destCapacity,
const UChar *src, UCaseContext *csc,
int32_t srcStart, int32_t srcLimit,
const char *locale, int32_t *locCache,
UErrorCode *pErrorCode) {
const UChar *s;
UChar32 c, c2;
int32_t srcIndex, destIndex;
int32_t locCache;
locCache=csm->locCache;
/* case mapping loop */
srcIndex=srcStart;
@ -142,7 +139,7 @@ _caseMap(const UCaseProps *csp, UCaseMapFull *map,
csc->cpStart=srcIndex;
U16_NEXT(src, srcIndex, srcLimit, c);
csc->cpLimit=srcIndex;
c=map(csp, c, utf16_caseContextIterator, csc, &s, locale, locCache);
c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
/* fast path version of appendResult() for BMP results */
dest[destIndex++]=(UChar)c2;
@ -157,26 +154,83 @@ _caseMap(const UCaseProps *csp, UCaseMapFull *map,
return destIndex;
}
static void
setTempCaseMapLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
/*
* We could call ucasemap_setLocale(), but here we really only care about
* the initial language subtag, we need not return the real string via
* ucasemap_getLocale(), and we don't care about only getting "x" from
* "x-some-thing" etc.
*
* We ignore locales with a longer-than-3 initial subtag.
*
* We also do not fill in the locCache because it is rarely used,
* and not worth setting unless we reuse it for many case mapping operations.
* (That's why UCaseMap was created.)
*/
int i;
char c;
/* the internal functions require locale!=NULL */
if(locale==NULL) {
locale=uloc_getDefault();
}
for(i=0; i<4 && (c=locale[i])!=0 && c!='-' && c!='_'; ++i) {
csm->locale[i]=c;
}
if(i<=3) {
csm->locale[i]=0; /* Up to 3 non-separator characters. */
} else {
csm->locale[0]=0; /* Longer-than-3 initial subtag: Ignore. */
}
}
/*
* Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
* Do this fast because it is called with every function call.
*/
static U_INLINE void
setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
if(csm->csp==NULL) {
csm->csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
}
if(locale!=NULL && locale[0]==0) {
csm->locale[0]=0;
} else {
setTempCaseMapLocale(csm, locale, pErrorCode);
}
}
#if !UCONFIG_NO_BREAK_ITERATION
/*
* Internal titlecasing function.
*
* Must get titleIter!=NULL.
*/
static int32_t
_toTitle(const UCaseProps *csp,
_toTitle(UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, UCaseContext *csc,
int32_t srcLength,
UBreakIterator *titleIter,
const char *locale, int32_t *locCache,
UErrorCode *pErrorCode) {
const UChar *s;
UChar32 c;
int32_t prev, titleStart, titleLimit, index, destIndex, length;
UBool isFirstIndex;
if(csm->iter!=NULL) {
ubrk_setText(csm->iter, src, srcLength, pErrorCode);
} else {
csm->iter=ubrk_open(UBRK_WORD, csm->locale,
src, srcLength,
pErrorCode);
}
if(U_FAILURE(*pErrorCode)) {
return 0;
}
/* set up local variables */
destIndex=0;
prev=0;
@ -187,9 +241,9 @@ _toTitle(const UCaseProps *csp,
/* find next index where to titlecase */
if(isFirstIndex) {
isFirstIndex=FALSE;
index=ubrk_first(titleIter);
index=ubrk_first(csm->iter);
} else {
index=ubrk_next(titleIter);
index=ubrk_next(csm->iter);
}
if(index==UBRK_DONE || index>srcLength) {
index=srcLength;
@ -211,45 +265,58 @@ _toTitle(const UCaseProps *csp,
if(prev<index) {
/* find and copy uncased characters [prev..titleStart[ */
titleStart=titleLimit=prev;
for(;;) {
U16_NEXT(src, titleLimit, srcLength, c);
if(UCASE_NONE!=ucase_getType(csp, c)) {
break; /* cased letter at [titleStart..titleLimit[ */
U16_NEXT(src, titleLimit, index, c);
if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
/* Adjust the titlecasing index (titleStart) to the next cased character. */
for(;;) {
titleStart=titleLimit;
if(titleLimit==index) {
/*
* only uncased characters in [prev..index[
* stop with titleStart==titleLimit==index
*/
break;
}
U16_NEXT(src, titleLimit, index, c);
if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
break; /* cased letter at [titleStart..titleLimit[ */
}
}
titleStart=titleLimit;
if(titleLimit==index) {
/*
* only uncased characters in [prev..index[
* stop with titleStart==titleLimit==index
*/
break;
length=titleStart-prev;
if(length>0) {
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
}
destIndex+=length;
}
}
length=titleStart-prev;
if(length>0) {
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
}
destIndex+=length;
}
if(titleStart<titleLimit) {
/* titlecase c which is from [titleStart..titleLimit[ */
csc->cpStart=titleStart;
csc->cpLimit=titleLimit;
c=ucase_toFullTitle(csp, c, utf16_caseContextIterator, csc, &s, locale, locCache);
c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
/* lowercase [titleLimit..index[ */
if(titleLimit<index) {
destIndex+=
_caseMap(
csp, ucase_toFullLower,
dest+destIndex, destCapacity-destIndex,
src, csc,
titleLimit, index,
locale, locCache,
pErrorCode);
if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
/* Normal operation: Lowercase the rest of the word. */
destIndex+=
_caseMap(
csm, ucase_toFullLower,
dest+destIndex, destCapacity-destIndex,
src, csc,
titleLimit, index,
pErrorCode);
} else {
/* Optionally just copy the rest of the word unchanged. */
length=index-titleLimit;
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
}
destIndex+=length;
}
}
}
}
@ -263,26 +330,6 @@ _toTitle(const UCaseProps *csp,
return destIndex;
}
U_CFUNC int32_t
ustr_toTitle(const UCaseProps *csp,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
UErrorCode *pErrorCode) {
UCaseContext csc={ NULL };
int32_t locCache;
csc.p=(void *)src;
csc.limit=srcLength;
locCache=0;
return _toTitle(csp,
dest, destCapacity,
src, &csc, srcLength,
titleIter, locale, &locCache, pErrorCode);
}
#endif
/* functions available in the common library (for unistr_case.cpp) */
@ -293,17 +340,18 @@ ustr_toLower(const UCaseProps *csp,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode) {
UCaseMap csm={ NULL };
UCaseContext csc={ NULL };
int32_t locCache;
csm.csp=csp;
setTempCaseMap(&csm, locale, pErrorCode);
csc.p=(void *)src;
csc.limit=srcLength;
locCache=0;
return _caseMap(csp, ucase_toFullLower,
return _caseMap(&csm, ucase_toFullLower,
dest, destCapacity,
src, &csc, 0, srcLength,
locale, &locCache, pErrorCode);
pErrorCode);
}
U_CFUNC int32_t
@ -312,19 +360,52 @@ ustr_toUpper(const UCaseProps *csp,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode) {
UCaseMap csm={ NULL };
UCaseContext csc={ NULL };
int32_t locCache;
csm.csp=csp;
setTempCaseMap(&csm, locale, pErrorCode);
csc.p=(void *)src;
csc.limit=srcLength;
locCache=0;
return _caseMap(csp, ucase_toFullUpper,
return _caseMap(&csm, ucase_toFullUpper,
dest, destCapacity,
src, &csc, 0, srcLength,
locale, &locCache, pErrorCode);
pErrorCode);
}
#if !UCONFIG_NO_BREAK_ITERATION
U_CFUNC int32_t
ustr_toTitle(const UCaseProps *csp,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale, uint32_t options,
UErrorCode *pErrorCode) {
UCaseMap csm={ NULL };
UCaseContext csc={ NULL };
int32_t length;
csm.csp=csp;
csm.iter=titleIter;
csm.options=options;
setTempCaseMap(&csm, locale, pErrorCode);
csc.p=(void *)src;
csc.limit=srcLength;
length=_toTitle(&csm,
dest, destCapacity,
src, &csc, srcLength,
pErrorCode);
if(titleIter==NULL && csm.iter!=NULL) {
ubrk_close(csm.iter);
}
return length;
}
#endif
U_CFUNC int32_t
ustr_foldCase(const UCaseProps *csp,
UChar *dest, int32_t destCapacity,
@ -359,30 +440,19 @@ ustr_foldCase(const UCaseProps *csp,
* Implement argument checking and buffer handling
* for string case mapping as a common function.
*/
enum {
TO_LOWER,
TO_UPPER,
TO_TITLE,
FOLD_CASE
};
/* common internal function for public API functions */
static int32_t
caseMap(UChar *dest, int32_t destCapacity,
caseMap(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
uint32_t options,
int32_t toWhichCase,
UErrorCode *pErrorCode) {
UChar buffer[300];
UChar *temp;
const UCaseProps *csp;
int32_t destLength;
UBool ownTitleIter;
/* check argument values */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
@ -397,11 +467,6 @@ caseMap(UChar *dest, int32_t destCapacity,
return 0;
}
csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
/* get the string length */
if(srcLength==-1) {
srcLength=u_strlen(src);
@ -428,53 +493,38 @@ caseMap(UChar *dest, int32_t destCapacity,
temp=dest;
}
ownTitleIter=FALSE;
destLength=0;
if(toWhichCase==FOLD_CASE) {
destLength=ustr_foldCase(csp, temp, destCapacity, src, srcLength,
options, pErrorCode);
destLength=ustr_foldCase(csm->csp, temp, destCapacity, src, srcLength,
csm->options, pErrorCode);
} else {
UCaseContext csc={ NULL };
int32_t locCache;
csc.p=(void *)src;
csc.limit=srcLength;
locCache=0;
/* the internal functions require locale!=NULL */
if(locale==NULL) {
locale=uloc_getDefault();
}
if(toWhichCase==TO_LOWER) {
destLength=_caseMap(csp, ucase_toFullLower,
destLength=_caseMap(csm, ucase_toFullLower,
temp, destCapacity,
src, &csc,
0, srcLength,
locale, &locCache, pErrorCode);
pErrorCode);
} else if(toWhichCase==TO_UPPER) {
destLength=_caseMap(csp, ucase_toFullUpper,
destLength=_caseMap(csm, ucase_toFullUpper,
temp, destCapacity,
src, &csc,
0, srcLength,
locale, &locCache, pErrorCode);
pErrorCode);
} else /* if(toWhichCase==TO_TITLE) */ {
#if UCONFIG_NO_BREAK_ITERATION
#if UCONFIG_NO_BREAK_ITERATION
*pErrorCode=U_UNSUPPORTED_ERROR;
#else
if(titleIter==NULL) {
titleIter=ubrk_open(UBRK_WORD, locale,
src, srcLength,
pErrorCode);
ownTitleIter=(UBool)U_SUCCESS(*pErrorCode);
}
if(U_SUCCESS(*pErrorCode)) {
destLength=_toTitle(csp, temp, destCapacity,
src, &csc, srcLength,
titleIter, locale, &locCache, pErrorCode);
}
#endif
#else
/* UCaseMap is actually non-const in toTitle() APIs. */
destLength=_toTitle((UCaseMap *)csm, temp, destCapacity,
src, &csc, srcLength,
pErrorCode);
#endif
}
}
if(temp!=dest) {
@ -490,12 +540,6 @@ caseMap(UChar *dest, int32_t destCapacity,
}
}
#if !UCONFIG_NO_BREAK_ITERATION
if(ownTitleIter) {
ubrk_close(titleIter);
}
#endif
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
@ -506,9 +550,11 @@ u_strToLower(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode) {
return caseMap(dest, destCapacity,
UCaseMap csm={ NULL };
setTempCaseMap(&csm, locale, pErrorCode);
return caseMap(&csm,
dest, destCapacity,
src, srcLength,
NULL, locale, 0,
TO_LOWER, pErrorCode);
}
@ -517,9 +563,11 @@ u_strToUpper(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode) {
return caseMap(dest, destCapacity,
UCaseMap csm={ NULL };
setTempCaseMap(&csm, locale, pErrorCode);
return caseMap(&csm,
dest, destCapacity,
src, srcLength,
NULL, locale, 0,
TO_UPPER, pErrorCode);
}
@ -531,9 +579,29 @@ u_strToTitle(UChar *dest, int32_t destCapacity,
UBreakIterator *titleIter,
const char *locale,
UErrorCode *pErrorCode) {
return caseMap(dest, destCapacity,
UCaseMap csm={ NULL };
int32_t length;
csm.iter=titleIter;
setTempCaseMap(&csm, locale, pErrorCode);
length=caseMap(&csm,
dest, destCapacity,
src, srcLength,
TO_TITLE, pErrorCode);
if(titleIter==NULL && csm.iter!=NULL) {
ubrk_close(csm.iter);
}
return length;
}
U_DRAFT int32_t U_EXPORT2
ucasemap_toTitle(UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UErrorCode *pErrorCode) {
return caseMap(csm,
dest, destCapacity,
src, srcLength,
titleIter, locale, 0,
TO_TITLE, pErrorCode);
}
@ -544,9 +612,12 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
uint32_t options,
UErrorCode *pErrorCode) {
return caseMap(dest, destCapacity,
UCaseMap csm={ NULL };
csm.csp=ucase_getSingleton(pErrorCode);
csm.options=options;
return caseMap(&csm,
dest, destCapacity,
src, srcLength,
NULL, NULL, options,
FOLD_CASE, pErrorCode);
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2006, International Business Machines
* Copyright (C) 2002-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -26,6 +26,8 @@
#include "cmemory.h"
#include "cintltst.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
/* test string case mapping functions --------------------------------------- */
static void
@ -776,10 +778,153 @@ TestUCaseMap(void) {
log_err("ucasemap_utf8ToUpper(overflow) failed\n");
}
/* C API coverage for case folding. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */
errorCode=U_ZERO_ERROR;
utf8Out[0]=0;
length=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, 3, &errorCode);
if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) {
log_err("ucasemap_utf8FoldCase(aBc) failed\n");
}
ucasemap_close(csm);
}
void addCaseTest(TestNode** root);
#if !UCONFIG_NO_BREAK_ITERATION
/* Try titlecasing with options. */
static void
TestUCaseMapToTitle(void) {
/* "a 'CaT. A 'dOg! 'eTc." where '=U+02BB */
/*
* Note: The sentence BreakIterator does not recognize a '.'
* as a sentence terminator if it is followed by lowercase.
* That is why the example has the '!'.
*/
static const UChar
beforeTitle[]= { 0x61, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x54, 0x63, 0x2e },
titleWord[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x44, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x74, 0x63, 0x2e },
titleWordNoAdjust[]={ 0x41, 0x20, 0x2bb, 0x63, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x74, 0x63, 0x2e },
titleSentNoLower[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x54, 0x63, 0x2e };
UChar buffer[32];
UCaseMap *csm;
UBreakIterator *sentenceIter;
const UBreakIterator *iter;
int32_t length;
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
csm=ucasemap_open("", 0, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucasemap_open(\"\") failed - %s\n", u_errorName(errorCode));
return;
}
iter=ucasemap_getBreakIterator(csm);
if(iter!=NULL) {
log_err("ucasemap_getBreakIterator() returns %p!=NULL before setting any iterator or titlecasing\n", iter);
}
/* Use default UBreakIterator: Word breaks. */
length=ucasemap_toTitle(csm, buffer, LENGTHOF(buffer), beforeTitle, LENGTHOF(beforeTitle), &errorCode);
if( U_FAILURE(errorCode) ||
length!=LENGTHOF(titleWord) ||
0!=u_memcmp(buffer, titleWord, length) ||
buffer[length]!=0
) {
log_err("ucasemap_toTitle(default iterator)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
}
iter=ucasemap_getBreakIterator(csm);
if(iter==NULL) {
log_err("ucasemap_getBreakIterator() returns NULL after titlecasing\n");
}
/* Try U_TITLECASE_NO_BREAK_ADJUSTMENT. */
ucasemap_setOptions(csm, U_TITLECASE_NO_BREAK_ADJUSTMENT, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: ucasemap_setOptions(U_TITLECASE_NO_BREAK_ADJUSTMENT) failed - %s\n", u_errorName(errorCode));
return;
}
length=ucasemap_toTitle(csm, buffer, LENGTHOF(buffer), beforeTitle, LENGTHOF(beforeTitle), &errorCode);
if( U_FAILURE(errorCode) ||
length!=LENGTHOF(titleWordNoAdjust) ||
0!=u_memcmp(buffer, titleWordNoAdjust, length) ||
buffer[length]!=0
) {
log_err("ucasemap_toTitle(default iterator, no break adjustment)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
}
/* Set a sentence break iterator. */
errorCode=U_ZERO_ERROR;
sentenceIter=ubrk_open(UBRK_SENTENCE, "", NULL, 0, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: ubrk_open(UBRK_SENTENCE) failed - %s\n", u_errorName(errorCode));
ucasemap_close(csm);
return;
}
ucasemap_setBreakIterator(csm, sentenceIter, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: ucasemap_setBreakIterator(sentence iterator) failed - %s\n", u_errorName(errorCode));
ubrk_close(sentenceIter);
ucasemap_close(csm);
return;
}
iter=ucasemap_getBreakIterator(csm);
if(iter!=sentenceIter) {
log_err("ucasemap_getBreakIterator() returns %p!=%p after setting the iterator\n", iter, sentenceIter);
}
ucasemap_setOptions(csm, U_TITLECASE_NO_LOWERCASE, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: ucasemap_setOptions(U_TITLECASE_NO_LOWERCASE) failed - %s\n", u_errorName(errorCode));
return;
}
/* Use the sentence break iterator with the option. Preflight first. */
length=ucasemap_toTitle(csm, NULL, 0, beforeTitle, LENGTHOF(beforeTitle), &errorCode);
if( errorCode!=U_BUFFER_OVERFLOW_ERROR ||
length!=LENGTHOF(titleSentNoLower)
) {
log_err("ucasemap_toTitle(preflight sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
}
errorCode=U_ZERO_ERROR;
buffer[0]=0;
length=ucasemap_toTitle(csm, buffer, LENGTHOF(buffer), beforeTitle, LENGTHOF(beforeTitle), &errorCode);
if( U_FAILURE(errorCode) ||
length!=LENGTHOF(titleSentNoLower) ||
0!=u_memcmp(buffer, titleSentNoLower, length) ||
buffer[length]!=0
) {
log_err("ucasemap_toTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
}
/* UTF-8 C API coverage. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */
{
char utf8BeforeTitle[64], utf8TitleSentNoLower[64], utf8[64];
int32_t utf8BeforeTitleLength, utf8TitleSentNoLowerLength;
errorCode=U_ZERO_ERROR;
u_strToUTF8(utf8BeforeTitle, (int32_t)sizeof(utf8BeforeTitle), &utf8BeforeTitleLength, beforeTitle, LENGTHOF(beforeTitle), &errorCode);
u_strToUTF8(utf8TitleSentNoLower, (int32_t)sizeof(utf8TitleSentNoLower), &utf8TitleSentNoLowerLength, titleSentNoLower, LENGTHOF(titleSentNoLower), &errorCode);
length=ucasemap_utf8ToTitle(csm, utf8, (int32_t)sizeof(utf8), utf8BeforeTitle, utf8BeforeTitleLength, &errorCode);
if( U_FAILURE(errorCode) ||
length!=utf8TitleSentNoLowerLength ||
0!=uprv_memcmp(utf8, utf8TitleSentNoLower, length) ||
utf8[length]!=0
) {
log_err("ucasemap_utf8ToTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
}
}
ucasemap_close(csm);
}
#endif
void addCaseTest(TestNode** root) {
/* cstrcase.c functions, declared in cucdtst.h */
@ -791,4 +936,7 @@ void addCaseTest(TestNode** root) {
addTest(root, &TestCaseFolding, "tsutil/cstrcase/TestCaseFolding");
addTest(root, &TestCaseCompare, "tsutil/cstrcase/TestCaseCompare");
addTest(root, &TestUCaseMap, "tsutil/cstrcase/TestUCaseMap");
#if !UCONFIG_NO_BREAK_ITERATION
addTest(root, &TestUCaseMapToTitle, "tsutil/cstrcase/TestUCaseMapToTitle");
#endif
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2005, International Business Machines
* Copyright (C) 2002-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -26,6 +26,8 @@
#include "ustrtest.h"
#include "unicode/tstdtmod.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
StringCaseTest::~StringCaseTest() {}
void
@ -325,9 +327,8 @@ StringCaseTest::TestCaseConversion()
enum {
TEST_LOWER,
TEST_UPPER,
#if !UCONFIG_NO_BREAK_ITERATION
TEST_TITLE,
#endif
TEST_FOLD,
TEST_COUNT
};
@ -335,9 +336,8 @@ enum {
static const char *const dataNames[TEST_COUNT+1]={
"lowercasing",
"uppercasing",
#if !UCONFIG_NO_BREAK_ITERATION
"titlecasing",
#endif
"casefolding",
""
};
@ -345,20 +345,31 @@ void
StringCaseTest::TestCasingImpl(const UnicodeString &input,
const UnicodeString &output,
int32_t whichCase,
const char *localeID, uint32_t options) {
void *iter, const char *localeID, uint32_t options) {
// UnicodeString
UnicodeString result;
const char *name;
Locale locale(localeID);
result=input;
switch(whichCase) {
case TEST_LOWER:
name="toLower";
result.toLower(Locale(localeID));
result.toLower(locale);
break;
case TEST_UPPER:
name="toUpper";
result.toUpper(Locale(localeID));
result.toUpper(locale);
break;
#if !UCONFIG_NO_BREAK_ITERATION
case TEST_TITLE:
name="toTitle";
result.toTitle((BreakIterator *)iter, locale, options);
break;
#endif
case TEST_FOLD:
name="foldCase";
result.foldCase(options);
break;
default:
name="";
@ -367,6 +378,15 @@ StringCaseTest::TestCasingImpl(const UnicodeString &input,
if(result!=output) {
errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
}
#if !UCONFIG_NO_BREAK_ITERATION
if(whichCase==TEST_TITLE && options==0) {
result=input;
result.toTitle((BreakIterator *)iter, locale);
if(result!=output) {
errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
}
}
#endif
// UTF-8
char utf8In[100], utf8Out[100];
@ -378,6 +398,14 @@ StringCaseTest::TestCasingImpl(const UnicodeString &input,
errorCode=U_ZERO_ERROR;
csm=ucasemap_open(localeID, options, &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
if(iter!=NULL) {
// Clone the break iterator so that the UCaseMap can safely adopt it.
int32_t size=1; // Not 0 because that only gives preflighting.
UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode);
ucasemap_setBreakIterator(csm, clone, &errorCode);
}
#endif
u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode);
switch(whichCase) {
@ -393,6 +421,18 @@ StringCaseTest::TestCasingImpl(const UnicodeString &input,
utf8Out, (int32_t)sizeof(utf8Out),
utf8In, utf8InLength, &errorCode);
break;
case TEST_TITLE:
name="ucasemap_utf8ToTitle";
utf8OutLength=ucasemap_utf8ToTitle(csm,
utf8Out, (int32_t)sizeof(utf8Out),
utf8In, utf8InLength, &errorCode);
break;
case TEST_FOLD:
name="ucasemap_utf8FoldCase";
utf8OutLength=ucasemap_utf8FoldCase(csm,
utf8Out, (int32_t)sizeof(utf8Out),
utf8In, utf8InLength, &errorCode);
break;
default:
name="";
utf8OutLength=0;
@ -410,36 +450,22 @@ StringCaseTest::TestCasingImpl(const UnicodeString &input,
ucasemap_close(csm);
}
#if !UCONFIG_NO_BREAK_ITERATION
void
StringCaseTest::TestTitleCasing(const UnicodeString &input,
const UnicodeString &output,
const char *localeID,
UBreakIterator *iter) {
UnicodeString result;
result=input;
result.toTitle((BreakIterator *)iter, Locale(localeID));
if(result!=output) {
errln("error: UnicodeString.toTitle() got a wrong result for a test case from casing.res");
}
}
#endif
void
StringCaseTest::TestCasing() {
UErrorCode status = U_ZERO_ERROR;
#if !UCONFIG_NO_BREAK_ITERATION
UBreakIterator *iter;
#endif
void *iter;
char cLocaleID[100];
UnicodeString locale, input, output, result;
UnicodeString locale, input, output, optionsString, result;
uint32_t options;
int32_t whichCase, type;
TestDataModule *driver = TestDataModule::getTestDataModule("casing", *this, status);
if(U_SUCCESS(status)) {
for(whichCase=0; whichCase<TEST_COUNT; ++whichCase) {
#if UCONFIG_NO_BREAK_ITERATION
if(whichCase==TEST_TITLE) {
continue;
}
#endif
TestData *casingTest = driver->createTestData(dataNames[whichCase], status);
if(U_FAILURE(status)) {
errln("TestCasing failed to createTestData(%s) - %s", dataNames[whichCase], u_errorName(status));
@ -447,39 +473,48 @@ StringCaseTest::TestCasing() {
}
const DataMap *myCase = NULL;
while(casingTest->nextCase(myCase, status)) {
locale = myCase->getString("Locale", status);
locale.extract(0, 0x7fffffff, cLocaleID, sizeof(cLocaleID), "");
input = myCase->getString("Input", status);
output = myCase->getString("Output", status);
#if !UCONFIG_NO_BREAK_ITERATION
if(whichCase!=TEST_FOLD) {
locale = myCase->getString("Locale", status);
}
locale.extract(0, 0x7fffffff, cLocaleID, sizeof(cLocaleID), "");
iter=NULL;
#if !UCONFIG_NO_BREAK_ITERATION
if(whichCase==TEST_TITLE) {
type = myCase->getInt("Type", status);
if(type>=0) {
iter=ubrk_open((UBreakIteratorType)type, cLocaleID, NULL, 0, &status);
} else if(type==-2) {
// Open a trivial break iterator that only delivers { 0, length }
// or even just { 0 } as boundaries.
static const UChar rules[] = { 0x2e, 0x2a, 0x3b }; // ".*;"
UParseError parseError;
iter=ubrk_openRules(rules, LENGTHOF(rules), NULL, 0, &parseError, &status);
}
}
#endif
if(whichCase==TEST_TITLE || whichCase==TEST_FOLD) {
optionsString = myCase->getString("Options", status);
options = 0;
if(optionsString.indexOf((UChar)0x54)>=0) { // T
options|=U_FOLD_CASE_EXCLUDE_SPECIAL_I;
}
if(optionsString.indexOf((UChar)0x4c)>=0) { // L
options|=U_TITLECASE_NO_LOWERCASE;
}
if(optionsString.indexOf((UChar)0x41)>=0) { // A
options|=U_TITLECASE_NO_BREAK_ADJUSTMENT;
}
}
if(U_FAILURE(status)) {
errln("error: TestCasing() setup failed for %s test case from casing.res: %s", dataNames[whichCase], u_errorName(status));
status = U_ZERO_ERROR;
} else {
switch(whichCase) {
case TEST_LOWER:
case TEST_UPPER:
TestCasingImpl(input, output, whichCase, cLocaleID, 0);
break;
#if !UCONFIG_NO_BREAK_ITERATION
case TEST_TITLE:
TestTitleCasing(input, output, cLocaleID, iter);
break;
#endif
default:
break; // won't happen
}
TestCasingImpl(input, output, whichCase, iter, cLocaleID, options);
}
#if !UCONFIG_NO_BREAK_ITERATION

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2006, International Business Machines Corporation and
* Copyright (c) 1997-2007, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -8,7 +8,6 @@
#define UNICODESTRINGTEST_H
#include "unicode/unistr.h"
#include "unicode/ubrk.h"
#include "intltest.h"
/**
@ -90,13 +89,7 @@ public:
void TestCasingImpl(const UnicodeString &input,
const UnicodeString &output,
int32_t whichCase,
const char *localeID, uint32_t options);
#if !UCONFIG_NO_BREAK_ITERATION
void TestTitleCasing(const UnicodeString &input,
const UnicodeString &output,
const char *localeID,
UBreakIterator *iter);
#endif
void *iter, const char *localeID, uint32_t options);
void TestCasing();
};

View file

@ -1,6 +1,6 @@
//*******************************************************************************
//*
//* Copyright (C) 2002-2006, International Business Machines
//* Copyright (C) 2002-2007, International Business Machines
//* Corporation and others. All Rights Reserved.
//*
//*******************************************************************************
@ -13,7 +13,8 @@ casing:table(nofallback) {
"each item is an array with\n"
"input string, result string, locale ID[, break iterator]\n"
"the break iterator (only for titlecasing) is specified as an int, same as in UBreakIteratorType:\n"
"0=UBRK_CHARACTER 1=UBRK_WORD 2=UBRK_LINE 3=UBRK_SENTENCE 4=UBRK_TITLE -1=default\n"
"0=UBRK_CHARACTER 1=UBRK_WORD 2=UBRK_LINE 3=UBRK_SENTENCE 4=UBRK_TITLE -1=default (NULL=words) -2=no breaks (.*)\n"
"options: T=U_FOLD_CASE_EXCLUDE_SPECIAL_I L=U_TITLECASE_NO_LOWERCASE A=U_TITLECASE_NO_BREAK_ADJUSTMENT\n"
}
}
TestData {
@ -34,13 +35,29 @@ casing:table(nofallback) {
}
}
titlecasing {
Headers { "Input", "Output", "Locale", "Type" }
Headers { "Input", "Output", "Locale", "Type", "Options" }
Cases {
{ "ʻaMeLikA huI Pū ʻʻʻiA", "ʻAmelika Hui Pū ʻʻʻIa", "", "-1" }, // titlecase first _cased_ letter, j4933
{ " tHe QUIcK bRoWn", " The Quick Brown", "", "4" },
{ "DŽDždžLJLjljNJNjnj", "DžDžDžLjLjLjNjNjNj", "", "0" }, // UBRK_CHARACTER
{ "ljubav ljubav", "Ljubav Ljubav", "", "-1" }, // Lj vs. L+j
{ "'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'", "", "-1" }
{ "ʻaMeLikA huI Pū ʻʻʻiA", "ʻAmelika Hui Pū ʻʻʻIa", "", "-1", "" }, // titlecase first _cased_ letter, j4933
{ " tHe QUIcK bRoWn", " The Quick Brown", "", "4", "" },
{ "DŽDždžLJLjljNJNjnj", "DžDžDžLjLjLjNjNjNj", "", "0", "" }, // UBRK_CHARACTER
{ "ljubav ljubav", "Ljubav Ljubav", "", "-1", "" }, // Lj vs. L+j
{ "'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'", "", "-1", "" },
{ "a ʻCaT. A ʻdOg! ʻeTc.", "A ʻCat. A ʻDog! ʻEtc.", "", "-1", "" }, // default
{ "a ʻCaT. A ʻdOg! ʻeTc.", "A ʻcat. A ʻdog! ʻetc.", "", "-1", "A" }, // U_TITLECASE_NO_BREAK_ADJUSTMENT
{ "a ʻCaT. A ʻdOg! ʻeTc.", "A ʻCaT. A ʻdOg! ʻETc.", "", "3", "L" }, // UBRK_SENTENCE and U_TITLECASE_NO_LOWERCASE
{ "ʻcAt! ʻeTc.", "ʻCat! ʻetc.", "", "-2", "" }, // -2=Trivial break iterator
{ "ʻcAt! ʻeTc.", "ʻcat! ʻetc.", "", "-2", "A" }, // U_TITLECASE_NO_BREAK_ADJUSTMENT
{ "ʻcAt! ʻeTc.", "ʻCAt! ʻeTc.", "", "-2", "L" }, // U_TITLECASE_NO_LOWERCASE
{ "ʻcAt! ʻeTc.", "ʻcAt! ʻeTc.", "", "-2", "AL" } // Both options
}
}
casefolding {
Headers { "Input", "Output", "Options" }
Cases {
{ "aBİIıϐßffi񟿿", "abi̇iıβssffi񟿿", "" },
{ "aBİIıϐßffi񟿿", "abiııβssffi񟿿", "T" } // U_FOLD_CASE_EXCLUDE_SPECIAL_I
}
}
}