ICU-12647 make string case mapping functions faster

X-SVN-Rev: 40921
This commit is contained in:
Markus Scherer 2018-02-15 06:43:56 +00:00
parent a3d84405e5
commit e8bb1bb9c2
9 changed files with 1014 additions and 199 deletions

View file

@ -92,20 +92,16 @@ ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) {
sink.Append(s8, 2);
}
UBool
ByteSinkUtil::appendUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
if (length > 0) {
if (edits != nullptr) {
edits->addUnchanged(length);
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(reinterpret_cast<const char *>(s), length);
}
void
ByteSinkUtil::appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits) {
U_ASSERT(length > 0);
if (edits != nullptr) {
edits->addUnchanged(length);
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(reinterpret_cast<const char *>(s), length);
}
return TRUE;
}
UBool
@ -117,7 +113,11 @@ ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit,
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
return appendUnchanged(s, (int32_t)(limit - s), sink, options, edits, errorCode);
int32_t length = (int32_t)(limit - s);
if (length > 0) {
appendNonEmptyUnchanged(s, length, sink, options, edits);
}
return TRUE;
}
U_NAMESPACE_END

View file

@ -43,11 +43,19 @@ public:
static UBool appendUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode);
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
if (length > 0) { appendNonEmptyUnchanged(s, length, sink, options, edits); }
return TRUE;
}
static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode);
private:
static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits);
};
U_NAMESPACE_END

View file

@ -77,9 +77,12 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
/* data access primitives --------------------------------------------------- */
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie() {
return &ucase_props_singleton.trie;
}
#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
/* number of bits in an 8-bit integer value */
static const uint8_t flagsOffset[256]={
@ -128,8 +131,8 @@ static const uint8_t flagsOffset[256]={
U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_IS_UPPER_OR_TITLE(props)) {
c+=UCASE_GET_DELTA(props);
}
} else {
@ -145,7 +148,7 @@ ucase_tolower(UChar32 c) {
U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
c+=UCASE_GET_DELTA(props);
}
@ -162,7 +165,7 @@ ucase_toupper(UChar32 c) {
U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
c+=UCASE_GET_DELTA(props);
}
@ -223,7 +226,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
}
props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
/* add the one simple case mapping, no matter what type it is */
int32_t delta=UCASE_GET_DELTA(props);
@ -419,6 +422,138 @@ FullCaseFoldingIterator::next(UnicodeString &full) {
return c;
}
namespace LatinCase {
const int8_t TO_LOWER_NORMAL[LIMIT] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
};
const int8_t TO_LOWER_TR_LT[LIMIT] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
};
const int8_t TO_UPPER_NORMAL[LIMIT] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
};
const int8_t TO_UPPER_TR[LIMIT] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
};
} // namespace LatinCase
U_NAMESPACE_END
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
@ -439,7 +574,7 @@ ucase_getTypeOrIgnorable(UChar32 c) {
static inline int32_t
getDotType(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(!UCASE_HAS_EXCEPTION(props)) {
return props&UCASE_DOT_MASK;
} else {
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
@ -878,8 +1013,8 @@ ucase_toFullLower(UChar32 c,
U_ASSERT(c >= 0);
UChar32 result=c;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_IS_UPPER_OR_TITLE(props)) {
result=c+UCASE_GET_DELTA(props);
}
} else {
@ -1024,7 +1159,7 @@ toUpperOrTitle(UChar32 c,
U_ASSERT(c >= 0);
UChar32 result=c;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
result=c+UCASE_GET_DELTA(props);
}
@ -1169,8 +1304,8 @@ ucase_toFullTitle(UChar32 c,
U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c, uint32_t options) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_IS_UPPER_OR_TITLE(props)) {
c+=UCASE_GET_DELTA(props);
}
} else {
@ -1234,8 +1369,8 @@ ucase_toFullFolding(UChar32 c,
U_ASSERT(c >= 0);
UChar32 result=c;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!PROPS_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_IS_UPPER_OR_TITLE(props)) {
result=c+UCASE_GET_DELTA(props);
}
} else {

View file

@ -26,6 +26,7 @@
#include "putilimp.h"
#include "uset_imp.h"
#include "udataswp.h"
#include "utrie2.h"
#ifdef __cplusplus
U_NAMESPACE_BEGIN
@ -148,6 +149,33 @@ private:
int32_t rowCpIndex;
};
/**
* Fast case mapping data for ASCII/Latin.
* Linear arrays of delta bytes: 0=no mapping; EXC=exception.
* Deltas must not cross the ASCII boundary, or else they cannot be easily used
* in simple UTF-8 code.
*/
namespace LatinCase {
/** Case mapping/folding data for code points up to U+017F. */
constexpr UChar LIMIT = 0x180;
/** U+017F case-folds and uppercases crossing the ASCII boundary. */
constexpr UChar LONG_S = 0x17f;
/** Exception: Complex mapping, or too-large delta. */
constexpr int8_t EXC = -0x80;
/** Deltas for lowercasing for most locales, and default case folding. */
extern const int8_t TO_LOWER_NORMAL[LIMIT];
/** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */
extern const int8_t TO_LOWER_TR_LT[LIMIT];
/** Deltas for uppercasing for most locales. */
extern const int8_t TO_UPPER_NORMAL[LIMIT];
/** Deltas for uppercasing for tr/az. */
extern const int8_t TO_UPPER_TR[LIMIT];
} // namespace LatinCase
U_NAMESPACE_END
#endif
@ -308,6 +336,9 @@ enum {
/* definitions for 16-bit case properties word ------------------------------ */
U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie();
/* 2-bit constants for types of cased characters */
#define UCASE_TYPE_MASK 3
enum {
@ -320,10 +351,14 @@ enum {
#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
#define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
#define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2)
#define UCASE_IGNORABLE 4
#define UCASE_SENSITIVE 8
#define UCASE_EXCEPTION 0x10
#define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
#define UCASE_DOT_MASK 0x60
enum {
UCASE_NO_DOT=0, /* normal characters with cc=0 */

View file

@ -165,9 +165,7 @@ appendResult(int32_t cpLength, int32_t result, const UChar *s,
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
} // namespace
static UChar32 U_CALLCONV
UChar32 U_CALLCONV
utf8_caseContextIterator(void *context, int8_t dir) {
UCaseContext *csc=(UCaseContext *)context;
UChar32 c;
@ -199,36 +197,227 @@ utf8_caseContextIterator(void *context, int8_t dir) {
return U_SENTINEL;
}
/*
* Case-maps [srcStart..srcLimit[ but takes
* context [0..srcLength[ into account.
/**
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
*/
static void
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
const uint8_t *src, UCaseContext *csc,
int32_t srcStart, int32_t srcLimit,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
/* case mapping loop */
int32_t srcIndex=srcStart;
while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
void toLower(int32_t caseLocale, uint32_t options,
const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
const int8_t *latinToLower;
if (caseLocale == UCASE_LOC_ROOT ||
(caseLocale >= 0 ?
!(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
(options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
latinToLower = LatinCase::TO_LOWER_NORMAL;
} else {
latinToLower = LatinCase::TO_LOWER_TR_LT;
}
const UTrie2 *trie = ucase_getTrie();
int32_t prev = srcStart;
int32_t srcIndex = srcStart;
for (;;) {
// fast path for simple cases
int32_t cpStart;
csc->cpStart=cpStart=srcIndex;
UChar32 c;
U8_NEXT(src, srcIndex, srcLimit, c);
csc->cpLimit=srcIndex;
if(c<0) {
// Malformed UTF-8.
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
for (;;) {
if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
c = U_SENTINEL;
break;
}
uint8_t lead = src[srcIndex++];
if (lead <= 0x7f) {
int8_t d = latinToLower[lead];
if (d == LatinCase::EXC) {
cpStart = srcIndex - 1;
c = lead;
break;
}
if (d == 0) { continue; }
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
sink, options, edits, errorCode);
char ascii = (char)(lead + d);
sink.Append(&ascii, 1);
if (edits != nullptr) {
edits->addReplace(1, 1);
}
prev = srcIndex;
continue;
} else if (lead < 0xe3) {
uint8_t t;
if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
(t = src[srcIndex] - 0x80) <= 0x3f) {
// U+0080..U+017F
++srcIndex;
c = ((lead - 0xc0) << 6) | t;
int8_t d = latinToLower[c];
if (d == LatinCase::EXC) {
cpStart = srcIndex - 2;
break;
}
if (d == 0) { continue; }
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
sink, options, edits, errorCode);
ByteSinkUtil::appendTwoBytes(c + d, sink);
if (edits != nullptr) {
edits->addReplace(2, 2);
}
prev = srcIndex;
continue;
}
} else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
(srcIndex + 2) <= srcLimit &&
U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
// most of CJK: no case mappings
srcIndex += 2;
continue;
}
cpStart = --srcIndex;
U8_NEXT(src, srcIndex, srcLimit, c);
if (c < 0) {
// ill-formed UTF-8
continue;
}
uint16_t props = UTRIE2_GET16(trie, c);
if (UCASE_HAS_EXCEPTION(props)) { break; }
int32_t delta;
if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
continue;
}
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
sink, options, edits, errorCode);
ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
prev = srcIndex;
}
if (c < 0) {
break;
}
// slow path
const UChar *s;
if (caseLocale >= 0) {
csc->cpStart = cpStart;
csc->cpLimit = srcIndex;
c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
} else {
const UChar *s;
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
c = ucase_toFullFolding(c, &s, options);
}
if (c >= 0) {
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
sink, options, edits, errorCode);
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
prev = srcIndex;
}
}
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
sink, options, edits, errorCode);
}
void toUpper(int32_t caseLocale, uint32_t options,
const uint8_t *src, UCaseContext *csc, int32_t srcLength,
icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
const int8_t *latinToUpper;
if (caseLocale == UCASE_LOC_TURKISH) {
latinToUpper = LatinCase::TO_UPPER_TR;
} else {
latinToUpper = LatinCase::TO_UPPER_NORMAL;
}
const UTrie2 *trie = ucase_getTrie();
int32_t prev = 0;
int32_t srcIndex = 0;
for (;;) {
// fast path for simple cases
int32_t cpStart;
UChar32 c;
for (;;) {
if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
c = U_SENTINEL;
break;
}
uint8_t lead = src[srcIndex++];
if (lead <= 0x7f) {
int8_t d = latinToUpper[lead];
if (d == LatinCase::EXC) {
cpStart = srcIndex - 1;
c = lead;
break;
}
if (d == 0) { continue; }
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
sink, options, edits, errorCode);
char ascii = (char)(lead + d);
sink.Append(&ascii, 1);
if (edits != nullptr) {
edits->addReplace(1, 1);
}
prev = srcIndex;
continue;
} else if (lead < 0xe3) {
uint8_t t;
if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
(t = src[srcIndex] - 0x80) <= 0x3f) {
// U+0080..U+017F
++srcIndex;
c = ((lead - 0xc0) << 6) | t;
int8_t d = latinToUpper[c];
if (d == LatinCase::EXC) {
cpStart = srcIndex - 2;
break;
}
if (d == 0) { continue; }
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
sink, options, edits, errorCode);
ByteSinkUtil::appendTwoBytes(c + d, sink);
if (edits != nullptr) {
edits->addReplace(2, 2);
}
prev = srcIndex;
continue;
}
} else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
(srcIndex + 2) <= srcLength &&
U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
// most of CJK: no case mappings
srcIndex += 2;
continue;
}
cpStart = --srcIndex;
U8_NEXT(src, srcIndex, srcLength, c);
if (c < 0) {
// ill-formed UTF-8
continue;
}
uint16_t props = UTRIE2_GET16(trie, c);
if (UCASE_HAS_EXCEPTION(props)) { break; }
int32_t delta;
if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
continue;
}
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
sink, options, edits, errorCode);
ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
prev = srcIndex;
}
if (c < 0) {
break;
}
// slow path
csc->cpStart = cpStart;
csc->cpLimit = srcIndex;
const UChar *s;
c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
if (c >= 0) {
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
sink, options, edits, errorCode);
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
prev = srcIndex;
}
}
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
sink, options, edits, errorCode);
}
} // namespace
#if !UCONFIG_NO_BREAK_ITERATION
U_CFUNC void U_CALLCONV
@ -335,10 +524,9 @@ ucasemap_internalUTF8ToTitle(
if(titleLimit<index) {
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
/* Normal operation: Lowercase the rest of the word. */
_caseMap(caseLocale, options, ucase_toFullLower,
src, &csc,
titleLimit, index,
sink, edits, errorCode);
toLower(caseLocale, options,
src, &csc, titleLimit, index,
sink, edits, errorCode);
if(U_FAILURE(errorCode)) {
return;
}
@ -538,8 +726,8 @@ ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREA
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
_caseMap(
caseLocale, options, ucase_toFullLower,
toLower(
caseLocale, options,
src, &csc, 0, srcLength,
sink, edits, errorCode);
}
@ -555,9 +743,9 @@ ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREA
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
_caseMap(
caseLocale, options, ucase_toFullUpper,
src, &csc, 0, srcLength,
toUpper(
caseLocale, options,
src, &csc, srcLength,
sink, edits, errorCode);
}
}
@ -567,22 +755,10 @@ ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_B
const uint8_t *src, int32_t srcLength,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
/* case mapping loop */
int32_t srcIndex = 0;
while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
int32_t cpStart = srcIndex;
UChar32 c;
U8_NEXT(src, srcIndex, srcLength, c);
if(c<0) {
// Malformed UTF-8.
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
sink, options, edits, errorCode);
} else {
const UChar *s;
c = ucase_toFullFolding(c, &s, options);
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
}
}
toLower(
-1, options,
src, nullptr, 0, srcLength,
sink, edits, errorCode);
}
void

View file

@ -52,16 +52,8 @@ int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
return destIndex;
}
} // namespace
U_NAMESPACE_END
U_NAMESPACE_USE
/* string casing ------------------------------------------------------------ */
/* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
static inline int32_t
inline int32_t
appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
int32_t result, const UChar *s,
int32_t cpLength, uint32_t options, icu::Edits *edits) {
@ -134,7 +126,7 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
return destIndex;
}
static inline int32_t
inline int32_t
appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
if(destIndex<destCapacity) {
dest[destIndex]=c;
@ -144,28 +136,34 @@ appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
return destIndex+1;
}
static inline int32_t
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
if(length>0) {
if(edits!=NULL) {
edits->addUnchanged(length);
}
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
if((destIndex+length)<=destCapacity) {
u_memcpy(dest+destIndex, s, length);
}
destIndex+=length;
int32_t
appendNonEmptyUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
if(edits!=NULL) {
edits->addUnchanged(length);
}
return destIndex;
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
if((destIndex+length)<=destCapacity) {
u_memcpy(dest+destIndex, s, length);
}
return destIndex + length;
}
static UChar32 U_CALLCONV
inline int32_t
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
if (length <= 0) {
return destIndex;
}
return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
}
UChar32 U_CALLCONV
utf16_caseContextIterator(void *context, int8_t dir) {
UCaseContext *csc=(UCaseContext *)context;
UChar32 c;
@ -197,39 +195,205 @@ utf16_caseContextIterator(void *context, int8_t dir) {
return U_SENTINEL;
}
/*
* Case-maps [srcStart..srcLimit[ but takes
* context [0..srcLength[ into account.
/**
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
*/
static int32_t
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
UChar *dest, int32_t destCapacity,
const UChar *src, UCaseContext *csc,
int32_t srcStart, int32_t srcLimit,
icu::Edits *edits,
UErrorCode &errorCode) {
/* case mapping loop */
int32_t srcIndex=srcStart;
int32_t destIndex=0;
while(srcIndex<srcLimit) {
int32_t cpStart;
csc->cpStart=cpStart=srcIndex;
int32_t toLower(int32_t caseLocale, uint32_t options,
UChar *dest, int32_t destCapacity,
const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
icu::Edits *edits, UErrorCode &errorCode) {
const int8_t *latinToLower;
if (caseLocale == UCASE_LOC_ROOT ||
(caseLocale >= 0 ?
!(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
(options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
latinToLower = LatinCase::TO_LOWER_NORMAL;
} else {
latinToLower = LatinCase::TO_LOWER_TR_LT;
}
const UTrie2 *trie = ucase_getTrie();
int32_t destIndex = 0;
int32_t prev = srcStart;
int32_t srcIndex = srcStart;
for (;;) {
// fast path for simple cases
UChar lead;
while (srcIndex < srcLimit) {
lead = src[srcIndex];
int32_t delta;
if (lead < LatinCase::LONG_S) {
int8_t d = latinToLower[lead];
if (d == LatinCase::EXC) { break; }
++srcIndex;
if (d == 0) { continue; }
delta = d;
} else if (lead >= 0xd800) {
break; // surrogate or higher
} else {
uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
if (UCASE_HAS_EXCEPTION(props)) { break; }
++srcIndex;
if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
continue;
}
}
lead += delta;
destIndex = appendUnchanged(dest, destIndex, destCapacity,
src + prev, srcIndex - 1 - prev, options, edits);
if (destIndex >= 0) {
destIndex = appendUChar(dest, destIndex, destCapacity, lead);
if (edits != nullptr) {
edits->addReplace(1, 1);
}
}
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
prev = srcIndex;
}
if (srcIndex >= srcLimit) {
break;
}
// slow path
int32_t cpStart = srcIndex++;
UChar trail;
UChar32 c;
U16_NEXT(src, srcIndex, srcLimit, c);
csc->cpLimit=srcIndex;
if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
c = U16_GET_SUPPLEMENTARY(lead, trail);
++srcIndex;
} else {
c = lead;
}
const UChar *s;
c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
srcIndex - cpStart, options, edits);
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (caseLocale >= 0) {
csc->cpStart = cpStart;
csc->cpLimit = srcIndex;
c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
} else {
c = ucase_toFullFolding(c, &s, options);
}
if (c >= 0) {
destIndex = appendUnchanged(dest, destIndex, destCapacity,
src + prev, cpStart - prev, options, edits);
if (destIndex >= 0) {
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
srcIndex - cpStart, options, edits);
}
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
prev = srcIndex;
}
}
destIndex = appendUnchanged(dest, destIndex, destCapacity,
src + prev, srcIndex - prev, options, edits);
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
return destIndex;
}
int32_t toUpper(int32_t caseLocale, uint32_t options,
UChar *dest, int32_t destCapacity,
const UChar *src, UCaseContext *csc, int32_t srcLength,
icu::Edits *edits, UErrorCode &errorCode) {
const int8_t *latinToUpper;
if (caseLocale == UCASE_LOC_TURKISH) {
latinToUpper = LatinCase::TO_UPPER_TR;
} else {
latinToUpper = LatinCase::TO_UPPER_NORMAL;
}
const UTrie2 *trie = ucase_getTrie();
int32_t destIndex = 0;
int32_t prev = 0;
int32_t srcIndex = 0;
for (;;) {
// fast path for simple cases
UChar lead;
while (srcIndex < srcLength) {
lead = src[srcIndex];
int32_t delta;
if (lead < LatinCase::LONG_S) {
int8_t d = latinToUpper[lead];
if (d == LatinCase::EXC) { break; }
++srcIndex;
if (d == 0) { continue; }
delta = d;
} else if (lead >= 0xd800) {
break; // surrogate or higher
} else {
uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
if (UCASE_HAS_EXCEPTION(props)) { break; }
++srcIndex;
if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
continue;
}
}
lead += delta;
destIndex = appendUnchanged(dest, destIndex, destCapacity,
src + prev, srcIndex - 1 - prev, options, edits);
if (destIndex >= 0) {
destIndex = appendUChar(dest, destIndex, destCapacity, lead);
if (edits != nullptr) {
edits->addReplace(1, 1);
}
}
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
prev = srcIndex;
}
if (srcIndex >= srcLength) {
break;
}
// slow path
int32_t cpStart;
csc->cpStart = cpStart = srcIndex++;
UChar trail;
UChar32 c;
if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
c = U16_GET_SUPPLEMENTARY(lead, trail);
++srcIndex;
} else {
c = lead;
}
csc->cpLimit = srcIndex;
const UChar *s;
c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
if (c >= 0) {
destIndex = appendUnchanged(dest, destIndex, destCapacity,
src + prev, cpStart - prev, options, edits);
if (destIndex >= 0) {
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
srcIndex - cpStart, options, edits);
}
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
prev = srcIndex;
}
}
destIndex = appendUnchanged(dest, destIndex, destCapacity,
src + prev, srcIndex - prev, options, edits);
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
return destIndex;
}
} // namespace
U_NAMESPACE_END
U_NAMESPACE_USE
#if !UCONFIG_NO_BREAK_ITERATION
U_CFUNC int32_t U_CALLCONV
@ -344,11 +508,10 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
/* Normal operation: Lowercase the rest of the word. */
destIndex+=
_caseMap(
caseLocale, options, ucase_toFullLower,
toLower(
caseLocale, options,
dest+destIndex, destCapacity-destIndex,
src, &csc,
titleLimit, index,
src, &csc, titleLimit, index,
edits, errorCode);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
@ -1013,8 +1176,8 @@ ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
int32_t destIndex = _caseMap(
caseLocale, options, ucase_toFullLower,
int32_t destIndex = toLower(
caseLocale, options,
dest, destCapacity,
src, &csc, 0, srcLength,
edits, errorCode);
@ -1035,10 +1198,10 @@ ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
destIndex = _caseMap(
caseLocale, options, ucase_toFullUpper,
destIndex = toUpper(
caseLocale, options,
dest, destCapacity,
src, &csc, 0, srcLength,
src, &csc, srcLength,
edits, errorCode);
}
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
@ -1050,23 +1213,11 @@ ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode &errorCode) {
/* case mapping loop */
int32_t srcIndex = 0;
int32_t destIndex = 0;
while (srcIndex < srcLength) {
int32_t cpStart = srcIndex;
UChar32 c;
U16_NEXT(src, srcIndex, srcLength, c);
const UChar *s;
c = ucase_toFullFolding(c, &s, options);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
srcIndex - cpStart, options, edits);
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
int32_t destIndex = toLower(
-1, options,
dest, destCapacity,
src, nullptr, 0, srcLength,
edits, errorCode);
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
}

View file

@ -30,6 +30,21 @@ public final class CaseMapImpl {
dir=0;
}
/**
* Constructor.
* @param src String to iterate over.
* @param cpStart Start index of the current code point.
* @param cpLimit Limit index of the current code point.
*/
public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
s = src;
index = 0;
limit = src.length();
this.cpStart = cpStart;
this.cpLimit = cpLimit;
dir = 0;
}
/**
* Set the iteration limit for nextCaseMapCP() to an index within the string.
* If the limit parameter is negative or past the string, then the
@ -77,6 +92,11 @@ public final class CaseMapImpl {
}
}
public void setCPStartAndLimit(int s, int l) {
cpStart = s;
cpLimit = l;
dir = 0;
}
/**
* Returns the start of the code point that was last returned
* by nextCaseMapCP().
@ -400,13 +420,162 @@ public final class CaseMapImpl {
return result.toString();
}
private static void internalToLower(int caseLocale, int options, StringContextIterator iter,
private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
/**
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
*/
private static void internalToLower(int caseLocale, int options,
CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
Appendable dest, Edits edits) throws IOException {
int c;
while ((c = iter.nextCaseMapCP()) >= 0) {
c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
appendResult(c, dest, iter.getCPLength(), options, edits);
byte[] latinToLower;
if (caseLocale == UCaseProps.LOC_ROOT ||
(caseLocale >= 0 ?
!(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
(options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
} else {
latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
}
int prev = srcStart;
int srcIndex = srcStart;
outerLoop:
for (;;) {
// fast path for simple cases
char lead;
for (;;) {
if (srcIndex >= srcLimit) {
break outerLoop;
}
lead = src.charAt(srcIndex);
int delta;
if (lead < UCaseProps.LatinCase.LONG_S) {
byte d = latinToLower[lead];
if (d == UCaseProps.LatinCase.EXC) { break; }
++srcIndex;
if (d == 0) { continue; }
delta = d;
} else if (lead >= 0xd800) {
break; // surrogate or higher
} else {
int props = CASE_TRIE.getFromU16SingleLead(lead);
if (UCaseProps.propsHasException(props)) { break; }
++srcIndex;
if (!UCaseProps.isUpperOrTitleFromProps(props) ||
(delta = UCaseProps.getDelta(props)) == 0) {
continue;
}
}
lead += delta;
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
dest.append(lead);
if (edits != null) {
edits.addReplace(1, 1);
}
prev = srcIndex;
}
// slow path
int cpStart = srcIndex++;
char trail;
int c;
if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
c = Character.toCodePoint(lead, trail);
++srcIndex;
} else {
c = lead;
}
if (caseLocale >= 0) {
if (iter == null) {
iter = new StringContextIterator(src, cpStart, srcIndex);
} else {
iter.setCPStartAndLimit(cpStart, srcIndex);
}
c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
} else {
c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
}
if (c >= 0) {
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
appendResult(c, dest, srcIndex - cpStart, options, edits);
prev = srcIndex;
}
}
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
}
private static void internalToUpper(int caseLocale, int options,
CharSequence src, Appendable dest, Edits edits) throws IOException {
StringContextIterator iter = null;
byte[] latinToUpper;
if (caseLocale == UCaseProps.LOC_TURKISH) {
latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
} else {
latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
}
int prev = 0;
int srcIndex = 0;
int srcLength = src.length();
outerLoop:
for (;;) {
// fast path for simple cases
char lead;
for (;;) {
if (srcIndex >= srcLength) {
break outerLoop;
}
lead = src.charAt(srcIndex);
int delta;
if (lead < UCaseProps.LatinCase.LONG_S) {
byte d = latinToUpper[lead];
if (d == UCaseProps.LatinCase.EXC) { break; }
++srcIndex;
if (d == 0) { continue; }
delta = d;
} else if (lead >= 0xd800) {
break; // surrogate or higher
} else {
int props = CASE_TRIE.getFromU16SingleLead(lead);
if (UCaseProps.propsHasException(props)) { break; }
++srcIndex;
if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
(delta = UCaseProps.getDelta(props)) == 0) {
continue;
}
}
lead += delta;
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
dest.append(lead);
if (edits != null) {
edits.addReplace(1, 1);
}
prev = srcIndex;
}
// slow path
int cpStart = srcIndex++;
char trail;
int c;
if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
c = Character.toCodePoint(lead, trail);
++srcIndex;
} else {
c = lead;
}
if (iter == null) {
iter = new StringContextIterator(src, cpStart, srcIndex);
} else {
iter.setCPStartAndLimit(cpStart, srcIndex);
}
c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
if (c >= 0) {
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
appendResult(c, dest, srcIndex - cpStart, options, edits);
prev = srcIndex;
}
}
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
}
public static String toLower(int caseLocale, int options, CharSequence src) {
@ -432,8 +601,7 @@ public final class CaseMapImpl {
if (edits != null) {
edits.reset();
}
StringContextIterator iter = new StringContextIterator(src);
internalToLower(caseLocale, options, iter, dest, edits);
internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
@ -466,12 +634,7 @@ public final class CaseMapImpl {
if (caseLocale == UCaseProps.LOC_GREEK) {
return GreekUpper.toUpper(options, src, dest, edits);
}
StringContextIterator iter = new StringContextIterator(src);
int c;
while ((c = iter.nextCaseMapCP()) >= 0) {
c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
appendResult(c, dest, iter.getCPLength(), options, edits);
}
internalToUpper(caseLocale, options, src, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
@ -589,12 +752,13 @@ public final class CaseMapImpl {
if(titleLimit<index) {
if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
// Normal operation: Lowercase the rest of the word.
internalToLower(caseLocale, options, iter, dest, edits);
internalToLower(caseLocale, options,
src, titleLimit, index, iter, dest, edits);
} else {
// Optionally just copy the rest of the word unchanged.
appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
iter.moveToLimit();
}
iter.moveToLimit();
}
}
}
@ -629,14 +793,7 @@ public final class CaseMapImpl {
if (edits != null) {
edits.reset();
}
int length = src.length();
for (int i = 0; i < length;) {
int c = Character.codePointAt(src, i);
int cpLength = Character.charCount(c);
i += cpLength;
c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
appendResult(c, dest, cpLength, options, edits);
}
internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
return dest;
} catch (IOException e) {
throw new ICUUncheckedIOException(e);

View file

@ -115,7 +115,7 @@ public final class UCaseProps {
return props>>EXC_SHIFT;
}
private static final boolean propsHasException(int props) {
static final boolean propsHasException(int props) {
return (props&EXCEPTION)!=0;
}
@ -187,7 +187,7 @@ public final class UCaseProps {
public final int tolower(int c) {
int props=trie.get(c);
if(!propsHasException(props)) {
if(getTypeFromProps(props)>=UPPER) {
if(isUpperOrTitleFromProps(props)) {
c+=getDelta(props);
}
} else {
@ -591,6 +591,153 @@ public final class UCaseProps {
public int next();
}
/**
* Fast case mapping data for ASCII/Latin.
* Linear arrays of delta bytes: 0=no mapping; EXC=exception.
* Deltas must not cross the ASCII boundary, or else they cannot be easily used
* in simple UTF-8 code.
*/
static final class LatinCase {
/** Case mapping/folding data for code points up to U+017F. */
static final char LIMIT = 0x180;
/** U+017F case-folds and uppercases crossing the ASCII boundary. */
static final char LONG_S = 0x17f;
/** Exception: Complex mapping, or too-large delta. */
static final byte EXC = -0x80;
/** Deltas for lowercasing for most locales, and default case folding. */
static final byte[] TO_LOWER_NORMAL = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
};
/** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */
static final byte[] TO_LOWER_TR_LT = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
};
/** Deltas for uppercasing for most locales. */
static final byte[] TO_UPPER_NORMAL = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
};
/** Deltas for uppercasing for tr/az. */
static final byte[] TO_UPPER_TR = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
};
}
/**
* For string case mappings, a single character (a code point) is mapped
* either to itself (in which case in-place mapping functions do nothing),
@ -609,8 +756,8 @@ public final class UCaseProps {
//ivate static final int LOC_UNKNOWN=0;
public static final int LOC_ROOT=1;
private static final int LOC_TURKISH=2;
private static final int LOC_LITHUANIAN=3;
static final int LOC_TURKISH=2;
static final int LOC_LITHUANIAN=3;
static final int LOC_GREEK=4;
public static final int LOC_DUTCH=5;
@ -823,7 +970,7 @@ public final class UCaseProps {
result=c;
props=trie.get(c);
if(!propsHasException(props)) {
if(getTypeFromProps(props)>=UPPER) {
if(isUpperOrTitleFromProps(props)) {
result=c+getDelta(props);
}
} else {
@ -1132,13 +1279,13 @@ public final class UCaseProps {
*
* @internal
*/
private static final int FOLD_CASE_OPTIONS_MASK = 7;
static final int FOLD_CASE_OPTIONS_MASK = 7;
/* return the simple case folding mapping for c */
public final int fold(int c, int options) {
int props=trie.get(c);
if(!propsHasException(props)) {
if(getTypeFromProps(props)>=UPPER) {
if(isUpperOrTitleFromProps(props)) {
c+=getDelta(props);
}
} else {
@ -1201,7 +1348,7 @@ public final class UCaseProps {
result=c;
props=trie.get(c);
if(!propsHasException(props)) {
if(getTypeFromProps(props)>=UPPER) {
if(isUpperOrTitleFromProps(props)) {
result=c+getDelta(props);
}
} else {
@ -1361,6 +1508,10 @@ public final class UCaseProps {
// definitions for 16-bit case properties word ------------------------- ***
static Trie2_16 getTrie() {
return INSTANCE.trie;
}
/* 2-bit constants for types of cased characters */
public static final int TYPE_MASK=3;
public static final int NONE=0;
@ -1369,7 +1520,7 @@ public final class UCaseProps {
public static final int TITLE=3;
/** @return NONE, LOWER, UPPER, TITLE */
private static final int getTypeFromProps(int props) {
static final int getTypeFromProps(int props) {
return props&TYPE_MASK;
}
@ -1378,6 +1529,10 @@ public final class UCaseProps {
return props&7;
}
static final boolean isUpperOrTitleFromProps(int props) {
return (props & 2) != 0;
}
static final int IGNORABLE=4;
private static final int SENSITIVE= 8;
private static final int EXCEPTION= 0x10;
@ -1394,7 +1549,7 @@ public final class UCaseProps {
//private static final int MAX_DELTA= 0xff;
//private static final int MIN_DELTA= (-MAX_DELTA-1);
private static final int getDelta(int props) {
static final int getDelta(int props) {
return (short)props>>DELTA_SHIFT;
}

View file

@ -480,19 +480,17 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
else {
if (!SPECIAL_DATA_[j + 1].equals(
UCharacter.toLowerCase(str))) {
String lower = UCharacter.toLowerCase(str);
if (!SPECIAL_DATA_[j + 1].equals(lower)) {
errln("error lowercasing special characters " +
hex(str) + " expected " + SPECIAL_DATA_[j + 1] +
" but got " +
hex(UCharacter.toLowerCase(locale, str)));
" but got " + hex(lower));
}
if (!SPECIAL_DATA_[j + 2].equals(
UCharacter.toUpperCase(locale, str))) {
String upper = UCharacter.toUpperCase(str);
if (!SPECIAL_DATA_[j + 2].equals(upper)) {
errln("error uppercasing special characters " +
hex(str) + " expected " + SPECIAL_DATA_[j + 2] +
" but got " +
hex(UCharacter.toUpperCase(locale, str)));
" but got " + hex(upper));
}
}
}