mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-12647 make string case mapping functions faster
X-SVN-Rev: 40921
This commit is contained in:
parent
a3d84405e5
commit
e8bb1bb9c2
9 changed files with 1014 additions and 199 deletions
|
@ -92,20 +92,16 @@ ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) {
|
|||
sink.Append(s8, 2);
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
if (length > 0) {
|
||||
if (edits != nullptr) {
|
||||
edits->addUnchanged(length);
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(reinterpret_cast<const char *>(s), length);
|
||||
}
|
||||
void
|
||||
ByteSinkUtil::appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits) {
|
||||
U_ASSERT(length > 0);
|
||||
if (edits != nullptr) {
|
||||
edits->addUnchanged(length);
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(reinterpret_cast<const char *>(s), length);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool
|
||||
|
@ -117,7 +113,11 @@ ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit,
|
|||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
return appendUnchanged(s, (int32_t)(limit - s), sink, options, edits, errorCode);
|
||||
int32_t length = (int32_t)(limit - s);
|
||||
if (length > 0) {
|
||||
appendNonEmptyUnchanged(s, length, sink, options, edits);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -43,11 +43,19 @@ public:
|
|||
|
||||
static UBool appendUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
if (length > 0) { appendNonEmptyUnchanged(s, length, sink, options, edits); }
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -77,9 +77,12 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
|||
|
||||
/* data access primitives --------------------------------------------------- */
|
||||
|
||||
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
|
||||
U_CFUNC const UTrie2 * U_EXPORT2
|
||||
ucase_getTrie() {
|
||||
return &ucase_props_singleton.trie;
|
||||
}
|
||||
|
||||
#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
|
||||
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
|
||||
|
||||
/* number of bits in an 8-bit integer value */
|
||||
static const uint8_t flagsOffset[256]={
|
||||
|
@ -128,8 +131,8 @@ static const uint8_t flagsOffset[256]={
|
|||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_tolower(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
|
@ -145,7 +148,7 @@ ucase_tolower(UChar32 c) {
|
|||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_toupper(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
|
@ -162,7 +165,7 @@ ucase_toupper(UChar32 c) {
|
|||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_totitle(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
|
@ -223,7 +226,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
|||
}
|
||||
|
||||
props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
|
||||
/* add the one simple case mapping, no matter what type it is */
|
||||
int32_t delta=UCASE_GET_DELTA(props);
|
||||
|
@ -419,6 +422,138 @@ FullCaseFoldingIterator::next(UnicodeString &full) {
|
|||
return c;
|
||||
}
|
||||
|
||||
namespace LatinCase {
|
||||
|
||||
const int8_t TO_LOWER_NORMAL[LIMIT] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
|
||||
};
|
||||
|
||||
const int8_t TO_LOWER_TR_LT[LIMIT] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
|
||||
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
|
||||
};
|
||||
|
||||
const int8_t TO_UPPER_NORMAL[LIMIT] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
|
||||
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
|
||||
|
||||
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
|
||||
};
|
||||
|
||||
const int8_t TO_UPPER_TR[LIMIT] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
|
||||
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
|
||||
|
||||
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
|
||||
};
|
||||
|
||||
} // namespace LatinCase
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
|
||||
|
@ -439,7 +574,7 @@ ucase_getTypeOrIgnorable(UChar32 c) {
|
|||
static inline int32_t
|
||||
getDotType(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
return props&UCASE_DOT_MASK;
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
|
@ -878,8 +1013,8 @@ ucase_toFullLower(UChar32 c,
|
|||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
|
@ -1024,7 +1159,7 @@ toUpperOrTitle(UChar32 c,
|
|||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
|
@ -1169,8 +1304,8 @@ ucase_toFullTitle(UChar32 c,
|
|||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_fold(UChar32 c, uint32_t options) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
|
@ -1234,8 +1369,8 @@ ucase_toFullFolding(UChar32 c,
|
|||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "putilimp.h"
|
||||
#include "uset_imp.h"
|
||||
#include "udataswp.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -148,6 +149,33 @@ private:
|
|||
int32_t rowCpIndex;
|
||||
};
|
||||
|
||||
/**
|
||||
* Fast case mapping data for ASCII/Latin.
|
||||
* Linear arrays of delta bytes: 0=no mapping; EXC=exception.
|
||||
* Deltas must not cross the ASCII boundary, or else they cannot be easily used
|
||||
* in simple UTF-8 code.
|
||||
*/
|
||||
namespace LatinCase {
|
||||
|
||||
/** Case mapping/folding data for code points up to U+017F. */
|
||||
constexpr UChar LIMIT = 0x180;
|
||||
/** U+017F case-folds and uppercases crossing the ASCII boundary. */
|
||||
constexpr UChar LONG_S = 0x17f;
|
||||
/** Exception: Complex mapping, or too-large delta. */
|
||||
constexpr int8_t EXC = -0x80;
|
||||
|
||||
/** Deltas for lowercasing for most locales, and default case folding. */
|
||||
extern const int8_t TO_LOWER_NORMAL[LIMIT];
|
||||
/** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */
|
||||
extern const int8_t TO_LOWER_TR_LT[LIMIT];
|
||||
|
||||
/** Deltas for uppercasing for most locales. */
|
||||
extern const int8_t TO_UPPER_NORMAL[LIMIT];
|
||||
/** Deltas for uppercasing for tr/az. */
|
||||
extern const int8_t TO_UPPER_TR[LIMIT];
|
||||
|
||||
} // namespace LatinCase
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
|
@ -308,6 +336,9 @@ enum {
|
|||
|
||||
/* definitions for 16-bit case properties word ------------------------------ */
|
||||
|
||||
U_CFUNC const UTrie2 * U_EXPORT2
|
||||
ucase_getTrie();
|
||||
|
||||
/* 2-bit constants for types of cased characters */
|
||||
#define UCASE_TYPE_MASK 3
|
||||
enum {
|
||||
|
@ -320,10 +351,14 @@ enum {
|
|||
#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
|
||||
#define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
|
||||
|
||||
#define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2)
|
||||
|
||||
#define UCASE_IGNORABLE 4
|
||||
#define UCASE_SENSITIVE 8
|
||||
#define UCASE_EXCEPTION 0x10
|
||||
|
||||
#define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
|
||||
|
||||
#define UCASE_DOT_MASK 0x60
|
||||
enum {
|
||||
UCASE_NO_DOT=0, /* normal characters with cc=0 */
|
||||
|
|
|
@ -165,9 +165,7 @@ appendResult(int32_t cpLength, int32_t result, const UChar *s,
|
|||
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
|
||||
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
|
||||
|
||||
} // namespace
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
UChar32 U_CALLCONV
|
||||
utf8_caseContextIterator(void *context, int8_t dir) {
|
||||
UCaseContext *csc=(UCaseContext *)context;
|
||||
UChar32 c;
|
||||
|
@ -199,36 +197,227 @@ utf8_caseContextIterator(void *context, int8_t dir) {
|
|||
return U_SENTINEL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Case-maps [srcStart..srcLimit[ but takes
|
||||
* context [0..srcLength[ into account.
|
||||
/**
|
||||
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
|
||||
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
|
||||
*/
|
||||
static void
|
||||
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
|
||||
const uint8_t *src, UCaseContext *csc,
|
||||
int32_t srcStart, int32_t srcLimit,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
int32_t srcIndex=srcStart;
|
||||
while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
|
||||
void toLower(int32_t caseLocale, uint32_t options,
|
||||
const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
|
||||
icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
|
||||
const int8_t *latinToLower;
|
||||
if (caseLocale == UCASE_LOC_ROOT ||
|
||||
(caseLocale >= 0 ?
|
||||
!(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
|
||||
(options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
|
||||
latinToLower = LatinCase::TO_LOWER_NORMAL;
|
||||
} else {
|
||||
latinToLower = LatinCase::TO_LOWER_TR_LT;
|
||||
}
|
||||
const UTrie2 *trie = ucase_getTrie();
|
||||
int32_t prev = srcStart;
|
||||
int32_t srcIndex = srcStart;
|
||||
for (;;) {
|
||||
// fast path for simple cases
|
||||
int32_t cpStart;
|
||||
csc->cpStart=cpStart=srcIndex;
|
||||
UChar32 c;
|
||||
U8_NEXT(src, srcIndex, srcLimit, c);
|
||||
csc->cpLimit=srcIndex;
|
||||
if(c<0) {
|
||||
// Malformed UTF-8.
|
||||
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
|
||||
for (;;) {
|
||||
if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
|
||||
c = U_SENTINEL;
|
||||
break;
|
||||
}
|
||||
uint8_t lead = src[srcIndex++];
|
||||
if (lead <= 0x7f) {
|
||||
int8_t d = latinToLower[lead];
|
||||
if (d == LatinCase::EXC) {
|
||||
cpStart = srcIndex - 1;
|
||||
c = lead;
|
||||
break;
|
||||
}
|
||||
if (d == 0) { continue; }
|
||||
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
|
||||
sink, options, edits, errorCode);
|
||||
char ascii = (char)(lead + d);
|
||||
sink.Append(&ascii, 1);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
prev = srcIndex;
|
||||
continue;
|
||||
} else if (lead < 0xe3) {
|
||||
uint8_t t;
|
||||
if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
|
||||
(t = src[srcIndex] - 0x80) <= 0x3f) {
|
||||
// U+0080..U+017F
|
||||
++srcIndex;
|
||||
c = ((lead - 0xc0) << 6) | t;
|
||||
int8_t d = latinToLower[c];
|
||||
if (d == LatinCase::EXC) {
|
||||
cpStart = srcIndex - 2;
|
||||
break;
|
||||
}
|
||||
if (d == 0) { continue; }
|
||||
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
|
||||
sink, options, edits, errorCode);
|
||||
ByteSinkUtil::appendTwoBytes(c + d, sink);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(2, 2);
|
||||
}
|
||||
prev = srcIndex;
|
||||
continue;
|
||||
}
|
||||
} else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
|
||||
(srcIndex + 2) <= srcLimit &&
|
||||
U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
|
||||
// most of CJK: no case mappings
|
||||
srcIndex += 2;
|
||||
continue;
|
||||
}
|
||||
cpStart = --srcIndex;
|
||||
U8_NEXT(src, srcIndex, srcLimit, c);
|
||||
if (c < 0) {
|
||||
// ill-formed UTF-8
|
||||
continue;
|
||||
}
|
||||
uint16_t props = UTRIE2_GET16(trie, c);
|
||||
if (UCASE_HAS_EXCEPTION(props)) { break; }
|
||||
int32_t delta;
|
||||
if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
|
||||
continue;
|
||||
}
|
||||
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
|
||||
sink, options, edits, errorCode);
|
||||
ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
|
||||
prev = srcIndex;
|
||||
}
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
// slow path
|
||||
const UChar *s;
|
||||
if (caseLocale >= 0) {
|
||||
csc->cpStart = cpStart;
|
||||
csc->cpLimit = srcIndex;
|
||||
c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
|
||||
} else {
|
||||
const UChar *s;
|
||||
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
|
||||
c = ucase_toFullFolding(c, &s, options);
|
||||
}
|
||||
if (c >= 0) {
|
||||
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
|
||||
sink, options, edits, errorCode);
|
||||
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
|
||||
prev = srcIndex;
|
||||
}
|
||||
}
|
||||
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
|
||||
sink, options, edits, errorCode);
|
||||
}
|
||||
|
||||
void toUpper(int32_t caseLocale, uint32_t options,
|
||||
const uint8_t *src, UCaseContext *csc, int32_t srcLength,
|
||||
icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
|
||||
const int8_t *latinToUpper;
|
||||
if (caseLocale == UCASE_LOC_TURKISH) {
|
||||
latinToUpper = LatinCase::TO_UPPER_TR;
|
||||
} else {
|
||||
latinToUpper = LatinCase::TO_UPPER_NORMAL;
|
||||
}
|
||||
const UTrie2 *trie = ucase_getTrie();
|
||||
int32_t prev = 0;
|
||||
int32_t srcIndex = 0;
|
||||
for (;;) {
|
||||
// fast path for simple cases
|
||||
int32_t cpStart;
|
||||
UChar32 c;
|
||||
for (;;) {
|
||||
if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
|
||||
c = U_SENTINEL;
|
||||
break;
|
||||
}
|
||||
uint8_t lead = src[srcIndex++];
|
||||
if (lead <= 0x7f) {
|
||||
int8_t d = latinToUpper[lead];
|
||||
if (d == LatinCase::EXC) {
|
||||
cpStart = srcIndex - 1;
|
||||
c = lead;
|
||||
break;
|
||||
}
|
||||
if (d == 0) { continue; }
|
||||
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
|
||||
sink, options, edits, errorCode);
|
||||
char ascii = (char)(lead + d);
|
||||
sink.Append(&ascii, 1);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
prev = srcIndex;
|
||||
continue;
|
||||
} else if (lead < 0xe3) {
|
||||
uint8_t t;
|
||||
if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
|
||||
(t = src[srcIndex] - 0x80) <= 0x3f) {
|
||||
// U+0080..U+017F
|
||||
++srcIndex;
|
||||
c = ((lead - 0xc0) << 6) | t;
|
||||
int8_t d = latinToUpper[c];
|
||||
if (d == LatinCase::EXC) {
|
||||
cpStart = srcIndex - 2;
|
||||
break;
|
||||
}
|
||||
if (d == 0) { continue; }
|
||||
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
|
||||
sink, options, edits, errorCode);
|
||||
ByteSinkUtil::appendTwoBytes(c + d, sink);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(2, 2);
|
||||
}
|
||||
prev = srcIndex;
|
||||
continue;
|
||||
}
|
||||
} else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
|
||||
(srcIndex + 2) <= srcLength &&
|
||||
U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
|
||||
// most of CJK: no case mappings
|
||||
srcIndex += 2;
|
||||
continue;
|
||||
}
|
||||
cpStart = --srcIndex;
|
||||
U8_NEXT(src, srcIndex, srcLength, c);
|
||||
if (c < 0) {
|
||||
// ill-formed UTF-8
|
||||
continue;
|
||||
}
|
||||
uint16_t props = UTRIE2_GET16(trie, c);
|
||||
if (UCASE_HAS_EXCEPTION(props)) { break; }
|
||||
int32_t delta;
|
||||
if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
|
||||
continue;
|
||||
}
|
||||
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
|
||||
sink, options, edits, errorCode);
|
||||
ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
|
||||
prev = srcIndex;
|
||||
}
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
// slow path
|
||||
csc->cpStart = cpStart;
|
||||
csc->cpLimit = srcIndex;
|
||||
const UChar *s;
|
||||
c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
|
||||
if (c >= 0) {
|
||||
ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
|
||||
sink, options, edits, errorCode);
|
||||
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
|
||||
prev = srcIndex;
|
||||
}
|
||||
}
|
||||
ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
|
||||
sink, options, edits, errorCode);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
U_CFUNC void U_CALLCONV
|
||||
|
@ -335,10 +524,9 @@ ucasemap_internalUTF8ToTitle(
|
|||
if(titleLimit<index) {
|
||||
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
|
||||
/* Normal operation: Lowercase the rest of the word. */
|
||||
_caseMap(caseLocale, options, ucase_toFullLower,
|
||||
src, &csc,
|
||||
titleLimit, index,
|
||||
sink, edits, errorCode);
|
||||
toLower(caseLocale, options,
|
||||
src, &csc, titleLimit, index,
|
||||
sink, edits, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
@ -538,8 +726,8 @@ ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREA
|
|||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
_caseMap(
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
toLower(
|
||||
caseLocale, options,
|
||||
src, &csc, 0, srcLength,
|
||||
sink, edits, errorCode);
|
||||
}
|
||||
|
@ -555,9 +743,9 @@ ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREA
|
|||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
_caseMap(
|
||||
caseLocale, options, ucase_toFullUpper,
|
||||
src, &csc, 0, srcLength,
|
||||
toUpper(
|
||||
caseLocale, options,
|
||||
src, &csc, srcLength,
|
||||
sink, edits, errorCode);
|
||||
}
|
||||
}
|
||||
|
@ -567,22 +755,10 @@ ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_B
|
|||
const uint8_t *src, int32_t srcLength,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
int32_t srcIndex = 0;
|
||||
while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
|
||||
int32_t cpStart = srcIndex;
|
||||
UChar32 c;
|
||||
U8_NEXT(src, srcIndex, srcLength, c);
|
||||
if(c<0) {
|
||||
// Malformed UTF-8.
|
||||
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
|
||||
sink, options, edits, errorCode);
|
||||
} else {
|
||||
const UChar *s;
|
||||
c = ucase_toFullFolding(c, &s, options);
|
||||
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
|
||||
}
|
||||
}
|
||||
toLower(
|
||||
-1, options,
|
||||
src, nullptr, 0, srcLength,
|
||||
sink, edits, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -52,16 +52,8 @@ int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
|
|||
return destIndex;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
/* string casing ------------------------------------------------------------ */
|
||||
|
||||
/* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
|
||||
static inline int32_t
|
||||
inline int32_t
|
||||
appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
||||
int32_t result, const UChar *s,
|
||||
int32_t cpLength, uint32_t options, icu::Edits *edits) {
|
||||
|
@ -134,7 +126,7 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
|||
return destIndex;
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
inline int32_t
|
||||
appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
|
||||
if(destIndex<destCapacity) {
|
||||
dest[destIndex]=c;
|
||||
|
@ -144,28 +136,34 @@ appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
|
|||
return destIndex+1;
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
||||
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
|
||||
if(length>0) {
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(length);
|
||||
}
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
if((destIndex+length)<=destCapacity) {
|
||||
u_memcpy(dest+destIndex, s, length);
|
||||
}
|
||||
destIndex+=length;
|
||||
int32_t
|
||||
appendNonEmptyUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
||||
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(length);
|
||||
}
|
||||
return destIndex;
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
if((destIndex+length)<=destCapacity) {
|
||||
u_memcpy(dest+destIndex, s, length);
|
||||
}
|
||||
return destIndex + length;
|
||||
}
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
inline int32_t
|
||||
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
||||
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
|
||||
if (length <= 0) {
|
||||
return destIndex;
|
||||
}
|
||||
return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
|
||||
}
|
||||
|
||||
UChar32 U_CALLCONV
|
||||
utf16_caseContextIterator(void *context, int8_t dir) {
|
||||
UCaseContext *csc=(UCaseContext *)context;
|
||||
UChar32 c;
|
||||
|
@ -197,39 +195,205 @@ utf16_caseContextIterator(void *context, int8_t dir) {
|
|||
return U_SENTINEL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Case-maps [srcStart..srcLimit[ but takes
|
||||
* context [0..srcLength[ into account.
|
||||
/**
|
||||
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
|
||||
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
|
||||
*/
|
||||
static int32_t
|
||||
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, UCaseContext *csc,
|
||||
int32_t srcStart, int32_t srcLimit,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
int32_t srcIndex=srcStart;
|
||||
int32_t destIndex=0;
|
||||
while(srcIndex<srcLimit) {
|
||||
int32_t cpStart;
|
||||
csc->cpStart=cpStart=srcIndex;
|
||||
int32_t toLower(int32_t caseLocale, uint32_t options,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
|
||||
icu::Edits *edits, UErrorCode &errorCode) {
|
||||
const int8_t *latinToLower;
|
||||
if (caseLocale == UCASE_LOC_ROOT ||
|
||||
(caseLocale >= 0 ?
|
||||
!(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
|
||||
(options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
|
||||
latinToLower = LatinCase::TO_LOWER_NORMAL;
|
||||
} else {
|
||||
latinToLower = LatinCase::TO_LOWER_TR_LT;
|
||||
}
|
||||
const UTrie2 *trie = ucase_getTrie();
|
||||
int32_t destIndex = 0;
|
||||
int32_t prev = srcStart;
|
||||
int32_t srcIndex = srcStart;
|
||||
for (;;) {
|
||||
// fast path for simple cases
|
||||
UChar lead;
|
||||
while (srcIndex < srcLimit) {
|
||||
lead = src[srcIndex];
|
||||
int32_t delta;
|
||||
if (lead < LatinCase::LONG_S) {
|
||||
int8_t d = latinToLower[lead];
|
||||
if (d == LatinCase::EXC) { break; }
|
||||
++srcIndex;
|
||||
if (d == 0) { continue; }
|
||||
delta = d;
|
||||
} else if (lead >= 0xd800) {
|
||||
break; // surrogate or higher
|
||||
} else {
|
||||
uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
|
||||
if (UCASE_HAS_EXCEPTION(props)) { break; }
|
||||
++srcIndex;
|
||||
if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
lead += delta;
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity,
|
||||
src + prev, srcIndex - 1 - prev, options, edits);
|
||||
if (destIndex >= 0) {
|
||||
destIndex = appendUChar(dest, destIndex, destCapacity, lead);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
}
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
prev = srcIndex;
|
||||
}
|
||||
if (srcIndex >= srcLimit) {
|
||||
break;
|
||||
}
|
||||
// slow path
|
||||
int32_t cpStart = srcIndex++;
|
||||
UChar trail;
|
||||
UChar32 c;
|
||||
U16_NEXT(src, srcIndex, srcLimit, c);
|
||||
csc->cpLimit=srcIndex;
|
||||
if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
|
||||
c = U16_GET_SUPPLEMENTARY(lead, trail);
|
||||
++srcIndex;
|
||||
} else {
|
||||
c = lead;
|
||||
}
|
||||
const UChar *s;
|
||||
c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (caseLocale >= 0) {
|
||||
csc->cpStart = cpStart;
|
||||
csc->cpLimit = srcIndex;
|
||||
c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
|
||||
} else {
|
||||
c = ucase_toFullFolding(c, &s, options);
|
||||
}
|
||||
if (c >= 0) {
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity,
|
||||
src + prev, cpStart - prev, options, edits);
|
||||
if (destIndex >= 0) {
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
}
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
prev = srcIndex;
|
||||
}
|
||||
}
|
||||
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity,
|
||||
src + prev, srcIndex - prev, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
int32_t toUpper(int32_t caseLocale, uint32_t options,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, UCaseContext *csc, int32_t srcLength,
|
||||
icu::Edits *edits, UErrorCode &errorCode) {
|
||||
const int8_t *latinToUpper;
|
||||
if (caseLocale == UCASE_LOC_TURKISH) {
|
||||
latinToUpper = LatinCase::TO_UPPER_TR;
|
||||
} else {
|
||||
latinToUpper = LatinCase::TO_UPPER_NORMAL;
|
||||
}
|
||||
const UTrie2 *trie = ucase_getTrie();
|
||||
int32_t destIndex = 0;
|
||||
int32_t prev = 0;
|
||||
int32_t srcIndex = 0;
|
||||
for (;;) {
|
||||
// fast path for simple cases
|
||||
UChar lead;
|
||||
while (srcIndex < srcLength) {
|
||||
lead = src[srcIndex];
|
||||
int32_t delta;
|
||||
if (lead < LatinCase::LONG_S) {
|
||||
int8_t d = latinToUpper[lead];
|
||||
if (d == LatinCase::EXC) { break; }
|
||||
++srcIndex;
|
||||
if (d == 0) { continue; }
|
||||
delta = d;
|
||||
} else if (lead >= 0xd800) {
|
||||
break; // surrogate or higher
|
||||
} else {
|
||||
uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
|
||||
if (UCASE_HAS_EXCEPTION(props)) { break; }
|
||||
++srcIndex;
|
||||
if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
lead += delta;
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity,
|
||||
src + prev, srcIndex - 1 - prev, options, edits);
|
||||
if (destIndex >= 0) {
|
||||
destIndex = appendUChar(dest, destIndex, destCapacity, lead);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
}
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
prev = srcIndex;
|
||||
}
|
||||
if (srcIndex >= srcLength) {
|
||||
break;
|
||||
}
|
||||
// slow path
|
||||
int32_t cpStart;
|
||||
csc->cpStart = cpStart = srcIndex++;
|
||||
UChar trail;
|
||||
UChar32 c;
|
||||
if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
|
||||
c = U16_GET_SUPPLEMENTARY(lead, trail);
|
||||
++srcIndex;
|
||||
} else {
|
||||
c = lead;
|
||||
}
|
||||
csc->cpLimit = srcIndex;
|
||||
const UChar *s;
|
||||
c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
|
||||
if (c >= 0) {
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity,
|
||||
src + prev, cpStart - prev, options, edits);
|
||||
if (destIndex >= 0) {
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
}
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
prev = srcIndex;
|
||||
}
|
||||
}
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity,
|
||||
src + prev, srcIndex - prev, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
|
@ -344,11 +508,10 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
|
|||
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
|
||||
/* Normal operation: Lowercase the rest of the word. */
|
||||
destIndex+=
|
||||
_caseMap(
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
toLower(
|
||||
caseLocale, options,
|
||||
dest+destIndex, destCapacity-destIndex,
|
||||
src, &csc,
|
||||
titleLimit, index,
|
||||
src, &csc, titleLimit, index,
|
||||
edits, errorCode);
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
|
@ -1013,8 +1176,8 @@ ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT
|
|||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
int32_t destIndex = _caseMap(
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
int32_t destIndex = toLower(
|
||||
caseLocale, options,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
edits, errorCode);
|
||||
|
@ -1035,10 +1198,10 @@ ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT
|
|||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
destIndex = _caseMap(
|
||||
caseLocale, options, ucase_toFullUpper,
|
||||
destIndex = toUpper(
|
||||
caseLocale, options,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
src, &csc, srcLength,
|
||||
edits, errorCode);
|
||||
}
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
|
@ -1050,23 +1213,11 @@ ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK
|
|||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
int32_t srcIndex = 0;
|
||||
int32_t destIndex = 0;
|
||||
while (srcIndex < srcLength) {
|
||||
int32_t cpStart = srcIndex;
|
||||
UChar32 c;
|
||||
U16_NEXT(src, srcIndex, srcLength, c);
|
||||
const UChar *s;
|
||||
c = ucase_toFullFolding(c, &s, options);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t destIndex = toLower(
|
||||
-1, options,
|
||||
dest, destCapacity,
|
||||
src, nullptr, 0, srcLength,
|
||||
edits, errorCode);
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
|
|
|
@ -30,6 +30,21 @@ public final class CaseMapImpl {
|
|||
dir=0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param src String to iterate over.
|
||||
* @param cpStart Start index of the current code point.
|
||||
* @param cpLimit Limit index of the current code point.
|
||||
*/
|
||||
public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
|
||||
s = src;
|
||||
index = 0;
|
||||
limit = src.length();
|
||||
this.cpStart = cpStart;
|
||||
this.cpLimit = cpLimit;
|
||||
dir = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iteration limit for nextCaseMapCP() to an index within the string.
|
||||
* If the limit parameter is negative or past the string, then the
|
||||
|
@ -77,6 +92,11 @@ public final class CaseMapImpl {
|
|||
}
|
||||
}
|
||||
|
||||
public void setCPStartAndLimit(int s, int l) {
|
||||
cpStart = s;
|
||||
cpLimit = l;
|
||||
dir = 0;
|
||||
}
|
||||
/**
|
||||
* Returns the start of the code point that was last returned
|
||||
* by nextCaseMapCP().
|
||||
|
@ -400,13 +420,162 @@ public final class CaseMapImpl {
|
|||
return result.toString();
|
||||
}
|
||||
|
||||
private static void internalToLower(int caseLocale, int options, StringContextIterator iter,
|
||||
private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
|
||||
|
||||
/**
|
||||
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
|
||||
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
|
||||
*/
|
||||
private static void internalToLower(int caseLocale, int options,
|
||||
CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
|
||||
Appendable dest, Edits edits) throws IOException {
|
||||
int c;
|
||||
while ((c = iter.nextCaseMapCP()) >= 0) {
|
||||
c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
|
||||
appendResult(c, dest, iter.getCPLength(), options, edits);
|
||||
byte[] latinToLower;
|
||||
if (caseLocale == UCaseProps.LOC_ROOT ||
|
||||
(caseLocale >= 0 ?
|
||||
!(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
|
||||
(options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
|
||||
latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
|
||||
} else {
|
||||
latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
|
||||
}
|
||||
int prev = srcStart;
|
||||
int srcIndex = srcStart;
|
||||
outerLoop:
|
||||
for (;;) {
|
||||
// fast path for simple cases
|
||||
char lead;
|
||||
for (;;) {
|
||||
if (srcIndex >= srcLimit) {
|
||||
break outerLoop;
|
||||
}
|
||||
lead = src.charAt(srcIndex);
|
||||
int delta;
|
||||
if (lead < UCaseProps.LatinCase.LONG_S) {
|
||||
byte d = latinToLower[lead];
|
||||
if (d == UCaseProps.LatinCase.EXC) { break; }
|
||||
++srcIndex;
|
||||
if (d == 0) { continue; }
|
||||
delta = d;
|
||||
} else if (lead >= 0xd800) {
|
||||
break; // surrogate or higher
|
||||
} else {
|
||||
int props = CASE_TRIE.getFromU16SingleLead(lead);
|
||||
if (UCaseProps.propsHasException(props)) { break; }
|
||||
++srcIndex;
|
||||
if (!UCaseProps.isUpperOrTitleFromProps(props) ||
|
||||
(delta = UCaseProps.getDelta(props)) == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
lead += delta;
|
||||
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
|
||||
dest.append(lead);
|
||||
if (edits != null) {
|
||||
edits.addReplace(1, 1);
|
||||
}
|
||||
prev = srcIndex;
|
||||
}
|
||||
// slow path
|
||||
int cpStart = srcIndex++;
|
||||
char trail;
|
||||
int c;
|
||||
if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
|
||||
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
|
||||
c = Character.toCodePoint(lead, trail);
|
||||
++srcIndex;
|
||||
} else {
|
||||
c = lead;
|
||||
}
|
||||
if (caseLocale >= 0) {
|
||||
if (iter == null) {
|
||||
iter = new StringContextIterator(src, cpStart, srcIndex);
|
||||
} else {
|
||||
iter.setCPStartAndLimit(cpStart, srcIndex);
|
||||
}
|
||||
c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
|
||||
} else {
|
||||
c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
|
||||
}
|
||||
if (c >= 0) {
|
||||
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
|
||||
appendResult(c, dest, srcIndex - cpStart, options, edits);
|
||||
prev = srcIndex;
|
||||
}
|
||||
}
|
||||
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
|
||||
}
|
||||
|
||||
private static void internalToUpper(int caseLocale, int options,
|
||||
CharSequence src, Appendable dest, Edits edits) throws IOException {
|
||||
StringContextIterator iter = null;
|
||||
byte[] latinToUpper;
|
||||
if (caseLocale == UCaseProps.LOC_TURKISH) {
|
||||
latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
|
||||
} else {
|
||||
latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
|
||||
}
|
||||
int prev = 0;
|
||||
int srcIndex = 0;
|
||||
int srcLength = src.length();
|
||||
outerLoop:
|
||||
for (;;) {
|
||||
// fast path for simple cases
|
||||
char lead;
|
||||
for (;;) {
|
||||
if (srcIndex >= srcLength) {
|
||||
break outerLoop;
|
||||
}
|
||||
lead = src.charAt(srcIndex);
|
||||
int delta;
|
||||
if (lead < UCaseProps.LatinCase.LONG_S) {
|
||||
byte d = latinToUpper[lead];
|
||||
if (d == UCaseProps.LatinCase.EXC) { break; }
|
||||
++srcIndex;
|
||||
if (d == 0) { continue; }
|
||||
delta = d;
|
||||
} else if (lead >= 0xd800) {
|
||||
break; // surrogate or higher
|
||||
} else {
|
||||
int props = CASE_TRIE.getFromU16SingleLead(lead);
|
||||
if (UCaseProps.propsHasException(props)) { break; }
|
||||
++srcIndex;
|
||||
if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
|
||||
(delta = UCaseProps.getDelta(props)) == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
lead += delta;
|
||||
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
|
||||
dest.append(lead);
|
||||
if (edits != null) {
|
||||
edits.addReplace(1, 1);
|
||||
}
|
||||
prev = srcIndex;
|
||||
}
|
||||
// slow path
|
||||
int cpStart = srcIndex++;
|
||||
char trail;
|
||||
int c;
|
||||
if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
|
||||
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
|
||||
c = Character.toCodePoint(lead, trail);
|
||||
++srcIndex;
|
||||
} else {
|
||||
c = lead;
|
||||
}
|
||||
if (iter == null) {
|
||||
iter = new StringContextIterator(src, cpStart, srcIndex);
|
||||
} else {
|
||||
iter.setCPStartAndLimit(cpStart, srcIndex);
|
||||
}
|
||||
c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
|
||||
if (c >= 0) {
|
||||
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
|
||||
appendResult(c, dest, srcIndex - cpStart, options, edits);
|
||||
prev = srcIndex;
|
||||
}
|
||||
}
|
||||
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
|
||||
}
|
||||
|
||||
public static String toLower(int caseLocale, int options, CharSequence src) {
|
||||
|
@ -432,8 +601,7 @@ public final class CaseMapImpl {
|
|||
if (edits != null) {
|
||||
edits.reset();
|
||||
}
|
||||
StringContextIterator iter = new StringContextIterator(src);
|
||||
internalToLower(caseLocale, options, iter, dest, edits);
|
||||
internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
|
||||
return dest;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
|
@ -466,12 +634,7 @@ public final class CaseMapImpl {
|
|||
if (caseLocale == UCaseProps.LOC_GREEK) {
|
||||
return GreekUpper.toUpper(options, src, dest, edits);
|
||||
}
|
||||
StringContextIterator iter = new StringContextIterator(src);
|
||||
int c;
|
||||
while ((c = iter.nextCaseMapCP()) >= 0) {
|
||||
c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
|
||||
appendResult(c, dest, iter.getCPLength(), options, edits);
|
||||
}
|
||||
internalToUpper(caseLocale, options, src, dest, edits);
|
||||
return dest;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
|
@ -589,12 +752,13 @@ public final class CaseMapImpl {
|
|||
if(titleLimit<index) {
|
||||
if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
|
||||
// Normal operation: Lowercase the rest of the word.
|
||||
internalToLower(caseLocale, options, iter, dest, edits);
|
||||
internalToLower(caseLocale, options,
|
||||
src, titleLimit, index, iter, dest, edits);
|
||||
} else {
|
||||
// Optionally just copy the rest of the word unchanged.
|
||||
appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
|
||||
iter.moveToLimit();
|
||||
}
|
||||
iter.moveToLimit();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -629,14 +793,7 @@ public final class CaseMapImpl {
|
|||
if (edits != null) {
|
||||
edits.reset();
|
||||
}
|
||||
int length = src.length();
|
||||
for (int i = 0; i < length;) {
|
||||
int c = Character.codePointAt(src, i);
|
||||
int cpLength = Character.charCount(c);
|
||||
i += cpLength;
|
||||
c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
|
||||
appendResult(c, dest, cpLength, options, edits);
|
||||
}
|
||||
internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
|
||||
return dest;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
|
|
|
@ -115,7 +115,7 @@ public final class UCaseProps {
|
|||
return props>>EXC_SHIFT;
|
||||
}
|
||||
|
||||
private static final boolean propsHasException(int props) {
|
||||
static final boolean propsHasException(int props) {
|
||||
return (props&EXCEPTION)!=0;
|
||||
}
|
||||
|
||||
|
@ -187,7 +187,7 @@ public final class UCaseProps {
|
|||
public final int tolower(int c) {
|
||||
int props=trie.get(c);
|
||||
if(!propsHasException(props)) {
|
||||
if(getTypeFromProps(props)>=UPPER) {
|
||||
if(isUpperOrTitleFromProps(props)) {
|
||||
c+=getDelta(props);
|
||||
}
|
||||
} else {
|
||||
|
@ -591,6 +591,153 @@ public final class UCaseProps {
|
|||
public int next();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fast case mapping data for ASCII/Latin.
|
||||
* Linear arrays of delta bytes: 0=no mapping; EXC=exception.
|
||||
* Deltas must not cross the ASCII boundary, or else they cannot be easily used
|
||||
* in simple UTF-8 code.
|
||||
*/
|
||||
static final class LatinCase {
|
||||
/** Case mapping/folding data for code points up to U+017F. */
|
||||
static final char LIMIT = 0x180;
|
||||
/** U+017F case-folds and uppercases crossing the ASCII boundary. */
|
||||
static final char LONG_S = 0x17f;
|
||||
/** Exception: Complex mapping, or too-large delta. */
|
||||
static final byte EXC = -0x80;
|
||||
|
||||
/** Deltas for lowercasing for most locales, and default case folding. */
|
||||
static final byte[] TO_LOWER_NORMAL = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
|
||||
};
|
||||
|
||||
/** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */
|
||||
static final byte[] TO_LOWER_TR_LT = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
|
||||
EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
|
||||
};
|
||||
|
||||
/** Deltas for uppercasing for most locales. */
|
||||
static final byte[] TO_UPPER_NORMAL = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
|
||||
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
|
||||
|
||||
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
|
||||
};
|
||||
|
||||
/** Deltas for uppercasing for tr/az. */
|
||||
static final byte[] TO_UPPER_TR = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
|
||||
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
||||
-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
|
||||
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
|
||||
|
||||
-1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* For string case mappings, a single character (a code point) is mapped
|
||||
* either to itself (in which case in-place mapping functions do nothing),
|
||||
|
@ -609,8 +756,8 @@ public final class UCaseProps {
|
|||
|
||||
//ivate static final int LOC_UNKNOWN=0;
|
||||
public static final int LOC_ROOT=1;
|
||||
private static final int LOC_TURKISH=2;
|
||||
private static final int LOC_LITHUANIAN=3;
|
||||
static final int LOC_TURKISH=2;
|
||||
static final int LOC_LITHUANIAN=3;
|
||||
static final int LOC_GREEK=4;
|
||||
public static final int LOC_DUTCH=5;
|
||||
|
||||
|
@ -823,7 +970,7 @@ public final class UCaseProps {
|
|||
result=c;
|
||||
props=trie.get(c);
|
||||
if(!propsHasException(props)) {
|
||||
if(getTypeFromProps(props)>=UPPER) {
|
||||
if(isUpperOrTitleFromProps(props)) {
|
||||
result=c+getDelta(props);
|
||||
}
|
||||
} else {
|
||||
|
@ -1132,13 +1279,13 @@ public final class UCaseProps {
|
|||
*
|
||||
* @internal
|
||||
*/
|
||||
private static final int FOLD_CASE_OPTIONS_MASK = 7;
|
||||
static final int FOLD_CASE_OPTIONS_MASK = 7;
|
||||
|
||||
/* return the simple case folding mapping for c */
|
||||
public final int fold(int c, int options) {
|
||||
int props=trie.get(c);
|
||||
if(!propsHasException(props)) {
|
||||
if(getTypeFromProps(props)>=UPPER) {
|
||||
if(isUpperOrTitleFromProps(props)) {
|
||||
c+=getDelta(props);
|
||||
}
|
||||
} else {
|
||||
|
@ -1201,7 +1348,7 @@ public final class UCaseProps {
|
|||
result=c;
|
||||
props=trie.get(c);
|
||||
if(!propsHasException(props)) {
|
||||
if(getTypeFromProps(props)>=UPPER) {
|
||||
if(isUpperOrTitleFromProps(props)) {
|
||||
result=c+getDelta(props);
|
||||
}
|
||||
} else {
|
||||
|
@ -1361,6 +1508,10 @@ public final class UCaseProps {
|
|||
|
||||
// definitions for 16-bit case properties word ------------------------- ***
|
||||
|
||||
static Trie2_16 getTrie() {
|
||||
return INSTANCE.trie;
|
||||
}
|
||||
|
||||
/* 2-bit constants for types of cased characters */
|
||||
public static final int TYPE_MASK=3;
|
||||
public static final int NONE=0;
|
||||
|
@ -1369,7 +1520,7 @@ public final class UCaseProps {
|
|||
public static final int TITLE=3;
|
||||
|
||||
/** @return NONE, LOWER, UPPER, TITLE */
|
||||
private static final int getTypeFromProps(int props) {
|
||||
static final int getTypeFromProps(int props) {
|
||||
return props&TYPE_MASK;
|
||||
}
|
||||
|
||||
|
@ -1378,6 +1529,10 @@ public final class UCaseProps {
|
|||
return props&7;
|
||||
}
|
||||
|
||||
static final boolean isUpperOrTitleFromProps(int props) {
|
||||
return (props & 2) != 0;
|
||||
}
|
||||
|
||||
static final int IGNORABLE=4;
|
||||
private static final int SENSITIVE= 8;
|
||||
private static final int EXCEPTION= 0x10;
|
||||
|
@ -1394,7 +1549,7 @@ public final class UCaseProps {
|
|||
//private static final int MAX_DELTA= 0xff;
|
||||
//private static final int MIN_DELTA= (-MAX_DELTA-1);
|
||||
|
||||
private static final int getDelta(int props) {
|
||||
static final int getDelta(int props) {
|
||||
return (short)props>>DELTA_SHIFT;
|
||||
}
|
||||
|
||||
|
|
|
@ -480,19 +480,17 @@ public final class UCharacterCaseTest extends TestFmwk
|
|||
}
|
||||
}
|
||||
else {
|
||||
if (!SPECIAL_DATA_[j + 1].equals(
|
||||
UCharacter.toLowerCase(str))) {
|
||||
String lower = UCharacter.toLowerCase(str);
|
||||
if (!SPECIAL_DATA_[j + 1].equals(lower)) {
|
||||
errln("error lowercasing special characters " +
|
||||
hex(str) + " expected " + SPECIAL_DATA_[j + 1] +
|
||||
" but got " +
|
||||
hex(UCharacter.toLowerCase(locale, str)));
|
||||
" but got " + hex(lower));
|
||||
}
|
||||
if (!SPECIAL_DATA_[j + 2].equals(
|
||||
UCharacter.toUpperCase(locale, str))) {
|
||||
String upper = UCharacter.toUpperCase(str);
|
||||
if (!SPECIAL_DATA_[j + 2].equals(upper)) {
|
||||
errln("error uppercasing special characters " +
|
||||
hex(str) + " expected " + SPECIAL_DATA_[j + 2] +
|
||||
" but got " +
|
||||
hex(UCharacter.toUpperCase(locale, str)));
|
||||
" but got " + hex(upper));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue