mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
parent
2864379937
commit
79ab90b5f9
14 changed files with 1228 additions and 391 deletions
icu4c/source
common
test/intltest
icu4j/main
classes/core/src/com/ibm/icu
tests/core/src/com/ibm/icu/dev/test/lang
|
@ -377,22 +377,30 @@ UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
|
|||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_USE
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
U_CAPI const USet * U_EXPORT2
|
||||
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
|
||||
if (U_FAILURE(*pErrorCode)) { return nullptr; }
|
||||
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return nullptr; }
|
||||
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
|
||||
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
Mutex m(&cpMutex);
|
||||
UnicodeSet *set = sets[property];
|
||||
if (set == nullptr) {
|
||||
sets[property] = set = makeSet(property, *pErrorCode);
|
||||
sets[property] = set = makeSet(property, errorCode);
|
||||
}
|
||||
if (U_FAILURE(*pErrorCode)) { return nullptr; }
|
||||
return set->toUSet();
|
||||
return set;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI const USet * U_EXPORT2
|
||||
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
|
||||
const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
|
||||
return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
|
||||
}
|
||||
|
||||
U_CAPI const UCPMap * U_EXPORT2
|
||||
|
|
|
@ -205,37 +205,7 @@ static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
|
|||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
||||
uint16_t props;
|
||||
|
||||
/*
|
||||
* Hardcode the case closure of i and its relatives and ignore the
|
||||
* data file data for these characters.
|
||||
* The Turkic dotless i and dotted I with their case mapping conditions
|
||||
* and case folding option make the related characters behave specially.
|
||||
* This code matches their closure behavior to their case folding behavior.
|
||||
*/
|
||||
|
||||
switch(c) {
|
||||
case 0x49:
|
||||
/* regular i and I are in one equivalence class */
|
||||
sa->add(sa->set, 0x69);
|
||||
return;
|
||||
case 0x69:
|
||||
sa->add(sa->set, 0x49);
|
||||
return;
|
||||
case 0x130:
|
||||
/* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
|
||||
sa->addString(sa->set, iDot, 2);
|
||||
return;
|
||||
case 0x131:
|
||||
/* dotless i is in a class by itself */
|
||||
return;
|
||||
default:
|
||||
/* otherwise use the data file data */
|
||||
break;
|
||||
}
|
||||
|
||||
props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
|
||||
/* add the one simple case mapping, no matter what type it is */
|
||||
|
@ -249,19 +219,42 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
|||
* c has exceptions, so there may be multiple simple and/or
|
||||
* full case mappings. Add them all.
|
||||
*/
|
||||
const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
const char16_t *closure;
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t idx, closureLength, fullLength, length;
|
||||
const uint16_t *pe0=pe;
|
||||
|
||||
pe0=pe;
|
||||
// Hardcode the case closure of i and its relatives and ignore the
|
||||
// data file data for these characters.
|
||||
// The Turkic dotless i and dotted I with their case mapping conditions
|
||||
// and case folding option make the related characters behave specially.
|
||||
// This code matches their closure behavior to their case folding behavior.
|
||||
if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
|
||||
// These characters have Turkic case foldings. Hardcode their closure.
|
||||
if (c == 0x49) {
|
||||
// Regular i and I are in one equivalence class.
|
||||
sa->add(sa->set, 0x69);
|
||||
return;
|
||||
} else if (c == 0x130) {
|
||||
// Dotted I is in a class with <0069 0307>
|
||||
// (for canonical equivalence with <0049 0307>).
|
||||
sa->addString(sa->set, iDot, 2);
|
||||
return;
|
||||
}
|
||||
} else if (c == 0x69) {
|
||||
sa->add(sa->set, 0x49);
|
||||
return;
|
||||
} else if (c == 0x131) {
|
||||
// Dotless i is in a class by itself.
|
||||
return;
|
||||
}
|
||||
|
||||
/* add all simple case mappings */
|
||||
for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
|
||||
for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
|
||||
if(HAS_SLOT(excWord, idx)) {
|
||||
pe=pe0;
|
||||
GET_SLOT_VALUE(excWord, idx, pe, c);
|
||||
sa->add(sa->set, c);
|
||||
UChar32 mapping;
|
||||
GET_SLOT_VALUE(excWord, idx, pe, mapping);
|
||||
sa->add(sa->set, mapping);
|
||||
}
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
|
||||
|
@ -272,6 +265,8 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
|||
}
|
||||
|
||||
/* get the closure string pointer & length */
|
||||
const char16_t *closure;
|
||||
int32_t closureLength;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
|
||||
pe=pe0;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
|
||||
|
@ -285,6 +280,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
|||
/* add the full case folding */
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
|
||||
pe=pe0;
|
||||
int32_t fullLength;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
|
||||
|
||||
/* start of full case mapping strings */
|
||||
|
@ -297,7 +293,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
|||
fullLength>>=4;
|
||||
|
||||
/* add the full case folding string */
|
||||
length=fullLength&0xf;
|
||||
int32_t length=fullLength&0xf;
|
||||
if(length!=0) {
|
||||
sa->addString(sa->set, (const char16_t *)pe, length);
|
||||
pe+=length;
|
||||
|
@ -313,9 +309,146 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
|||
}
|
||||
|
||||
/* add each code point in the closure string */
|
||||
for(idx=0; idx<closureLength;) {
|
||||
U16_NEXT_UNSAFE(closure, idx, c);
|
||||
sa->add(sa->set, c);
|
||||
for(int32_t idx=0; idx<closureLength;) {
|
||||
UChar32 mapping;
|
||||
U16_NEXT_UNSAFE(closure, idx, mapping);
|
||||
sa->add(sa->set, mapping);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* Add the simple case closure mapping,
|
||||
* except if there is not actually an scf relationship between the two characters.
|
||||
* TODO: Unicode should probably add the corresponding scf mappings.
|
||||
* See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
|
||||
* If & when those scf mappings are added, we should be able to remove all of these exceptions.
|
||||
*/
|
||||
void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
|
||||
switch (c) {
|
||||
case 0x0390:
|
||||
if (t == 0x1FD3) { return; }
|
||||
break;
|
||||
case 0x03B0:
|
||||
if (t == 0x1FE3) { return; }
|
||||
break;
|
||||
case 0x1FD3:
|
||||
if (t == 0x0390) { return; }
|
||||
break;
|
||||
case 0x1FE3:
|
||||
if (t == 0x03B0) { return; }
|
||||
break;
|
||||
case 0xFB05:
|
||||
if (t == 0xFB06) { return; }
|
||||
break;
|
||||
case 0xFB06:
|
||||
if (t == 0xFB05) { return; }
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
sa->add(sa->set, t);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
|
||||
/* add the one simple case mapping, no matter what type it is */
|
||||
int32_t delta=UCASE_GET_DELTA(props);
|
||||
if(delta!=0) {
|
||||
sa->add(sa->set, c+delta);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
const uint16_t *pe0=pe;
|
||||
|
||||
// Hardcode the case closure of i and its relatives and ignore the
|
||||
// data file data for these characters, like in ucase_addCaseClosure().
|
||||
if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
|
||||
// These characters have Turkic case foldings. Hardcode their closure.
|
||||
if (c == 0x49) {
|
||||
// Regular i and I are in one equivalence class.
|
||||
sa->add(sa->set, 0x69);
|
||||
return;
|
||||
} else if (c == 0x130) {
|
||||
// For scf=Simple_Case_Folding, dotted I is in a class by itself.
|
||||
return;
|
||||
}
|
||||
} else if (c == 0x69) {
|
||||
sa->add(sa->set, 0x49);
|
||||
return;
|
||||
} else if (c == 0x131) {
|
||||
// Dotless i is in a class by itself.
|
||||
return;
|
||||
}
|
||||
|
||||
// Add all simple case mappings.
|
||||
for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
|
||||
if(HAS_SLOT(excWord, idx)) {
|
||||
pe=pe0;
|
||||
UChar32 mapping;
|
||||
GET_SLOT_VALUE(excWord, idx, pe, mapping);
|
||||
addOneSimpleCaseClosure(c, mapping, sa);
|
||||
}
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
|
||||
pe=pe0;
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
addOneSimpleCaseClosure(c, mapping, sa);
|
||||
}
|
||||
|
||||
/* get the closure string pointer & length */
|
||||
const char16_t *closure;
|
||||
int32_t closureLength;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
|
||||
pe=pe0;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
|
||||
closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
|
||||
closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
|
||||
} else {
|
||||
closureLength=0;
|
||||
closure=nullptr;
|
||||
}
|
||||
|
||||
// Skip the full case mappings.
|
||||
if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
|
||||
pe=pe0;
|
||||
int32_t fullLength;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
|
||||
|
||||
/* start of full case mapping strings */
|
||||
++pe;
|
||||
|
||||
fullLength&=0xffff; /* bits 16 and higher are reserved */
|
||||
|
||||
// Skip all 4 full case mappings.
|
||||
pe+=fullLength&UCASE_FULL_LOWER;
|
||||
fullLength>>=4;
|
||||
pe+=fullLength&0xf;
|
||||
fullLength>>=4;
|
||||
pe+=fullLength&0xf;
|
||||
fullLength>>=4;
|
||||
pe+=fullLength;
|
||||
|
||||
closure=(const char16_t *)pe; /* behind full case mappings */
|
||||
}
|
||||
|
||||
// Add each code point in the closure string whose scf maps back to c.
|
||||
for(int32_t idx=0; idx<closureLength;) {
|
||||
UChar32 mapping;
|
||||
U16_NEXT_UNSAFE(closure, idx, mapping);
|
||||
addOneSimpleCaseClosure(c, mapping, sa);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,6 +108,10 @@ ucase_fold(UChar32 c, uint32_t options);
|
|||
U_CFUNC void U_EXPORT2
|
||||
ucase_addCaseClosure(UChar32 c, const USetAdder *sa);
|
||||
|
||||
/** Case closure with only scf=Simple_Case_Folding. */
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa);
|
||||
|
||||
/**
|
||||
* Maps the string to single code points and adds the associated case closure
|
||||
* mappings.
|
||||
|
|
|
@ -430,7 +430,9 @@ public:
|
|||
* description for the syntax of the pattern language.
|
||||
* @param pattern a string specifying what characters are in the set
|
||||
* @param options bitmask for options to apply to the pattern.
|
||||
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
||||
* Valid options are USET_IGNORE_SPACE and
|
||||
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* @param symbols a symbol table mapping variable names to values
|
||||
* and stand-in characters to UnicodeSets; may be nullptr
|
||||
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
||||
|
@ -450,7 +452,9 @@ public:
|
|||
* @param pos on input, the position in pattern at which to start parsing.
|
||||
* On output, the position after the last character parsed.
|
||||
* @param options bitmask for options to apply to the pattern.
|
||||
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
||||
* Valid options are USET_IGNORE_SPACE and
|
||||
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* @param symbols a symbol table mapping variable names to values
|
||||
* and stand-in characters to UnicodeSets; may be nullptr
|
||||
* @param status input-output error code
|
||||
|
@ -645,7 +649,9 @@ public:
|
|||
* A frozen set will not be modified.
|
||||
* @param pattern a string specifying what characters are in the set
|
||||
* @param options bitmask for options to apply to the pattern.
|
||||
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
||||
* Valid options are USET_IGNORE_SPACE and
|
||||
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* @param symbols a symbol table mapping variable names to
|
||||
* values and stand-ins to UnicodeSets; may be nullptr
|
||||
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
||||
|
@ -683,7 +689,9 @@ public:
|
|||
* pattern.length() if the closing ']' is the last character of
|
||||
* the pattern string.
|
||||
* @param options bitmask for options to apply to the pattern.
|
||||
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
||||
* Valid options are USET_IGNORE_SPACE and
|
||||
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* @param symbols a symbol table mapping variable names to
|
||||
* values and stand-ins to UnicodeSets; may be nullptr
|
||||
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
||||
|
@ -1390,7 +1398,7 @@ public:
|
|||
|
||||
/**
|
||||
* Close this set over the given attribute. For the attribute
|
||||
* USET_CASE, the result is to modify this set so that:
|
||||
* USET_CASE_INSENSITIVE, the result is to modify this set so that:
|
||||
*
|
||||
* 1. For each character or string 'a' in this set, all strings or
|
||||
* characters 'b' such that foldCase(a) == foldCase(b) are added
|
||||
|
@ -1408,8 +1416,10 @@ public:
|
|||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param attribute bitmask for attributes to close over.
|
||||
* Currently only the USET_CASE bit is supported. Any undefined bits
|
||||
* are ignored.
|
||||
* Valid options:
|
||||
* At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* Unrelated options bits are ignored.
|
||||
* @return a reference to this set.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
|
@ -1579,6 +1589,9 @@ private:
|
|||
int32_t depth,
|
||||
UErrorCode& ec);
|
||||
|
||||
void closeOverCaseInsensitive(bool simple);
|
||||
void closeOverAddCaseMappings();
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
|
|
@ -53,6 +53,12 @@ typedef struct USet USet;
|
|||
/**
|
||||
* Bitmask values to be passed to uset_openPatternOptions() or
|
||||
* uset_applyPattern() taking an option parameter.
|
||||
*
|
||||
* Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
*
|
||||
* Undefined options bits are ignored, and reserved for future use.
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
enum {
|
||||
|
@ -60,13 +66,13 @@ enum {
|
|||
* Ignore white space within patterns unless quoted or escaped.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
USET_IGNORE_SPACE = 1,
|
||||
USET_IGNORE_SPACE = 1,
|
||||
|
||||
/**
|
||||
* Enable case insensitive matching. E.g., "[ab]" with this flag
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'. This performs a full
|
||||
* closure over case mappings, e.g. U+017F for s.
|
||||
* closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
|
||||
*
|
||||
* The resulting set is a superset of the input for the code points but
|
||||
* not for the strings.
|
||||
|
@ -88,17 +94,36 @@ enum {
|
|||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
USET_CASE_INSENSITIVE = 2,
|
||||
USET_CASE_INSENSITIVE = 2,
|
||||
|
||||
/**
|
||||
* Enable case insensitive matching. E.g., "[ab]" with this flag
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
|
||||
* title-, and uppercase mappings as well as the case folding
|
||||
* Adds all case mappings for each element in the set.
|
||||
* This adds the full lower-, title-, and uppercase mappings as well as the full case folding
|
||||
* of each existing element in the set.
|
||||
*
|
||||
* Unlike the “case insensitive” options, this does not perform a closure.
|
||||
* For example, it does not add 'ſ' (U+017F long s) for 's',
|
||||
* 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
|
||||
*
|
||||
* @stable ICU 3.2
|
||||
*/
|
||||
USET_ADD_CASE_MAPPINGS = 4
|
||||
USET_ADD_CASE_MAPPINGS = 4,
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Enable case insensitive matching.
|
||||
* Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
|
||||
* which map each code point to one code point,
|
||||
* not full Case_Folding (cf) mappings, which map some code points to multiple code points.
|
||||
*
|
||||
* This is designed for case-insensitive matches, for example in certain
|
||||
* regular expression implementations where only Simple_Case_Folding mappings are used,
|
||||
* such as in ECMAScript (JavaScript) regular expressions.
|
||||
*
|
||||
* @draft ICU 73
|
||||
*/
|
||||
USET_SIMPLE_CASE_INSENSITIVE = 6
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -299,7 +324,9 @@ uset_openPattern(const UChar* pattern, int32_t patternLength,
|
|||
* @param patternLength the length of the pattern, or -1 if null
|
||||
* terminated
|
||||
* @param options bitmask for options to apply to the pattern.
|
||||
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
||||
* Valid options are USET_IGNORE_SPACE and
|
||||
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* @param ec the error code
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
|
@ -414,7 +441,10 @@ uset_set(USet* set,
|
|||
* The character at pattern[0] must be a '['.
|
||||
* @param patternLength The length of the UChar string. -1 if NUL terminated.
|
||||
* @param options A bitmask for options to apply to the pattern.
|
||||
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
||||
* Valid options are USET_IGNORE_SPACE and
|
||||
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
|
||||
* USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* @param status Returns an error if the pattern cannot be parsed.
|
||||
* @return Upon successful parse, the value is either
|
||||
* the index of the character after the closing ']'
|
||||
|
@ -804,7 +834,7 @@ uset_clear(USet* set);
|
|||
|
||||
/**
|
||||
* Close this set over the given attribute. For the attribute
|
||||
* USET_CASE, the result is to modify this set so that:
|
||||
* USET_CASE_INSENSITIVE, the result is to modify this set so that:
|
||||
*
|
||||
* 1. For each character or string 'a' in this set, all strings or
|
||||
* characters 'b' such that foldCase(a) == foldCase(b) are added
|
||||
|
@ -824,8 +854,10 @@ uset_clear(USet* set);
|
|||
* @param set the set
|
||||
*
|
||||
* @param attributes bitmask for attributes to close over.
|
||||
* Currently only the USET_CASE bit is supported. Any undefined bits
|
||||
* are ignored.
|
||||
* Valid options:
|
||||
* At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
|
||||
* These case options are mutually exclusive.
|
||||
* Unrelated options bits are ignored.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
|
|
|
@ -25,9 +25,11 @@
|
|||
#include "unicode/locid.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "cmemory.h"
|
||||
#include "ruleiter.h"
|
||||
#include "ucase.h"
|
||||
#include "uprops.h"
|
||||
#include "util.h"
|
||||
#include "uvector.h"
|
||||
|
||||
|
@ -149,102 +151,208 @@ addCaseMapping(UnicodeSet &set, int32_t result, const char16_t *full, UnicodeStr
|
|||
// see ucase.h
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/** For case closure on a large set, look only at code points with relevant properties. */
|
||||
const UnicodeSet &maybeOnlyCaseSensitive(const UnicodeSet &src, UnicodeSet &subset) {
|
||||
// The subset must have been constructed with all code points,
|
||||
// so that the retainAll() intersection effectively copies all single code points from src.
|
||||
U_ASSERT(subset.contains(0, 0x10ffff));
|
||||
if (src.size() < 30) {
|
||||
return src;
|
||||
}
|
||||
// Return the intersection of the src code points with Case_Sensitive ones.
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
const UnicodeSet *sensitive =
|
||||
CharacterProperties::getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return src;
|
||||
}
|
||||
// Start by copying the "smaller" set.
|
||||
// (We "copy" by intersecting all Unicode *code points* with the first set,
|
||||
// which omits any strings.)
|
||||
if (src.getRangeCount() > sensitive->getRangeCount()) {
|
||||
subset.retainAll(*sensitive);
|
||||
subset.retainAll(src);
|
||||
} else {
|
||||
subset.retainAll(src);
|
||||
subset.retainAll(*sensitive);
|
||||
}
|
||||
return subset;
|
||||
}
|
||||
|
||||
// Per-character scf = Simple_Case_Folding of a string.
|
||||
// (Normally when we case-fold a string we use full case foldings.)
|
||||
bool scfString(const UnicodeString &s, UnicodeString &scf) {
|
||||
// Iterate over the raw buffer for best performance.
|
||||
const char16_t *p = s.getBuffer();
|
||||
int32_t length = s.length();
|
||||
// Loop while not needing modification.
|
||||
for (int32_t i = 0; i < length;) {
|
||||
UChar32 c;
|
||||
U16_NEXT(p, i, length, c); // post-increments i
|
||||
UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
if (scfChar != c) {
|
||||
// Copy the characters before c.
|
||||
scf.setTo(p, i - U16_LENGTH(c));
|
||||
// Loop over the rest of the string and keep case-folding.
|
||||
for (;;) {
|
||||
scf.append(scfChar);
|
||||
if (i == length) {
|
||||
return true;
|
||||
}
|
||||
U16_NEXT(p, i, length, c); // post-increments i
|
||||
scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
if (isFrozen() || isBogus()) {
|
||||
return *this;
|
||||
}
|
||||
if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
|
||||
{
|
||||
UnicodeSet foldSet(*this);
|
||||
UnicodeString str;
|
||||
USetAdder sa = {
|
||||
foldSet.toUSet(),
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString,
|
||||
nullptr, // don't need remove()
|
||||
nullptr // don't need removeRange()
|
||||
};
|
||||
|
||||
// start with input set to guarantee inclusion
|
||||
// USET_CASE: remove strings because the strings will actually be reduced (folded);
|
||||
// therefore, start with no strings and add only those needed
|
||||
if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {
|
||||
foldSet.strings->removeAllElements();
|
||||
}
|
||||
|
||||
int32_t n = getRangeCount();
|
||||
UChar32 result;
|
||||
const char16_t *full;
|
||||
|
||||
for (int32_t i=0; i<n; ++i) {
|
||||
UChar32 start = getRangeStart(i);
|
||||
UChar32 end = getRangeEnd(i);
|
||||
|
||||
if (attribute & USET_CASE_INSENSITIVE) {
|
||||
// full case closure
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
ucase_addCaseClosure(cp, &sa);
|
||||
}
|
||||
} else {
|
||||
// add case mappings
|
||||
// (does not add long s for regular s, or Kelvin for k, for example)
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullFolding(cp, &full, 0);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hasStrings()) {
|
||||
if (attribute & USET_CASE_INSENSITIVE) {
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
str = *(const UnicodeString *) strings->elementAt(j);
|
||||
str.foldCase();
|
||||
if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
|
||||
foldSet.add(str); // does not map to code points: add the folded string itself
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Locale root("");
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
#endif
|
||||
const UnicodeString *pStr;
|
||||
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
pStr = (const UnicodeString *) strings->elementAt(j);
|
||||
(str = *pStr).toLower(root);
|
||||
foldSet.add(str);
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
(str = *pStr).toTitle(bi, root);
|
||||
foldSet.add(str);
|
||||
#endif
|
||||
(str = *pStr).toUpper(root);
|
||||
foldSet.add(str);
|
||||
(str = *pStr).foldCase();
|
||||
foldSet.add(str);
|
||||
}
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
}
|
||||
delete bi;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
*this = foldSet;
|
||||
}
|
||||
switch (attribute & USET_CASE_MASK) {
|
||||
case 0:
|
||||
break;
|
||||
case USET_CASE_INSENSITIVE:
|
||||
closeOverCaseInsensitive(/* simple= */ false);
|
||||
break;
|
||||
case USET_ADD_CASE_MAPPINGS:
|
||||
closeOverAddCaseMappings();
|
||||
break;
|
||||
case USET_SIMPLE_CASE_INSENSITIVE:
|
||||
closeOverCaseInsensitive(/* simple= */ true);
|
||||
break;
|
||||
default:
|
||||
// bad option (unreachable)
|
||||
break;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void UnicodeSet::closeOverCaseInsensitive(bool simple) {
|
||||
// Start with input set to guarantee inclusion.
|
||||
UnicodeSet foldSet(*this);
|
||||
// Full case mappings closure:
|
||||
// Remove strings because the strings will actually be reduced (folded);
|
||||
// therefore, start with no strings and add only those needed.
|
||||
// Do this before processing code points, because they may add strings.
|
||||
if (!simple && foldSet.hasStrings()) {
|
||||
foldSet.strings->removeAllElements();
|
||||
}
|
||||
|
||||
USetAdder sa = {
|
||||
foldSet.toUSet(),
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString,
|
||||
nullptr, // don't need remove()
|
||||
nullptr // don't need removeRange()
|
||||
};
|
||||
|
||||
UnicodeSet subset(0, 0x10ffff);
|
||||
const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
|
||||
|
||||
// Iterate over the ranges of single code points. Nested loop for each code point.
|
||||
int32_t n = codePoints.getRangeCount();
|
||||
|
||||
for (int32_t i=0; i<n; ++i) {
|
||||
UChar32 start = codePoints.getRangeStart(i);
|
||||
UChar32 end = codePoints.getRangeEnd(i);
|
||||
|
||||
if (simple) {
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
ucase_addSimpleCaseClosure(cp, &sa);
|
||||
}
|
||||
} else {
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
ucase_addCaseClosure(cp, &sa);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hasStrings()) {
|
||||
UnicodeString str;
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
|
||||
if (simple) {
|
||||
if (scfString(*pStr, str)) {
|
||||
foldSet.remove(*pStr).add(str);
|
||||
}
|
||||
} else {
|
||||
str = *pStr;
|
||||
str.foldCase();
|
||||
if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
|
||||
foldSet.add(str); // does not map to code points: add the folded string itself
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*this = foldSet;
|
||||
}
|
||||
|
||||
void UnicodeSet::closeOverAddCaseMappings() {
|
||||
// Start with input set to guarantee inclusion.
|
||||
UnicodeSet foldSet(*this);
|
||||
|
||||
UnicodeSet subset(0, 0x10ffff);
|
||||
const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
|
||||
|
||||
// Iterate over the ranges of single code points. Nested loop for each code point.
|
||||
int32_t n = codePoints.getRangeCount();
|
||||
UChar32 result;
|
||||
const char16_t *full;
|
||||
UnicodeString str;
|
||||
|
||||
for (int32_t i=0; i<n; ++i) {
|
||||
UChar32 start = codePoints.getRangeStart(i);
|
||||
UChar32 end = codePoints.getRangeEnd(i);
|
||||
|
||||
// add case mappings
|
||||
// (does not add long s for regular s, or Kelvin for k, for example)
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullFolding(cp, &full, 0);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
}
|
||||
}
|
||||
if (hasStrings()) {
|
||||
Locale root("");
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
#endif
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
|
||||
(str = *pStr).toLower(root);
|
||||
foldSet.add(str);
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
(str = *pStr).toTitle(bi, root);
|
||||
foldSet.add(str);
|
||||
#endif
|
||||
(str = *pStr).toUpper(root);
|
||||
foldSet.add(str);
|
||||
(str = *pStr).foldCase();
|
||||
foldSet.add(str);
|
||||
}
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
}
|
||||
delete bi;
|
||||
#endif
|
||||
}
|
||||
*this = foldSet;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -631,11 +631,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
|
|||
* to close over case BEFORE COMPLEMENTING. This makes
|
||||
* patterns like /[^abc]/i work.
|
||||
*/
|
||||
if ((options & USET_CASE_INSENSITIVE) != 0) {
|
||||
(this->*caseClosure)(USET_CASE_INSENSITIVE);
|
||||
}
|
||||
else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
|
||||
(this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
|
||||
if ((options & USET_CASE_MASK) != 0) {
|
||||
(this->*caseClosure)(options);
|
||||
}
|
||||
if (invert) {
|
||||
complement().removeAllStrings(); // code point complement
|
||||
|
|
|
@ -441,6 +441,7 @@ class CharacterProperties {
|
|||
public:
|
||||
CharacterProperties() = delete;
|
||||
static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
|
||||
static const UnicodeSet *getBinaryPropertySet(UProperty property, UErrorCode &errorCode);
|
||||
};
|
||||
|
||||
// implemented in uniset_props.cpp
|
||||
|
|
|
@ -58,5 +58,14 @@ typedef struct USetAdder USetAdder;
|
|||
|
||||
U_CDECL_END
|
||||
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr int32_t USET_CASE_MASK = USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS;
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <unordered_map>
|
||||
#include "unicode/utypes.h"
|
||||
#include "usettest.h"
|
||||
#include "unicode/ucnv.h"
|
||||
|
@ -85,6 +86,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE_AUTO(TestStrings);
|
||||
TESTCASE_AUTO(Testj2268);
|
||||
TESTCASE_AUTO(TestCloseOver);
|
||||
TESTCASE_AUTO(TestCloseOverSimpleCaseFolding);
|
||||
TESTCASE_AUTO(TestCloseOverLargeSets);
|
||||
TESTCASE_AUTO(TestEscapePattern);
|
||||
TESTCASE_AUTO(TestInvalidCodePoint);
|
||||
TESTCASE_AUTO(TestSymbolTable);
|
||||
|
@ -1243,27 +1246,38 @@ void UnicodeSetTest::TestIndexOf() {
|
|||
* Test closure API.
|
||||
*/
|
||||
void UnicodeSetTest::TestCloseOver() {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
|
||||
char CASE[] = {(char)USET_CASE_INSENSITIVE};
|
||||
char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
|
||||
const char* DATA[] = {
|
||||
static constexpr char CASE[] = {(char)USET_CASE_INSENSITIVE};
|
||||
static constexpr char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
|
||||
static constexpr char SIMPLE_CASE_INSENSITIVE[] = {(char)USET_SIMPLE_CASE_INSENSITIVE};
|
||||
static const char* DATA[] = {
|
||||
// selector, input, output
|
||||
CASE,
|
||||
"[aq\\u00DF{Bc}{bC}{Fi}]",
|
||||
"[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[aq\\u00DF{Bc}{bC}{Fi}]",
|
||||
"[aAqQ\\u00DF\\u1E9E{bc}{fi}]",
|
||||
|
||||
CASE,
|
||||
"[\\u01F1]", // 'DZ'
|
||||
"[\\u01F1\\u01F2\\u01F3]",
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[\\u01F1]", // 'DZ'
|
||||
"[\\u01F1\\u01F2\\u01F3]",
|
||||
|
||||
CASE,
|
||||
"[\\u1FB4]",
|
||||
"[\\u1FB4{\\u03AC\\u03B9}]",
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[\\u1FB4]",
|
||||
"[\\u1FB4]",
|
||||
|
||||
CASE,
|
||||
"[{F\\uFB01}]",
|
||||
"[\\uFB03{ffi}]",
|
||||
"[\\uFB03{ffi}]",
|
||||
|
||||
CASE, // make sure binary search finds limits
|
||||
"[a\\uFF3A]",
|
||||
|
@ -1271,6 +1285,10 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
|
||||
CASE,
|
||||
"[a-z]","[A-Za-z\\u017F\\u212A]",
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[a-z]","[A-Za-z\\u017F\\u212A]",
|
||||
|
||||
CASE,
|
||||
"[abc]","[A-Ca-c]",
|
||||
CASE,
|
||||
|
@ -1311,7 +1329,7 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
CASE_MAPPINGS,
|
||||
"[\\u01F1]", // 'DZ'
|
||||
"[\\u01F1\\u01F2\\u01F3]",
|
||||
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[a-z]",
|
||||
"[A-Za-z]",
|
||||
|
@ -1326,6 +1344,8 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
int32_t selector = DATA[i][0];
|
||||
UnicodeString pat(DATA[i+1], -1, US_INV);
|
||||
UnicodeString exp(DATA[i+2], -1, US_INV);
|
||||
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
s.applyPattern(pat, ec);
|
||||
s.closeOver(selector);
|
||||
t.applyPattern(exp, ec);
|
||||
|
@ -1341,68 +1361,8 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Unused test code.
|
||||
* This was used to compare the old implementation (using USET_CASE)
|
||||
* with the new one (using 0x100 temporarily)
|
||||
* while transitioning from hardcoded case closure tables in uniset.cpp
|
||||
* (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
|
||||
* and using ucase.c functions for closure.
|
||||
* See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
|
||||
*
|
||||
* Note: The old and new implementation never fully matched because
|
||||
* the old implementation turned out to not map U+0130 and U+0131 correctly
|
||||
* (dotted I and dotless i) and because the old implementation's data tables
|
||||
* were outdated compared to Unicode 4.0.1 at the time of the change to the
|
||||
* new implementation. (So sigmas and some other characters were not handled
|
||||
* according to the newer Unicode version.)
|
||||
*/
|
||||
UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
|
||||
UnicodeSetIterator si(sens);
|
||||
UnicodeString str, buf2;
|
||||
const UnicodeString *pStr;
|
||||
UChar32 c;
|
||||
while(si.next()) {
|
||||
if(!si.isString()) {
|
||||
c=si.getCodepoint();
|
||||
s.clear();
|
||||
s.add(c);
|
||||
|
||||
str.setTo(c);
|
||||
str.foldCase();
|
||||
sens2.add(str);
|
||||
|
||||
t=s;
|
||||
s.closeOver(USET_CASE);
|
||||
t.closeOver(0x100);
|
||||
if(s!=t) {
|
||||
errln("FAIL: closeOver(U+%04x) differs: ", c);
|
||||
errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
|
||||
}
|
||||
}
|
||||
}
|
||||
// remove all code points
|
||||
// should contain all full case folding mapping strings
|
||||
sens2.remove(0, 0x10ffff);
|
||||
si.reset(sens2);
|
||||
while(si.next()) {
|
||||
if(si.isString()) {
|
||||
pStr=&si.getString();
|
||||
s.clear();
|
||||
s.add(*pStr);
|
||||
t=s2=s;
|
||||
s.closeOver(USET_CASE);
|
||||
t.closeOver(0x100);
|
||||
if(s!=t) {
|
||||
errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, true)+") differs: ");
|
||||
errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Test the pattern API
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
s.applyPattern("[abc]", USET_CASE_INSENSITIVE, nullptr, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
errln("FAIL: applyPattern failed");
|
||||
|
@ -1423,6 +1383,123 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void addIfAbsent(const std::unordered_multimap<UChar32, UChar32> &closure, UChar32 c, UChar32 t,
|
||||
std::unordered_multimap<UChar32, UChar32> &additions) {
|
||||
for (auto it = closure.find(c);; ++it) {
|
||||
if (it == closure.end() || it->first != c) {
|
||||
// absent
|
||||
additions.insert({c, t});
|
||||
break;
|
||||
} else if (it->second == t) {
|
||||
// present
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void UnicodeSetTest::TestCloseOverSimpleCaseFolding() {
|
||||
IcuTestErrorCode errorCode(*this, "TestCloseOverSimpleCaseFolding");
|
||||
const UnicodeSet *sensitive =
|
||||
UnicodeSet::fromUSet(u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode));
|
||||
if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE) failed")) {
|
||||
return;
|
||||
}
|
||||
// Compute the scf=Simple_Case_Folding closure:
|
||||
// For each scf(c)=t, start with mappings c->t and t->c.
|
||||
std::unordered_multimap<UChar32, UChar32> closure;
|
||||
UnicodeSetIterator iter(*sensitive);
|
||||
while (iter.next()) {
|
||||
UChar32 c = iter.getCodepoint();
|
||||
UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
if (scfChar != c) {
|
||||
closure.insert({c, scfChar});
|
||||
closure.insert({scfChar, c});
|
||||
}
|
||||
}
|
||||
// Complete the closure: Add mappings of mappings.
|
||||
for (;;) {
|
||||
std::unordered_multimap<UChar32, UChar32> additions;
|
||||
// for each mapping c->t
|
||||
for (auto mapping : closure) {
|
||||
UChar32 c = mapping.first;
|
||||
UChar32 t = mapping.second;
|
||||
// enumerate each t->u
|
||||
for (auto it = closure.find(t); it != closure.end() && it->first == t; ++it) {
|
||||
UChar32 u = it->second;
|
||||
if (u != c) {
|
||||
addIfAbsent(closure, c, u, additions);
|
||||
addIfAbsent(closure, u, c, additions);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (additions.empty()) {
|
||||
break; // The closure is complete.
|
||||
}
|
||||
closure.insert(additions.begin(), additions.end());
|
||||
}
|
||||
// Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
|
||||
// Here we focus on single code points as input.
|
||||
// Other examples, including strings, are tested in TestCloseOver().
|
||||
int32_t errors = 0;
|
||||
iter.reset();
|
||||
UnicodeSet set, expected;
|
||||
while (iter.next()) {
|
||||
UChar32 c = iter.getCodepoint();
|
||||
// closeOver()
|
||||
set.clear().add(c);
|
||||
set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
|
||||
// From-first-principles implementation.
|
||||
expected.clear().add(c);
|
||||
for (auto it = closure.find(c); it != closure.end() && it->first == c; ++it) {
|
||||
expected.add(it->second);
|
||||
}
|
||||
// compare
|
||||
if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
|
||||
errln(" c=U+%04X", c);
|
||||
if (++errors == 10) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestCloseOverLargeSets() {
|
||||
IcuTestErrorCode errorCode(*this, "TestCloseOverLargeSets");
|
||||
// Check that an optimization for large sets does not change the result.
|
||||
|
||||
// Most code points except ones that are boring for case mappings.
|
||||
UnicodeSet manyCp(u"[^[:C:][:Ideographic:][:Hang:]]", errorCode);
|
||||
// Main Unihan block.
|
||||
constexpr UChar32 LARGE_START = 0x4E00;
|
||||
constexpr UChar32 LARGE_END = 0x9FFF;
|
||||
|
||||
static constexpr int32_t OPTIONS[] = {
|
||||
USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE
|
||||
};
|
||||
UnicodeSet input, small, large;
|
||||
for (int32_t option : OPTIONS) {
|
||||
UnicodeSetIterator iter(manyCp);
|
||||
while (iter.next()) {
|
||||
UChar32 c = iter.getCodepoint();
|
||||
input.clear().add(c);
|
||||
small = input;
|
||||
small.closeOver(option);
|
||||
large = input;
|
||||
large.add(LARGE_START, LARGE_END);
|
||||
large.closeOver(option);
|
||||
large.remove(LARGE_START, LARGE_END);
|
||||
if (!checkEqual(small, large, "small != large")) {
|
||||
errln(" option=%d c=U+%04X", option, c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestEscapePattern() {
|
||||
const char pattern[] =
|
||||
"[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
|
||||
|
|
|
@ -74,6 +74,8 @@ private:
|
|||
void TestExhaustive(void);
|
||||
|
||||
void TestCloseOver(void);
|
||||
void TestCloseOverSimpleCaseFolding();
|
||||
void TestCloseOverLargeSets();
|
||||
|
||||
void TestEscapePattern(void);
|
||||
|
||||
|
|
|
@ -260,34 +260,6 @@ public final class UCaseProps {
|
|||
* - for k include the Kelvin sign
|
||||
*/
|
||||
public final void addCaseClosure(int c, UnicodeSet set) {
|
||||
/*
|
||||
* Hardcode the case closure of i and its relatives and ignore the
|
||||
* data file data for these characters.
|
||||
* The Turkic dotless i and dotted I with their case mapping conditions
|
||||
* and case folding option make the related characters behave specially.
|
||||
* This code matches their closure behavior to their case folding behavior.
|
||||
*/
|
||||
|
||||
switch(c) {
|
||||
case 0x49:
|
||||
/* regular i and I are in one equivalence class */
|
||||
set.add(0x69);
|
||||
return;
|
||||
case 0x69:
|
||||
set.add(0x49);
|
||||
return;
|
||||
case 0x130:
|
||||
/* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
|
||||
set.add(iDot);
|
||||
return;
|
||||
case 0x131:
|
||||
/* dotless i is in a class by itself */
|
||||
return;
|
||||
default:
|
||||
/* otherwise use the data file data */
|
||||
break;
|
||||
}
|
||||
|
||||
int props=trie.get(c);
|
||||
if(!propsHasException(props)) {
|
||||
if(getTypeFromProps(props)!=NONE) {
|
||||
|
@ -302,19 +274,41 @@ public final class UCaseProps {
|
|||
* c has exceptions, so there may be multiple simple and/or
|
||||
* full case mappings. Add them all.
|
||||
*/
|
||||
int excOffset0, excOffset=getExceptionsOffset(props);
|
||||
int closureOffset;
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int index, closureLength, fullLength, length;
|
||||
int excOffset0=excOffset;
|
||||
|
||||
excOffset0=excOffset;
|
||||
// Hardcode the case closure of i and its relatives and ignore the
|
||||
// data file data for these characters.
|
||||
// The Turkic dotless i and dotted I with their case mapping conditions
|
||||
// and case folding option make the related characters behave specially.
|
||||
// This code matches their closure behavior to their case folding behavior.
|
||||
if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
|
||||
// These characters have Turkic case foldings. Hardcode their closure.
|
||||
if (c == 0x49) {
|
||||
// Regular i and I are in one equivalence class.
|
||||
set.add(0x69);
|
||||
return;
|
||||
} else if (c == 0x130) {
|
||||
// Dotted I is in a class with <0069 0307>
|
||||
// (for canonical equivalence with <0049 0307>).
|
||||
set.add(iDot);
|
||||
return;
|
||||
}
|
||||
} else if (c == 0x69) {
|
||||
set.add(0x49);
|
||||
return;
|
||||
} else if (c == 0x131) {
|
||||
// Dotless i is in a class by itself.
|
||||
return;
|
||||
}
|
||||
|
||||
/* add all simple case mappings */
|
||||
for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
|
||||
for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
|
||||
if(hasSlot(excWord, index)) {
|
||||
excOffset=excOffset0;
|
||||
c=getSlotValue(excWord, index, excOffset);
|
||||
set.add(c);
|
||||
int mapping=getSlotValue(excWord, index, excOffset);
|
||||
set.add(mapping);
|
||||
}
|
||||
}
|
||||
if(hasSlot(excWord, EXC_DELTA)) {
|
||||
|
@ -324,6 +318,7 @@ public final class UCaseProps {
|
|||
}
|
||||
|
||||
/* get the closure string pointer & length */
|
||||
int closureOffset, closureLength;
|
||||
if(hasSlot(excWord, EXC_CLOSURE)) {
|
||||
excOffset=excOffset0;
|
||||
long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
|
||||
|
@ -338,7 +333,7 @@ public final class UCaseProps {
|
|||
if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
|
||||
excOffset=excOffset0;
|
||||
long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
|
||||
fullLength=(int)value;
|
||||
int fullLength=(int)value;
|
||||
|
||||
/* start of full case mapping strings */
|
||||
excOffset=(int)(value>>32)+1;
|
||||
|
@ -350,7 +345,7 @@ public final class UCaseProps {
|
|||
fullLength>>=4;
|
||||
|
||||
/* add the full case folding string */
|
||||
length=fullLength&0xf;
|
||||
int length=fullLength&0xf;
|
||||
if(length!=0) {
|
||||
set.add(exceptions.substring(excOffset, excOffset+length));
|
||||
excOffset+=length;
|
||||
|
@ -367,9 +362,137 @@ public final class UCaseProps {
|
|||
|
||||
/* add each code point in the closure string */
|
||||
int limit=closureOffset+closureLength;
|
||||
for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
|
||||
c=exceptions.codePointAt(index);
|
||||
set.add(c);
|
||||
for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
|
||||
int mapping=exceptions.codePointAt(index);
|
||||
set.add(mapping);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the simple case closure mapping,
|
||||
* except if there is not actually an scf relationship between the two characters.
|
||||
* TODO: Unicode should probably add the corresponding scf mappings.
|
||||
* See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
|
||||
* If & when those scf mappings are added, we should be able to remove all of these exceptions.
|
||||
*/
|
||||
private static void addOneSimpleCaseClosure(int c, int t, UnicodeSet set) {
|
||||
switch (c) {
|
||||
case 0x0390:
|
||||
if (t == 0x1FD3) { return; }
|
||||
break;
|
||||
case 0x03B0:
|
||||
if (t == 0x1FE3) { return; }
|
||||
break;
|
||||
case 0x1FD3:
|
||||
if (t == 0x0390) { return; }
|
||||
break;
|
||||
case 0x1FE3:
|
||||
if (t == 0x03B0) { return; }
|
||||
break;
|
||||
case 0xFB05:
|
||||
if (t == 0xFB06) { return; }
|
||||
break;
|
||||
case 0xFB06:
|
||||
if (t == 0xFB05) { return; }
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
set.add(t);
|
||||
}
|
||||
|
||||
public final void addSimpleCaseClosure(int c, UnicodeSet set) {
|
||||
int props=trie.get(c);
|
||||
if(!propsHasException(props)) {
|
||||
if(getTypeFromProps(props)!=NONE) {
|
||||
/* add the one simple case mapping, no matter what type it is */
|
||||
int delta=getDelta(props);
|
||||
if(delta!=0) {
|
||||
set.add(c+delta);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int excOffset0=excOffset;
|
||||
|
||||
// Hardcode the case closure of i and its relatives and ignore the
|
||||
// data file data for these characters, like in ucase_addCaseClosure().
|
||||
if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
|
||||
// These characters have Turkic case foldings. Hardcode their closure.
|
||||
if (c == 0x49) {
|
||||
// Regular i and I are in one equivalence class.
|
||||
set.add(0x69);
|
||||
return;
|
||||
} else if (c == 0x130) {
|
||||
// For scf=Simple_Case_Folding, dotted I is in a class by itself.
|
||||
return;
|
||||
}
|
||||
} else if (c == 0x69) {
|
||||
set.add(0x49);
|
||||
return;
|
||||
} else if (c == 0x131) {
|
||||
// Dotless i is in a class by itself.
|
||||
return;
|
||||
}
|
||||
|
||||
// Add all simple case mappings.
|
||||
for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
|
||||
if(hasSlot(excWord, index)) {
|
||||
excOffset=excOffset0;
|
||||
int mapping=getSlotValue(excWord, index, excOffset);
|
||||
addOneSimpleCaseClosure(c, mapping, set);
|
||||
}
|
||||
}
|
||||
if(hasSlot(excWord, EXC_DELTA)) {
|
||||
excOffset=excOffset0;
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
int mapping = (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
addOneSimpleCaseClosure(c, mapping, set);
|
||||
}
|
||||
|
||||
/* get the closure string pointer & length */
|
||||
int closureOffset, closureLength;
|
||||
if(hasSlot(excWord, EXC_CLOSURE)) {
|
||||
excOffset=excOffset0;
|
||||
long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
|
||||
closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
|
||||
closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
|
||||
} else {
|
||||
closureLength=0;
|
||||
closureOffset=0;
|
||||
}
|
||||
|
||||
// Skip the full case mappings.
|
||||
if(closureLength > 0 && hasSlot(excWord, EXC_FULL_MAPPINGS)) {
|
||||
excOffset=excOffset0;
|
||||
long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
|
||||
int fullLength=(int)value;
|
||||
|
||||
/* start of full case mapping strings */
|
||||
excOffset=(int)(value>>32)+1;
|
||||
|
||||
fullLength&=0xffff; /* bits 16 and higher are reserved */
|
||||
|
||||
// Skip all 4 full case mappings.
|
||||
excOffset+=fullLength&FULL_LOWER;
|
||||
fullLength>>=4;
|
||||
excOffset+=fullLength&0xf;
|
||||
fullLength>>=4;
|
||||
excOffset+=fullLength&0xf;
|
||||
fullLength>>=4;
|
||||
excOffset+=fullLength;
|
||||
|
||||
closureOffset=excOffset; /* behind full case mappings */
|
||||
}
|
||||
|
||||
// Add each code point in the closure string whose scf maps back to c.
|
||||
int limit=closureOffset+closureLength;
|
||||
for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
|
||||
int mapping=exceptions.codePointAt(index);
|
||||
addOneSimpleCaseClosure(c, mapping, set);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -459,7 +459,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* for the syntax of the pattern language.
|
||||
* @param pattern a string specifying what characters are in the set
|
||||
* @param options a bitmask indicating which options to apply.
|
||||
* Valid options are IGNORE_SPACE and CASE.
|
||||
* Valid options are {@link #IGNORE_SPACE} and
|
||||
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
|
||||
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
|
||||
* @exception java.lang.IllegalArgumentException if the pattern contains
|
||||
* a syntax error.
|
||||
* @stable ICU 3.8
|
||||
|
@ -495,7 +497,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* @param symbols a symbol table mapping variables to char[] arrays
|
||||
* and chars to UnicodeSets
|
||||
* @param options a bitmask indicating which options to apply.
|
||||
* Valid options are IGNORE_SPACE and CASE.
|
||||
* Valid options are {@link #IGNORE_SPACE} and
|
||||
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
|
||||
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
|
||||
* @exception java.lang.IllegalArgumentException if the pattern
|
||||
* contains a syntax error.
|
||||
* @stable ICU 3.2
|
||||
|
@ -587,7 +591,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* See the class description for the syntax of the pattern language.
|
||||
* @param pattern a string specifying what characters are in the set
|
||||
* @param options a bitmask indicating which options to apply.
|
||||
* Valid options are IGNORE_SPACE and CASE.
|
||||
* Valid options are {@link #IGNORE_SPACE} and
|
||||
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
|
||||
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
|
||||
* @exception java.lang.IllegalArgumentException if the pattern
|
||||
* contains a syntax error.
|
||||
* @stable ICU 3.8
|
||||
|
@ -2584,8 +2590,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* variables, or null if none.
|
||||
* @param rebuiltPat the pattern that was parsed, rebuilt or
|
||||
* copied from the input pattern, as appropriate.
|
||||
* @param options a bit mask of zero or more of the following:
|
||||
* IGNORE_SPACE, CASE.
|
||||
* @param options a bit mask.
|
||||
* Valid options are {@link #IGNORE_SPACE} and
|
||||
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
|
||||
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
|
||||
*/
|
||||
private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
|
||||
Appendable rebuiltPat, int options, int depth) {
|
||||
|
@ -2965,8 +2973,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
* to close over case BEFORE COMPLEMENTING. This makes
|
||||
* patterns like /[^abc]/i work.
|
||||
*/
|
||||
if ((options & CASE) != 0) {
|
||||
closeOver(CASE);
|
||||
if ((options & CASE_MASK) != 0) {
|
||||
closeOver(options);
|
||||
}
|
||||
if (invert) {
|
||||
complement().removeAllStrings(); // code point complement
|
||||
|
@ -3861,58 +3869,81 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
public static final int IGNORE_SPACE = 1;
|
||||
|
||||
/**
|
||||
* Bitmask for constructor, applyPattern(), and closeOver()
|
||||
* indicating letter case. This may be ORed together with other
|
||||
* selectors.
|
||||
* Alias for {@link #CASE_INSENSITIVE}.
|
||||
*
|
||||
* Enable case insensitive matching. E.g., "[ab]" with this flag
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'. This performs a full
|
||||
* closure over case mappings, e.g. U+017F for s.
|
||||
*
|
||||
* The resulting set is a superset of the input for the code points but
|
||||
* not for the strings.
|
||||
* It performs a case mapping closure of the code points and adds
|
||||
* full case folding strings for the code points, and reduces strings of
|
||||
* the original set to their full case folding equivalents.
|
||||
*
|
||||
* This is designed for case-insensitive matches, for example
|
||||
* in regular expressions. The full code point case closure allows checking of
|
||||
* an input character directly against the closure set.
|
||||
* Strings are matched by comparing the case-folded form from the closure
|
||||
* set with an incremental case folding of the string in question.
|
||||
*
|
||||
* The closure set will also contain single code points if the original
|
||||
* set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
|
||||
* This is not necessary (that is, redundant) for the above matching method
|
||||
* but results in the same closure sets regardless of whether the original
|
||||
* set contained the code point or a string.
|
||||
* @stable ICU 3.8
|
||||
*/
|
||||
public static final int CASE = 2;
|
||||
|
||||
/**
|
||||
* Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
|
||||
* also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
|
||||
* @see #CASE
|
||||
* Enable case insensitive matching. E.g., "[ab]" with this flag
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'. This performs a full
|
||||
* closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
|
||||
*
|
||||
* <p>This value is an options bit set value for some
|
||||
* constructors, applyPattern(), and closeOver().
|
||||
* It can be ORed together with other, unrelated options.
|
||||
*
|
||||
* <p>The resulting set is a superset of the input for the code points but
|
||||
* not for the strings.
|
||||
* It performs a case mapping closure of the code points and adds
|
||||
* full case folding strings for the code points, and reduces strings of
|
||||
* the original set to their full case folding equivalents.
|
||||
*
|
||||
* <p>This is designed for case-insensitive matches, for example
|
||||
* in regular expressions. The full code point case closure allows checking of
|
||||
* an input character directly against the closure set.
|
||||
* Strings are matched by comparing the case-folded form from the closure
|
||||
* set with an incremental case folding of the string in question.
|
||||
*
|
||||
* <p>The closure set will also contain single code points if the original
|
||||
* set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
|
||||
* This is not necessary (that is, redundant) for the above matching method
|
||||
* but results in the same closure sets regardless of whether the original
|
||||
* set contained the code point or a string.
|
||||
*
|
||||
* @stable ICU 3.4
|
||||
*/
|
||||
public static final int CASE_INSENSITIVE = 2;
|
||||
|
||||
/**
|
||||
* Bitmask for constructor, applyPattern(), and closeOver()
|
||||
* indicating letter case. This may be ORed together with other
|
||||
* selectors.
|
||||
*
|
||||
* Enable case insensitive matching. E.g., "[ab]" with this flag
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
|
||||
* title-, and uppercase mappings as well as the case folding
|
||||
* Adds all case mappings for each element in the set.
|
||||
* This adds the full lower-, title-, and uppercase mappings as well as the full case folding
|
||||
* of each existing element in the set.
|
||||
*
|
||||
* <p>This value is an options bit set value for some
|
||||
* constructors, applyPattern(), and closeOver().
|
||||
* It can be ORed together with other, unrelated options.
|
||||
*
|
||||
* <p>Unlike the “case insensitive” options, this does not perform a closure.
|
||||
* For example, it does not add 'ſ' (U+017F long s) for 's',
|
||||
* 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
|
||||
*
|
||||
* @stable ICU 3.4
|
||||
*/
|
||||
public static final int ADD_CASE_MAPPINGS = 4;
|
||||
|
||||
/**
|
||||
* Enable case insensitive matching.
|
||||
* Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings,
|
||||
* which map each code point to one code point,
|
||||
* not full Case_Folding (cf) mappings, which map some code points to multiple code points.
|
||||
*
|
||||
* <p>This is designed for case-insensitive matches, for example in certain
|
||||
* regular expression implementations where only Simple_Case_Folding mappings are used,
|
||||
* such as in ECMAScript (JavaScript) regular expressions.
|
||||
*
|
||||
* <p>This value is an options bit set value for some
|
||||
* constructors, applyPattern(), and closeOver().
|
||||
* It can be ORed together with other, unrelated options.
|
||||
*
|
||||
* @draft ICU 73
|
||||
*/
|
||||
public static final int SIMPLE_CASE_INSENSITIVE = 6;
|
||||
|
||||
private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS;
|
||||
|
||||
// add the result of a full case mapping to the set
|
||||
// use str as a temporary string to avoid constructing one
|
||||
private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
|
||||
|
@ -3930,99 +3961,193 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
// see UCaseProps
|
||||
}
|
||||
|
||||
/** For case closure on a large set, look only at code points with relevant properties. */
|
||||
UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) {
|
||||
if (src.size() < 30) {
|
||||
return src;
|
||||
}
|
||||
// Return the intersection of the src code points with Case_Sensitive ones.
|
||||
UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
|
||||
// Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src.
|
||||
if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) {
|
||||
return sensitive.cloneAsThawed().retainAll(src);
|
||||
} else {
|
||||
return ((UnicodeSet) src.clone()).retainAll(sensitive);
|
||||
}
|
||||
}
|
||||
|
||||
// Per-character scf = Simple_Case_Folding of a string.
|
||||
// (Normally when we case-fold a string we use full case foldings.)
|
||||
private static final boolean scfString(CharSequence s, StringBuilder scf) {
|
||||
int length = s.length();
|
||||
// Loop while not needing modification.
|
||||
for (int i = 0; i < length;) {
|
||||
int c = Character.codePointAt(s, i);
|
||||
int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
|
||||
if (scfChar != c) {
|
||||
// Copy the characters before c.
|
||||
scf.setLength(0);
|
||||
scf.append(s, 0, i);
|
||||
// Loop over the rest of the string and keep case-folding.
|
||||
for (;;) {
|
||||
scf.appendCodePoint(scfChar);
|
||||
i += Character.charCount(c);
|
||||
if (i == length) {
|
||||
return true;
|
||||
}
|
||||
c = Character.codePointAt(s, i);
|
||||
scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
|
||||
}
|
||||
}
|
||||
i += Character.charCount(c);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close this set over the given attribute. For the attribute
|
||||
* CASE, the result is to modify this set so that:
|
||||
* {@link #CASE_INSENSITIVE}, the result is to modify this set so that:
|
||||
*
|
||||
* 1. For each character or string 'a' in this set, all strings
|
||||
* <ol>
|
||||
* <li>For each character or string 'a' in this set, all strings
|
||||
* 'b' such that foldCase(a) == foldCase(b) are added to this set.
|
||||
* (For most 'a' that are single characters, 'b' will have
|
||||
* b.length() == 1.)
|
||||
*
|
||||
* 2. For each string 'e' in the resulting set, if e !=
|
||||
* <li>For each string 'e' in the resulting set, if e !=
|
||||
* foldCase(e), 'e' will be removed.
|
||||
* </ol>
|
||||
*
|
||||
* Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
|
||||
* <p>Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
|
||||
*
|
||||
* (Here foldCase(x) refers to the operation
|
||||
* <p>(Here foldCase(x) refers to the operation
|
||||
* UCharacter.foldCase(x, true), and a == b actually denotes
|
||||
* a.equals(b), not pointer comparison.)
|
||||
*
|
||||
* @param attribute bitmask for attributes to close over.
|
||||
* Currently only the CASE bit is supported. Any undefined bits
|
||||
* are ignored.
|
||||
* Valid options:
|
||||
* At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
|
||||
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
|
||||
* Unrelated options bits are ignored.
|
||||
* @return a reference to this set.
|
||||
* @stable ICU 3.8
|
||||
*/
|
||||
public UnicodeSet closeOver(int attribute) {
|
||||
checkFrozen();
|
||||
if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
|
||||
UCaseProps csp = UCaseProps.INSTANCE;
|
||||
UnicodeSet foldSet = new UnicodeSet(this);
|
||||
ULocale root = ULocale.ROOT;
|
||||
|
||||
// start with input set to guarantee inclusion
|
||||
// CASE: remove strings because the strings will actually be reduced (folded);
|
||||
// therefore, start with no strings and add only those needed
|
||||
if((attribute & CASE) != 0 && foldSet.hasStrings()) {
|
||||
foldSet.strings.clear();
|
||||
}
|
||||
|
||||
int n = getRangeCount();
|
||||
int result;
|
||||
StringBuilder full = new StringBuilder();
|
||||
|
||||
for (int i=0; i<n; ++i) {
|
||||
int start = getRangeStart(i);
|
||||
int end = getRangeEnd(i);
|
||||
|
||||
if((attribute & CASE) != 0) {
|
||||
// full case closure
|
||||
for (int cp=start; cp<=end; ++cp) {
|
||||
csp.addCaseClosure(cp, foldSet);
|
||||
}
|
||||
} else {
|
||||
// add case mappings
|
||||
// (does not add long s for regular s, or Kelvin for k, for example)
|
||||
for (int cp=start; cp<=end; ++cp) {
|
||||
result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullFolding(cp, full, 0);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hasStrings()) {
|
||||
if ((attribute & CASE) != 0) {
|
||||
for (String s : strings) {
|
||||
String str = UCharacter.foldCase(s, 0);
|
||||
if(!csp.addStringCaseClosure(str, foldSet)) {
|
||||
foldSet.add(str); // does not map to code points: add the folded string itself
|
||||
}
|
||||
}
|
||||
} else {
|
||||
BreakIterator bi = BreakIterator.getWordInstance(root);
|
||||
for (String str : strings) {
|
||||
// TODO: call lower-level functions
|
||||
foldSet.add(UCharacter.toLowerCase(root, str));
|
||||
foldSet.add(UCharacter.toTitleCase(root, str, bi));
|
||||
foldSet.add(UCharacter.toUpperCase(root, str));
|
||||
foldSet.add(UCharacter.foldCase(str, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
set(foldSet);
|
||||
switch (attribute & CASE_MASK) {
|
||||
case 0:
|
||||
break;
|
||||
case CASE_INSENSITIVE:
|
||||
closeOverCaseInsensitive(/* simple= */ false);
|
||||
break;
|
||||
case ADD_CASE_MAPPINGS:
|
||||
closeOverAddCaseMappings();
|
||||
break;
|
||||
case SIMPLE_CASE_INSENSITIVE:
|
||||
closeOverCaseInsensitive(/* simple= */ true);
|
||||
break;
|
||||
default:
|
||||
// bad option (unreachable)
|
||||
break;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
private void closeOverCaseInsensitive(boolean simple) {
|
||||
UCaseProps csp = UCaseProps.INSTANCE;
|
||||
// Start with input set to guarantee inclusion.
|
||||
UnicodeSet foldSet = new UnicodeSet(this);
|
||||
|
||||
// Full case mappings closure:
|
||||
// Remove strings because the strings will actually be reduced (folded);
|
||||
// therefore, start with no strings and add only those needed.
|
||||
// Do this before processing code points, because they may add strings.
|
||||
if (!simple && foldSet.hasStrings()) {
|
||||
foldSet.strings.clear();
|
||||
}
|
||||
|
||||
UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
|
||||
|
||||
// Iterate over the ranges of single code points. Nested loop for each code point.
|
||||
int n = codePoints.getRangeCount();
|
||||
for (int i=0; i<n; ++i) {
|
||||
int start = codePoints.getRangeStart(i);
|
||||
int end = codePoints.getRangeEnd(i);
|
||||
|
||||
if (simple) {
|
||||
for (int cp=start; cp<=end; ++cp) {
|
||||
csp.addSimpleCaseClosure(cp, foldSet);
|
||||
}
|
||||
} else {
|
||||
for (int cp=start; cp<=end; ++cp) {
|
||||
csp.addCaseClosure(cp, foldSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hasStrings()) {
|
||||
StringBuilder sb = simple ? new StringBuilder() : null;
|
||||
for (String s : strings) {
|
||||
if (simple) {
|
||||
if (scfString(s, sb)) {
|
||||
foldSet.remove(s).add(sb);
|
||||
}
|
||||
} else {
|
||||
String str = UCharacter.foldCase(s, 0);
|
||||
if(!csp.addStringCaseClosure(str, foldSet)) {
|
||||
foldSet.add(str); // does not map to code points: add the folded string itself
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
set(foldSet);
|
||||
}
|
||||
|
||||
private void closeOverAddCaseMappings() {
|
||||
UCaseProps csp = UCaseProps.INSTANCE;
|
||||
// Start with input set to guarantee inclusion.
|
||||
UnicodeSet foldSet = new UnicodeSet(this);
|
||||
|
||||
UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
|
||||
|
||||
// Iterate over the ranges of single code points. Nested loop for each code point.
|
||||
int n = codePoints.getRangeCount();
|
||||
int result;
|
||||
StringBuilder full = new StringBuilder();
|
||||
|
||||
for (int i=0; i<n; ++i) {
|
||||
int start = codePoints.getRangeStart(i);
|
||||
int end = codePoints.getRangeEnd(i);
|
||||
|
||||
// add case mappings
|
||||
// (does not add long s for regular s, or Kelvin for k, for example)
|
||||
for (int cp=start; cp<=end; ++cp) {
|
||||
result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullFolding(cp, full, 0);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
}
|
||||
}
|
||||
if (hasStrings()) {
|
||||
ULocale root = ULocale.ROOT;
|
||||
BreakIterator bi = BreakIterator.getWordInstance(root);
|
||||
for (String str : strings) {
|
||||
// TODO: call lower-level functions
|
||||
foldSet.add(UCharacter.toLowerCase(root, str));
|
||||
foldSet.add(UCharacter.toTitleCase(root, str, bi));
|
||||
foldSet.add(UCharacter.toUpperCase(root, str));
|
||||
foldSet.add(UCharacter.foldCase(str, 0));
|
||||
}
|
||||
}
|
||||
set(foldSet);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal class for customizing UnicodeSet parsing of properties.
|
||||
* TODO: extend to allow customizing of codepoint ranges
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.util.HashSet;
|
|||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
@ -32,6 +33,7 @@ import com.ibm.icu.dev.test.TestFmwk;
|
|||
import com.ibm.icu.dev.util.CollectionUtilities;
|
||||
import com.ibm.icu.impl.SortedSetRelation;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.CharacterProperties;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
|
@ -1323,38 +1325,98 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
@Test
|
||||
public void TestCloseOver() {
|
||||
String CASE = String.valueOf(UnicodeSet.CASE);
|
||||
String CASE_MAPPINGS = String.valueOf(UnicodeSet.ADD_CASE_MAPPINGS);
|
||||
String SIMPLE_CASE_INSENSITIVE = String.valueOf(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
|
||||
String[] DATA = {
|
||||
// selector, input, output
|
||||
CASE,
|
||||
"[aq\u00DF{Bc}{bC}{Fi}]",
|
||||
"[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[aq\u00DF{Bc}{bC}{Fi}]",
|
||||
"[aAqQ\u00DF\u1E9E{bc}{fi}]",
|
||||
|
||||
CASE,
|
||||
"[\u01F1]", // 'DZ'
|
||||
"[\u01F1\u01F2\u01F3]",
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[\u01F1]", // 'DZ'
|
||||
"[\u01F1\u01F2\u01F3]",
|
||||
|
||||
CASE,
|
||||
"[\u1FB4]",
|
||||
"[\u1FB4{\u03AC\u03B9}]",
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[\u1FB4]",
|
||||
"[\u1FB4]",
|
||||
|
||||
CASE,
|
||||
"[{F\uFB01}]",
|
||||
"[\uFB03{ffi}]",
|
||||
|
||||
CASE, // make sure binary search finds limits
|
||||
"[a\uFF3A]",
|
||||
"[aA\uFF3A\uFF5A]",
|
||||
|
||||
CASE,
|
||||
"[a-z]","[A-Za-z\u017F\u212A]",
|
||||
|
||||
SIMPLE_CASE_INSENSITIVE,
|
||||
"[a-z]","[A-Za-z\u017F\u212A]",
|
||||
|
||||
CASE,
|
||||
"[abc]","[A-Ca-c]",
|
||||
CASE,
|
||||
"[ABC]","[A-Ca-c]",
|
||||
|
||||
CASE, "[i]", "[iI]",
|
||||
|
||||
CASE, "[\u0130]", "[\u0130{i\u0307}]", // dotted I
|
||||
CASE, "[{i\u0307}]", "[\u0130{i\u0307}]", // i with dot
|
||||
|
||||
CASE, "[\u0131]", "[\u0131]", // dotless i
|
||||
|
||||
CASE, "[\u0390]", "[\u0390\u1FD3{\u03B9\u0308\u0301}]",
|
||||
|
||||
CASE, "[\u03c2]", "[\u03a3\u03c2\u03c3]", // sigmas
|
||||
|
||||
CASE, "[\u03f2]", "[\u03f2\u03f9]", // lunate sigmas
|
||||
|
||||
CASE, "[\u03f7]", "[\u03f7\u03f8]",
|
||||
|
||||
CASE, "[\u1fe3]", "[\u03b0\u1fe3{\u03c5\u0308\u0301}]",
|
||||
|
||||
CASE, "[\ufb05]", "[\ufb05\ufb06{st}]",
|
||||
CASE, "[{st}]", "[\ufb05\ufb06{st}]",
|
||||
|
||||
CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
|
||||
|
||||
CASE, "[{a\u02BE}]", "[\u1E9A{a\u02BE}]", // first in sorted table
|
||||
|
||||
CASE, "[{\u1f7c\u03b9}]", "[\u1ff2{\u1f7c\u03b9}]", // last in sorted table
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[aq\u00DF{Bc}{bC}{Fi}]",
|
||||
"[aAqQ\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[\u01F1]", // 'DZ'
|
||||
"[\u01F1\u01F2\u01F3]",
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[a-z]",
|
||||
"[A-Za-z]",
|
||||
};
|
||||
|
||||
UnicodeSet s = new UnicodeSet();
|
||||
UnicodeSet t = new UnicodeSet();
|
||||
for (int i=0; i<DATA.length; i+=3) {
|
||||
int selector = Integer.parseInt(DATA[i]);
|
||||
String pat = DATA[i+1];
|
||||
String exp = DATA[i+2];
|
||||
String pat = Utility.unescape(DATA[i+1]);
|
||||
String exp = Utility.unescape(DATA[i+2]);
|
||||
s.applyPattern(pat);
|
||||
s.closeOver(selector);
|
||||
t.applyPattern(exp);
|
||||
|
@ -1371,6 +1433,149 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
expectContainment(s, "abcABC", "defDEF");
|
||||
s = new UnicodeSet("[^abc]", UnicodeSet.CASE);
|
||||
expectContainment(s, "defDEF", "abcABC");
|
||||
s = new UnicodeSet("[abck]", UnicodeSet.ADD_CASE_MAPPINGS);
|
||||
expectContainment(s, "abckABCK", "defDEF\u212A");
|
||||
}
|
||||
|
||||
private void add(Map<Integer, Collection<Integer>> closure, Integer c, Integer t) {
|
||||
Collection<Integer> values = closure.get(c);
|
||||
if (values == null) {
|
||||
values = new TreeSet<>();
|
||||
closure.put(c, values);
|
||||
}
|
||||
values.add(t);
|
||||
}
|
||||
|
||||
private void addIfAbsent(Map<Integer, Collection<Integer>> closure, Integer c, Integer t,
|
||||
Map<Integer, Collection<Integer>> additions) {
|
||||
Collection<Integer> values = closure.get(c);
|
||||
if (values == null || !values.contains(t)) {
|
||||
if (additions != closure) {
|
||||
values = additions.get(c);
|
||||
}
|
||||
if (values == null) {
|
||||
values = new TreeSet<>();
|
||||
additions.put(c, values);
|
||||
}
|
||||
values.add(t);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestCloseOverSimpleCaseFolding() {
|
||||
UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
|
||||
// Compute the scf=Simple_Case_Folding closure:
|
||||
// For each scf(c)=t, start with mappings c->t and t->c.
|
||||
|
||||
// Poor man's multimap from code points to code points.
|
||||
Map<Integer, Collection<Integer>> closure = new HashMap<>();
|
||||
UnicodeSetIterator iter = new UnicodeSetIterator(sensitive);
|
||||
while (iter.next()) {
|
||||
int c = iter.codepoint;
|
||||
int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
|
||||
if (scfChar != c) {
|
||||
add(closure, c, scfChar);
|
||||
add(closure, scfChar, c);
|
||||
}
|
||||
}
|
||||
// Complete the closure: Add mappings of mappings.
|
||||
Map<Integer, Collection<Integer>> additions = new HashMap<>();
|
||||
for (;;) {
|
||||
// for each mapping c->t
|
||||
for (Map.Entry<Integer, Collection<Integer>> entry : closure.entrySet()) {
|
||||
Integer c = entry.getKey();
|
||||
Collection<Integer> cValues = entry.getValue();
|
||||
for (Integer t : cValues) {
|
||||
// enumerate each t->u
|
||||
Collection<Integer> tValues = closure.get(t);
|
||||
if (tValues != null) {
|
||||
for (Integer u : tValues) {
|
||||
if (!u.equals(c)) {
|
||||
addIfAbsent(closure, c, u, additions);
|
||||
addIfAbsent(closure, u, c, additions);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (additions.isEmpty()) {
|
||||
break; // The closure is complete.
|
||||
}
|
||||
// Add all of the additions back into the closure.
|
||||
for (Map.Entry<Integer, Collection<Integer>> entry : additions.entrySet()) {
|
||||
Integer c = entry.getKey();
|
||||
Collection<Integer> cValues = entry.getValue();
|
||||
Collection<Integer> closureValues = closure.get(c);
|
||||
if (closureValues == null) {
|
||||
closureValues = new TreeSet<>();
|
||||
closure.put(c, closureValues);
|
||||
}
|
||||
closureValues.addAll(cValues);
|
||||
}
|
||||
additions.clear();
|
||||
}
|
||||
// Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
|
||||
// Here we focus on single code points as input.
|
||||
// Other examples, including strings, are tested in TestCloseOver().
|
||||
int errors = 0;
|
||||
iter.reset();
|
||||
UnicodeSet set = new UnicodeSet(), expected = new UnicodeSet();
|
||||
while (iter.next()) {
|
||||
int c = iter.codepoint;
|
||||
// closeOver()
|
||||
set.clear().add(c);
|
||||
set.closeOver(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
|
||||
// From-first-principles implementation.
|
||||
expected.clear().add(c);
|
||||
Collection<Integer> values = closure.get(c);
|
||||
if (values != null) {
|
||||
for (Integer t : values) {
|
||||
expected.add(t);
|
||||
}
|
||||
}
|
||||
// compare
|
||||
if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
|
||||
errln(" c=U+" + Utility.hex(c));
|
||||
if (++errors == 10) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestCloseOverLargeSets() {
|
||||
// Check that an optimization for large sets does not change the result.
|
||||
|
||||
// Most code points except ones that are boring for case mappings.
|
||||
UnicodeSet manyCp = new UnicodeSet("[^[:C:][:Ideographic:][:Hang:]]");
|
||||
// Main Unihan block.
|
||||
int LARGE_START = 0x4E00;
|
||||
int LARGE_END = 0x9FFF;
|
||||
|
||||
int OPTIONS[] = {
|
||||
UnicodeSet.CASE_INSENSITIVE, UnicodeSet.ADD_CASE_MAPPINGS,
|
||||
UnicodeSet.SIMPLE_CASE_INSENSITIVE
|
||||
};
|
||||
UnicodeSet input = new UnicodeSet(), small, large;
|
||||
for (int option : OPTIONS) {
|
||||
UnicodeSetIterator iter = new UnicodeSetIterator(manyCp);
|
||||
while (iter.next()) {
|
||||
int c = iter.codepoint;
|
||||
input.clear().add(c);
|
||||
small = (UnicodeSet) input.clone();
|
||||
small.closeOver(option);
|
||||
large = (UnicodeSet) input.clone();
|
||||
large.add(LARGE_START, LARGE_END);
|
||||
large.closeOver(option);
|
||||
large.remove(LARGE_START, LARGE_END);
|
||||
if (!checkEqual(small, large, "small != large")) {
|
||||
errln(" option=" + option + " c=U+" + Utility.hex(c));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -1709,8 +1914,8 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
test2.add("a" + (max - i)); // add in reverse order
|
||||
}
|
||||
assertNotEquals("compare iterable test", test1, test2);
|
||||
TreeSet<CharSequence> sortedTest1 = new TreeSet<CharSequence>(test1);
|
||||
TreeSet<CharSequence> sortedTest2 = new TreeSet<CharSequence>(test2);
|
||||
TreeSet<CharSequence> sortedTest1 = new TreeSet<>(test1);
|
||||
TreeSet<CharSequence> sortedTest2 = new TreeSet<>(test2);
|
||||
assertEquals("compare iterable test", sortedTest1, sortedTest2);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue