ICU-6065 UnicodeSet::closeOver(simple case folding)

See 
This commit is contained in:
Markus Scherer 2023-03-02 00:25:11 +00:00
parent 2864379937
commit 79ab90b5f9
14 changed files with 1228 additions and 391 deletions
icu4c/source
icu4j/main
classes/core/src/com/ibm/icu
tests/core/src/com/ibm/icu/dev/test/lang

View file

@ -377,22 +377,30 @@ UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
} // namespace
U_NAMESPACE_USE
U_NAMESPACE_BEGIN
U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
if (U_FAILURE(*pErrorCode)) { return nullptr; }
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Mutex m(&cpMutex);
UnicodeSet *set = sets[property];
if (set == nullptr) {
sets[property] = set = makeSet(property, *pErrorCode);
sets[property] = set = makeSet(property, errorCode);
}
if (U_FAILURE(*pErrorCode)) { return nullptr; }
return set->toUSet();
return set;
}
U_NAMESPACE_END
U_NAMESPACE_USE
U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
}
U_CAPI const UCPMap * U_EXPORT2

View file

@ -205,37 +205,7 @@ static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
uint16_t props;
/*
* Hardcode the case closure of i and its relatives and ignore the
* data file data for these characters.
* The Turkic dotless i and dotted I with their case mapping conditions
* and case folding option make the related characters behave specially.
* This code matches their closure behavior to their case folding behavior.
*/
switch(c) {
case 0x49:
/* regular i and I are in one equivalence class */
sa->add(sa->set, 0x69);
return;
case 0x69:
sa->add(sa->set, 0x49);
return;
case 0x130:
/* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
sa->addString(sa->set, iDot, 2);
return;
case 0x131:
/* dotless i is in a class by itself */
return;
default:
/* otherwise use the data file data */
break;
}
props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
/* add the one simple case mapping, no matter what type it is */
@ -249,19 +219,42 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
* c has exceptions, so there may be multiple simple and/or
* full case mappings. Add them all.
*/
const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
const char16_t *closure;
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
uint16_t excWord=*pe++;
int32_t idx, closureLength, fullLength, length;
const uint16_t *pe0=pe;
pe0=pe;
// Hardcode the case closure of i and its relatives and ignore the
// data file data for these characters.
// The Turkic dotless i and dotted I with their case mapping conditions
// and case folding option make the related characters behave specially.
// This code matches their closure behavior to their case folding behavior.
if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
// These characters have Turkic case foldings. Hardcode their closure.
if (c == 0x49) {
// Regular i and I are in one equivalence class.
sa->add(sa->set, 0x69);
return;
} else if (c == 0x130) {
// Dotted I is in a class with <0069 0307>
// (for canonical equivalence with <0049 0307>).
sa->addString(sa->set, iDot, 2);
return;
}
} else if (c == 0x69) {
sa->add(sa->set, 0x49);
return;
} else if (c == 0x131) {
// Dotless i is in a class by itself.
return;
}
/* add all simple case mappings */
for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
if(HAS_SLOT(excWord, idx)) {
pe=pe0;
GET_SLOT_VALUE(excWord, idx, pe, c);
sa->add(sa->set, c);
UChar32 mapping;
GET_SLOT_VALUE(excWord, idx, pe, mapping);
sa->add(sa->set, mapping);
}
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
@ -272,6 +265,8 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
}
/* get the closure string pointer & length */
const char16_t *closure;
int32_t closureLength;
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
pe=pe0;
GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
@ -285,6 +280,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
/* add the full case folding */
if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
pe=pe0;
int32_t fullLength;
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
/* start of full case mapping strings */
@ -297,7 +293,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
fullLength>>=4;
/* add the full case folding string */
length=fullLength&0xf;
int32_t length=fullLength&0xf;
if(length!=0) {
sa->addString(sa->set, (const char16_t *)pe, length);
pe+=length;
@ -313,9 +309,146 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
}
/* add each code point in the closure string */
for(idx=0; idx<closureLength;) {
U16_NEXT_UNSAFE(closure, idx, c);
sa->add(sa->set, c);
for(int32_t idx=0; idx<closureLength;) {
UChar32 mapping;
U16_NEXT_UNSAFE(closure, idx, mapping);
sa->add(sa->set, mapping);
}
}
}
namespace {
/**
* Add the simple case closure mapping,
* except if there is not actually an scf relationship between the two characters.
* TODO: Unicode should probably add the corresponding scf mappings.
* See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
* If & when those scf mappings are added, we should be able to remove all of these exceptions.
*/
void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
switch (c) {
case 0x0390:
if (t == 0x1FD3) { return; }
break;
case 0x03B0:
if (t == 0x1FE3) { return; }
break;
case 0x1FD3:
if (t == 0x0390) { return; }
break;
case 0x1FE3:
if (t == 0x03B0) { return; }
break;
case 0xFB05:
if (t == 0xFB06) { return; }
break;
case 0xFB06:
if (t == 0xFB05) { return; }
break;
default:
break;
}
sa->add(sa->set, t);
}
} // namespace
U_CFUNC void U_EXPORT2
ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
/* add the one simple case mapping, no matter what type it is */
int32_t delta=UCASE_GET_DELTA(props);
if(delta!=0) {
sa->add(sa->set, c+delta);
}
}
} else {
// c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
uint16_t excWord=*pe++;
const uint16_t *pe0=pe;
// Hardcode the case closure of i and its relatives and ignore the
// data file data for these characters, like in ucase_addCaseClosure().
if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
// These characters have Turkic case foldings. Hardcode their closure.
if (c == 0x49) {
// Regular i and I are in one equivalence class.
sa->add(sa->set, 0x69);
return;
} else if (c == 0x130) {
// For scf=Simple_Case_Folding, dotted I is in a class by itself.
return;
}
} else if (c == 0x69) {
sa->add(sa->set, 0x49);
return;
} else if (c == 0x131) {
// Dotless i is in a class by itself.
return;
}
// Add all simple case mappings.
for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
if(HAS_SLOT(excWord, idx)) {
pe=pe0;
UChar32 mapping;
GET_SLOT_VALUE(excWord, idx, pe, mapping);
addOneSimpleCaseClosure(c, mapping, sa);
}
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
pe=pe0;
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
addOneSimpleCaseClosure(c, mapping, sa);
}
/* get the closure string pointer & length */
const char16_t *closure;
int32_t closureLength;
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
pe=pe0;
GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
} else {
closureLength=0;
closure=nullptr;
}
// Skip the full case mappings.
if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
pe=pe0;
int32_t fullLength;
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
/* start of full case mapping strings */
++pe;
fullLength&=0xffff; /* bits 16 and higher are reserved */
// Skip all 4 full case mappings.
pe+=fullLength&UCASE_FULL_LOWER;
fullLength>>=4;
pe+=fullLength&0xf;
fullLength>>=4;
pe+=fullLength&0xf;
fullLength>>=4;
pe+=fullLength;
closure=(const char16_t *)pe; /* behind full case mappings */
}
// Add each code point in the closure string whose scf maps back to c.
for(int32_t idx=0; idx<closureLength;) {
UChar32 mapping;
U16_NEXT_UNSAFE(closure, idx, mapping);
addOneSimpleCaseClosure(c, mapping, sa);
}
}
}

View file

@ -108,6 +108,10 @@ ucase_fold(UChar32 c, uint32_t options);
U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c, const USetAdder *sa);
/** Case closure with only scf=Simple_Case_Folding. */
U_CFUNC void U_EXPORT2
ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa);
/**
* Maps the string to single code points and adds the associated case closure
* mappings.

View file

@ -430,7 +430,9 @@ public:
* description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* Valid options are USET_IGNORE_SPACE and
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to values
* and stand-in characters to UnicodeSets; may be nullptr
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
@ -450,7 +452,9 @@ public:
* @param pos on input, the position in pattern at which to start parsing.
* On output, the position after the last character parsed.
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* Valid options are USET_IGNORE_SPACE and
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to values
* and stand-in characters to UnicodeSets; may be nullptr
* @param status input-output error code
@ -645,7 +649,9 @@ public:
* A frozen set will not be modified.
* @param pattern a string specifying what characters are in the set
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* Valid options are USET_IGNORE_SPACE and
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to
* values and stand-ins to UnicodeSets; may be nullptr
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
@ -683,7 +689,9 @@ public:
* pattern.length() if the closing ']' is the last character of
* the pattern string.
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* Valid options are USET_IGNORE_SPACE and
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to
* values and stand-ins to UnicodeSets; may be nullptr
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
@ -1390,7 +1398,7 @@ public:
/**
* Close this set over the given attribute. For the attribute
* USET_CASE, the result is to modify this set so that:
* USET_CASE_INSENSITIVE, the result is to modify this set so that:
*
* 1. For each character or string 'a' in this set, all strings or
* characters 'b' such that foldCase(a) == foldCase(b) are added
@ -1408,8 +1416,10 @@ public:
* A frozen set will not be modified.
*
* @param attribute bitmask for attributes to close over.
* Currently only the USET_CASE bit is supported. Any undefined bits
* are ignored.
* Valid options:
* At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* Unrelated options bits are ignored.
* @return a reference to this set.
* @stable ICU 4.2
*/
@ -1579,6 +1589,9 @@ private:
int32_t depth,
UErrorCode& ec);
void closeOverCaseInsensitive(bool simple);
void closeOverAddCaseMappings();
//----------------------------------------------------------------
// Implementation: Utility methods
//----------------------------------------------------------------

View file

@ -53,6 +53,12 @@ typedef struct USet USet;
/**
* Bitmask values to be passed to uset_openPatternOptions() or
* uset_applyPattern() taking an option parameter.
*
* Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
*
* Undefined options bits are ignored, and reserved for future use.
*
* @stable ICU 2.4
*/
enum {
@ -60,13 +66,13 @@ enum {
* Ignore white space within patterns unless quoted or escaped.
* @stable ICU 2.4
*/
USET_IGNORE_SPACE = 1,
USET_IGNORE_SPACE = 1,
/**
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This performs a full
* closure over case mappings, e.g. U+017F for s.
* closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
*
* The resulting set is a superset of the input for the code points but
* not for the strings.
@ -88,17 +94,36 @@ enum {
*
* @stable ICU 2.4
*/
USET_CASE_INSENSITIVE = 2,
USET_CASE_INSENSITIVE = 2,
/**
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
* title-, and uppercase mappings as well as the case folding
* Adds all case mappings for each element in the set.
* This adds the full lower-, title-, and uppercase mappings as well as the full case folding
* of each existing element in the set.
*
* Unlike the case insensitive options, this does not perform a closure.
* For example, it does not add 'ſ' (U+017F long s) for 's',
* 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
*
* @stable ICU 3.2
*/
USET_ADD_CASE_MAPPINGS = 4
USET_ADD_CASE_MAPPINGS = 4,
#ifndef U_HIDE_DRAFT_API
/**
* Enable case insensitive matching.
* Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
* which map each code point to one code point,
* not full Case_Folding (cf) mappings, which map some code points to multiple code points.
*
* This is designed for case-insensitive matches, for example in certain
* regular expression implementations where only Simple_Case_Folding mappings are used,
* such as in ECMAScript (JavaScript) regular expressions.
*
* @draft ICU 73
*/
USET_SIMPLE_CASE_INSENSITIVE = 6
#endif // U_HIDE_DRAFT_API
};
/**
@ -299,7 +324,9 @@ uset_openPattern(const UChar* pattern, int32_t patternLength,
* @param patternLength the length of the pattern, or -1 if null
* terminated
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* Valid options are USET_IGNORE_SPACE and
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* @param ec the error code
* @stable ICU 2.4
*/
@ -414,7 +441,10 @@ uset_set(USet* set,
* The character at pattern[0] must be a '['.
* @param patternLength The length of the UChar string. -1 if NUL terminated.
* @param options A bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* Valid options are USET_IGNORE_SPACE and
* at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
* USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* @param status Returns an error if the pattern cannot be parsed.
* @return Upon successful parse, the value is either
* the index of the character after the closing ']'
@ -804,7 +834,7 @@ uset_clear(USet* set);
/**
* Close this set over the given attribute. For the attribute
* USET_CASE, the result is to modify this set so that:
* USET_CASE_INSENSITIVE, the result is to modify this set so that:
*
* 1. For each character or string 'a' in this set, all strings or
* characters 'b' such that foldCase(a) == foldCase(b) are added
@ -824,8 +854,10 @@ uset_clear(USet* set);
* @param set the set
*
* @param attributes bitmask for attributes to close over.
* Currently only the USET_CASE bit is supported. Any undefined bits
* are ignored.
* Valid options:
* At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
* These case options are mutually exclusive.
* Unrelated options bits are ignored.
* @stable ICU 4.2
*/
U_CAPI void U_EXPORT2

View file

@ -25,9 +25,11 @@
#include "unicode/locid.h"
#include "unicode/parsepos.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "ruleiter.h"
#include "ucase.h"
#include "uprops.h"
#include "util.h"
#include "uvector.h"
@ -149,102 +151,208 @@ addCaseMapping(UnicodeSet &set, int32_t result, const char16_t *full, UnicodeStr
// see ucase.h
}
namespace {
/** For case closure on a large set, look only at code points with relevant properties. */
const UnicodeSet &maybeOnlyCaseSensitive(const UnicodeSet &src, UnicodeSet &subset) {
// The subset must have been constructed with all code points,
// so that the retainAll() intersection effectively copies all single code points from src.
U_ASSERT(subset.contains(0, 0x10ffff));
if (src.size() < 30) {
return src;
}
// Return the intersection of the src code points with Case_Sensitive ones.
UErrorCode errorCode = U_ZERO_ERROR;
const UnicodeSet *sensitive =
CharacterProperties::getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode);
if (U_FAILURE(errorCode)) {
return src;
}
// Start by copying the "smaller" set.
// (We "copy" by intersecting all Unicode *code points* with the first set,
// which omits any strings.)
if (src.getRangeCount() > sensitive->getRangeCount()) {
subset.retainAll(*sensitive);
subset.retainAll(src);
} else {
subset.retainAll(src);
subset.retainAll(*sensitive);
}
return subset;
}
// Per-character scf = Simple_Case_Folding of a string.
// (Normally when we case-fold a string we use full case foldings.)
bool scfString(const UnicodeString &s, UnicodeString &scf) {
// Iterate over the raw buffer for best performance.
const char16_t *p = s.getBuffer();
int32_t length = s.length();
// Loop while not needing modification.
for (int32_t i = 0; i < length;) {
UChar32 c;
U16_NEXT(p, i, length, c); // post-increments i
UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
if (scfChar != c) {
// Copy the characters before c.
scf.setTo(p, i - U16_LENGTH(c));
// Loop over the rest of the string and keep case-folding.
for (;;) {
scf.append(scfChar);
if (i == length) {
return true;
}
U16_NEXT(p, i, length, c); // post-increments i
scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
}
}
}
return false;
}
} // namespace
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
if (isFrozen() || isBogus()) {
return *this;
}
if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
{
UnicodeSet foldSet(*this);
UnicodeString str;
USetAdder sa = {
foldSet.toUSet(),
_set_add,
_set_addRange,
_set_addString,
nullptr, // don't need remove()
nullptr // don't need removeRange()
};
// start with input set to guarantee inclusion
// USET_CASE: remove strings because the strings will actually be reduced (folded);
// therefore, start with no strings and add only those needed
if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {
foldSet.strings->removeAllElements();
}
int32_t n = getRangeCount();
UChar32 result;
const char16_t *full;
for (int32_t i=0; i<n; ++i) {
UChar32 start = getRangeStart(i);
UChar32 end = getRangeEnd(i);
if (attribute & USET_CASE_INSENSITIVE) {
// full case closure
for (UChar32 cp=start; cp<=end; ++cp) {
ucase_addCaseClosure(cp, &sa);
}
} else {
// add case mappings
// (does not add long s for regular s, or Kelvin for k, for example)
for (UChar32 cp=start; cp<=end; ++cp) {
result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullFolding(cp, &full, 0);
addCaseMapping(foldSet, result, full, str);
}
}
}
if (hasStrings()) {
if (attribute & USET_CASE_INSENSITIVE) {
for (int32_t j=0; j<strings->size(); ++j) {
str = *(const UnicodeString *) strings->elementAt(j);
str.foldCase();
if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
foldSet.add(str); // does not map to code points: add the folded string itself
}
}
} else {
Locale root("");
#if !UCONFIG_NO_BREAK_ITERATION
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
if (U_SUCCESS(status)) {
#endif
const UnicodeString *pStr;
for (int32_t j=0; j<strings->size(); ++j) {
pStr = (const UnicodeString *) strings->elementAt(j);
(str = *pStr).toLower(root);
foldSet.add(str);
#if !UCONFIG_NO_BREAK_ITERATION
(str = *pStr).toTitle(bi, root);
foldSet.add(str);
#endif
(str = *pStr).toUpper(root);
foldSet.add(str);
(str = *pStr).foldCase();
foldSet.add(str);
}
#if !UCONFIG_NO_BREAK_ITERATION
}
delete bi;
#endif
}
}
*this = foldSet;
}
switch (attribute & USET_CASE_MASK) {
case 0:
break;
case USET_CASE_INSENSITIVE:
closeOverCaseInsensitive(/* simple= */ false);
break;
case USET_ADD_CASE_MAPPINGS:
closeOverAddCaseMappings();
break;
case USET_SIMPLE_CASE_INSENSITIVE:
closeOverCaseInsensitive(/* simple= */ true);
break;
default:
// bad option (unreachable)
break;
}
return *this;
}
void UnicodeSet::closeOverCaseInsensitive(bool simple) {
// Start with input set to guarantee inclusion.
UnicodeSet foldSet(*this);
// Full case mappings closure:
// Remove strings because the strings will actually be reduced (folded);
// therefore, start with no strings and add only those needed.
// Do this before processing code points, because they may add strings.
if (!simple && foldSet.hasStrings()) {
foldSet.strings->removeAllElements();
}
USetAdder sa = {
foldSet.toUSet(),
_set_add,
_set_addRange,
_set_addString,
nullptr, // don't need remove()
nullptr // don't need removeRange()
};
UnicodeSet subset(0, 0x10ffff);
const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
// Iterate over the ranges of single code points. Nested loop for each code point.
int32_t n = codePoints.getRangeCount();
for (int32_t i=0; i<n; ++i) {
UChar32 start = codePoints.getRangeStart(i);
UChar32 end = codePoints.getRangeEnd(i);
if (simple) {
for (UChar32 cp=start; cp<=end; ++cp) {
ucase_addSimpleCaseClosure(cp, &sa);
}
} else {
for (UChar32 cp=start; cp<=end; ++cp) {
ucase_addCaseClosure(cp, &sa);
}
}
}
if (hasStrings()) {
UnicodeString str;
for (int32_t j=0; j<strings->size(); ++j) {
const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
if (simple) {
if (scfString(*pStr, str)) {
foldSet.remove(*pStr).add(str);
}
} else {
str = *pStr;
str.foldCase();
if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
foldSet.add(str); // does not map to code points: add the folded string itself
}
}
}
}
*this = foldSet;
}
void UnicodeSet::closeOverAddCaseMappings() {
// Start with input set to guarantee inclusion.
UnicodeSet foldSet(*this);
UnicodeSet subset(0, 0x10ffff);
const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
// Iterate over the ranges of single code points. Nested loop for each code point.
int32_t n = codePoints.getRangeCount();
UChar32 result;
const char16_t *full;
UnicodeString str;
for (int32_t i=0; i<n; ++i) {
UChar32 start = codePoints.getRangeStart(i);
UChar32 end = codePoints.getRangeEnd(i);
// add case mappings
// (does not add long s for regular s, or Kelvin for k, for example)
for (UChar32 cp=start; cp<=end; ++cp) {
result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullFolding(cp, &full, 0);
addCaseMapping(foldSet, result, full, str);
}
}
if (hasStrings()) {
Locale root("");
#if !UCONFIG_NO_BREAK_ITERATION
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
if (U_SUCCESS(status)) {
#endif
for (int32_t j=0; j<strings->size(); ++j) {
const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
(str = *pStr).toLower(root);
foldSet.add(str);
#if !UCONFIG_NO_BREAK_ITERATION
(str = *pStr).toTitle(bi, root);
foldSet.add(str);
#endif
(str = *pStr).toUpper(root);
foldSet.add(str);
(str = *pStr).foldCase();
foldSet.add(str);
}
#if !UCONFIG_NO_BREAK_ITERATION
}
delete bi;
#endif
}
*this = foldSet;
}
U_NAMESPACE_END

View file

@ -631,11 +631,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
* to close over case BEFORE COMPLEMENTING. This makes
* patterns like /[^abc]/i work.
*/
if ((options & USET_CASE_INSENSITIVE) != 0) {
(this->*caseClosure)(USET_CASE_INSENSITIVE);
}
else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
(this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
if ((options & USET_CASE_MASK) != 0) {
(this->*caseClosure)(options);
}
if (invert) {
complement().removeAllStrings(); // code point complement

View file

@ -441,6 +441,7 @@ class CharacterProperties {
public:
CharacterProperties() = delete;
static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
static const UnicodeSet *getBinaryPropertySet(UProperty property, UErrorCode &errorCode);
};
// implemented in uniset_props.cpp

View file

@ -58,5 +58,14 @@ typedef struct USetAdder USetAdder;
U_CDECL_END
#endif
#ifdef __cplusplus
namespace {
constexpr int32_t USET_CASE_MASK = USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS;
} // namespace
#endif // __cplusplus
#endif

View file

@ -14,6 +14,7 @@
#include <stdio.h>
#include <string.h>
#include <unordered_map>
#include "unicode/utypes.h"
#include "usettest.h"
#include "unicode/ucnv.h"
@ -85,6 +86,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestStrings);
TESTCASE_AUTO(Testj2268);
TESTCASE_AUTO(TestCloseOver);
TESTCASE_AUTO(TestCloseOverSimpleCaseFolding);
TESTCASE_AUTO(TestCloseOverLargeSets);
TESTCASE_AUTO(TestEscapePattern);
TESTCASE_AUTO(TestInvalidCodePoint);
TESTCASE_AUTO(TestSymbolTable);
@ -1243,27 +1246,38 @@ void UnicodeSetTest::TestIndexOf() {
* Test closure API.
*/
void UnicodeSetTest::TestCloseOver() {
UErrorCode ec = U_ZERO_ERROR;
char CASE[] = {(char)USET_CASE_INSENSITIVE};
char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
const char* DATA[] = {
static constexpr char CASE[] = {(char)USET_CASE_INSENSITIVE};
static constexpr char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
static constexpr char SIMPLE_CASE_INSENSITIVE[] = {(char)USET_SIMPLE_CASE_INSENSITIVE};
static const char* DATA[] = {
// selector, input, output
CASE,
"[aq\\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
SIMPLE_CASE_INSENSITIVE,
"[aq\\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\\u00DF\\u1E9E{bc}{fi}]",
CASE,
"[\\u01F1]", // 'DZ'
"[\\u01F1\\u01F2\\u01F3]",
SIMPLE_CASE_INSENSITIVE,
"[\\u01F1]", // 'DZ'
"[\\u01F1\\u01F2\\u01F3]",
CASE,
"[\\u1FB4]",
"[\\u1FB4{\\u03AC\\u03B9}]",
SIMPLE_CASE_INSENSITIVE,
"[\\u1FB4]",
"[\\u1FB4]",
CASE,
"[{F\\uFB01}]",
"[\\uFB03{ffi}]",
"[\\uFB03{ffi}]",
CASE, // make sure binary search finds limits
"[a\\uFF3A]",
@ -1271,6 +1285,10 @@ void UnicodeSetTest::TestCloseOver() {
CASE,
"[a-z]","[A-Za-z\\u017F\\u212A]",
SIMPLE_CASE_INSENSITIVE,
"[a-z]","[A-Za-z\\u017F\\u212A]",
CASE,
"[abc]","[A-Ca-c]",
CASE,
@ -1311,7 +1329,7 @@ void UnicodeSetTest::TestCloseOver() {
CASE_MAPPINGS,
"[\\u01F1]", // 'DZ'
"[\\u01F1\\u01F2\\u01F3]",
CASE_MAPPINGS,
"[a-z]",
"[A-Za-z]",
@ -1326,6 +1344,8 @@ void UnicodeSetTest::TestCloseOver() {
int32_t selector = DATA[i][0];
UnicodeString pat(DATA[i+1], -1, US_INV);
UnicodeString exp(DATA[i+2], -1, US_INV);
UErrorCode ec = U_ZERO_ERROR;
s.applyPattern(pat, ec);
s.closeOver(selector);
t.applyPattern(exp, ec);
@ -1341,68 +1361,8 @@ void UnicodeSetTest::TestCloseOver() {
}
}
#if 0
/*
* Unused test code.
* This was used to compare the old implementation (using USET_CASE)
* with the new one (using 0x100 temporarily)
* while transitioning from hardcoded case closure tables in uniset.cpp
* (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
* and using ucase.c functions for closure.
* See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
*
* Note: The old and new implementation never fully matched because
* the old implementation turned out to not map U+0130 and U+0131 correctly
* (dotted I and dotless i) and because the old implementation's data tables
* were outdated compared to Unicode 4.0.1 at the time of the change to the
* new implementation. (So sigmas and some other characters were not handled
* according to the newer Unicode version.)
*/
UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
UnicodeSetIterator si(sens);
UnicodeString str, buf2;
const UnicodeString *pStr;
UChar32 c;
while(si.next()) {
if(!si.isString()) {
c=si.getCodepoint();
s.clear();
s.add(c);
str.setTo(c);
str.foldCase();
sens2.add(str);
t=s;
s.closeOver(USET_CASE);
t.closeOver(0x100);
if(s!=t) {
errln("FAIL: closeOver(U+%04x) differs: ", c);
errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
}
}
}
// remove all code points
// should contain all full case folding mapping strings
sens2.remove(0, 0x10ffff);
si.reset(sens2);
while(si.next()) {
if(si.isString()) {
pStr=&si.getString();
s.clear();
s.add(*pStr);
t=s2=s;
s.closeOver(USET_CASE);
t.closeOver(0x100);
if(s!=t) {
errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, true)+") differs: ");
errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
}
}
}
#endif
// Test the pattern API
UErrorCode ec = U_ZERO_ERROR;
s.applyPattern("[abc]", USET_CASE_INSENSITIVE, nullptr, ec);
if (U_FAILURE(ec)) {
errln("FAIL: applyPattern failed");
@ -1423,6 +1383,123 @@ void UnicodeSetTest::TestCloseOver() {
}
}
namespace {
void addIfAbsent(const std::unordered_multimap<UChar32, UChar32> &closure, UChar32 c, UChar32 t,
std::unordered_multimap<UChar32, UChar32> &additions) {
for (auto it = closure.find(c);; ++it) {
if (it == closure.end() || it->first != c) {
// absent
additions.insert({c, t});
break;
} else if (it->second == t) {
// present
break;
}
}
}
} // namespace
void UnicodeSetTest::TestCloseOverSimpleCaseFolding() {
IcuTestErrorCode errorCode(*this, "TestCloseOverSimpleCaseFolding");
const UnicodeSet *sensitive =
UnicodeSet::fromUSet(u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode));
if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE) failed")) {
return;
}
// Compute the scf=Simple_Case_Folding closure:
// For each scf(c)=t, start with mappings c->t and t->c.
std::unordered_multimap<UChar32, UChar32> closure;
UnicodeSetIterator iter(*sensitive);
while (iter.next()) {
UChar32 c = iter.getCodepoint();
UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
if (scfChar != c) {
closure.insert({c, scfChar});
closure.insert({scfChar, c});
}
}
// Complete the closure: Add mappings of mappings.
for (;;) {
std::unordered_multimap<UChar32, UChar32> additions;
// for each mapping c->t
for (auto mapping : closure) {
UChar32 c = mapping.first;
UChar32 t = mapping.second;
// enumerate each t->u
for (auto it = closure.find(t); it != closure.end() && it->first == t; ++it) {
UChar32 u = it->second;
if (u != c) {
addIfAbsent(closure, c, u, additions);
addIfAbsent(closure, u, c, additions);
}
}
}
if (additions.empty()) {
break; // The closure is complete.
}
closure.insert(additions.begin(), additions.end());
}
// Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
// Here we focus on single code points as input.
// Other examples, including strings, are tested in TestCloseOver().
int32_t errors = 0;
iter.reset();
UnicodeSet set, expected;
while (iter.next()) {
UChar32 c = iter.getCodepoint();
// closeOver()
set.clear().add(c);
set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
// From-first-principles implementation.
expected.clear().add(c);
for (auto it = closure.find(c); it != closure.end() && it->first == c; ++it) {
expected.add(it->second);
}
// compare
if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
errln(" c=U+%04X", c);
if (++errors == 10) {
break;
}
}
}
}
void UnicodeSetTest::TestCloseOverLargeSets() {
IcuTestErrorCode errorCode(*this, "TestCloseOverLargeSets");
// Check that an optimization for large sets does not change the result.
// Most code points except ones that are boring for case mappings.
UnicodeSet manyCp(u"[^[:C:][:Ideographic:][:Hang:]]", errorCode);
// Main Unihan block.
constexpr UChar32 LARGE_START = 0x4E00;
constexpr UChar32 LARGE_END = 0x9FFF;
static constexpr int32_t OPTIONS[] = {
USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE
};
UnicodeSet input, small, large;
for (int32_t option : OPTIONS) {
UnicodeSetIterator iter(manyCp);
while (iter.next()) {
UChar32 c = iter.getCodepoint();
input.clear().add(c);
small = input;
small.closeOver(option);
large = input;
large.add(LARGE_START, LARGE_END);
large.closeOver(option);
large.remove(LARGE_START, LARGE_END);
if (!checkEqual(small, large, "small != large")) {
errln(" option=%d c=U+%04X", option, c);
break;
}
}
}
}
void UnicodeSetTest::TestEscapePattern() {
const char pattern[] =
"[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";

View file

@ -74,6 +74,8 @@ private:
void TestExhaustive(void);
void TestCloseOver(void);
void TestCloseOverSimpleCaseFolding();
void TestCloseOverLargeSets();
void TestEscapePattern(void);

View file

@ -260,34 +260,6 @@ public final class UCaseProps {
* - for k include the Kelvin sign
*/
public final void addCaseClosure(int c, UnicodeSet set) {
/*
* Hardcode the case closure of i and its relatives and ignore the
* data file data for these characters.
* The Turkic dotless i and dotted I with their case mapping conditions
* and case folding option make the related characters behave specially.
* This code matches their closure behavior to their case folding behavior.
*/
switch(c) {
case 0x49:
/* regular i and I are in one equivalence class */
set.add(0x69);
return;
case 0x69:
set.add(0x49);
return;
case 0x130:
/* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
set.add(iDot);
return;
case 0x131:
/* dotless i is in a class by itself */
return;
default:
/* otherwise use the data file data */
break;
}
int props=trie.get(c);
if(!propsHasException(props)) {
if(getTypeFromProps(props)!=NONE) {
@ -302,19 +274,41 @@ public final class UCaseProps {
* c has exceptions, so there may be multiple simple and/or
* full case mappings. Add them all.
*/
int excOffset0, excOffset=getExceptionsOffset(props);
int closureOffset;
int excOffset=getExceptionsOffset(props);
int excWord=exceptions.charAt(excOffset++);
int index, closureLength, fullLength, length;
int excOffset0=excOffset;
excOffset0=excOffset;
// Hardcode the case closure of i and its relatives and ignore the
// data file data for these characters.
// The Turkic dotless i and dotted I with their case mapping conditions
// and case folding option make the related characters behave specially.
// This code matches their closure behavior to their case folding behavior.
if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
// These characters have Turkic case foldings. Hardcode their closure.
if (c == 0x49) {
// Regular i and I are in one equivalence class.
set.add(0x69);
return;
} else if (c == 0x130) {
// Dotted I is in a class with <0069 0307>
// (for canonical equivalence with <0049 0307>).
set.add(iDot);
return;
}
} else if (c == 0x69) {
set.add(0x49);
return;
} else if (c == 0x131) {
// Dotless i is in a class by itself.
return;
}
/* add all simple case mappings */
for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
if(hasSlot(excWord, index)) {
excOffset=excOffset0;
c=getSlotValue(excWord, index, excOffset);
set.add(c);
int mapping=getSlotValue(excWord, index, excOffset);
set.add(mapping);
}
}
if(hasSlot(excWord, EXC_DELTA)) {
@ -324,6 +318,7 @@ public final class UCaseProps {
}
/* get the closure string pointer & length */
int closureOffset, closureLength;
if(hasSlot(excWord, EXC_CLOSURE)) {
excOffset=excOffset0;
long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
@ -338,7 +333,7 @@ public final class UCaseProps {
if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
excOffset=excOffset0;
long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
fullLength=(int)value;
int fullLength=(int)value;
/* start of full case mapping strings */
excOffset=(int)(value>>32)+1;
@ -350,7 +345,7 @@ public final class UCaseProps {
fullLength>>=4;
/* add the full case folding string */
length=fullLength&0xf;
int length=fullLength&0xf;
if(length!=0) {
set.add(exceptions.substring(excOffset, excOffset+length));
excOffset+=length;
@ -367,9 +362,137 @@ public final class UCaseProps {
/* add each code point in the closure string */
int limit=closureOffset+closureLength;
for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
c=exceptions.codePointAt(index);
set.add(c);
for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
int mapping=exceptions.codePointAt(index);
set.add(mapping);
}
}
}
/**
* Add the simple case closure mapping,
* except if there is not actually an scf relationship between the two characters.
* TODO: Unicode should probably add the corresponding scf mappings.
* See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
* If & when those scf mappings are added, we should be able to remove all of these exceptions.
*/
private static void addOneSimpleCaseClosure(int c, int t, UnicodeSet set) {
switch (c) {
case 0x0390:
if (t == 0x1FD3) { return; }
break;
case 0x03B0:
if (t == 0x1FE3) { return; }
break;
case 0x1FD3:
if (t == 0x0390) { return; }
break;
case 0x1FE3:
if (t == 0x03B0) { return; }
break;
case 0xFB05:
if (t == 0xFB06) { return; }
break;
case 0xFB06:
if (t == 0xFB05) { return; }
break;
default:
break;
}
set.add(t);
}
public final void addSimpleCaseClosure(int c, UnicodeSet set) {
int props=trie.get(c);
if(!propsHasException(props)) {
if(getTypeFromProps(props)!=NONE) {
/* add the one simple case mapping, no matter what type it is */
int delta=getDelta(props);
if(delta!=0) {
set.add(c+delta);
}
}
} else {
// c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
int excOffset=getExceptionsOffset(props);
int excWord=exceptions.charAt(excOffset++);
int excOffset0=excOffset;
// Hardcode the case closure of i and its relatives and ignore the
// data file data for these characters, like in ucase_addCaseClosure().
if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
// These characters have Turkic case foldings. Hardcode their closure.
if (c == 0x49) {
// Regular i and I are in one equivalence class.
set.add(0x69);
return;
} else if (c == 0x130) {
// For scf=Simple_Case_Folding, dotted I is in a class by itself.
return;
}
} else if (c == 0x69) {
set.add(0x49);
return;
} else if (c == 0x131) {
// Dotless i is in a class by itself.
return;
}
// Add all simple case mappings.
for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
if(hasSlot(excWord, index)) {
excOffset=excOffset0;
int mapping=getSlotValue(excWord, index, excOffset);
addOneSimpleCaseClosure(c, mapping, set);
}
}
if(hasSlot(excWord, EXC_DELTA)) {
excOffset=excOffset0;
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
int mapping = (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
addOneSimpleCaseClosure(c, mapping, set);
}
/* get the closure string pointer & length */
int closureOffset, closureLength;
if(hasSlot(excWord, EXC_CLOSURE)) {
excOffset=excOffset0;
long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
} else {
closureLength=0;
closureOffset=0;
}
// Skip the full case mappings.
if(closureLength > 0 && hasSlot(excWord, EXC_FULL_MAPPINGS)) {
excOffset=excOffset0;
long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
int fullLength=(int)value;
/* start of full case mapping strings */
excOffset=(int)(value>>32)+1;
fullLength&=0xffff; /* bits 16 and higher are reserved */
// Skip all 4 full case mappings.
excOffset+=fullLength&FULL_LOWER;
fullLength>>=4;
excOffset+=fullLength&0xf;
fullLength>>=4;
excOffset+=fullLength&0xf;
fullLength>>=4;
excOffset+=fullLength;
closureOffset=excOffset; /* behind full case mappings */
}
// Add each code point in the closure string whose scf maps back to c.
int limit=closureOffset+closureLength;
for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
int mapping=exceptions.codePointAt(index);
addOneSimpleCaseClosure(c, mapping, set);
}
}
}

View file

@ -459,7 +459,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options a bitmask indicating which options to apply.
* Valid options are IGNORE_SPACE and CASE.
* Valid options are {@link #IGNORE_SPACE} and
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern contains
* a syntax error.
* @stable ICU 3.8
@ -495,7 +497,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @param symbols a symbol table mapping variables to char[] arrays
* and chars to UnicodeSets
* @param options a bitmask indicating which options to apply.
* Valid options are IGNORE_SPACE and CASE.
* Valid options are {@link #IGNORE_SPACE} and
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern
* contains a syntax error.
* @stable ICU 3.2
@ -587,7 +591,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* See the class description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options a bitmask indicating which options to apply.
* Valid options are IGNORE_SPACE and CASE.
* Valid options are {@link #IGNORE_SPACE} and
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern
* contains a syntax error.
* @stable ICU 3.8
@ -2584,8 +2590,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* variables, or null if none.
* @param rebuiltPat the pattern that was parsed, rebuilt or
* copied from the input pattern, as appropriate.
* @param options a bit mask of zero or more of the following:
* IGNORE_SPACE, CASE.
* @param options a bit mask.
* Valid options are {@link #IGNORE_SPACE} and
* at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
*/
private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
Appendable rebuiltPat, int options, int depth) {
@ -2965,8 +2973,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* to close over case BEFORE COMPLEMENTING. This makes
* patterns like /[^abc]/i work.
*/
if ((options & CASE) != 0) {
closeOver(CASE);
if ((options & CASE_MASK) != 0) {
closeOver(options);
}
if (invert) {
complement().removeAllStrings(); // code point complement
@ -3861,58 +3869,81 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
public static final int IGNORE_SPACE = 1;
/**
* Bitmask for constructor, applyPattern(), and closeOver()
* indicating letter case. This may be ORed together with other
* selectors.
* Alias for {@link #CASE_INSENSITIVE}.
*
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This performs a full
* closure over case mappings, e.g. U+017F for s.
*
* The resulting set is a superset of the input for the code points but
* not for the strings.
* It performs a case mapping closure of the code points and adds
* full case folding strings for the code points, and reduces strings of
* the original set to their full case folding equivalents.
*
* This is designed for case-insensitive matches, for example
* in regular expressions. The full code point case closure allows checking of
* an input character directly against the closure set.
* Strings are matched by comparing the case-folded form from the closure
* set with an incremental case folding of the string in question.
*
* The closure set will also contain single code points if the original
* set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
* This is not necessary (that is, redundant) for the above matching method
* but results in the same closure sets regardless of whether the original
* set contained the code point or a string.
* @stable ICU 3.8
*/
public static final int CASE = 2;
/**
* Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
* also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
* @see #CASE
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This performs a full
* closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
*
* <p>This value is an options bit set value for some
* constructors, applyPattern(), and closeOver().
* It can be ORed together with other, unrelated options.
*
* <p>The resulting set is a superset of the input for the code points but
* not for the strings.
* It performs a case mapping closure of the code points and adds
* full case folding strings for the code points, and reduces strings of
* the original set to their full case folding equivalents.
*
* <p>This is designed for case-insensitive matches, for example
* in regular expressions. The full code point case closure allows checking of
* an input character directly against the closure set.
* Strings are matched by comparing the case-folded form from the closure
* set with an incremental case folding of the string in question.
*
* <p>The closure set will also contain single code points if the original
* set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
* This is not necessary (that is, redundant) for the above matching method
* but results in the same closure sets regardless of whether the original
* set contained the code point or a string.
*
* @stable ICU 3.4
*/
public static final int CASE_INSENSITIVE = 2;
/**
* Bitmask for constructor, applyPattern(), and closeOver()
* indicating letter case. This may be ORed together with other
* selectors.
*
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
* title-, and uppercase mappings as well as the case folding
* Adds all case mappings for each element in the set.
* This adds the full lower-, title-, and uppercase mappings as well as the full case folding
* of each existing element in the set.
*
* <p>This value is an options bit set value for some
* constructors, applyPattern(), and closeOver().
* It can be ORed together with other, unrelated options.
*
* <p>Unlike the case insensitive options, this does not perform a closure.
* For example, it does not add 'ſ' (U+017F long s) for 's',
* 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
*
* @stable ICU 3.4
*/
public static final int ADD_CASE_MAPPINGS = 4;
/**
* Enable case insensitive matching.
* Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings,
* which map each code point to one code point,
* not full Case_Folding (cf) mappings, which map some code points to multiple code points.
*
* <p>This is designed for case-insensitive matches, for example in certain
* regular expression implementations where only Simple_Case_Folding mappings are used,
* such as in ECMAScript (JavaScript) regular expressions.
*
* <p>This value is an options bit set value for some
* constructors, applyPattern(), and closeOver().
* It can be ORed together with other, unrelated options.
*
* @draft ICU 73
*/
public static final int SIMPLE_CASE_INSENSITIVE = 6;
private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS;
// add the result of a full case mapping to the set
// use str as a temporary string to avoid constructing one
private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
@ -3930,99 +3961,193 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// see UCaseProps
}
/** For case closure on a large set, look only at code points with relevant properties. */
UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) {
if (src.size() < 30) {
return src;
}
// Return the intersection of the src code points with Case_Sensitive ones.
UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
// Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src.
if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) {
return sensitive.cloneAsThawed().retainAll(src);
} else {
return ((UnicodeSet) src.clone()).retainAll(sensitive);
}
}
// Per-character scf = Simple_Case_Folding of a string.
// (Normally when we case-fold a string we use full case foldings.)
private static final boolean scfString(CharSequence s, StringBuilder scf) {
int length = s.length();
// Loop while not needing modification.
for (int i = 0; i < length;) {
int c = Character.codePointAt(s, i);
int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
if (scfChar != c) {
// Copy the characters before c.
scf.setLength(0);
scf.append(s, 0, i);
// Loop over the rest of the string and keep case-folding.
for (;;) {
scf.appendCodePoint(scfChar);
i += Character.charCount(c);
if (i == length) {
return true;
}
c = Character.codePointAt(s, i);
scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
}
}
i += Character.charCount(c);
}
return false;
}
/**
* Close this set over the given attribute. For the attribute
* CASE, the result is to modify this set so that:
* {@link #CASE_INSENSITIVE}, the result is to modify this set so that:
*
* 1. For each character or string 'a' in this set, all strings
* <ol>
* <li>For each character or string 'a' in this set, all strings
* 'b' such that foldCase(a) == foldCase(b) are added to this set.
* (For most 'a' that are single characters, 'b' will have
* b.length() == 1.)
*
* 2. For each string 'e' in the resulting set, if e !=
* <li>For each string 'e' in the resulting set, if e !=
* foldCase(e), 'e' will be removed.
* </ol>
*
* Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
* <p>Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
*
* (Here foldCase(x) refers to the operation
* <p>(Here foldCase(x) refers to the operation
* UCharacter.foldCase(x, true), and a == b actually denotes
* a.equals(b), not pointer comparison.)
*
* @param attribute bitmask for attributes to close over.
* Currently only the CASE bit is supported. Any undefined bits
* are ignored.
* Valid options:
* At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
* {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* Unrelated options bits are ignored.
* @return a reference to this set.
* @stable ICU 3.8
*/
public UnicodeSet closeOver(int attribute) {
checkFrozen();
if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
UCaseProps csp = UCaseProps.INSTANCE;
UnicodeSet foldSet = new UnicodeSet(this);
ULocale root = ULocale.ROOT;
// start with input set to guarantee inclusion
// CASE: remove strings because the strings will actually be reduced (folded);
// therefore, start with no strings and add only those needed
if((attribute & CASE) != 0 && foldSet.hasStrings()) {
foldSet.strings.clear();
}
int n = getRangeCount();
int result;
StringBuilder full = new StringBuilder();
for (int i=0; i<n; ++i) {
int start = getRangeStart(i);
int end = getRangeEnd(i);
if((attribute & CASE) != 0) {
// full case closure
for (int cp=start; cp<=end; ++cp) {
csp.addCaseClosure(cp, foldSet);
}
} else {
// add case mappings
// (does not add long s for regular s, or Kelvin for k, for example)
for (int cp=start; cp<=end; ++cp) {
result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
result = csp.toFullFolding(cp, full, 0);
addCaseMapping(foldSet, result, full);
}
}
}
if (hasStrings()) {
if ((attribute & CASE) != 0) {
for (String s : strings) {
String str = UCharacter.foldCase(s, 0);
if(!csp.addStringCaseClosure(str, foldSet)) {
foldSet.add(str); // does not map to code points: add the folded string itself
}
}
} else {
BreakIterator bi = BreakIterator.getWordInstance(root);
for (String str : strings) {
// TODO: call lower-level functions
foldSet.add(UCharacter.toLowerCase(root, str));
foldSet.add(UCharacter.toTitleCase(root, str, bi));
foldSet.add(UCharacter.toUpperCase(root, str));
foldSet.add(UCharacter.foldCase(str, 0));
}
}
}
set(foldSet);
switch (attribute & CASE_MASK) {
case 0:
break;
case CASE_INSENSITIVE:
closeOverCaseInsensitive(/* simple= */ false);
break;
case ADD_CASE_MAPPINGS:
closeOverAddCaseMappings();
break;
case SIMPLE_CASE_INSENSITIVE:
closeOverCaseInsensitive(/* simple= */ true);
break;
default:
// bad option (unreachable)
break;
}
return this;
}
private void closeOverCaseInsensitive(boolean simple) {
UCaseProps csp = UCaseProps.INSTANCE;
// Start with input set to guarantee inclusion.
UnicodeSet foldSet = new UnicodeSet(this);
// Full case mappings closure:
// Remove strings because the strings will actually be reduced (folded);
// therefore, start with no strings and add only those needed.
// Do this before processing code points, because they may add strings.
if (!simple && foldSet.hasStrings()) {
foldSet.strings.clear();
}
UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
// Iterate over the ranges of single code points. Nested loop for each code point.
int n = codePoints.getRangeCount();
for (int i=0; i<n; ++i) {
int start = codePoints.getRangeStart(i);
int end = codePoints.getRangeEnd(i);
if (simple) {
for (int cp=start; cp<=end; ++cp) {
csp.addSimpleCaseClosure(cp, foldSet);
}
} else {
for (int cp=start; cp<=end; ++cp) {
csp.addCaseClosure(cp, foldSet);
}
}
}
if (hasStrings()) {
StringBuilder sb = simple ? new StringBuilder() : null;
for (String s : strings) {
if (simple) {
if (scfString(s, sb)) {
foldSet.remove(s).add(sb);
}
} else {
String str = UCharacter.foldCase(s, 0);
if(!csp.addStringCaseClosure(str, foldSet)) {
foldSet.add(str); // does not map to code points: add the folded string itself
}
}
}
}
set(foldSet);
}
private void closeOverAddCaseMappings() {
UCaseProps csp = UCaseProps.INSTANCE;
// Start with input set to guarantee inclusion.
UnicodeSet foldSet = new UnicodeSet(this);
UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
// Iterate over the ranges of single code points. Nested loop for each code point.
int n = codePoints.getRangeCount();
int result;
StringBuilder full = new StringBuilder();
for (int i=0; i<n; ++i) {
int start = codePoints.getRangeStart(i);
int end = codePoints.getRangeEnd(i);
// add case mappings
// (does not add long s for regular s, or Kelvin for k, for example)
for (int cp=start; cp<=end; ++cp) {
result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
result = csp.toFullFolding(cp, full, 0);
addCaseMapping(foldSet, result, full);
}
}
if (hasStrings()) {
ULocale root = ULocale.ROOT;
BreakIterator bi = BreakIterator.getWordInstance(root);
for (String str : strings) {
// TODO: call lower-level functions
foldSet.add(UCharacter.toLowerCase(root, str));
foldSet.add(UCharacter.toTitleCase(root, str, bi));
foldSet.add(UCharacter.toUpperCase(root, str));
foldSet.add(UCharacter.foldCase(str, 0));
}
}
set(foldSet);
}
/**
* Internal class for customizing UnicodeSet parsing of properties.
* TODO: extend to allow customizing of codepoint ranges

View file

@ -20,6 +20,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
@ -32,6 +33,7 @@ import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.util.CollectionUtilities;
import com.ibm.icu.impl.SortedSetRelation;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.CharacterProperties;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
import com.ibm.icu.lang.UProperty;
@ -1323,38 +1325,98 @@ public class UnicodeSetTest extends TestFmwk {
@Test
public void TestCloseOver() {
String CASE = String.valueOf(UnicodeSet.CASE);
String CASE_MAPPINGS = String.valueOf(UnicodeSet.ADD_CASE_MAPPINGS);
String SIMPLE_CASE_INSENSITIVE = String.valueOf(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
String[] DATA = {
// selector, input, output
CASE,
"[aq\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
SIMPLE_CASE_INSENSITIVE,
"[aq\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\u00DF\u1E9E{bc}{fi}]",
CASE,
"[\u01F1]", // 'DZ'
"[\u01F1\u01F2\u01F3]",
SIMPLE_CASE_INSENSITIVE,
"[\u01F1]", // 'DZ'
"[\u01F1\u01F2\u01F3]",
CASE,
"[\u1FB4]",
"[\u1FB4{\u03AC\u03B9}]",
SIMPLE_CASE_INSENSITIVE,
"[\u1FB4]",
"[\u1FB4]",
CASE,
"[{F\uFB01}]",
"[\uFB03{ffi}]",
CASE, // make sure binary search finds limits
"[a\uFF3A]",
"[aA\uFF3A\uFF5A]",
CASE,
"[a-z]","[A-Za-z\u017F\u212A]",
SIMPLE_CASE_INSENSITIVE,
"[a-z]","[A-Za-z\u017F\u212A]",
CASE,
"[abc]","[A-Ca-c]",
CASE,
"[ABC]","[A-Ca-c]",
CASE, "[i]", "[iI]",
CASE, "[\u0130]", "[\u0130{i\u0307}]", // dotted I
CASE, "[{i\u0307}]", "[\u0130{i\u0307}]", // i with dot
CASE, "[\u0131]", "[\u0131]", // dotless i
CASE, "[\u0390]", "[\u0390\u1FD3{\u03B9\u0308\u0301}]",
CASE, "[\u03c2]", "[\u03a3\u03c2\u03c3]", // sigmas
CASE, "[\u03f2]", "[\u03f2\u03f9]", // lunate sigmas
CASE, "[\u03f7]", "[\u03f7\u03f8]",
CASE, "[\u1fe3]", "[\u03b0\u1fe3{\u03c5\u0308\u0301}]",
CASE, "[\ufb05]", "[\ufb05\ufb06{st}]",
CASE, "[{st}]", "[\ufb05\ufb06{st}]",
CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
CASE, "[{a\u02BE}]", "[\u1E9A{a\u02BE}]", // first in sorted table
CASE, "[{\u1f7c\u03b9}]", "[\u1ff2{\u1f7c\u03b9}]", // last in sorted table
CASE_MAPPINGS,
"[aq\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
CASE_MAPPINGS,
"[\u01F1]", // 'DZ'
"[\u01F1\u01F2\u01F3]",
CASE_MAPPINGS,
"[a-z]",
"[A-Za-z]",
};
UnicodeSet s = new UnicodeSet();
UnicodeSet t = new UnicodeSet();
for (int i=0; i<DATA.length; i+=3) {
int selector = Integer.parseInt(DATA[i]);
String pat = DATA[i+1];
String exp = DATA[i+2];
String pat = Utility.unescape(DATA[i+1]);
String exp = Utility.unescape(DATA[i+2]);
s.applyPattern(pat);
s.closeOver(selector);
t.applyPattern(exp);
@ -1371,6 +1433,149 @@ public class UnicodeSetTest extends TestFmwk {
expectContainment(s, "abcABC", "defDEF");
s = new UnicodeSet("[^abc]", UnicodeSet.CASE);
expectContainment(s, "defDEF", "abcABC");
s = new UnicodeSet("[abck]", UnicodeSet.ADD_CASE_MAPPINGS);
expectContainment(s, "abckABCK", "defDEF\u212A");
}
private void add(Map<Integer, Collection<Integer>> closure, Integer c, Integer t) {
Collection<Integer> values = closure.get(c);
if (values == null) {
values = new TreeSet<>();
closure.put(c, values);
}
values.add(t);
}
private void addIfAbsent(Map<Integer, Collection<Integer>> closure, Integer c, Integer t,
Map<Integer, Collection<Integer>> additions) {
Collection<Integer> values = closure.get(c);
if (values == null || !values.contains(t)) {
if (additions != closure) {
values = additions.get(c);
}
if (values == null) {
values = new TreeSet<>();
additions.put(c, values);
}
values.add(t);
}
}
@Test
public void TestCloseOverSimpleCaseFolding() {
UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
// Compute the scf=Simple_Case_Folding closure:
// For each scf(c)=t, start with mappings c->t and t->c.
// Poor man's multimap from code points to code points.
Map<Integer, Collection<Integer>> closure = new HashMap<>();
UnicodeSetIterator iter = new UnicodeSetIterator(sensitive);
while (iter.next()) {
int c = iter.codepoint;
int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
if (scfChar != c) {
add(closure, c, scfChar);
add(closure, scfChar, c);
}
}
// Complete the closure: Add mappings of mappings.
Map<Integer, Collection<Integer>> additions = new HashMap<>();
for (;;) {
// for each mapping c->t
for (Map.Entry<Integer, Collection<Integer>> entry : closure.entrySet()) {
Integer c = entry.getKey();
Collection<Integer> cValues = entry.getValue();
for (Integer t : cValues) {
// enumerate each t->u
Collection<Integer> tValues = closure.get(t);
if (tValues != null) {
for (Integer u : tValues) {
if (!u.equals(c)) {
addIfAbsent(closure, c, u, additions);
addIfAbsent(closure, u, c, additions);
}
}
}
}
}
if (additions.isEmpty()) {
break; // The closure is complete.
}
// Add all of the additions back into the closure.
for (Map.Entry<Integer, Collection<Integer>> entry : additions.entrySet()) {
Integer c = entry.getKey();
Collection<Integer> cValues = entry.getValue();
Collection<Integer> closureValues = closure.get(c);
if (closureValues == null) {
closureValues = new TreeSet<>();
closure.put(c, closureValues);
}
closureValues.addAll(cValues);
}
additions.clear();
}
// Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
// Here we focus on single code points as input.
// Other examples, including strings, are tested in TestCloseOver().
int errors = 0;
iter.reset();
UnicodeSet set = new UnicodeSet(), expected = new UnicodeSet();
while (iter.next()) {
int c = iter.codepoint;
// closeOver()
set.clear().add(c);
set.closeOver(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
// From-first-principles implementation.
expected.clear().add(c);
Collection<Integer> values = closure.get(c);
if (values != null) {
for (Integer t : values) {
expected.add(t);
}
}
// compare
if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
errln(" c=U+" + Utility.hex(c));
if (++errors == 10) {
break;
}
}
}
}
@Test
public void TestCloseOverLargeSets() {
// Check that an optimization for large sets does not change the result.
// Most code points except ones that are boring for case mappings.
UnicodeSet manyCp = new UnicodeSet("[^[:C:][:Ideographic:][:Hang:]]");
// Main Unihan block.
int LARGE_START = 0x4E00;
int LARGE_END = 0x9FFF;
int OPTIONS[] = {
UnicodeSet.CASE_INSENSITIVE, UnicodeSet.ADD_CASE_MAPPINGS,
UnicodeSet.SIMPLE_CASE_INSENSITIVE
};
UnicodeSet input = new UnicodeSet(), small, large;
for (int option : OPTIONS) {
UnicodeSetIterator iter = new UnicodeSetIterator(manyCp);
while (iter.next()) {
int c = iter.codepoint;
input.clear().add(c);
small = (UnicodeSet) input.clone();
small.closeOver(option);
large = (UnicodeSet) input.clone();
large.add(LARGE_START, LARGE_END);
large.closeOver(option);
large.remove(LARGE_START, LARGE_END);
if (!checkEqual(small, large, "small != large")) {
errln(" option=" + option + " c=U+" + Utility.hex(c));
break;
}
}
}
}
@Test
@ -1709,8 +1914,8 @@ public class UnicodeSetTest extends TestFmwk {
test2.add("a" + (max - i)); // add in reverse order
}
assertNotEquals("compare iterable test", test1, test2);
TreeSet<CharSequence> sortedTest1 = new TreeSet<CharSequence>(test1);
TreeSet<CharSequence> sortedTest2 = new TreeSet<CharSequence>(test2);
TreeSet<CharSequence> sortedTest1 = new TreeSet<>(test1);
TreeSet<CharSequence> sortedTest2 = new TreeSet<>(test2);
assertEquals("compare iterable test", sortedTest1, sortedTest2);
}