ICU-6001 UCNV_ROUNDTRIP_AND_FALLBACK_SET for ucnv_getUnicodeSet(); improve set filtering; fix some bugs

X-SVN-Rev: 22847
This commit is contained in:
Markus Scherer 2007-10-24 21:15:41 +00:00
parent 56171edcdc
commit 31b85e745d
12 changed files with 353 additions and 38 deletions

View file

@ -3399,11 +3399,19 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
/* include ASCII for JP */
sa->addRange(sa->set, 0, 0x7f);
}
if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
/*
* TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
* we need to include half-width Katakana for all JP variants because
* JIS X 0208 has hardcoded fallbacks for them.
* Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
* because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
* use half-width Katakana.
* This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
* half-width Katakana via the ESC ( I sequence.
* However, we only emit (fromUnicode) half-width Katakana according to the
* definition of each variant.
*
* When including fallbacks,
* we need to include half-width Katakana Unicode code points for all JP variants because
* JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
*/
/* include half-width Katakana for JP */
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
@ -3457,6 +3465,12 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
* corresponding to JIS X 0208.
*/
filter=UCNV_SET_FILTER_SJIS;
} else if(i==KSC5601) {
/*
* Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
* are broader than GR94.
*/
filter=UCNV_SET_FILTER_GR94DBCS;
} else {
filter=UCNV_SET_FILTER_NONE;
}
@ -3472,6 +3486,9 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
sa->remove(sa->set, 0x0e);
sa->remove(sa->set, 0x0f);
sa->remove(sa->set, 0x1b);
/* ISO 2022 converters do not convert C1 controls either */
sa->removeRange(sa->set, 0x80, 0x9f);
}
static const UConverterImpl _ISO2022Impl={

View file

@ -946,7 +946,7 @@ static void
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
const int32_t *cx,
const USetAdder *sa,
UConverterUnicodeSet which,
UBool useFallback,
int32_t minLength,
UChar32 c,
UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
@ -966,7 +966,7 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
value=*fromUSectionValues++;
if( value!=0 &&
UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
if(c>=0) {
@ -987,12 +987,14 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
/* no mapping, do nothing */
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
ucnv_extGetUnicodeSetString(
sharedData, cx, sa, which, minLength,
sharedData, cx, sa, useFallback, minLength,
U_SENTINEL, s, length+1,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
} else if((useFallback ?
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
sa->addString(sa->set, s, length+1);
@ -1004,6 +1006,7 @@ U_CFUNC void
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
const USetAdder *sa,
UConverterUnicodeSet which,
UConverterSetFilter filter,
UErrorCode *pErrorCode) {
const int32_t *cx;
const uint16_t *stage12, *stage3, *ps2, *ps3;
@ -1011,6 +1014,7 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
uint32_t value;
int32_t st1, stage1Length, st2, st3, minLength;
UBool useFallback;
UChar s[UCNV_EXT_MAX_UCHARS];
UChar32 c;
@ -1027,12 +1031,20 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
/* enumerate the from-Unicode trie table */
c=0; /* keep track of the current code point while enumerating */
if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
filter==UCNV_SET_FILTER_DBCS_ONLY ||
filter==UCNV_SET_FILTER_SJIS ||
filter==UCNV_SET_FILTER_GR94DBCS
) {
/* DBCS-only, ignore single-byte results */
minLength=2;
} else if(filter==UCNV_SET_FILTER_2022_CN) {
minLength=3;
} else {
minLength=1;
}
@ -1064,14 +1076,41 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
length=0;
U16_APPEND_UNSAFE(s, length, c);
ucnv_extGetUnicodeSetString(
sharedData, cx, sa, which, minLength,
sharedData, cx, sa, useFallback, minLength,
c, s, length,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
} else if((useFallback ?
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
switch(filter) {
case UCNV_SET_FILTER_2022_CN:
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
continue;
}
break;
case UCNV_SET_FILTER_SJIS:
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
continue;
}
break;
case UCNV_SET_FILTER_GR94DBCS:
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
(uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) &&
(uint8_t)(value-0xa1)<=(0xfe-0xa1))) {
continue;
}
break;
default:
/*
* UCNV_SET_FILTER_NONE,
* or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
*/
break;
}
sa->add(sa->set, c);
}
} while((++c&0xf)!=0);

View file

@ -382,10 +382,20 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
UErrorCode *pErrorCode);
/*
* Add code points and strings to the set according to the extension mappings.
* Limitation on the UConverterSetFilter:
* The filters currently assume that they are used with 1:1 mappings.
* They only apply to single input code points, and then they pass through
* only mappings with single-charset-code results.
* For example, the Shift-JIS filter only works for 2-byte results and tests
* that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
*/
U_CFUNC void
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
const USetAdder *sa,
UConverterUnicodeSet which,
UConverterSetFilter filter,
UErrorCode *pErrorCode);
/* toUnicode helpers -------------------------------------------------------- */

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2005, International Business Machines
* Copyright (C) 2003-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -52,7 +52,8 @@ ucnv_getUnicodeSet(const UConverter *cnv,
uset_add,
uset_addRange,
uset_addString,
uset_remove
uset_remove,
uset_removeRange
};
sa.set=setFillIn;

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2006, International Business Machines
* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvhz.c
@ -528,6 +528,7 @@ _HZ_GetUnicodeSet(const UConverter *cnv,
sa->add(sa->set, 0x7e);
/* add all of the code points that the sub-converter handles */
/* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
((UConverterDataHZ*)cnv->extraInfo)->
gbConverter->sharedData->impl->
getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,

View file

@ -485,9 +485,23 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
if(mbcsTable->outputType==MBCS_OUTPUT_1) {
const uint16_t *stage2, *stage3, *results;
uint16_t minValue;
results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
/*
* Set a threshold variable for selecting which mappings to use.
* See ucnv_MBCSSingleFromBMPWithOffsets() and
* MBCS_SINGLE_RESULT_FROM_U() for details.
*/
if(which==UCNV_ROUNDTRIP_SET) {
/* use only roundtrips */
minValue=0xf00;
} else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
/* use all roundtrip and fallback results */
minValue=0x800;
}
for(st1=0; st1<maxStage1; ++st1) {
st2=table[st1];
if(st2>maxStage1) {
@ -497,15 +511,8 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
/* read the stage 3 block */
stage3=results+st3;
/*
* Add code points for which the roundtrip flag is set.
* Once we get a set for fallback mappings, we have to use
* a threshold variable with a value of 0x800.
* See ucnv_MBCSSingleFromBMPWithOffsets() and
* MBCS_SINGLE_RESULT_FROM_U() for details.
*/
do {
if(*stage3++>=0xf00) {
if(*stage3++>=minValue) {
sa->add(sa->set, c);
}
} while((++c&0xf)!=0);
@ -522,9 +529,12 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
const uint8_t *stage3, *bytes;
uint32_t st3Multiplier;
uint32_t value;
UBool useFallback;
bytes=mbcsTable->fromUnicodeBytes;
useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
switch(mbcsTable->outputType) {
case MBCS_OUTPUT_3:
case MBCS_OUTPUT_4_EUC:
@ -551,9 +561,8 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
st3>>=16;
/*
* Add code points for which the roundtrip flag is set.
* Once we get a set for fallback mappings, we have to check
* non-roundtrip stage 3 results for whether they are 0.
* Add code points for which the roundtrip flag is set,
* or which map to non-zero bytes if we use fallbacks.
* See ucnv_MBCSFromUnicodeWithOffsets() for details.
*/
switch(filter) {
@ -561,6 +570,23 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
do {
if(st3&1) {
sa->add(sa->set, c);
stage3+=st3Multiplier;
} else if(useFallback) {
uint8_t b=0;
switch(st3Multiplier) {
case 4:
b|=*stage3++;
case 3:
b|=*stage3++;
case 2:
b|=stage3[0]|stage3[1];
stage3+=2;
default:
break;
}
if(b!=0) {
sa->add(sa->set, c);
}
}
st3>>=1;
} while((++c&0xf)!=0);
@ -568,7 +594,7 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
case UCNV_SET_FILTER_DBCS_ONLY:
/* Ignore single-byte results (<0x100). */
do {
if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
sa->add(sa->set, c);
}
st3>>=1;
@ -578,7 +604,7 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
case UCNV_SET_FILTER_2022_CN:
/* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
do {
if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
sa->add(sa->set, c);
}
st3>>=1;
@ -588,7 +614,20 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
case UCNV_SET_FILTER_SJIS:
/* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
do {
if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
sa->add(sa->set, c);
}
st3>>=1;
stage3+=2; /* +=st3Multiplier */
} while((++c&0xf)!=0);
break;
case UCNV_SET_FILTER_GR94DBCS:
/* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
do {
if( ((st3&1)!=0 || useFallback) &&
(uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) &&
(uint8_t)(value-0xa1)<=(0xfe-0xa1)
) {
sa->add(sa->set, c);
}
st3>>=1;
@ -609,7 +648,7 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
}
}
ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
}
U_CFUNC void

View file

@ -492,6 +492,7 @@ typedef enum UConverterSetFilter {
UCNV_SET_FILTER_DBCS_ONLY,
UCNV_SET_FILTER_2022_CN,
UCNV_SET_FILTER_SJIS,
UCNV_SET_FILTER_GR94DBCS,
UCNV_SET_FILTER_COUNT
} UConverterSetFilter;

View file

@ -870,6 +870,8 @@ ucnv_getStarters(const UConverter* converter,
typedef enum UConverterUnicodeSet {
/** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */
UCNV_ROUNDTRIP_SET,
/** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */
UCNV_ROUNDTRIP_AND_FALLBACK_SET,
/** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */
UCNV_SET_COUNT
} UConverterUnicodeSet;
@ -878,11 +880,16 @@ typedef enum UConverterUnicodeSet {
/**
* Returns the set of Unicode code points that can be converted by an ICU converter.
*
* The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET):
* Returns one of several kinds of set:
*
* 1. UCNV_ROUNDTRIP_SET
*
* The set of all Unicode code points that can be roundtrip-converted
* (converted without any data loss) with the converter.
* (converted without any data loss) with the converter (ucnv_fromUnicode()).
* This set will not include code points that have fallback mappings
* or are only the result of reverse fallback mappings.
* This set will also not include PUA code points with fallbacks, although
* ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback().
* See UTR #22 "Character Mapping Markup Language"
* at http://www.unicode.org/reports/tr22/
*
@ -893,6 +900,12 @@ typedef enum UConverterUnicodeSet {
* by comparing its roundtrip set with the set of ExemplarCharacters from
* ICU's locale data or other sources
*
* 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET
*
* The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode())
* when fallbacks are turned on (see ucnv_setFallback()).
* This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks).
*
* In the future, there may be more UConverterUnicodeSet choices to select
* sets with different properties.
*

View file

@ -36,6 +36,9 @@ USetAddString(USet *set, const UChar *str, int32_t length);
typedef void U_CALLCONV
USetRemove(USet *set, UChar32 c);
typedef void U_CALLCONV
USetRemoveRange(USet *set, UChar32 start, UChar32 end);
/**
* Interface for adding items to a USet, to keep low-level code from
* statically depending on the USet implementation.
@ -47,6 +50,7 @@ struct USetAdder {
USetAddRange *addRange;
USetAddString *addString;
USetRemove *remove;
USetRemoveRange *removeRange;
};
typedef struct USetAdder USetAdder;

View file

@ -70,6 +70,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
default: name=""; break; //needed to end loop
}
}
@ -465,6 +466,181 @@ ConversionTest::TestGetUnicodeSet() {
}
}
static void U_EXPORT2
getUnicodeSetCallback(const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode *pErrorCode) {
if(reason<=UCNV_IRREGULAR) {
((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
*pErrorCode=U_ZERO_ERROR; // skip
} // else ignore the reset, close and clone calls.
}
// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
void
ConversionTest::TestGetUnicodeSet2() {
// Build a string with all code points.
UChar32 cpLimit;
int32_t s0Length;
if(quick) {
cpLimit=s0Length=0x10000; // BMP only
} else {
cpLimit=0x110000;
s0Length=0x10000+0x200000; // BMP + surrogate pairs
}
UChar *s0=new UChar[s0Length];
if(s0==NULL) {
return;
}
UChar *s=s0;
UChar32 c;
UChar c2;
// low BMP
for(c=0; c<=0xd7ff; ++c) {
*s++=(UChar)c;
}
// trail surrogates
for(c=0xdc00; c<=0xdfff; ++c) {
*s++=(UChar)c;
}
// lead surrogates
// (after trails so that there is not even one surrogate pair in between)
for(c=0xd800; c<=0xdbff; ++c) {
*s++=(UChar)c;
}
// high BMP
for(c=0xe000; c<=0xffff; ++c) {
*s++=(UChar)c;
}
// supplementary code points = surrogate pairs
if(cpLimit==0x110000) {
for(c=0xd800; c<=0xdbff; ++c) {
for(c2=0xdc00; c2<=0xdfff; ++c2) {
*s++=(UChar)c;
*s++=c2;
}
}
}
static const char *const cnvNames[]={
"UTF-8",
"UTF-7",
"UTF-16",
"US-ASCII",
"ISO-8859-1",
"windows-1252",
"Shift-JIS",
"ibm-1390", // EBCDIC_STATEFUL table
"ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
// "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
"ISO-2022-JP",
"JIS7",
"ISO-2022-CN",
"ISO-2022-CN-EXT",
// "LMBCS" TODO(markus): known bug, the fallback set is said to be missing [\uF600-\uF6FF]
};
char buffer[1024];
int32_t i;
for(i=0; i<LENGTHOF(cnvNames); ++i) {
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=cnv_open(cnvNames[i], errorCode);
if(U_FAILURE(errorCode)) {
errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
continue;
}
UnicodeSet expected;
ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
if(U_FAILURE(errorCode)) {
errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
ucnv_close(cnv);
continue;
}
UConverterUnicodeSet which;
for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
ucnv_setFallback(cnv, TRUE);
}
expected.add(0, cpLimit-1);
s=s0;
UBool flush;
do {
char *t=buffer;
flush=(UBool)(s==s0+s0Length);
ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
if(U_FAILURE(errorCode)) {
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
continue;
} else {
break; // unexpected error, should not occur
}
}
} while(!flush);
UnicodeSet set;
ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
if(cpLimit<0x110000) {
set.remove(cpLimit, 0x10ffff);
}
if(which==UCNV_ROUNDTRIP_SET) {
// ignore PUA code points because they will be converted even if they
// are fallbacks and when other fallbacks are turned off,
// but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
expected.remove(0xe000, 0xf8ff);
expected.remove(0xf0000, 0xffffd);
expected.remove(0x100000, 0x10fffd);
set.remove(0xe000, 0xf8ff);
set.remove(0xf0000, 0xffffd);
set.remove(0x100000, 0x10fffd);
}
if(set!=expected) {
// First try to see if we have different sets because ucnv_getUnicodeSet()
// added strings: The above conversion method does not tell us what strings might be convertible.
// Remove strings from the set and compare again.
// Unfortunately, there are no good, direct set methods for finding out whether there are strings
// in the set, nor for enumerating or removing just them.
// Intersect all code points with the set. The intersection will not contain strings.
UnicodeSet temp(0, 0x10ffff);
temp.retainAll(set);
set=temp;
}
if(set!=expected) {
UnicodeSet diffSet;
UnicodeString out;
// are there items that must be in the set but are not?
(diffSet=expected).removeAll(set);
if(!diffSet.isEmpty()) {
diffSet.toPattern(out, TRUE);
if(out.length()>100) {
out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
}
errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
cnvNames[i], which);
errln(out);
}
// are there items that must not be in the set but are?
(diffSet=set).removeAll(expected);
if(!diffSet.isEmpty()) {
diffSet.toPattern(out, TRUE);
if(out.length()>100) {
out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
}
errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
cnvNames[i], which);
errln(out);
}
}
}
}
delete [] s0;
}
// open testdata or ICU data converter ------------------------------------- ***
UConverter *

View file

@ -72,6 +72,7 @@ public:
void TestToUnicode();
void TestFromUnicode();
void TestGetUnicodeSet();
void TestGetUnicodeSet2();
private:
UBool

View file

@ -1311,16 +1311,29 @@ conversion:table(nofallback) {
// versions of ISO-2022-JP
{
"ISO-2022-JP",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
:int{0}
}
}
{
"ISO-2022-JP-2",
"[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
"[\x0e\x0f\x1b\uffe7-\U0010ffff]",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
"[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
:int{0}
}
{
"JIS7",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
"[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
:int{0}
}
// with fallbacks
{
"ISO-2022-JP",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
"[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
:int{1}
}
// versions of ISO-2022-CN
{