mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-6001 UCNV_ROUNDTRIP_AND_FALLBACK_SET for ucnv_getUnicodeSet(); improve set filtering; fix some bugs
X-SVN-Rev: 22847
This commit is contained in:
parent
56171edcdc
commit
31b85e745d
12 changed files with 353 additions and 38 deletions
|
@ -3399,11 +3399,19 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||
/* include ASCII for JP */
|
||||
sa->addRange(sa->set, 0, 0x7f);
|
||||
}
|
||||
if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
|
||||
if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
|
||||
/*
|
||||
* TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
|
||||
* we need to include half-width Katakana for all JP variants because
|
||||
* JIS X 0208 has hardcoded fallbacks for them.
|
||||
* Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
|
||||
* because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
|
||||
* use half-width Katakana.
|
||||
* This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
|
||||
* half-width Katakana via the ESC ( I sequence.
|
||||
* However, we only emit (fromUnicode) half-width Katakana according to the
|
||||
* definition of each variant.
|
||||
*
|
||||
* When including fallbacks,
|
||||
* we need to include half-width Katakana Unicode code points for all JP variants because
|
||||
* JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
|
||||
*/
|
||||
/* include half-width Katakana for JP */
|
||||
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
|
||||
|
@ -3457,6 +3465,12 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||
* corresponding to JIS X 0208.
|
||||
*/
|
||||
filter=UCNV_SET_FILTER_SJIS;
|
||||
} else if(i==KSC5601) {
|
||||
/*
|
||||
* Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
|
||||
* are broader than GR94.
|
||||
*/
|
||||
filter=UCNV_SET_FILTER_GR94DBCS;
|
||||
} else {
|
||||
filter=UCNV_SET_FILTER_NONE;
|
||||
}
|
||||
|
@ -3472,6 +3486,9 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||
sa->remove(sa->set, 0x0e);
|
||||
sa->remove(sa->set, 0x0f);
|
||||
sa->remove(sa->set, 0x1b);
|
||||
|
||||
/* ISO 2022 converters do not convert C1 controls either */
|
||||
sa->removeRange(sa->set, 0x80, 0x9f);
|
||||
}
|
||||
|
||||
static const UConverterImpl _ISO2022Impl={
|
||||
|
|
|
@ -946,7 +946,7 @@ static void
|
|||
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
||||
const int32_t *cx,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UBool useFallback,
|
||||
int32_t minLength,
|
||||
UChar32 c,
|
||||
UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
|
||||
|
@ -966,7 +966,7 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
|||
value=*fromUSectionValues++;
|
||||
|
||||
if( value!=0 &&
|
||||
UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
|
||||
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
||||
) {
|
||||
if(c>=0) {
|
||||
|
@ -987,12 +987,14 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
|||
/* no mapping, do nothing */
|
||||
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
|
||||
ucnv_extGetUnicodeSetString(
|
||||
sharedData, cx, sa, which, minLength,
|
||||
sharedData, cx, sa, useFallback, minLength,
|
||||
U_SENTINEL, s, length+1,
|
||||
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
||||
pErrorCode);
|
||||
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
|
||||
} else if((useFallback ?
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
|
||||
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
||||
) {
|
||||
sa->addString(sa->set, s, length+1);
|
||||
|
@ -1004,6 +1006,7 @@ U_CFUNC void
|
|||
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UConverterSetFilter filter,
|
||||
UErrorCode *pErrorCode) {
|
||||
const int32_t *cx;
|
||||
const uint16_t *stage12, *stage3, *ps2, *ps3;
|
||||
|
@ -1011,6 +1014,7 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
|||
|
||||
uint32_t value;
|
||||
int32_t st1, stage1Length, st2, st3, minLength;
|
||||
UBool useFallback;
|
||||
|
||||
UChar s[UCNV_EXT_MAX_UCHARS];
|
||||
UChar32 c;
|
||||
|
@ -1027,12 +1031,20 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
|||
|
||||
stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
|
||||
|
||||
useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
|
||||
|
||||
/* enumerate the from-Unicode trie table */
|
||||
c=0; /* keep track of the current code point while enumerating */
|
||||
|
||||
if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
|
||||
if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
|
||||
filter==UCNV_SET_FILTER_DBCS_ONLY ||
|
||||
filter==UCNV_SET_FILTER_SJIS ||
|
||||
filter==UCNV_SET_FILTER_GR94DBCS
|
||||
) {
|
||||
/* DBCS-only, ignore single-byte results */
|
||||
minLength=2;
|
||||
} else if(filter==UCNV_SET_FILTER_2022_CN) {
|
||||
minLength=3;
|
||||
} else {
|
||||
minLength=1;
|
||||
}
|
||||
|
@ -1064,14 +1076,41 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
|||
length=0;
|
||||
U16_APPEND_UNSAFE(s, length, c);
|
||||
ucnv_extGetUnicodeSetString(
|
||||
sharedData, cx, sa, which, minLength,
|
||||
sharedData, cx, sa, useFallback, minLength,
|
||||
c, s, length,
|
||||
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
||||
pErrorCode);
|
||||
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
|
||||
} else if((useFallback ?
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
|
||||
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
||||
) {
|
||||
switch(filter) {
|
||||
case UCNV_SET_FILTER_2022_CN:
|
||||
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case UCNV_SET_FILTER_SJIS:
|
||||
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case UCNV_SET_FILTER_GR94DBCS:
|
||||
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
|
||||
(uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) &&
|
||||
(uint8_t)(value-0xa1)<=(0xfe-0xa1))) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* UCNV_SET_FILTER_NONE,
|
||||
* or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
|
||||
*/
|
||||
break;
|
||||
}
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
} while((++c&0xf)!=0);
|
||||
|
|
|
@ -382,10 +382,20 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
|
|||
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/*
|
||||
* Add code points and strings to the set according to the extension mappings.
|
||||
* Limitation on the UConverterSetFilter:
|
||||
* The filters currently assume that they are used with 1:1 mappings.
|
||||
* They only apply to single input code points, and then they pass through
|
||||
* only mappings with single-charset-code results.
|
||||
* For example, the Shift-JIS filter only works for 2-byte results and tests
|
||||
* that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UConverterSetFilter filter,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/* toUnicode helpers -------------------------------------------------------- */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2005, International Business Machines
|
||||
* Copyright (C) 2003-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -52,7 +52,8 @@ ucnv_getUnicodeSet(const UConverter *cnv,
|
|||
uset_add,
|
||||
uset_addRange,
|
||||
uset_addString,
|
||||
uset_remove
|
||||
uset_remove,
|
||||
uset_removeRange
|
||||
};
|
||||
sa.set=setFillIn;
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2000-2006, International Business Machines
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: ucnvhz.c
|
||||
|
@ -528,6 +528,7 @@ _HZ_GetUnicodeSet(const UConverter *cnv,
|
|||
sa->add(sa->set, 0x7e);
|
||||
|
||||
/* add all of the code points that the sub-converter handles */
|
||||
/* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
|
||||
((UConverterDataHZ*)cnv->extraInfo)->
|
||||
gbConverter->sharedData->impl->
|
||||
getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
|
||||
|
|
|
@ -485,9 +485,23 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
|
||||
if(mbcsTable->outputType==MBCS_OUTPUT_1) {
|
||||
const uint16_t *stage2, *stage3, *results;
|
||||
uint16_t minValue;
|
||||
|
||||
results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
|
||||
|
||||
/*
|
||||
* Set a threshold variable for selecting which mappings to use.
|
||||
* See ucnv_MBCSSingleFromBMPWithOffsets() and
|
||||
* MBCS_SINGLE_RESULT_FROM_U() for details.
|
||||
*/
|
||||
if(which==UCNV_ROUNDTRIP_SET) {
|
||||
/* use only roundtrips */
|
||||
minValue=0xf00;
|
||||
} else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
|
||||
/* use all roundtrip and fallback results */
|
||||
minValue=0x800;
|
||||
}
|
||||
|
||||
for(st1=0; st1<maxStage1; ++st1) {
|
||||
st2=table[st1];
|
||||
if(st2>maxStage1) {
|
||||
|
@ -497,15 +511,8 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
/* read the stage 3 block */
|
||||
stage3=results+st3;
|
||||
|
||||
/*
|
||||
* Add code points for which the roundtrip flag is set.
|
||||
* Once we get a set for fallback mappings, we have to use
|
||||
* a threshold variable with a value of 0x800.
|
||||
* See ucnv_MBCSSingleFromBMPWithOffsets() and
|
||||
* MBCS_SINGLE_RESULT_FROM_U() for details.
|
||||
*/
|
||||
do {
|
||||
if(*stage3++>=0xf00) {
|
||||
if(*stage3++>=minValue) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
} while((++c&0xf)!=0);
|
||||
|
@ -522,9 +529,12 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
const uint8_t *stage3, *bytes;
|
||||
uint32_t st3Multiplier;
|
||||
uint32_t value;
|
||||
UBool useFallback;
|
||||
|
||||
bytes=mbcsTable->fromUnicodeBytes;
|
||||
|
||||
useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
|
||||
|
||||
switch(mbcsTable->outputType) {
|
||||
case MBCS_OUTPUT_3:
|
||||
case MBCS_OUTPUT_4_EUC:
|
||||
|
@ -551,9 +561,8 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
st3>>=16;
|
||||
|
||||
/*
|
||||
* Add code points for which the roundtrip flag is set.
|
||||
* Once we get a set for fallback mappings, we have to check
|
||||
* non-roundtrip stage 3 results for whether they are 0.
|
||||
* Add code points for which the roundtrip flag is set,
|
||||
* or which map to non-zero bytes if we use fallbacks.
|
||||
* See ucnv_MBCSFromUnicodeWithOffsets() for details.
|
||||
*/
|
||||
switch(filter) {
|
||||
|
@ -561,6 +570,23 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
do {
|
||||
if(st3&1) {
|
||||
sa->add(sa->set, c);
|
||||
stage3+=st3Multiplier;
|
||||
} else if(useFallback) {
|
||||
uint8_t b=0;
|
||||
switch(st3Multiplier) {
|
||||
case 4:
|
||||
b|=*stage3++;
|
||||
case 3:
|
||||
b|=*stage3++;
|
||||
case 2:
|
||||
b|=stage3[0]|stage3[1];
|
||||
stage3+=2;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if(b!=0) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
}
|
||||
st3>>=1;
|
||||
} while((++c&0xf)!=0);
|
||||
|
@ -568,7 +594,7 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
case UCNV_SET_FILTER_DBCS_ONLY:
|
||||
/* Ignore single-byte results (<0x100). */
|
||||
do {
|
||||
if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
|
||||
if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
|
@ -578,7 +604,7 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
case UCNV_SET_FILTER_2022_CN:
|
||||
/* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
|
||||
do {
|
||||
if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
|
||||
if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
|
@ -588,7 +614,20 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
case UCNV_SET_FILTER_SJIS:
|
||||
/* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
|
||||
do {
|
||||
if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
|
||||
if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=2; /* +=st3Multiplier */
|
||||
} while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_GR94DBCS:
|
||||
/* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
|
||||
do {
|
||||
if( ((st3&1)!=0 || useFallback) &&
|
||||
(uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) &&
|
||||
(uint8_t)(value-0xa1)<=(0xfe-0xa1)
|
||||
) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
|
@ -609,7 +648,7 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
|||
}
|
||||
}
|
||||
|
||||
ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
|
||||
ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
|
|
|
@ -492,6 +492,7 @@ typedef enum UConverterSetFilter {
|
|||
UCNV_SET_FILTER_DBCS_ONLY,
|
||||
UCNV_SET_FILTER_2022_CN,
|
||||
UCNV_SET_FILTER_SJIS,
|
||||
UCNV_SET_FILTER_GR94DBCS,
|
||||
UCNV_SET_FILTER_COUNT
|
||||
} UConverterSetFilter;
|
||||
|
||||
|
|
|
@ -870,6 +870,8 @@ ucnv_getStarters(const UConverter* converter,
|
|||
typedef enum UConverterUnicodeSet {
|
||||
/** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */
|
||||
UCNV_ROUNDTRIP_SET,
|
||||
/** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */
|
||||
UCNV_ROUNDTRIP_AND_FALLBACK_SET,
|
||||
/** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */
|
||||
UCNV_SET_COUNT
|
||||
} UConverterUnicodeSet;
|
||||
|
@ -878,11 +880,16 @@ typedef enum UConverterUnicodeSet {
|
|||
/**
|
||||
* Returns the set of Unicode code points that can be converted by an ICU converter.
|
||||
*
|
||||
* The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET):
|
||||
* Returns one of several kinds of set:
|
||||
*
|
||||
* 1. UCNV_ROUNDTRIP_SET
|
||||
*
|
||||
* The set of all Unicode code points that can be roundtrip-converted
|
||||
* (converted without any data loss) with the converter.
|
||||
* (converted without any data loss) with the converter (ucnv_fromUnicode()).
|
||||
* This set will not include code points that have fallback mappings
|
||||
* or are only the result of reverse fallback mappings.
|
||||
* This set will also not include PUA code points with fallbacks, although
|
||||
* ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback().
|
||||
* See UTR #22 "Character Mapping Markup Language"
|
||||
* at http://www.unicode.org/reports/tr22/
|
||||
*
|
||||
|
@ -893,6 +900,12 @@ typedef enum UConverterUnicodeSet {
|
|||
* by comparing its roundtrip set with the set of ExemplarCharacters from
|
||||
* ICU's locale data or other sources
|
||||
*
|
||||
* 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET
|
||||
*
|
||||
* The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode())
|
||||
* when fallbacks are turned on (see ucnv_setFallback()).
|
||||
* This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks).
|
||||
*
|
||||
* In the future, there may be more UConverterUnicodeSet choices to select
|
||||
* sets with different properties.
|
||||
*
|
||||
|
|
|
@ -36,6 +36,9 @@ USetAddString(USet *set, const UChar *str, int32_t length);
|
|||
typedef void U_CALLCONV
|
||||
USetRemove(USet *set, UChar32 c);
|
||||
|
||||
typedef void U_CALLCONV
|
||||
USetRemoveRange(USet *set, UChar32 start, UChar32 end);
|
||||
|
||||
/**
|
||||
* Interface for adding items to a USet, to keep low-level code from
|
||||
* statically depending on the USet implementation.
|
||||
|
@ -47,6 +50,7 @@ struct USetAdder {
|
|||
USetAddRange *addRange;
|
||||
USetAddString *addString;
|
||||
USetRemove *remove;
|
||||
USetRemoveRange *removeRange;
|
||||
};
|
||||
typedef struct USetAdder USetAdder;
|
||||
|
||||
|
|
|
@ -70,6 +70,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
|
|||
case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
|
||||
case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
|
||||
case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
|
||||
case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
|
||||
default: name=""; break; //needed to end loop
|
||||
}
|
||||
}
|
||||
|
@ -465,6 +466,181 @@ ConversionTest::TestGetUnicodeSet() {
|
|||
}
|
||||
}
|
||||
|
||||
static void U_EXPORT2
|
||||
getUnicodeSetCallback(const void *context,
|
||||
UConverterFromUnicodeArgs *fromUArgs,
|
||||
const UChar* codeUnits,
|
||||
int32_t length,
|
||||
UChar32 codePoint,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(reason<=UCNV_IRREGULAR) {
|
||||
((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
|
||||
*pErrorCode=U_ZERO_ERROR; // skip
|
||||
} // else ignore the reset, close and clone calls.
|
||||
}
|
||||
|
||||
// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
|
||||
void
|
||||
ConversionTest::TestGetUnicodeSet2() {
|
||||
// Build a string with all code points.
|
||||
UChar32 cpLimit;
|
||||
int32_t s0Length;
|
||||
if(quick) {
|
||||
cpLimit=s0Length=0x10000; // BMP only
|
||||
} else {
|
||||
cpLimit=0x110000;
|
||||
s0Length=0x10000+0x200000; // BMP + surrogate pairs
|
||||
}
|
||||
UChar *s0=new UChar[s0Length];
|
||||
if(s0==NULL) {
|
||||
return;
|
||||
}
|
||||
UChar *s=s0;
|
||||
UChar32 c;
|
||||
UChar c2;
|
||||
// low BMP
|
||||
for(c=0; c<=0xd7ff; ++c) {
|
||||
*s++=(UChar)c;
|
||||
}
|
||||
// trail surrogates
|
||||
for(c=0xdc00; c<=0xdfff; ++c) {
|
||||
*s++=(UChar)c;
|
||||
}
|
||||
// lead surrogates
|
||||
// (after trails so that there is not even one surrogate pair in between)
|
||||
for(c=0xd800; c<=0xdbff; ++c) {
|
||||
*s++=(UChar)c;
|
||||
}
|
||||
// high BMP
|
||||
for(c=0xe000; c<=0xffff; ++c) {
|
||||
*s++=(UChar)c;
|
||||
}
|
||||
// supplementary code points = surrogate pairs
|
||||
if(cpLimit==0x110000) {
|
||||
for(c=0xd800; c<=0xdbff; ++c) {
|
||||
for(c2=0xdc00; c2<=0xdfff; ++c2) {
|
||||
*s++=(UChar)c;
|
||||
*s++=c2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const char *const cnvNames[]={
|
||||
"UTF-8",
|
||||
"UTF-7",
|
||||
"UTF-16",
|
||||
"US-ASCII",
|
||||
"ISO-8859-1",
|
||||
"windows-1252",
|
||||
"Shift-JIS",
|
||||
"ibm-1390", // EBCDIC_STATEFUL table
|
||||
"ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
|
||||
// "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
|
||||
"ISO-2022-JP",
|
||||
"JIS7",
|
||||
"ISO-2022-CN",
|
||||
"ISO-2022-CN-EXT",
|
||||
// "LMBCS" TODO(markus): known bug, the fallback set is said to be missing [\uF600-\uF6FF]
|
||||
};
|
||||
char buffer[1024];
|
||||
int32_t i;
|
||||
for(i=0; i<LENGTHOF(cnvNames); ++i) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
UConverter *cnv=cnv_open(cnvNames[i], errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
|
||||
continue;
|
||||
}
|
||||
UnicodeSet expected;
|
||||
ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
|
||||
ucnv_close(cnv);
|
||||
continue;
|
||||
}
|
||||
UConverterUnicodeSet which;
|
||||
for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
|
||||
if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
|
||||
ucnv_setFallback(cnv, TRUE);
|
||||
}
|
||||
expected.add(0, cpLimit-1);
|
||||
s=s0;
|
||||
UBool flush;
|
||||
do {
|
||||
char *t=buffer;
|
||||
flush=(UBool)(s==s0+s0Length);
|
||||
ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
continue;
|
||||
} else {
|
||||
break; // unexpected error, should not occur
|
||||
}
|
||||
}
|
||||
} while(!flush);
|
||||
UnicodeSet set;
|
||||
ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
|
||||
if(cpLimit<0x110000) {
|
||||
set.remove(cpLimit, 0x10ffff);
|
||||
}
|
||||
if(which==UCNV_ROUNDTRIP_SET) {
|
||||
// ignore PUA code points because they will be converted even if they
|
||||
// are fallbacks and when other fallbacks are turned off,
|
||||
// but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
|
||||
expected.remove(0xe000, 0xf8ff);
|
||||
expected.remove(0xf0000, 0xffffd);
|
||||
expected.remove(0x100000, 0x10fffd);
|
||||
set.remove(0xe000, 0xf8ff);
|
||||
set.remove(0xf0000, 0xffffd);
|
||||
set.remove(0x100000, 0x10fffd);
|
||||
}
|
||||
if(set!=expected) {
|
||||
// First try to see if we have different sets because ucnv_getUnicodeSet()
|
||||
// added strings: The above conversion method does not tell us what strings might be convertible.
|
||||
// Remove strings from the set and compare again.
|
||||
// Unfortunately, there are no good, direct set methods for finding out whether there are strings
|
||||
// in the set, nor for enumerating or removing just them.
|
||||
// Intersect all code points with the set. The intersection will not contain strings.
|
||||
UnicodeSet temp(0, 0x10ffff);
|
||||
temp.retainAll(set);
|
||||
set=temp;
|
||||
}
|
||||
if(set!=expected) {
|
||||
UnicodeSet diffSet;
|
||||
UnicodeString out;
|
||||
|
||||
// are there items that must be in the set but are not?
|
||||
(diffSet=expected).removeAll(set);
|
||||
if(!diffSet.isEmpty()) {
|
||||
diffSet.toPattern(out, TRUE);
|
||||
if(out.length()>100) {
|
||||
out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
|
||||
}
|
||||
errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
|
||||
cnvNames[i], which);
|
||||
errln(out);
|
||||
}
|
||||
|
||||
// are there items that must not be in the set but are?
|
||||
(diffSet=set).removeAll(expected);
|
||||
if(!diffSet.isEmpty()) {
|
||||
diffSet.toPattern(out, TRUE);
|
||||
if(out.length()>100) {
|
||||
out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
|
||||
}
|
||||
errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
|
||||
cnvNames[i], which);
|
||||
errln(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete [] s0;
|
||||
}
|
||||
|
||||
// open testdata or ICU data converter ------------------------------------- ***
|
||||
|
||||
UConverter *
|
||||
|
|
|
@ -72,6 +72,7 @@ public:
|
|||
void TestToUnicode();
|
||||
void TestFromUnicode();
|
||||
void TestGetUnicodeSet();
|
||||
void TestGetUnicodeSet2();
|
||||
|
||||
private:
|
||||
UBool
|
||||
|
|
23
icu4c/source/test/testdata/conversion.txt
vendored
23
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -1311,16 +1311,29 @@ conversion:table(nofallback) {
|
|||
// versions of ISO-2022-JP
|
||||
{
|
||||
"ISO-2022-JP",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
|
||||
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",
|
||||
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
}
|
||||
{
|
||||
"ISO-2022-JP-2",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
|
||||
"[\x0e\x0f\x1b\uffe7-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
|
||||
"[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
{
|
||||
"JIS7",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
|
||||
"[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
// with fallbacks
|
||||
{
|
||||
"ISO-2022-JP",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
|
||||
"[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
|
||||
:int{1}
|
||||
}
|
||||
|
||||
// versions of ISO-2022-CN
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue