ICU-1439 implement ucnv_getUnicodeSet() for roundtrippable code points

X-SVN-Rev: 11464
This commit is contained in:
Markus Scherer 2003-04-05 01:33:02 +00:00
parent 45065374f1
commit a6213ee1c0
15 changed files with 418 additions and 29 deletions

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1998-2001, International Business Machines
* Copyright (C) 1998-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -24,6 +24,7 @@
#include "unicode/ures.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "unicode/uset.h"
#include "cmemory.h"
#include "cstring.h"
#include "umutex.h"
@ -669,6 +670,34 @@ ucnv_getPlatform (const UConverter * converter,
return (UConverterPlatform)converter->sharedData->staticData->platform;
}
/** ### TODO @draft ICU 2.6 */
U_CAPI void U_EXPORT2
ucnv_getUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
/* argument checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
if(cnv==NULL || set==NULL || which<UCNV_ROUNDTRIP_SET || UCNV_SET_COUNT<=which) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
/* does this converter support this function? */
if(cnv->sharedData->impl->getUnicodeSet==NULL) {
*pErrorCode=U_UNSUPPORTED_ERROR;
return;
}
/* empty the set */
uset_clear(set);
/* call the converter to add the code points it supports */
cnv->sharedData->impl->getUnicodeSet(cnv, set, which, pErrorCode);
}
U_CAPI void U_EXPORT2
ucnv_getToUCallBack (const UConverter * converter,
UConverterToUCallback *action,

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2000-2001, International Business Machines
* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -17,8 +17,9 @@
#include "unicode/utypes.h"
#include "unicode/ucnv_err.h"
#include "ucnv_cnv.h"
#include "unicode/ucnv.h"
#include "unicode/uset.h"
#include "ucnv_cnv.h"
#include "cmemory.h"
/*Empties the internal unicode output buffer */
@ -239,3 +240,20 @@ ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs,
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
U_CFUNC void
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
uset_addRange(set, 0, 0x10ffff);
}
U_CFUNC void
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
uset_addRange(set, 0, 0xd7ff);
uset_addRange(set, 0xe000, 0x10ffff);
}

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2002, International Business Machines
* Copyright (C) 1999-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -19,6 +19,7 @@
#define UCNV_CNV_H
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
@ -97,6 +98,12 @@ typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv,
int32_t *pBufferSize,
UErrorCode *status);
/** ### TODO @draft ICU 2.6 */
typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
UBool CONVERSION_U_SUCCESS (UErrorCode err);
void ucnv_flushInternalUnicodeBuffer (UConverter * _this,
@ -149,6 +156,7 @@ struct UConverterImpl {
UConverterGetName getName;
UConverterWriteSub writeSub;
UConverterSafeClone safeClone;
UConverterGetUnicodeSet getUnicodeSet; /* ### TODO ICU 2.6 */
};
extern const UConverterSharedData
@ -231,4 +239,16 @@ ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs,
UBool collectPairs,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
#endif /* UCNV_CNV */

View file

@ -519,7 +519,11 @@ static const UConverterImpl _LMBCSImpl##n={\
_LMBCSFromUnicode,\
_LMBCSFromUnicode,\
_LMBCSGetNextUChar,\
NULL\
NULL,\
NULL,\
NULL,\
NULL,\
ucnv_getCompleteUnicodeSet\
};\
static const UConverterStaticData _LMBCSStaticData##n={\
sizeof(UConverterStaticData),\

View file

@ -439,7 +439,10 @@ static const UConverterImpl _UTF16BEImpl={
T_UConverter_getNextUChar_UTF16_BE,
NULL,
NULL
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
/* The 1200 CCSID refers to any version of Unicode with any endianess of UTF-16 */
@ -532,7 +535,10 @@ static const UConverterImpl _UTF16LEImpl={
T_UConverter_getNextUChar_UTF16_LE,
NULL,
NULL
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
@ -761,7 +767,8 @@ static const UConverterImpl _UTF16Impl = {
NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
NULL,
NULL,
NULL
NULL,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _UTF16StaticData = {

View file

@ -551,7 +551,10 @@ static const UConverterImpl _UTF32BEImpl = {
T_UConverter_getNextUChar_UTF32_BE,
NULL,
NULL
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
@ -1035,7 +1038,10 @@ static const UConverterImpl _UTF32LEImpl = {
T_UConverter_getNextUChar_UTF32_LE,
NULL,
NULL
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
@ -1292,7 +1298,8 @@ static const UConverterImpl _UTF32Impl = {
NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
NULL,
NULL,
NULL
NULL,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _UTF32StaticData = {

View file

@ -792,7 +792,9 @@ static const UConverterImpl _UTF7Impl={
NULL,
_UTF7GetName,
NULL /* we don't need writeSub() because we never call a callback at fromUnicode() */
NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
NULL,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _UTF7StaticData={
@ -1527,7 +1529,9 @@ static const UConverterImpl _IMAPImpl={
NULL,
NULL,
NULL /* we don't need writeSub() because we never call a callback at fromUnicode() */
NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
NULL,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _IMAPStaticData={

View file

@ -975,7 +975,8 @@ static const UConverterImpl _UTF8Impl={
NULL,
NULL,
NULL,
NULL
NULL,
ucnv_getNonSurrogateUnicodeSet
};
/* The 1208 CCSID refers to any version of Unicode of UTF-8 */

View file

@ -1597,7 +1597,8 @@ static const UConverterImpl _Bocu1Impl={
NULL,
NULL,
NULL,
NULL
NULL,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _Bocu1StaticData={

View file

@ -20,10 +20,11 @@
#include "cmemory.h"
#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "unicode/ucnv.h"
#include "ucnv_cnv.h"
#include "unicode/ucnv_cb.h"
#include "unicode/uset.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#define UCNV_TILDE 0x7E /* ~ */
#define UCNV_OPEN_BRACE 0x7B /* { */
@ -635,7 +636,20 @@ _HZ_SafeClone(const UConverter *cnv,
return &localClone->cnv;
}
static void
_HZ_GetUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
/* the tilde '~' is hardcoded in the converter */
uset_add(set, 0x7e);
/* add all of the code points that the sub-converter handles */
((UConverterDataHZ*)cnv->extraInfo)->
gbConverter->sharedData->impl->
getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
set, which, pErrorCode);
}
static const UConverterImpl _HZImpl={
@ -657,7 +671,8 @@ static const UConverterImpl _HZImpl={
NULL,
NULL,
_HZ_WriteSub,
_HZ_SafeClone
_HZ_SafeClone,
_HZ_GetUnicodeSet
};
static const UConverterStaticData _HZStaticData={

View file

@ -15,6 +15,7 @@
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "unicode/uset.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
@ -425,6 +426,14 @@ getTrail:
pArgs->offsets=offsets;
}
static void
_Latin1GetUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
uset_addRange(set, 0, 0xff);
}
static const UConverterImpl _Latin1Impl={
UCNV_LATIN_1,
@ -442,7 +451,10 @@ static const UConverterImpl _Latin1Impl={
_Latin1GetNextUChar,
NULL,
NULL
NULL,
NULL,
NULL,
_Latin1GetUnicodeSet
};
static const UConverterStaticData _Latin1StaticData={
@ -713,6 +725,14 @@ _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
return 0xffff;
}
static void
_ASCIIGetUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
uset_addRange(set, 0, 0x7f);
}
static const UConverterImpl _ASCIIImpl={
UCNV_US_ASCII,
@ -730,7 +750,10 @@ static const UConverterImpl _ASCIIImpl={
_ASCIIGetNextUChar,
NULL,
NULL
NULL,
NULL,
NULL,
_ASCIIGetUnicodeSet
};
static const UConverterStaticData _ASCIIStaticData={

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2000-2001, International Business Machines
* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -43,6 +43,7 @@
#include "unicode/ucnv.h"
#include "unicode/ucnv_cb.h"
#include "unicode/udata.h"
#include "unicode/uset.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
#include "ucnv_cnv.h"
@ -398,7 +399,7 @@ _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) {
for(st1=0; st1<maxStage1; ++st1) {
st2=table[st1];
if(st2!=0) {
if(st2>maxStage1) {
stage2=table+st2;
for(st2=0; st2<64; ++st2) {
st3=stage2[st2];
@ -419,7 +420,7 @@ _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) {
for(st1=0; st1<maxStage1; ++st1) {
st2=table[st1];
if(st2!=0) {
if(st2>(maxStage1>>1)) {
stage2=(const uint32_t *)table+st2;
for(st2=0; st2<64; ++st2) {
st3=stage2[st2]&0xffff;
@ -452,6 +453,105 @@ _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) {
}
}
static void
_MBCSGetUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
UConverterMBCSTable *mbcsTable;
const uint16_t *table;
uint32_t st3;
uint16_t st1, maxStage1, st2;
UChar32 c;
if(cnv->options&_MBCS_OPTION_GB18030) {
uset_addRange(set, 0, 0xd7ff);
uset_addRange(set, 0xe000, 0x10ffff);
return;
}
/* enumerate the from-Unicode trie table */
mbcsTable=&cnv->sharedData->table->mbcs;
table=mbcsTable->fromUnicodeTable;
if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
maxStage1=0x440;
} else {
maxStage1=0x40;
}
c=0; /* keep track of the current code point while enumerating */
if(mbcsTable->outputType==MBCS_OUTPUT_1) {
const uint16_t *stage2, *stage3, *results;
results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
for(st1=0; st1<maxStage1; ++st1) {
st2=table[st1];
if(st2>maxStage1) {
stage2=table+st2;
for(st2=0; st2<64; ++st2) {
if((st3=stage2[st2])!=0) {
/* read the stage 3 block */
stage3=results+st3;
/*
* Add code points for which the roundtrip flag is set.
* Once we get a set for fallback mappings, we have to use
* a threshold variable with a value of 0x800.
* See _MBCSSingleFromBMPWithOffsets() and
* MBCS_SINGLE_RESULT_FROM_U() for details.
*/
do {
if(*stage3++>=0xf00) {
uset_add(set, c);
}
} while((++c&0xf)!=0);
} else {
c+=16; /* empty stage 3 block */
}
}
} else {
c+=1024; /* empty stage 2 block */
}
}
} else {
const uint32_t *stage2;
for(st1=0; st1<maxStage1; ++st1) {
st2=table[st1];
if(st2>(maxStage1>>1)) {
stage2=(const uint32_t *)table+st2;
for(st2=0; st2<64; ++st2) {
if((st3=stage2[st2])!=0) {
/* get the roundtrip flags for the stage 3 block */
st3>>=16;
/*
* Add code points for which the roundtrip flag is set.
* Once we get a set for fallback mappings, we have to check
* non-roundtrip stage 3 results for whether they are 0.
* See _MBCSFromUnicodeWithOffsets() for details.
*/
do {
if(st3&1) {
uset_add(set, c);
}
st3>>=1;
} while((++c&0xf)!=0);
} else {
c+=16; /* empty stage 3 block */
}
}
} else {
c+=1024; /* empty stage 2 block */
}
}
}
}
/* EBCDIC swap LF<->NL ------------------------------------------------------ */
/*
@ -3561,7 +3661,9 @@ static const UConverterImpl _MBCSImpl={
_MBCSGetStarters,
_MBCSGetName,
_MBCSWriteSub
_MBCSWriteSub,
NULL,
_MBCSGetUnicodeSet
};

View file

@ -2198,7 +2198,8 @@ static const UConverterImpl _SCSUImpl={
NULL,
_SCSUGetName,
_SCSUWriteSub,
_SCSUSafeClone
_SCSUSafeClone,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _SCSUStaticData={

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2002, International Business Machines
* Copyright (C) 1999-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* ucnv.h:
@ -51,6 +51,14 @@ typedef struct UConverter UConverter;
#include "unicode/ucnv_err.h"
#include "unicode/uenum.h"
#ifndef __USET_H__
/* see unicode/uset.h */
struct USet;
typedef struct USet USet;
#endif
U_CDECL_BEGIN
/** Maximum length of a converter name including the terminating NULL @stable ICU 2.0 */
@ -679,6 +687,19 @@ ucnv_getStarters(const UConverter* converter,
UBool starters[256],
UErrorCode* err);
/** ### TODO @draft ICU 2.6 */
typedef enum UConverterUnicodeSet {
UCNV_ROUNDTRIP_SET,
UCNV_SET_COUNT
} UConverterUnicodeSet;
/** ### TODO @draft ICU 2.6 */
U_CAPI void U_EXPORT2
ucnv_getUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
/**
* Gets the current calback function used by the converter when an illegal
* or invalid codepage sequence is found.

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2001, International Business Machines Corporation and
* Copyright (c) 1997-2003, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -16,9 +16,10 @@
#include "cmemory.h"
#include "unicode/uloc.h"
#include "unicode/ucnv.h"
#include "cintltst.h"
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/uset.h"
#include "cintltst.h"
#define MAX_LENGTH 999
@ -31,7 +32,7 @@ static int32_t gOutBufferSize = 0;
static char gNuConvTestName[1024];
#define nct_min(x,y) ((x<y) ? x : y)
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
static void printSeq(const unsigned char* a, int len);
static void printSeqErr(const unsigned char* a, int len);
@ -67,6 +68,7 @@ static void TestAvailableConverters(void);
static void TestFlushInternalBuffer(void); /*for improved code coverage in ucnv_cnv.c*/
static void TestResetBehaviour(void);
static void TestTruncated(void);
static void TestUnicodeSet(void);
static void TestWithBufferSize(int32_t osize, int32_t isize);
@ -120,6 +122,7 @@ void addExtraTests(TestNode** root)
addTest(root, &TestRegressionUTF8, "tsconv/ncnvtst/TestRegressionUTF8");
addTest(root, &TestRegressionUTF32, "tsconv/ncnvtst/TestRegressionUTF32");
addTest(root, &TestTruncated, "tsconv/ncnvtst/TestTruncated");
addTest(root, &TestUnicodeSet, "tsconv/ncnvtst/TestUnicodeSet");
}
/*test surrogate behaviour*/
@ -1810,3 +1813,136 @@ TestTruncated() {
doTestTruncated(testCases[i].cnvName, testCases[i].bytes, testCases[i].length);
}
}
typedef struct NameRange {
const char *name;
UChar32 start, end, start2, end2, notStart, notEnd;
} NameRange;
static void
TestUnicodeSet() {
UErrorCode errorCode;
UConverter *cnv;
USet *set;
const char *name;
int32_t i, count;
static const char *const completeSetNames[]={
"UTF-7",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"SCSU",
"BOCU-1",
"CESU-8",
"gb18030",
"IMAP-mailbox-name",
"LMBCS-1",
"LMBCS-2",
"LMBCS-3",
"LMBCS-4",
"LMBCS-5",
"LMBCS-6",
"LMBCS-8",
"LMBCS-11",
"LMBCS-16",
"LMBCS-17",
"LMBCS-18",
"LMBCS-19"
};
static const NameRange nameRanges[]={
{ "US-ASCII", 0, 0x7f, -1, -1, 0x80, 0x10ffff },
{ "ibm-367", 0, 0x7f, -1, -1, 0x80, 0x10ffff },
{ "ISO-8859-1", 0, 0x7f, -1, -1, 0x100, 0x10ffff },
{ "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
{ "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
{ "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
{ "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
};
/* open an empty set */
set=uset_open(1, 0);
count=ucnv_countAvailable();
for(i=0; i<count; ++i) {
errorCode=U_ZERO_ERROR;
name=ucnv_getAvailableName(i);
cnv=ucnv_open(name, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: unable to open converter %s - %s\n",
name, u_errorName(errorCode));
continue;
}
uset_clear(set);
ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: ucnv_getUnicodeSet(%s) failed - %s\n",
name, u_errorName(errorCode));
} else if(uset_size(set)==0) {
log_err("error: ucnv_getUnicodeSet(%s) returns an empty set\n", name);
}
ucnv_close(cnv);
}
/* test converters that are known to convert all of Unicode (except maybe for surrogates) */
for(i=0; i<LENGTHOF(completeSetNames); ++i) {
errorCode=U_ZERO_ERROR;
name=completeSetNames[i];
cnv=ucnv_open(name, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: unable to open converter %s - %s\n",
name, u_errorName(errorCode));
continue;
}
uset_clear(set);
ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: ucnv_getUnicodeSet(%s) failed - %s\n",
name, u_errorName(errorCode));
} else if(!uset_containsRange(set, 0, 0xd7ff) || !uset_containsRange(set, 0xe000, 0x10ffff)) {
log_err("error: ucnv_getUnicodeSet(%s) does not return an all-Unicode set\n", name);
}
ucnv_close(cnv);
}
/* test specific sets */
for(i=0; i<LENGTHOF(nameRanges); ++i) {
errorCode=U_ZERO_ERROR;
name=nameRanges[i].name;
cnv=ucnv_open(name, &errorCode);
if(U_FAILURE(errorCode)) {
log_data_err("error: unable to open converter %s - %s\n",
name, u_errorName(errorCode));
continue;
}
uset_clear(set);
ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: ucnv_getUnicodeSet(%s) failed - %s\n",
name, u_errorName(errorCode));
} else if(
!uset_containsRange(set, nameRanges[i].start, nameRanges[i].end) ||
nameRanges[i].start2>=0 && !uset_containsRange(set, nameRanges[i].start2, nameRanges[i].end2)
) {
log_err("error: ucnv_getUnicodeSet(%s) does not contain the expected ranges\n", name);
} else if(nameRanges[i].notStart>=0) {
/* simulate containsAny() with the C API */
uset_complement(set);
if(!uset_containsRange(set, nameRanges[i].notStart, nameRanges[i].notEnd)) {
log_err("error: ucnv_getUnicodeSet(%s) contains part of the unexpected range\n", name);
}
}
ucnv_close(cnv);
}
}