diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c index a6030b92642..c12827e7b0e 100644 --- a/icu4c/source/common/ucnv_bld.c +++ b/icu4c/source/common/ucnv_bld.c @@ -78,7 +78,7 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={ #endif &_ASCIIData, - &_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data + &_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data, &_IMAPData }; /* Please keep this in binary sorted order for getAlgorithmicTypeFromName. @@ -93,6 +93,7 @@ static struct { { "cesu8", UCNV_CESU8 }, #if !UCONFIG_NO_LEGACY_CONVERSION { "hz",UCNV_HZ }, + { "imapmailboxname", UCNV_IMAP_MAILBOX }, { "iscii", UCNV_ISCII }, { "iso2022", UCNV_ISO_2022 }, #endif diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index 6d98100771c..d21e9468977 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -158,7 +158,7 @@ extern const UConverterSharedData _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19, _HZData,_ISCIIData, _SCSUData, _ASCIIData, - _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data; + _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData; U_CDECL_END diff --git a/icu4c/source/common/ucnv_u7.c b/icu4c/source/common/ucnv_u7.c index fc59efefe50..d6701edd7d5 100644 --- a/icu4c/source/common/ucnv_u7.c +++ b/icu4c/source/common/ucnv_u7.c @@ -498,7 +498,7 @@ callback: static UChar32 _UTF7GetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { - return ucnv_getNextUCharFromToUImpl(pArgs, _UTF7ToUnicodeWithOffsets, TRUE, pErrorCode); + return ucnv_getNextUCharFromToUImpl(pArgs, pArgs->converter->sharedData->impl->toUnicode, TRUE, pErrorCode); } static void @@ -618,7 +618,7 @@ unicodeMode: if(targetcharErrorBuffer[0]=MINUS; @@ -744,8 +744,7 @@ unicodeMode: *offsets++=sourceIndex-1; } } else { - cnv->charErrorBuffer[0]=toBase64[bits]; - cnv->charErrorBufferLength=1; + cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits]; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } } @@ -814,3 +813,738 @@ const UConverterSharedData _UTF7Data={ NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl, 0 }; + +/* IMAP mailbox name encoding ----------------------------------------------- */ + +/* + * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 + * http://www.ietf.org/rfc/rfc2060.txt + * + * 5.1.3. Mailbox International Naming Convention + * + * By convention, international mailbox names are specified using a + * modified version of the UTF-7 encoding described in [UTF-7]. The + * purpose of these modifications is to correct the following problems + * with UTF-7: + * + * 1) UTF-7 uses the "+" character for shifting; this conflicts with + * the common use of "+" in mailbox names, in particular USENET + * newsgroup names. + * + * 2) UTF-7's encoding is BASE64 which uses the "/" character; this + * conflicts with the use of "/" as a popular hierarchy delimiter. + * + * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with + * the use of "\" as a popular hierarchy delimiter. + * + * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with + * the use of "~" in some servers as a home directory indicator. + * + * 5) UTF-7 permits multiple alternate forms to represent the same + * string; in particular, printable US-ASCII chararacters can be + * represented in encoded form. + * + * In modified UTF-7, printable US-ASCII characters except for "&" + * represent themselves; that is, characters with octet values 0x20-0x25 + * and 0x27-0x7e. The character "&" (0x26) is represented by the two- + * octet sequence "&-". + * + * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all + * Unicode 16-bit octets) are represented in modified BASE64, with a + * further modification from [UTF-7] that "," is used instead of "/". + * Modified BASE64 MUST NOT be used to represent any printing US-ASCII + * character which can represent itself. + * + * "&" is used to shift to modified BASE64 and "-" to shift back to US- + * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that + * is, a name that ends with a Unicode 16-bit octet MUST end with a "- + * "). + * + * For example, here is a mailbox name which mixes English, Japanese, + * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw- + */ + +/* + * Tests for US-ASCII characters belonging to character classes + * defined in UTF-7. + * + * Set D (directly encoded characters) consists of the following + * characters: the upper and lower case letters A through Z + * and a through z, the 10 digits 0-9, and the following nine special + * characters (note that "+" and "=" are omitted): + * '(),-./:? + * + * Set O (optional direct characters) consists of the following + * characters (note that "\" and "~" are omitted): + * !"#$%&*;<=>@[]^_`{|} + * + * According to the rules in RFC 2152, the byte values for the following + * US-ASCII characters are not used in UTF-7 and are therefore illegal: + * - all C0 control codes except for CR LF TAB + * - BACKSLASH + * - TILDE + * - DEL + * - all codes beyond US-ASCII, i.e. all >127 + */ + +/* uses '&' not '+' to start a base64 sequence */ +#define AMPERSAND 0x26 +#define COMMA 0x2c +#define SLASH 0x2f + +/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */ +#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e) + +/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */ +#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND) + +#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA) +#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c]) + +/* + * converter status values: + * + * toUnicodeStatus: + * 24 inDirectMode (boolean) + * 23..16 base64Counter (-1..7) + * 15..0 bits (up to 14 bits incoming base64) + * + * fromUnicodeStatus: + * 24 inDirectMode (boolean) + * 23..16 base64Counter (0..2) + * 7..0 bits (6 bits outgoing base64) + * + * ignore bits 31..25 + */ + +static void +_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const uint8_t *source, *sourceLimit; + UChar *target; + const UChar *targetLimit; + int32_t *offsets; + + uint8_t *bytes; + uint8_t byteIndex; + + int32_t length, targetCapacity; + + /* UTF-7 state */ + uint16_t bits; + int8_t base64Counter; + UBool inDirectMode; + + int8_t base64Value; + + int32_t sourceIndex, nextSourceIndex; + + UChar c; + uint8_t b; + + /* set up the local pointers */ + cnv=pArgs->converter; + + source=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + target=pArgs->target; + targetLimit=pArgs->targetLimit; + offsets=pArgs->offsets; + /* get the state machine state */ + { + uint32_t status=cnv->toUnicodeStatus; + inDirectMode=(UBool)((status>>24)&1); + base64Counter=(int8_t)(status>>16); + bits=(uint16_t)status; + } + bytes=cnv->toUBytes; + byteIndex=cnv->toULength; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex=byteIndex==0 ? 0 : -1; + nextSourceIndex=0; + +loop: + if(inDirectMode) { +directMode: + /* + * In Direct Mode, US-ASCII characters are encoded directly, i.e., + * with their US-ASCII byte values. + * An ampersand starts Unicode (or "escape") Mode. + * + * In Direct Mode, only the sourceIndex is used. + */ + byteIndex=0; + length=sourceLimit-source; + targetCapacity=targetLimit-target; + if(length>targetCapacity) { + length=targetCapacity; + } + while(length>0) { + b=*source++; + if(!isLegalIMAP(b)) { + /* illegal */ + bytes[0]=b; + byteIndex=1; + nextSourceIndex=sourceIndex+1; + goto callback; + } else if(b!=AMPERSAND) { + /* write directly encoded character */ + *target++=b; + if(offsets!=NULL) { + *offsets++=sourceIndex++; + } + } else /* AMPERSAND */ { + /* switch to Unicode mode */ + nextSourceIndex=++sourceIndex; + inDirectMode=FALSE; + byteIndex=0; + bits=0; + base64Counter=-1; + goto unicodeMode; + } + --length; + } + if(source=targetLimit) { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + } else { +unicodeMode: + /* + * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded. + * The base64 sequence ends with any character that is not in the base64 alphabet. + * A terminating minus sign is consumed. + * US-ASCII must not be base64-ed. + * + * In Unicode Mode, the sourceIndex has the index to the start of the current + * base64 bytes, while nextSourceIndex is precisely parallel to source, + * keeping the index to the following byte. + * Note that in 2 out of 3 cases, UChars overlap within a base64 byte. + */ + while(source0x7e) { + /* illegal - test other illegal US-ASCII values by base64Value==-3 */ + inDirectMode=TRUE; + goto callback; + } else if((base64Value=FROM_BASE64_IMAP(b))>=0) { + /* collect base64 bytes into UChars */ + switch(base64Counter) { + case -1: /* -1 is immediately after the & */ + case 0: + bits=base64Value; + base64Counter=1; + break; + case 1: + case 3: + case 4: + case 6: + bits=(uint16_t)((bits<<6)|base64Value); + ++base64Counter; + break; + case 2: + c=(UChar)((bits<<4)|(base64Value>>2)); + if(isLegalIMAP(c)) { + /* illegal */ + inDirectMode=TRUE; + goto callback; + } + *target++=c; + if(offsets!=NULL) { + *offsets++=sourceIndex; + sourceIndex=nextSourceIndex-1; + } + bytes[0]=b; /* keep this byte in case an error occurs */ + byteIndex=1; + bits=(uint16_t)(base64Value&3); + base64Counter=3; + break; + case 5: + c=(UChar)((bits<<2)|(base64Value>>4)); + if(isLegalIMAP(c)) { + /* illegal */ + inDirectMode=TRUE; + goto callback; + } + *target++=c; + if(offsets!=NULL) { + *offsets++=sourceIndex; + sourceIndex=nextSourceIndex-1; + } + bytes[0]=b; /* keep this byte in case an error occurs */ + byteIndex=1; + bits=(uint16_t)(base64Value&15); + base64Counter=6; + break; + case 7: + c=(UChar)((bits<<6)|base64Value); + if(isLegalIMAP(c)) { + /* illegal */ + inDirectMode=TRUE; + goto callback; + } + *target++=c; + if(offsets!=NULL) { + *offsets++=sourceIndex; + sourceIndex=nextSourceIndex; + } + byteIndex=0; + bits=0; + base64Counter=0; + break; + default: + /* will never occur */ + break; + } + } else if(base64Value==-2) { + /* minus sign terminates the base64 sequence */ + inDirectMode=TRUE; + if(base64Counter==-1) { + /* &- i.e. a minus immediately following an ampersand */ + *target++=AMPERSAND; + if(offsets!=NULL) { + *offsets++=sourceIndex-1; + } + } else { + /* absorb the minus and leave the Unicode Mode */ + if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) { + /* bits are illegally left over, a UChar is incomplete */ + /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */ + goto callback; + } + } + sourceIndex=nextSourceIndex; + goto directMode; + } else { + if(base64Counter==-1) { + /* illegal: & immediately followed by something other than base64 or minus sign */ + /* include the ampersand in the reported sequence */ + --sourceIndex; + bytes[0]=AMPERSAND; + bytes[1]=b; + byteIndex=2; + } + /* base64Value==-1 for characters that are illegal only in Unicode mode */ + /* base64Value==-3 for illegal characters */ + /* illegal */ + inDirectMode=TRUE; + goto callback; + } + } else { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } +endloop: + + if(pArgs->flush && source>=sourceLimit) { + /* reset the state for the next conversion */ + if(!inDirectMode && U_SUCCESS(*pErrorCode)) { + /* a character byte sequence remains incomplete - IMAP must end in ASCII/direct mode */ + *pErrorCode=U_TRUNCATED_CHAR_FOUND; + } + cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */ + cnv->toULength=0; + } else { + /* set the converter state back into UConverter */ + cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; + cnv->toULength=byteIndex; + } + +finish: + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; + return; + +callback: + /* call the callback function with all the preparations and post-processing */ + /* update the arguments structure */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; + + /* copy the current bytes to invalidCharBuffer */ + for(b=0; b<(uint8_t)byteIndex; ++b) { + cnv->invalidCharBuffer[b]=(char)bytes[b]; + } + cnv->invalidCharLength=byteIndex; + + /* set the converter state in UConverter to deal with the next character */ + cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24; + cnv->toULength=0; + + /* call the callback function */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode); + + /* get the converter state from UConverter */ + { + uint32_t status=cnv->toUnicodeStatus; + inDirectMode=(UBool)((status>>24)&1); + base64Counter=(int8_t)(status>>16); + bits=(uint16_t)status; + } + byteIndex=cnv->toULength; + + /* update target and deal with offsets if necessary */ + offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex); + target=pArgs->target; + + /* update the source pointer and index */ + sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source); + source=(const uint8_t *)pArgs->source; + + /* + * If the callback overflowed the target, then we need to + * stop here with an overflow indication. + */ + if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { + goto endloop; + } else if(cnv->UCharErrorBufferLength>0) { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + goto endloop; + } else if(U_FAILURE(*pErrorCode)) { + /* break on error */ + cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */ + cnv->toULength=0; + goto finish; + } else { + goto loop; + } +} + +static void +_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const UChar *source, *sourceLimit; + uint8_t *target, *targetLimit; + int32_t *offsets; + + int32_t length, targetCapacity, sourceIndex; + UChar c; + uint8_t b; + + /* UTF-7 state */ + uint8_t bits; + int8_t base64Counter; + UBool inDirectMode; + + /* set up the local pointers */ + cnv=pArgs->converter; + + /* set up the local pointers */ + source=pArgs->source; + sourceLimit=pArgs->sourceLimit; + target=(uint8_t *)pArgs->target; + targetLimit=(uint8_t *)pArgs->targetLimit; + offsets=pArgs->offsets; + + /* get the state machine state */ + { + uint32_t status=cnv->fromUnicodeStatus; + inDirectMode=(UBool)((status>>24)&1); + base64Counter=(int8_t)(status>>16); + bits=(uint8_t)status; + } + + /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */ + sourceIndex=0; + + if(inDirectMode) { +directMode: + length=sourceLimit-source; + targetCapacity=targetLimit-target; + if(length>targetCapacity) { + length=targetCapacity; + } + while(length>0) { + c=*source++; + /* encode 0x20..0x7e except '&' directly */ + if(inSetDIMAP(c)) { + /* encode directly */ + *target++=(uint8_t)c; + if(offsets!=NULL) { + *offsets++=sourceIndex++; + } + } else if(c==AMPERSAND) { + /* output &- for & */ + *target++=AMPERSAND; + if(targetcharErrorBuffer[0]=MINUS; + cnv->charErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } else { + /* un-read this character and switch to Unicode Mode */ + --source; + *target++=AMPERSAND; + if(offsets!=NULL) { + *offsets++=sourceIndex; + } + inDirectMode=FALSE; + base64Counter=0; + goto unicodeMode; + } + --length; + } + if(source=targetLimit) { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + } else { +unicodeMode: + while(sourcecharErrorBuffer[0]=MINUS; + cnv->charErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + goto directMode; + } else { + /* + * base64 this character: + * Output 2 or 3 base64 bytes for the remaining bits of the previous character + * and the bits of this character, each implicitly in UTF-16BE. + * + * Here, bits is an 8-bit variable because only 6 bits need to be kept from one + * character to the next. The actual 2 or 4 bits are shifted to the left edge + * of the 6-bits field 5..0 to make the termination of the base64 sequence easier. + */ + switch(base64Counter) { + case 0: + b=(uint8_t)(c>>10); + *target++=TO_BASE64_IMAP(b); + if(target>4)&0x3f); + *target++=TO_BASE64_IMAP(b); + if(offsets!=NULL) { + *offsets++=sourceIndex; + *offsets++=sourceIndex++; + } + } else { + if(offsets!=NULL) { + *offsets++=sourceIndex++; + } + b=(uint8_t)((c>>4)&0x3f); + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); + cnv->charErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + bits=(uint8_t)((c&15)<<2); + base64Counter=1; + break; + case 1: + b=(uint8_t)(bits|(c>>14)); + *target++=TO_BASE64_IMAP(b); + if(target>8)&0x3f); + *target++=TO_BASE64_IMAP(b); + if(target>2)&0x3f); + *target++=TO_BASE64_IMAP(b); + if(offsets!=NULL) { + *offsets++=sourceIndex; + *offsets++=sourceIndex; + *offsets++=sourceIndex++; + } + } else { + if(offsets!=NULL) { + *offsets++=sourceIndex; + *offsets++=sourceIndex++; + } + b=(uint8_t)((c>>2)&0x3f); + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); + cnv->charErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + } else { + if(offsets!=NULL) { + *offsets++=sourceIndex++; + } + b=(uint8_t)((c>>8)&0x3f); + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); + b=(uint8_t)((c>>2)&0x3f); + cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); + cnv->charErrorBufferLength=2; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + bits=(uint8_t)((c&3)<<4); + base64Counter=2; + break; + case 2: + b=(uint8_t)(bits|(c>>12)); + *target++=TO_BASE64_IMAP(b); + if(target>6)&0x3f); + *target++=TO_BASE64_IMAP(b); + if(targetcharErrorBuffer[0]=TO_BASE64_IMAP(b); + cnv->charErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + } else { + if(offsets!=NULL) { + *offsets++=sourceIndex++; + } + b=(uint8_t)((c>>6)&0x3f); + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); + b=(uint8_t)(c&0x3f); + cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); + cnv->charErrorBufferLength=2; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + bits=0; + base64Counter=0; + break; + default: + /* will never occur */ + break; + } + } + } else { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } + + if(pArgs->flush && source>=sourceLimit) { + /* flush remaining bits to the target */ + if(!inDirectMode) { + if(base64Counter!=0) { + if(targetcharErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits); + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + } + /* need to terminate with a minus */ + if(targetcharErrorBuffer[cnv->charErrorBufferLength++]=MINUS; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + } + /* reset the state for the next conversion */ + cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ + } else { + /* set the converter state back into UConverter */ + cnv->fromUnicodeStatus= + (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/ + ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits; + } + + /* write back the updated pointers */ + pArgs->source=source; + pArgs->target=(char *)target; + pArgs->offsets=offsets; + return; +} + +static const UConverterImpl _IMAPImpl={ + UCNV_IMAP_MAILBOX, + + NULL, + NULL, + + _UTF7Open, + NULL, + _UTF7Reset, + + _IMAPToUnicodeWithOffsets, + _IMAPToUnicodeWithOffsets, + _IMAPFromUnicodeWithOffsets, + _IMAPFromUnicodeWithOffsets, + _UTF7GetNextUChar, + + NULL, + NULL, + NULL /* we don't need writeSub() because we never call a callback at fromUnicode() */ +}; + +static const UConverterStaticData _IMAPStaticData={ + sizeof(UConverterStaticData), + "IMAP-mailbox-name", + 0, /* TODO CCSID for UTF-7 */ + UCNV_IBM, UCNV_IMAP_MAILBOX, + 1, 4, + { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ + FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +const UConverterSharedData _IMAPData={ + sizeof(UConverterSharedData), ~((uint32_t)0), + NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl, + 0 +}; diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index 795ff28bffb..66a593561bd 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -104,6 +104,7 @@ typedef enum { UCNV_UTF16, UCNV_UTF32, UCNV_CESU8, + UCNV_IMAP_MAILBOX, /* Number of converter types for which we have conversion routines. */ UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES diff --git a/icu4c/source/data/mappings/convrtrs.txt b/icu4c/source/data/mappings/convrtrs.txt index 84918cef21e..4e13081b937 100644 --- a/icu4c/source/data/mappings/convrtrs.txt +++ b/icu4c/source/data/mappings/convrtrs.txt @@ -148,6 +148,14 @@ UTF32_OppositeEndian # For details about email headers see RFC 2047. UTF-7 { IANA* MIME* } cp65000 +# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names. +# It is a substantially modified UTF-7 encoding. See the specification in: +# +# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 +# (http://www.ietf.org/rfc/rfc2060.txt) +# Section 5.1.3. Mailbox International Naming Convention +IMAP-mailbox-name + SCSU { IANA* } BOCU-1 { IANA* } csBOCU-1 { IANA } diff --git a/icu4c/source/test/cintltst/nccbtst.c b/icu4c/source/test/cintltst/nccbtst.c index 2332e234e1e..fe23d310c4f 100644 --- a/icu4c/source/test/cintltst/nccbtst.c +++ b/icu4c/source/test/cintltst/nccbtst.c @@ -1707,6 +1707,26 @@ static void TestSub(int32_t inputsize, int32_t outputsize) } } + log_verbose("Testing IMAP-mailbox-name toUnicode with substitute callbacks\n"); + { + static const uint8_t bytes[]={ + /* aDEL a&AB~ a&AB\x0c a&AB- a&AB. a&. */ + 0x61, 0x7f, 0x61, 0x26, 0x41, 0x42, 0x7e, 0x61, 0x26, 0x41, 0x42, 0x0c, 0x61, 0x26, 0x41, 0x42, 0x2d, 0x61, 0x26, 0x41, 0x42, 0x2e, 0x61, 0x26, 0x2e + }; + static const UChar unicode[]={ + 0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd + }; + static const int32_t offsets[]={ + 0, 1, 2, 4, 7, 9, 12, 14, 17, 19, 22, 23 + }; + + if(!testConvertToUnicode(bytes, ARRAY_LENGTH(bytes), unicode, ARRAY_LENGTH(unicode), "IMAP-mailbox-name", + UCNV_TO_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0) + ) { + log_err("IMAP-mailbox-name->u with substitute did not match.\n"); + } + } + log_verbose("Testing UTF-16 toUnicode with substitute callbacks\n"); { static const uint8_t diff --git a/icu4c/source/test/cintltst/ncnvtst.c b/icu4c/source/test/cintltst/ncnvtst.c index 570ffdee548..c4271c3d557 100644 --- a/icu4c/source/test/cintltst/ncnvtst.c +++ b/icu4c/source/test/cintltst/ncnvtst.c @@ -1772,11 +1772,16 @@ doTestTruncated(const char *cnvName, const uint8_t *bytes, int32_t length) { static void TestTruncated() { - struct { + static const struct { const char *cnvName; uint8_t bytes[8]; /* partial input bytes resulting in no output */ int32_t length; } testCases[]={ + { "IMAP-mailbox-name", { 0x26 }, 1 }, /* & */ + { "IMAP-mailbox-name", { 0x26, 0x42 }, 2 }, /* &B */ + { "IMAP-mailbox-name", { 0x26, 0x42, 0x42 }, 3 }, /* &BB */ + { "IMAP-mailbox-name", { 0x26, 0x41, 0x41 }, 3 }, /* &AA */ + { "UTF-7", { 0x2b, 0x42 }, 2 }, /* +B */ { "UTF-8", { 0xd1 }, 1 }, diff --git a/icu4c/source/test/cintltst/nucnvtst.c b/icu4c/source/test/cintltst/nucnvtst.c index 78ce75be1c8..46e832d7d23 100644 --- a/icu4c/source/test/cintltst/nucnvtst.c +++ b/icu4c/source/test/cintltst/nucnvtst.c @@ -34,6 +34,7 @@ static void TestConverterTypesAndStarters(void); static void TestAmbiguous(void); static void TestSignatureDetection(void); static void TestUTF7(void); +static void TestIMAP(void); static void TestUTF8(void); static void TestCESU8(void); static void TestUTF16(void); @@ -145,7 +146,9 @@ TestNextUChar(UConverter* cnv, const char* source, const char* limit, const uint while(s-! + A. + & + [Japanese word "nihongo"] + \ + ~peter + /mail + /<65e5, 672c, 8a9e> + /<53f0, 5317> + */ + 0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x263a, 0x2d, 0x21, + 0x41, 0x2262, 0x0391, 0x2e, + 0x26, + 0x65e5, 0x672c, 0x8a9e, + 0x5c, + 0x7e, 0x70, 0x65, 0x74, 0x65, 0x72, + 0x2f, 0x6d, 0x61, 0x69, 0x6c, + 0x2f, 0x65e5, 0x672c, 0x8a9e, + 0x2f, 0x53f0, 0x5317 + }; + static const int32_t toUnicodeOffsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 9, 13, 14, + 15, 17, 19, 24, + 25, + 28, 30, 33, + 37, + 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, + 49, 51, 53, 56, + 60, 62, 64 + }; + static const int32_t fromUnicodeOffsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10, + 11, 12, 12, 12, 13, 13, 13, 13, 13, 14, + 15, 15, + 16, 16, 16, 17, 17, 17, 18, 18, 18, 18, + 19, + 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, + 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, + 35, 36, 36, 36, 37, 37, 37, 37, 37 + }; + + testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, imap, sizeof(imap), "IMAP-mailbox-name", fromUnicodeOffsets,FALSE); + + testConvertToU(imap, sizeof(imap), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "IMAP-mailbox-name", toUnicodeOffsets,FALSE); } /* Test UTF-8 bad data handling*/ @@ -1224,6 +1304,7 @@ static void TestConverterTypesAndStarters() TestConverterType("x-iscii-de", UCNV_ISCII); TestConverterType("ascii", UCNV_US_ASCII); TestConverterType("utf-7", UCNV_UTF7); + TestConverterType("IMAP-mailbox-name", UCNV_IMAP_MAILBOX); TestConverterType("bocu-1", UCNV_BOCU1); } @@ -1595,6 +1676,50 @@ static TestUTF7() { ucnv_close(cnv); } +void +static TestIMAP() { + /* test input */ + static const uint8_t in[]={ + /* H - &Jjo- - ! &- &2AHcAQ- \ */ + 0x48, + 0x2d, + 0x26, 0x4a, 0x6a, 0x6f, + 0x2d, 0x2d, + 0x21, + 0x26, 0x2d, + 0x26, 0x32, 0x41, 0x48, 0x63, 0x41, 0x51, 0x2d + }; + + /* expected test results */ + static const uint32_t results[]={ + /* number of bytes read, code point */ + 1, 0x48, + 1, 0x2d, + 4, 0x263a, /* */ + 2, 0x2d, + 1, 0x21, + 2, 0x26, + 7, 0x10401 + }; + + const char *cnvName; + const char *source=(const char *)in, *limit=(const char *)in+sizeof(in); + UErrorCode errorCode=U_ZERO_ERROR; + UConverter *cnv=ucnv_open("IMAP-mailbox-name", &errorCode); + if(U_FAILURE(errorCode)) { + log_err("Unable to open a IMAP-mailbox-name converter: %s\n", u_errorName(errorCode)); /* sholdn't be a data err */ + return; + } + TestNextUChar(cnv, source, limit, results, "IMAP-mailbox-name"); + /* Test the condition when source >= sourceLimit */ + TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source"); + cnvName = ucnv_getName(cnv, &errorCode); + if (U_FAILURE(errorCode) || uprv_strcmp(cnvName, "IMAP-mailbox-name") != 0) { + log_err("IMAP-mailbox-name converter is called %s: %s\n", cnvName, u_errorName(errorCode)); + } + ucnv_close(cnv); +} + void static TestUTF8() { /* test input */ @@ -2596,7 +2721,9 @@ TestGetNextUChar2022(UConverter* cnv, const char* source, const char* limit, while(s