diff --git a/icu4c/source/common/ucnv_ext.cpp b/icu4c/source/common/ucnv_ext.cpp index 51d1ba03375..5cd1ab61747 100644 --- a/icu4c/source/common/ucnv_ext.cpp +++ b/icu4c/source/common/ucnv_ext.cpp @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 2003-2011, International Business Machines +* Copyright (C) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -446,6 +446,15 @@ ucnv_extContinueMatchToU(UConverter *cnv, /* from Unicode ------------------------------------------------------------- */ +// Use roundtrips, "good one-way" mappings, and some normal fallbacks. +static inline UBool +extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) { + return + ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 || + FROM_U_USE_FALLBACK(useFallback, firstCP)) && + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0; +} + /* * @return index of the UChar, if found; else <0 */ @@ -580,11 +589,7 @@ ucnv_extMatchFromU(const int32_t *cx, /* read first pair of the section */ length=*fromUSectionUChars++; value=*fromUSectionValues++; - if( value!=0 && - (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || - FROM_U_USE_FALLBACK(useFallback, firstCP)) && - (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 - ) { + if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) { /* remember longest match so far */ matchValue=value; matchLength=2+i+j; @@ -621,10 +626,7 @@ ucnv_extMatchFromU(const int32_t *cx, /* partial match, continue */ idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); } else { - if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || - FROM_U_USE_FALLBACK(useFallback, firstCP)) && - (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 - ) { + if(extFromUUseMapping(useFallback, value, firstCP)) { /* full match, stop with result */ matchValue=value; matchLength=2+i+j; @@ -641,10 +643,7 @@ ucnv_extMatchFromU(const int32_t *cx, return 0; } } else /* result from firstCP trie lookup */ { - if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || - FROM_U_USE_FALLBACK(useFallback, firstCP)) && - (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 - ) { + if(extFromUUseMapping(useFallback, value, firstCP)) { /* full match, stop with result */ matchValue=value; matchLength=2; @@ -944,13 +943,38 @@ ucnv_extContinueMatchFromU(UConverter *cnv, } } +static UBool +extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) { + if(which==UCNV_ROUNDTRIP_SET) { + // Add only code points for which the roundtrip flag is set. + // Do not add any fallbacks, even if ucnv_fromUnicode() would use them + // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet(). + // + // By analogy, also do not add "good one-way" mappings. + // + // Do not add entries with reserved bits set. + if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!= + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) { + return FALSE; + } + } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { + // Do not add entries with reserved bits set. + if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) { + return FALSE; + } + } + // Do not add entries or other (future?) pseudo-entries + // with an output length of 0. + return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength; +} + static void ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, const int32_t *cx, const USetAdder *sa, - UBool useFallback, + UConverterUnicodeSet which, int32_t minLength, - UChar32 c, + UChar32 firstCP, UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, int32_t sectionIndex, UErrorCode *pErrorCode) { @@ -967,13 +991,10 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, count=*fromUSectionUChars++; value=*fromUSectionValues++; - if( value!=0 && - (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) && - UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength - ) { - if(c>=0) { + if(extSetUseMapping(which, minLength, value)) { + if(length==U16_LENGTH(firstCP)) { /* add the initial code point */ - sa->add(sa->set, c); + sa->add(sa->set, firstCP); } else { /* add the string so far */ sa->addString(sa->set, s, length); @@ -989,16 +1010,11 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, /* no mapping, do nothing */ } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { ucnv_extGetUnicodeSetString( - sharedData, cx, sa, useFallback, minLength, - U_SENTINEL, s, length+1, + sharedData, cx, sa, which, minLength, + firstCP, s, length+1, (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), pErrorCode); - } else if((useFallback ? - (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : - ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && - UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength - ) { + } else if(extSetUseMapping(which, minLength, value)) { sa->addString(sa->set, s, length+1); } } @@ -1016,7 +1032,6 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, uint32_t value; int32_t st1, stage1Length, st2, st3, minLength; - UBool useFallback; UChar s[UCNV_EXT_MAX_UCHARS]; UChar32 c; @@ -1033,8 +1048,6 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; - useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); - /* enumerate the from-Unicode trie table */ c=0; /* keep track of the current code point while enumerating */ @@ -1062,30 +1075,20 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, /* read the stage 3 block */ ps3=stage3+st3; - /* - * Add code points for which the roundtrip flag is set. - * Do not add entries or other (future?) pseudo-entries - * with an output length of 0, or entries with reserved bits set. - * Recurse for partial results. - */ do { value=stage3b[*ps3++]; if(value==0) { /* no mapping, do nothing */ } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { + // Recurse for partial results. length=0; U16_APPEND_UNSAFE(s, length, c); ucnv_extGetUnicodeSetString( - sharedData, cx, sa, useFallback, minLength, + sharedData, cx, sa, which, minLength, c, s, length, (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), pErrorCode); - } else if((useFallback ? - (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : - ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && - UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength - ) { + } else if(extSetUseMapping(which, minLength, value)) { switch(filter) { case UCNV_SET_FILTER_2022_CN: if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { diff --git a/icu4c/source/common/ucnv_ext.h b/icu4c/source/common/ucnv_ext.h index e3e46f4b305..82ea0f6bc45 100644 --- a/icu4c/source/common/ucnv_ext.h +++ b/icu4c/source/common/ucnv_ext.h @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 2003-2007, International Business Machines +* Copyright (C) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -29,10 +29,12 @@ /* * See icuhtml/design/conversion/conversion_extensions.html * - * Conversion extensions serve two purposes: + * Conversion extensions serve three purposes: * 1. They support m:n mappings. * 2. They support extension-only conversion files that are used together * with the regular conversion data in base files. + * 3. They support mappings with more complicated meta data, + * for example "good one-way" mappings (|4). * * A base file may contain an extension table (explicitly requested or * implicitly generated for m:n mappings), but its extension table is not @@ -229,11 +231,13 @@ * return no mapping, but request for ; * } * if(bit 31 set) { - * roundtrip; + * roundtrip (|0); + * } else if(bit 30 set) { + * "good one-way" mapping (|4); * } else { - * fallback; + * normal fallback (|1); * } - * // bits 30..29 reserved, 0 + * // bit 29 reserved, 0 * length=(value>>24)&0x1f; (bits 28..24) * if(length==1..3) { * bits 23..0 contain 1..3 bytes, padded with 00s on the left; @@ -444,7 +448,9 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, #define UCNV_EXT_FROM_U_LENGTH_SHIFT 24 #define UCNV_EXT_FROM_U_ROUNDTRIP_FLAG ((uint32_t)1<<31) -#define UCNV_EXT_FROM_U_RESERVED_MASK 0x60000000 +#define UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG 0x40000000 +#define UCNV_EXT_FROM_U_STATUS_MASK 0xc0000000 +#define UCNV_EXT_FROM_U_RESERVED_MASK 0x20000000 #define UCNV_EXT_FROM_U_DATA_MASK 0xffffff /* special value for "no mapping" to (impossible roundtrip to 0 bytes, value 01) */ diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 3e3d9456d73..028457f9e96 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -1004,6 +1004,20 @@ conversion:table(nofallback) { fromUnicode { Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" } Cases { + // Test ticket 9602: Add "good one-way" mapping type (|4). + // Such mappings are used regardless of the fallback flag. + { + "*test3", "##\uFE0E#\uFE0F", + :bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 }, + :int{1}, :int{0}, // no fallbacks + "", "?", "" + } + { + "*test3", "##\uFE0E#\uFE0F", + :bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 }, + :int{1}, :int{1}, // with fallbacks + "", "?", "" + } // Test ticket 6789: implement Java-compatible Unicode, UnicodeBig and UnicodeLittle converters // For details about these encodings see convrtrs.txt. // Standard UTF-16BE @@ -1833,6 +1847,20 @@ conversion:table(nofallback) { // which - numeric UConverterUnicodeSet value Headers { "charset", "map", "mapnot", "which" } Cases { + // Test ticket 9602: Add "good one-way" mapping type (|4). + // Excluded from roundtrip set, included in the set with fallbacks. + { + "*test3", + "[{#\uFE0F}]", + "[#{#\uFE0E}]", + :int{0} + } + { + "*test3", + "[#{#\uFE0E}{#\uFE0F}]", + "[]", + :int{1} + } // Unicode charsets that do not map surrogate code points { "UTF-8", diff --git a/icu4c/source/test/testdata/test3.ucm b/icu4c/source/test/testdata/test3.ucm index aee69c28982..57a798467dd 100644 --- a/icu4c/source/test/testdata/test3.ucm +++ b/icu4c/source/test/testdata/test3.ucm @@ -1,5 +1,5 @@ # ******************************************************************************* -# * Copyright (C) 2001-2003, International Business Machines +# * Copyright (C) 2001-2013, International Business Machines # * Corporation and others. All Rights Reserved. # ******************************************************************************* # @@ -15,7 +15,7 @@ \xff 0, 1:1, 5-9, ff 2:2 - a-f.p + 4, a-f.p CHARMAP @@ -57,4 +57,9 @@ CHARMAP \x01\x02\x0e |3 #unassigned \x01\x02\x0f +# "good one-way" mappings + \x01\x02\x04 |4 ++ \x01\x02\x04 |4 ++ \x01\x02\x04 |0 + END CHARMAP diff --git a/icu4c/source/tools/makeconv/gencnvex.c b/icu4c/source/tools/makeconv/gencnvex.c index 3b97e0c9b51..10a0571c6b9 100644 --- a/icu4c/source/tools/makeconv/gencnvex.c +++ b/icu4c/source/tools/makeconv/gencnvex.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2003-2012, International Business Machines +* Copyright (C) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -606,7 +606,7 @@ prepareFromUMappings(UCMTable *table) { flag&=MBCS_FROM_U_EXT_MASK; m->f=flag; } - if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) { + if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) { map[j++]=i; if(m->uLen>1) { @@ -672,6 +672,8 @@ getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { value|=(uint32_t)m->bLen<f==0) { value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; + } else if(m->f==4) { + value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG; } /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ diff --git a/icu4c/source/tools/makeconv/genmbcs.cpp b/icu4c/source/tools/makeconv/genmbcs.cpp index 556d8139419..4511765da65 100644 --- a/icu4c/source/tools/makeconv/genmbcs.cpp +++ b/icu4c/source/tools/makeconv/genmbcs.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000-2012, International Business Machines +* Copyright (C) 2000-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -1049,6 +1049,11 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati staticData->hasToUnicodeFallback=TRUE; isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f); break; + case 4: + /* move "good one-way" mappings to the extension table */ + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + break; default: /* will not occur because the parser checked it already */ fprintf(stderr, "error: illegal fallback indicator %d\n", f); diff --git a/icu4c/source/tools/toolutil/ucm.c b/icu4c/source/tools/toolutil/ucm.c index 8942a3aaa0f..adb3ebd2d0a 100644 --- a/icu4c/source/tools/toolutil/ucm.c +++ b/icu4c/source/tools/toolutil/ucm.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2003-2012, International Business Machines +* Copyright (C) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -327,7 +327,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, return result; } - if(0<=mb->f && mb->f<=2) { + if((0<=mb->f && mb->f<=2) || mb->f==4) { break; } @@ -339,7 +339,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, return result; } - if(0<=me->f && me->f<=2) { + if((0<=me->f && me->f<=2) || me->f==4) { break; } @@ -857,8 +857,8 @@ ucm_parseMappingLine(UCMapping *m, break; } else if(*s=='|') { f=(int8_t)(s[1]-'0'); - if((uint8_t)f>3) { - fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line); + if((uint8_t)f>4) { + fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); return FALSE; } break; @@ -1051,6 +1051,7 @@ ucm_mappingType(UCMStates *baseStates, /* * Suitable for an ICU conversion base table means: * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) + * - precision flag 0..3 * - SBCS: any 1:1 mapping * (the table stores additional bits to distinguish mapping types) * - MBCS: not a |2 SUB mapping for @@ -1070,7 +1071,7 @@ ucm_mappingType(UCMStates *baseStates, * makeconv uses a hack for moving mappings only for the fromUnicode table * that only works with non-negative values of f. */ - if( m->uLen==1 && count==1 && + if( m->uLen==1 && count==1 && m->f<=3 && (baseStates->maxCharLength==1 || !((m->f==2 && m->bLen==1) || (m->f==1 && bytes[0]==0) || @@ -1146,7 +1147,7 @@ ucm_readTable(UCMFile *ucm, FileStream* convFile, char line[500]; char *end; UBool isOK; - + if(U_FAILURE(*pErrorCode)) { return; } diff --git a/icu4c/source/tools/toolutil/ucm.h b/icu4c/source/tools/toolutil/ucm.h index 20324f54c70..cda67d31342 100644 --- a/icu4c/source/tools/toolutil/ucm.h +++ b/icu4c/source/tools/toolutil/ucm.h @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2003-2010, International Business Machines + * Copyright (C) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucm.h @@ -45,7 +45,8 @@ enum { * bIsMultipleChars indicates that the bytes contain more than one sequence * according to the state table * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) - * same values as in the source file after | + * or "good one-way" mapping (4). + * Same values as in the source file after | */ typedef struct UCMapping { UChar32 u;