diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index b3fb587ee92..5cf034f8e9b 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -82,7 +82,7 @@ udata.o ucmndata.o udatamem.o umapfile.o udataswp.o ucol_swp.o utrace.o \ uhash.o uhash_us.o uenum.o ustrenum.o uvector.o ustack.o uvectr32.o uvectr64.o \ ucnv.o ucnv_bld.o ucnv_cnv.o ucnv_io.o ucnv_cb.o ucnv_err.o ucnvlat1.o \ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \ -ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o \ +ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \ uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \ ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o loclikely.o locresdata.o \ bytestream.o stringpiece.o \ diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 86999e20a1a..308ae3f44a2 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -305,6 +305,7 @@ + diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c index a95b7b8172d..657e5c811f1 100644 --- a/icu4c/source/common/ucnv_bld.c +++ b/icu4c/source/common/ucnv_bld.c @@ -86,7 +86,8 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={ #endif &_ASCIIData, - &_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data, &_IMAPData + &_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data, &_IMAPData, + &_CompoundTextData }; /* Please keep this in binary sorted order for getAlgorithmicTypeFromName. @@ -145,7 +146,8 @@ static struct { { "utf32platformendian", UCNV_UTF32_LittleEndian }, #endif { "utf7", UCNV_UTF7 }, - { "utf8", UCNV_UTF8 } + { "utf8", UCNV_UTF8 }, + { "x11compoundtext", UCNV_COMPOUND_TEXT} }; diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index cba24d06c51..6334b2417f8 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2009, International Business Machines +* Copyright (C) 1999-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -257,7 +257,7 @@ extern const UConverterSharedData _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19, _HZData,_ISCIIData, _SCSUData, _ASCIIData, - _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData; + _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData, _CompoundTextData; U_CDECL_END diff --git a/icu4c/source/common/ucnv_ct.c b/icu4c/source/common/ucnv_ct.c new file mode 100644 index 00000000000..a74b0e50585 --- /dev/null +++ b/icu4c/source/common/ucnv_ct.c @@ -0,0 +1,633 @@ +/* +********************************************************************** +* Copyright (C) 2010, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: ucnv_ct.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010Dec09 +* created by: Michael Ow +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION + +#include "unicode/ucnv.h" +#include "unicode/uset.h" +#include "unicode/ucnv_err.h" +#include "unicode/ucnv_cb.h" +#include "ucnv_imp.h" +#include "ucnv_bld.h" +#include "ucnv_cnv.h" +#include "ucnvmbcs.h" +#include "cstring.h" +#include "cmemory.h" + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +typedef enum { + INVALID = -2, + DO_SEARCH = -1, + + COMPOUND_TEXT_SINGLE_0 = 0, + COMPOUND_TEXT_SINGLE_1 = 1, + COMPOUND_TEXT_SINGLE_2 = 2, + COMPOUND_TEXT_SINGLE_3 = 3, + + COMPOUND_TEXT_DOUBLE_1 = 4, + COMPOUND_TEXT_DOUBLE_2 = 5, + COMPOUND_TEXT_DOUBLE_3 = 6, + COMPOUND_TEXT_DOUBLE_4 = 7, + COMPOUND_TEXT_DOUBLE_5 = 8, + COMPOUND_TEXT_DOUBLE_6 = 9, + COMPOUND_TEXT_DOUBLE_7 = 10, + + COMPOUND_TEXT_TRIPLE_DOUBLE = 11, + + IBM_915 = 12, + IBM_916 = 13, + IBM_914 = 14, + IBM_874 = 15, + IBM_912 = 16, + IBM_913 = 17, + ISO_8859_14 = 18, + IBM_923 = 19, + NUM_OF_CONVERTERS = 20 +} COMPOUND_TEXT_CONVERTERS; + +#define SEARCH_LENGTH 12 + +static const uint8_t escSeqCompoundText[NUM_OF_CONVERTERS][5] = { + /* Single */ + { 0x1B, 0x2D, 0x41, 0, 0 }, + { 0x1B, 0x2D, 0x4D, 0, 0 }, + { 0x1B, 0x2D, 0x46, 0, 0 }, + { 0x1B, 0x2D, 0x47, 0, 0 }, + + /* Double */ + { 0x1B, 0x24, 0x29, 0x41, 0 }, + { 0x1B, 0x24, 0x29, 0x42, 0 }, + { 0x1B, 0x24, 0x29, 0x43, 0 }, + { 0x1B, 0x24, 0x29, 0x44, 0 }, + { 0x1B, 0x24, 0x29, 0x47, 0 }, + { 0x1B, 0x24, 0x29, 0x48, 0 }, + { 0x1B, 0x24, 0x29, 0x49, 0 }, + + /* Triple/Double */ + { 0x1B, 0x25, 0x47, 0, 0 }, + + /*IBM-915*/ + { 0x1B, 0x2D, 0x4C, 0, 0 }, + /*IBM-916*/ + { 0x1B, 0x2D, 0x48, 0, 0 }, + /*IBM-914*/ + { 0x1B, 0x2D, 0x44, 0, 0 }, + /*IBM-874*/ + { 0x1B, 0x2D, 0x54, 0, 0 }, + /*IBM-912*/ + { 0x1B, 0x2D, 0x42, 0, 0 }, + /* IBM-913 */ + { 0x1B, 0x2D, 0x43, 0, 0 }, + /* ISO-8859_14 */ + { 0x1B, 0x2D, 0x5F, 0, 0 }, + /* IBM-923 */ + { 0x1B, 0x2D, 0x62, 0, 0 }, +}; + +#define ESC_START 0x1B + +#define isASCIIRange(codepoint) \ + ((codepoint == 0x0000) || (codepoint == 0x0009) || (codepoint == 0x000A) || \ + (codepoint >= 0x0020 && codepoint <= 0x007f) || (codepoint >= 0x00A0 && codepoint <= 0x00FF)) + +#define isIBM915(codepoint) \ + ((codepoint >= 0x0401 && codepoint <= 0x045F) || (codepoint == 0x2116)) + +#define isIBM916(codepoint) \ + ((codepoint >= 0x05D0 && codepoint <= 0x05EA) || (codepoint == 0x2017) || (codepoint == 0x203E)) + +#define isCompoundS3(codepoint) \ + ((codepoint == 0x060C) || (codepoint == 0x061B) || (codepoint == 0x061F) || (codepoint >= 0x0621 && codepoint <= 0x063A) || \ + (codepoint >= 0x0640 && codepoint <= 0x0652) || (codepoint >= 0x0660 && codepoint <= 0x066D) || (codepoint == 0x200B) || \ + (codepoint >= 0x0FE70 && codepoint <= 0x0FE72) || (codepoint == 0x0FE74) || (codepoint >= 0x0FE76 && codepoint <= 0x0FEBE)) + +#define isCompoundS2(codepoint) \ + ((codepoint == 0x02BC) || (codepoint == 0x02BD) || (codepoint >= 0x0384 && codepoint <= 0x03CE) || (codepoint == 0x2015)) + +#define isIBM914(codepoint) \ + ((codepoint == 0x0100) || (codepoint == 0x0101) || (codepoint == 0x0112) || (codepoint == 0x0113) || (codepoint == 0x0116) || (codepoint == 0x0117) || \ + (codepoint == 0x0122) || (codepoint == 0x0123) || (codepoint >= 0x0128 && codepoint <= 0x012B) || (codepoint == 0x012E) || (codepoint == 0x012F) || \ + (codepoint >= 0x0136 && codepoint <= 0x0138) || (codepoint == 0x013B) || (codepoint == 0x013C) || (codepoint == 0x0145) || (codepoint == 0x0146) || \ + (codepoint >= 0x014A && codepoint <= 0x014D) || (codepoint == 0x0156) || (codepoint == 0x0157) || (codepoint >= 0x0166 && codepoint <= 0x016B) || \ + (codepoint == 0x0172) || (codepoint == 0x0173)) + +#define isIBM874(codepoint) \ + ((codepoint >= 0x0E01 && codepoint <= 0x0E3A) || (codepoint >= 0x0E3F && codepoint <= 0x0E5B)) + +#define isIBM912(codepoint) \ + ((codepoint >= 0x0102 && codepoint <= 0x0107) || (codepoint >= 0x010C && codepoint <= 0x0111) || (codepoint >= 0x0118 && codepoint <= 0x011B) || \ + (codepoint == 0x0139) || (codepoint == 0x013A) || (codepoint == 0x013D) || (codepoint == 0x013E) || (codepoint >= 0x0141 && codepoint <= 0x0144) || \ + (codepoint == 0x0147) || (codepoint == 0x0147) || (codepoint == 0x0150) || (codepoint == 0x0151) || (codepoint == 0x0154) || (codepoint == 0x0155) || \ + (codepoint >= 0x0158 && codepoint <= 0x015B) || (codepoint == 0x015E) || (codepoint == 0x015F) || (codepoint >= 0x0160 && codepoint <= 0x0165) || \ + (codepoint == 0x016E) || (codepoint == 0x016F) || (codepoint == 0x0170) || (codepoint == 0x0171) || (codepoint >= 0x0179 && codepoint <= 0x017E) || \ + (codepoint == 0x02C7) || (codepoint == 0x02D8) || (codepoint == 0x02D9) || (codepoint == 0x02DB) || (codepoint == 0x02DD)) + +#define isIBM913(codepoint) \ + ((codepoint >= 0x0108 && codepoint <= 0x010B) || (codepoint == 0x011C) || \ + (codepoint == 0x011D) || (codepoint == 0x0120) || (codepoint == 0x0121) || \ + (codepoint >= 0x0124 && codepoint <= 0x0127) || (codepoint == 0x0134) || (codepoint == 0x0135) || \ + (codepoint == 0x015C) || (codepoint == 0x015D) || (codepoint == 0x016C) || (codepoint == 0x016D)) + +#define isCompoundS1(codepoint) \ + ((codepoint == 0x011E) || (codepoint == 0x011F) || (codepoint == 0x0130) || \ + (codepoint == 0x0131) || (codepoint >= 0x0218 && codepoint <= 0x021B)) + +#define isISO8859_14(codepoint) \ + ((codepoint >= 0x0174 && codepoint <= 0x0177) || (codepoint == 0x1E0A) || \ + (codepoint == 0x1E0B) || (codepoint == 0x1E1E) || (codepoint == 0x1E1F) || \ + (codepoint == 0x1E40) || (codepoint == 0x1E41) || (codepoint == 0x1E56) || \ + (codepoint == 0x1E57) || (codepoint == 0x1E60) || (codepoint == 0x1E61) || \ + (codepoint == 0x1E6A) || (codepoint == 0x1E6B) || (codepoint == 0x1EF2) || \ + (codepoint == 0x1EF3) || (codepoint >= 0x1E80 && codepoint <= 0x1E85)) + +#define isIBM923(codepoint) \ + ((codepoint == 0x0152) || (codepoint == 0x0153) || (codepoint == 0x0178) || (codepoint == 0x20AC)) + + +typedef struct{ + UConverterSharedData *myConverterArray[NUM_OF_CONVERTERS]; + COMPOUND_TEXT_CONVERTERS state; +} UConverterDataCompoundText; + +static int32_t findNextEsc(const char *source, const char *sourceLimit) { + int32_t length = sourceLimit - source; + int32_t i; + for (i = 0; i < length; i++) { + if (*(source + i) == 0x1B) { + return i; + } + } + + return length; +} + +static COMPOUND_TEXT_CONVERTERS getState(int codepoint) { + COMPOUND_TEXT_CONVERTERS state = DO_SEARCH; + + if (isASCIIRange(codepoint)) { + state = COMPOUND_TEXT_SINGLE_0; + } else if (isIBM912(codepoint)) { + state = IBM_912; + }else if (isIBM913(codepoint)) { + state = IBM_913; + } else if (isISO8859_14(codepoint)) { + state = ISO_8859_14; + } else if (isIBM923(codepoint)) { + state = IBM_923; + } else if (isIBM874(codepoint)) { + state = IBM_874; + } else if (isIBM914(codepoint)) { + state = IBM_914; + } else if (isCompoundS2(codepoint)) { + state = COMPOUND_TEXT_SINGLE_2; + } else if (isCompoundS3(codepoint)) { + state = COMPOUND_TEXT_SINGLE_3; + } else if (isIBM916(codepoint)) { + state = IBM_916; + } else if (isIBM915(codepoint)) { + state = IBM_915; + } else if (isCompoundS1(codepoint)) { + state = COMPOUND_TEXT_SINGLE_1; + } + + return state; +} + +static COMPOUND_TEXT_CONVERTERS findStateFromEscSeq(const char* source, const char* sourceLimit, const uint8_t* toUBytesBuffer, int32_t toUBytesBufferLength, UErrorCode *err) { + COMPOUND_TEXT_CONVERTERS state = INVALID; + UBool matchFound = FALSE; + int32_t i, n, offset = toUBytesBufferLength; + + for (i = 0; i < NUM_OF_CONVERTERS; i++) { + for (n = 0; escSeqCompoundText[i][n] != 0; n++) { + matchFound = TRUE; + if (n < toUBytesBufferLength) { + if (toUBytesBuffer[n] != escSeqCompoundText[i][n]) { + matchFound = FALSE; + break; + } + } else if ((source + (n - offset)) >= sourceLimit) { + *err = U_TRUNCATED_CHAR_FOUND; + matchFound = FALSE; + break; + } else if (*(source + (n - offset)) != escSeqCompoundText[i][n]) { + matchFound = FALSE; + break; + } + } + + if (matchFound) { + break; + } + } + + if (matchFound) { + state = (COMPOUND_TEXT_CONVERTERS)i; + } + + return state; +} + +static void +_CompoundTextOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ + cnv->extraInfo = uprv_malloc (sizeof (UConverterDataCompoundText)); + if (cnv->extraInfo != NULL) { + UConverterDataCompoundText *myConverterData = (UConverterDataCompoundText *) cnv->extraInfo; + + UConverterNamePieces stackPieces; + UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) }; + + myConverterData->myConverterArray[COMPOUND_TEXT_SINGLE_0] = NULL; + myConverterData->myConverterArray[COMPOUND_TEXT_SINGLE_1] = ucnv_loadSharedData("icu-internal-compound-s1", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_SINGLE_2] = ucnv_loadSharedData("icu-internal-compound-s2", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_SINGLE_3] = ucnv_loadSharedData("icu-internal-compound-s3", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_DOUBLE_1] = ucnv_loadSharedData("icu-internal-compound-d1", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_DOUBLE_2] = ucnv_loadSharedData("icu-internal-compound-d2", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_DOUBLE_3] = ucnv_loadSharedData("icu-internal-compound-d3", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_DOUBLE_4] = ucnv_loadSharedData("icu-internal-compound-d4", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_DOUBLE_5] = ucnv_loadSharedData("icu-internal-compound-d5", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_DOUBLE_6] = ucnv_loadSharedData("icu-internal-compound-d6", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_DOUBLE_7] = ucnv_loadSharedData("icu-internal-compound-d7", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[COMPOUND_TEXT_TRIPLE_DOUBLE] = ucnv_loadSharedData("icu-internal-compound-t", &stackPieces, &stackArgs, errorCode); + + myConverterData->myConverterArray[IBM_915] = ucnv_loadSharedData("ibm-915_P100-1995", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[IBM_916] = ucnv_loadSharedData("ibm-916_P100-1995", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[IBM_914] = ucnv_loadSharedData("ibm-914_P100-1995", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[IBM_874] = ucnv_loadSharedData("ibm-874_P100-1995", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[IBM_912] = ucnv_loadSharedData("ibm-912_P100-1995", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[IBM_913] = ucnv_loadSharedData("ibm-913_P100-2000", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[ISO_8859_14] = ucnv_loadSharedData("iso-8859_14-1998", &stackPieces, &stackArgs, errorCode); + myConverterData->myConverterArray[IBM_923] = ucnv_loadSharedData("ibm-923_P100-1998", &stackPieces, &stackArgs, errorCode); + + if (U_FAILURE(*errorCode)) { + return; + } + + myConverterData->state = 0; + } else { + *errorCode = U_MEMORY_ALLOCATION_ERROR; + } +} + + +static void +_CompoundTextClose(UConverter *converter) { + UConverterSharedData** gbCharset = (UConverterSharedData**)(converter->extraInfo); + int32_t i; + + if (converter->extraInfo != NULL) { + /*close the array of converter pointers and free the memory*/ + for (i = 0; i < NUM_OF_CONVERTERS; i++) { + if(gbCharset[i] != NULL) { + ucnv_unloadSharedDataIfReady(gbCharset[i]); + } + } + + uprv_free(converter->extraInfo); + } +} + +static void +_CompoundTextReset(UConverter *converter, UConverterResetChoice choice) { +} + +static const char* +_CompoundTextgetName(const UConverter* cnv){ + return "x11-compound-text"; +} + +static void +UConverter_fromUnicode_CompoundText_OFFSETS(UConverterFromUnicodeArgs* args, UErrorCode* err){ + UConverter *cnv = args->converter; + uint8_t *target = (uint8_t *) args->target; + const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; + const UChar* source = args->source; + const UChar* sourceLimit = args->sourceLimit; + /* int32_t* offsets = args->offsets; */ + UChar32 sourceChar; + UBool useFallback = cnv->useFallback; + uint8_t tmpTargetBuffer[7]; + int32_t tmpTargetBufferLength = 0; + COMPOUND_TEXT_CONVERTERS currentState, tmpState; + uint8_t buffer[4]; + int32_t bufferLength = 0; + int32_t i, n; + + UConverterDataCompoundText *myConverterData = (UConverterDataCompoundText *) cnv->extraInfo; + + currentState = myConverterData->state; + + /* check if the last codepoint of previous buffer was a lead surrogate*/ + if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { + goto getTrail; + } + + while( source < sourceLimit){ + if(target < targetLimit){ + + sourceChar = *(source++); + /*check if the char is a First surrogate*/ + if(UTF_IS_SURROGATE(sourceChar)) { + if(UTF_IS_SURROGATE_FIRST(sourceChar)) { +getTrail: + /*look ahead to find the trail surrogate*/ + if(source < sourceLimit) { + /* test the following code unit */ + UChar trail=(UChar) *source; + if(UTF_IS_SECOND_SURROGATE(trail)) { + source++; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); + cnv->fromUChar32=0x00; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; + cnv->fromUChar32=sourceChar; + break; + } + } else { + /* no more input */ + cnv->fromUChar32=sourceChar; + break; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; + cnv->fromUChar32=sourceChar; + break; + } + } + + tmpTargetBufferLength = 0; + tmpState = getState(sourceChar); + + if (tmpState != DO_SEARCH && currentState != tmpState) { + /* Get escape sequence if necessary */ + currentState = tmpState; + for (i = 0; escSeqCompoundText[currentState][i] != 0; i++) { + tmpTargetBuffer[tmpTargetBufferLength++] = escSeqCompoundText[currentState][i]; + } + } + + if (tmpState == DO_SEARCH) { + /* Test all available converters */ + for (i = 1; i < SEARCH_LENGTH; i++) { + bufferLength = ucnv_MBCSFromUChar32(myConverterData->myConverterArray[i], sourceChar, (uint32_t*)buffer, useFallback); + if (bufferLength > 0) { + tmpState = (COMPOUND_TEXT_CONVERTERS)i; + if (currentState != tmpState) { + currentState = tmpState; + for (i = 0; escSeqCompoundText[currentState][i] != 0; i++) { + tmpTargetBuffer[tmpTargetBufferLength++] = escSeqCompoundText[currentState][i]; + } + } + for (n = 0; n < bufferLength; n++) { + tmpTargetBuffer[tmpTargetBufferLength++] = buffer[(bufferLength -1) - n]; + } + /* TODO: Reset? */ + break; + } + } + } else if (tmpState == COMPOUND_TEXT_SINGLE_0) { + tmpTargetBuffer[tmpTargetBufferLength++] = (uint8_t)sourceChar; + } else { + bufferLength = ucnv_MBCSFromUChar32(myConverterData->myConverterArray[currentState], sourceChar, (uint32_t*)buffer, useFallback); + if (bufferLength > 0) { + for (n = 0; n < bufferLength; n++) { + tmpTargetBuffer[tmpTargetBufferLength++] = buffer[(bufferLength -1) - n]; + } + } + } + + for (i = 0; i < tmpTargetBufferLength; i++) { + if (target < targetLimit) { + *target++ = tmpTargetBuffer[i]; + } else { + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + } + + if (*err == U_BUFFER_OVERFLOW_ERROR) { + for (; i < tmpTargetBufferLength; i++) { + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = tmpTargetBuffer[i]; + } + } + } else { + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + } + + /*save the state and return */ + myConverterData->state = currentState; + args->source = source; + args->target = (char*)target; +} + + +static void +UConverter_toUnicode_CompoundText_OFFSETS(UConverterToUnicodeArgs *args, + UErrorCode* err){ + const char *mySource = (char *) args->source; + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; + const char *tmpSourceLimit = mySourceLimit; + uint32_t mySourceChar = 0x0000; + COMPOUND_TEXT_CONVERTERS currentState, tmpState; + int32_t sourceOffset = 0; + UConverterDataCompoundText *myConverterData = (UConverterDataCompoundText *) args->converter->extraInfo; + UConverterSharedData* savedSharedData = NULL; + + UConverterToUnicodeArgs subArgs; + int32_t minArgsSize; + + /* set up the subconverter arguments */ + if(args->sizesize; + } else { + minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); + } + + uprv_memcpy(&subArgs, args, minArgsSize); + subArgs.size = (uint16_t)minArgsSize; + + currentState = tmpState = myConverterData->state; + + while(mySource < mySourceLimit){ + if(myTarget < args->targetLimit){ + if (args->converter->toULength > 0) { + mySourceChar = args->converter->toUBytes[0]; + } else { + mySourceChar = (uint8_t)*mySource; + } + + if (mySourceChar == ESC_START) { + tmpState = findStateFromEscSeq(mySource, mySourceLimit, args->converter->toUBytes, args->converter->toULength, err); + + if (*err == U_TRUNCATED_CHAR_FOUND) { + for (; mySource < mySourceLimit;) { + args->converter->toUBytes[args->converter->toULength++] = *mySource++; + } + *err = U_ZERO_ERROR; + break; + } else if (tmpState == INVALID) { + *err = U_ILLEGAL_CHAR_FOUND; + break; + } + + if (tmpState != currentState) { + currentState = tmpState; + } + + sourceOffset = uprv_strlen((char*)escSeqCompoundText[currentState]) - args->converter->toULength; + + mySource += sourceOffset; + + args->converter->toULength = 0; + } + + if (currentState == COMPOUND_TEXT_SINGLE_0) { + while (mySource < mySourceLimit) { + if (*mySource == ESC_START) { + break; + } + if (myTarget < args->targetLimit) { + *myTarget++ = 0x00ff&(*mySource++); + } else { + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } else if (mySource < mySourceLimit){ + sourceOffset = findNextEsc(mySource, mySourceLimit); + + tmpSourceLimit = mySource + sourceOffset; + + subArgs.source = mySource; + subArgs.sourceLimit = tmpSourceLimit; + subArgs.target = myTarget; + savedSharedData = subArgs.converter->sharedData; + subArgs.converter->sharedData = myConverterData->myConverterArray[currentState]; + + ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); + + subArgs.converter->sharedData = savedSharedData; + + mySource = subArgs.source; + myTarget = subArgs.target; + + /* copy input/error/overflow buffers */ + if(subArgs.converter->toULength > 0) { + uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); + } + args->converter->toULength = subArgs.converter->toULength; + + if (U_FAILURE(*err)) { + if(*err == U_BUFFER_OVERFLOW_ERROR) { + if(subArgs.converter->UCharErrorBufferLength > 0) { + uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, + subArgs.converter->UCharErrorBufferLength); + } + args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; + subArgs.converter->UCharErrorBufferLength = 0; + } + break; + } + } + } else { + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + } + myConverterData->state = currentState; + args->target = myTarget; + args->source = mySource; +} + +static void +_CompoundText_GetUnicodeSet(const UConverter *cnv, + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { + UConverterDataCompoundText *myConverterData = (UConverterDataCompoundText *)cnv->extraInfo; + int32_t i; + + for (i = 1; i < NUM_OF_CONVERTERS; i++) { + ucnv_MBCSGetUnicodeSetForUnicode(myConverterData->myConverterArray[i], sa, which, pErrorCode); + } + sa->add(sa->set, 0x0000); + sa->add(sa->set, 0x0009); + sa->add(sa->set, 0x000A); + sa->addRange(sa->set, 0x0020, 0x007F); + sa->addRange(sa->set, 0x00A0, 0x00FF); +} + +static const UConverterImpl _CompoundTextImpl = { + + UCNV_COMPOUND_TEXT, + + NULL, + NULL, + + _CompoundTextOpen, + _CompoundTextClose, + _CompoundTextReset, + + UConverter_toUnicode_CompoundText_OFFSETS, + UConverter_toUnicode_CompoundText_OFFSETS, + UConverter_fromUnicode_CompoundText_OFFSETS, + UConverter_fromUnicode_CompoundText_OFFSETS, + NULL, + + NULL, + _CompoundTextgetName, + NULL, + NULL, + _CompoundText_GetUnicodeSet +}; +static const UConverterStaticData _CompoundTextStaticData = { + sizeof(UConverterStaticData), + "COMPOUND_TEXT", + 0, + UCNV_IBM, + UCNV_COMPOUND_TEXT, + 1, + 6, + { 0xef, 0, 0, 0 }, + 1, + FALSE, + FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; +const UConverterSharedData _CompoundTextData = { + sizeof(UConverterSharedData), + ~((uint32_t) 0), + NULL, + NULL, + &_CompoundTextStaticData, + FALSE, + &_CompoundTextImpl, + 0 +}; + +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index 98da8ff6947..261e830f13d 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -124,6 +124,7 @@ typedef enum { UCNV_UTF32, UCNV_CESU8, UCNV_IMAP_MAILBOX, + UCNV_COMPOUND_TEXT, /* Number of converter types for which we have conversion routines. */ UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES diff --git a/icu4c/source/data/mappings/icu-internal-compound-d1.ucm b/icu4c/source/data/mappings/icu-internal-compound-d1.ucm index d1ecafa16f3..d6b1d23a1d0 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-d1.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-d1.ucm @@ -18,7 +18,7 @@ 80-fe CHARMAP - \x3F |0 + \x3F |2 \xA2\xF1 |0 \xA2\xF2 |0 \xA2\xF3 |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-d2.ucm b/icu4c/source/data/mappings/icu-internal-compound-d2.ucm index 724eb98abe3..f3b5a7730ac 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-d2.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-d2.ucm @@ -18,7 +18,7 @@ 80-fe CHARMAP - \x3F |0 + \x3F |2 \xA1\xBE |0 \xA1\xBD |0 \xA1\xC2 |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-d3.ucm b/icu4c/source/data/mappings/icu-internal-compound-d3.ucm index 7b6f3822f0e..66b890b65aa 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-d3.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-d3.ucm @@ -18,7 +18,7 @@ 80-fe CHARMAP - \x3F |0 + \x3F |2 \xA9\xF9 |0 \xA9\xFA |0 \xA9\xFB |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-d4.ucm b/icu4c/source/data/mappings/icu-internal-compound-d4.ucm index fccf73d06f8..27dcec7ddfc 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-d4.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-d4.ucm @@ -18,7 +18,7 @@ 80-fe CHARMAP - \x3F |0 + \x3F |2 \xB0\xA1 |0 \xB0\xA2 |0 \xB0\xA3 |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-d5.ucm b/icu4c/source/data/mappings/icu-internal-compound-d5.ucm index 2c795d8f49d..54af67644c2 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-d5.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-d5.ucm @@ -18,7 +18,7 @@ 80-fe CHARMAP - \x3F |0 + \x3F |2 \xA1\xB9 |0 \xA1\xEB |0 \xA2\xA2 |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-d6.ucm b/icu4c/source/data/mappings/icu-internal-compound-d6.ucm index 7d21c3feb8e..83da53c1ba4 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-d6.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-d6.ucm @@ -18,7 +18,7 @@ 80-fe CHARMAP - \x3F |0 + \x3F |2 \xAD\xA1 |1 \xB1\xA5 |1 \xD6\xC1 |1 diff --git a/icu4c/source/data/mappings/icu-internal-compound-d7.ucm b/icu4c/source/data/mappings/icu-internal-compound-d7.ucm index 5687f94a210..be1ac511c08 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-d7.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-d7.ucm @@ -18,7 +18,7 @@ 80-fe CHARMAP - \x3F |0 + \x3F |2 \xE2\xA2 |1 \xB0\xA1 |1 \xEE\xB9 |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-s1.ucm b/icu4c/source/data/mappings/icu-internal-compound-s1.ucm index 2e8c27be13b..6570ec744c8 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-s1.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-s1.ucm @@ -13,7 +13,7 @@ "SBCS" CHARMAP - \x3F |0 + \x3F |2 \xD0 |0 \xF0 |0 \xDD |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-s2.ucm b/icu4c/source/data/mappings/icu-internal-compound-s2.ucm index c8e154db6db..7c636b94b78 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-s2.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-s2.ucm @@ -13,7 +13,7 @@ "SBCS" CHARMAP - \x3F |0 + \x3F |2 \xA2 |1 \xA1 |1 \xB4 |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-s3.ucm b/icu4c/source/data/mappings/icu-internal-compound-s3.ucm index 0e626cdd2f2..da63d6e009f 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-s3.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-s3.ucm @@ -13,7 +13,7 @@ "SBCS" CHARMAP - \x3F |0 + \x3F |2 \xAC |0 \xBB |0 \xBF |0 diff --git a/icu4c/source/data/mappings/icu-internal-compound-t.ucm b/icu4c/source/data/mappings/icu-internal-compound-t.ucm index ec060457c92..09bf2142988 100644 --- a/icu4c/source/data/mappings/icu-internal-compound-t.ucm +++ b/icu4c/source/data/mappings/icu-internal-compound-t.ucm @@ -19,6 +19,7 @@ 80-bf CHARMAP + \x3F |2 \xC2\x80 |0 \xC2\x81 |0 \xC2\x82 |0 diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index ed4656ef9e0..05924ac0e3f 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -815,6 +815,13 @@ conversion:table(nofallback) { :intvector{ 0, 4, 8, 12 }, :int{1}, :int{0}, "", "?", :bin{""} } + { + "x11-compound-text", + :bin{ 1b242944b5ac1b2d41a5e31b2d43d5f51b2d4dd01b2d41411b2d43bc1b2d42ff1b2d54df1b2d44c0b31b2d46b41b2d47b01b2d48e01b2d4ca1 }, + "\u54A1\u00A5\u00E3\u0120\u0121\u011E\u0041\u0135\u02D9\u0E3F\u0100\u0157\u0384\u0660\u05D0\u0401", + :intvector{ }, + :int{1}, :int{0}, "", "?", :bin{""} + } // Improve coverage of ISCII { "iscii-bng", @@ -1651,6 +1658,13 @@ conversion:table(nofallback) { :intvector{ 0, 1, 1, 1, 1, 3, 3, 4, 5, 5, 5, 6, 7, 7, 7, 8, 9, 9, 9 }, :int{1}, :int{0}, "", "?", "" } + { + "x11-compound-text", + "\u54A1\u00A5\u00E3\u0120\u0121\u011E\u0041\u0135\u02D9\u0E3F\u0100\u0157\u0384\u0660\u05D0\u0401", + :bin{ 1b242944b5ac1b2d41a5e31b2d43d5f51b2d4dd01b2d41411b2d43bc1b2d42ff1b2d54df1b2d44c0b31b2d46b41b2d47b01b2d48e01b2d4ca1 }, + :intvector{ }, + :int{1}, :int{0}, "", "?", "" + } // Test Gurmukhi (Bindi Tippi and Consonant clusters) { "iscii-gur",