ICU-11296 based on patch from Jungshik, approved option name UCONFIG_ONLY_HTML_CONVERSION, turn off UTF-32, simplify changes, fix warnings

X-SVN-Rev: 37045
This commit is contained in:
Markus Scherer 2015-02-20 19:31:33 +00:00
parent 71035aa827
commit 8e6898ae3d
17 changed files with 155 additions and 51 deletions

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2014, International Business Machines
* Copyright (C) 2000-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv2022.cpp
@ -75,8 +75,10 @@
*/
#endif
#if !UCONFIG_ONLY_HTML_CONVERSION
static const char SHIFT_IN_STR[] = "\x0F";
// static const char SHIFT_OUT_STR[] = "\x0E";
#endif
#define CR 0x0D
#define LF 0x0A
@ -152,7 +154,11 @@ typedef enum {
} StateEnum;
/* is the StateEnum charset value for a DBCS charset? */
#if UCONFIG_ONLY_HTML_CONVERSION
#define IS_JP_DBCS(cs) (JISX208==(cs))
#else
#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
#endif
#define CSM(cs) ((uint16_t)1<<(cs))
@ -165,13 +171,19 @@ typedef enum {
* all versions, not just JIS7 and JIS8.
* - ICU does not distinguish between different versions of JIS X 0208.
*/
#if UCONFIG_ONLY_HTML_CONVERSION
enum { MAX_JA_VERSION=0 };
#else
enum { MAX_JA_VERSION=4 };
#endif
static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
#if !UCONFIG_ONLY_HTML_CONVERSION
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
#endif
};
typedef enum {
@ -358,15 +370,16 @@ static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
};
/* Type def for refactoring changeState_2022 code*/
typedef enum{
#ifdef U_ENABLE_GENERIC_ISO_2022
ISO_2022=0,
#endif
ISO_2022_JP=1,
#if !UCONFIG_ONLY_HTML_CONVERSION
ISO_2022_KR=2,
ISO_2022_CN=3
#endif
} Variant2022;
/*********** ISO 2022 Converter Protos ***********/
@ -397,8 +410,11 @@ namespace {
/*const UConverterSharedData _ISO2022Data;*/
extern const UConverterSharedData _ISO2022JPData;
#if !UCONFIG_ONLY_HTML_CONVERSION
extern const UConverterSharedData _ISO2022KRData;
extern const UConverterSharedData _ISO2022CNData;
#endif
} // namespace
@ -511,6 +527,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
myConverterData->name[len+1]='\0';
}
#if !UCONFIG_ONLY_HTML_CONVERSION
else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
(myLocale[2]=='_' || myLocale[2]=='\0'))
{
@ -580,6 +597,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
}
}
#endif // !UCONFIG_ONLY_HTML_CONVERSION
else{
#ifdef U_ENABLE_GENERIC_ISO_2022
myConverterData->isFirstBuffer = TRUE;
@ -714,6 +732,7 @@ static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
};
#if !UCONFIG_ONLY_HTML_CONVERSION
/*************** to unicode *******************/
static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
/* 0 1 2 3 4 5 6 7 8 9 */
@ -726,6 +745,7 @@ static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
};
#endif
static UCNV_TableStates_2022
@ -898,6 +918,7 @@ DONE:
}
}
break;
#if !UCONFIG_ONLY_HTML_CONVERSION
case ISO_2022_CN:
{
StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
@ -959,6 +980,7 @@ DONE:
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
}
break;
#endif // !UCONFIG_ONLY_HTML_CONVERSION
default:
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
@ -1001,6 +1023,7 @@ DONE:
}
}
#if !UCONFIG_ONLY_HTML_CONVERSION
/*Checks the characters of the buffer against valid 2022 escape sequences
*if the match we return a pointer to the initial start of the sequence otherwise
*we return sourceLimit
@ -1055,7 +1078,7 @@ getEndOfBuffer_2022(const char** source,
return mySource;
#endif
}
#endif
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
* any future change in _MBCSFromUChar32() function should be reflected here.
@ -2269,6 +2292,7 @@ endloop:
}
#if !UCONFIG_ONLY_HTML_CONVERSION
/***************************************************************
* Rules for ISO-2022-KR encoding
* i) The KSC5601 designator sequence should appear only once in a file,
@ -3412,6 +3436,7 @@ endloop:
args->target = myTarget;
args->source = mySource;
}
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
@ -3638,6 +3663,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
}
break;
#if !UCONFIG_ONLY_HTML_CONVERSION
case 'c':
case 'z':
/* include ASCII for CN */
@ -3649,6 +3675,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
cnvData->currentConverter, sa, which, pErrorCode);
/* the loop over myConverterArray[] will simply not find another converter */
break;
#endif
default:
break;
}
@ -3669,9 +3696,15 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
UConverterSetFilter filter;
if(cnvData->myConverterArray[i]!=NULL) {
if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
cnvData->version==0 && i==CNS_11643
) {
if(cnvData->locale[0]=='j' && i==JISX208) {
/*
* Only add code points that map to Shift-JIS codes
* corresponding to JIS X 0208.
*/
filter=UCNV_SET_FILTER_SJIS;
#if !UCONFIG_ONLY_HTML_CONVERSION
} else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
cnvData->version==0 && i==CNS_11643) {
/*
* Version-specific for CN:
* CN version 0 does not map CNS planes 3..7 although
@ -3680,18 +3713,13 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
* The two versions create different Unicode sets.
*/
filter=UCNV_SET_FILTER_2022_CN;
} else if(cnvData->locale[0]=='j' && i==JISX208) {
/*
* Only add code points that map to Shift-JIS codes
* corresponding to JIS X 0208.
*/
filter=UCNV_SET_FILTER_SJIS;
} else if(i==KSC5601) {
/*
* Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
* are broader than GR94.
*/
filter=UCNV_SET_FILTER_GR94DBCS;
#endif
} else {
filter=UCNV_SET_FILTER_NONE;
}
@ -3829,6 +3857,7 @@ const UConverterSharedData _ISO2022JPData={
} // namespace
#if !UCONFIG_ONLY_HTML_CONVERSION
/************* KR ***************/
static const UConverterImpl _ISO2022KRImpl={
UCNV_ISO_2022,
@ -3945,5 +3974,6 @@ const UConverterSharedData _ISO2022CNData={
};
} // namespace
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

View file

@ -1,11 +1,11 @@
/*
********************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2014, International Business Machines Corporation and
* Copyright (c) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*
* uconv_bld.cpp:
* ucnv_bld.cpp:
*
* Defines functions that are used in the creation/initialization/deletion
* of converters and related structures.
@ -64,33 +64,51 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={
#endif
&_Latin1Data,
&_UTF8Data, &_UTF16BEData, &_UTF16LEData, &_UTF32BEData, &_UTF32LEData,
&_UTF8Data, &_UTF16BEData, &_UTF16LEData,
#if UCONFIG_ONLY_HTML_CONVERSION
NULL, NULL,
#else
&_UTF32BEData, &_UTF32LEData,
#endif
NULL,
#if UCONFIG_NO_LEGACY_CONVERSION
NULL,
#else
&_ISO2022Data,
#endif
#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_ONLY_HTML_CONVERSION
NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
NULL,
#else
&_ISO2022Data,
&_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6,
&_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19,
&_HZData,
#endif
#if UCONFIG_ONLY_HTML_CONVERSION
NULL,
#else
&_SCSUData,
#endif
#if UCONFIG_NO_LEGACY_CONVERSION
#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_ONLY_HTML_CONVERSION
NULL,
#else
&_ISCIIData,
#endif
&_ASCIIData,
#if UCONFIG_ONLY_HTML_CONVERSION
NULL, NULL, &_UTF16Data, NULL, NULL, NULL,
#else
&_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data, &_IMAPData,
#endif
#if UCONFIG_NO_LEGACY_CONVERSION
#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_ONLY_HTML_CONVERSION
NULL,
#else
&_CompoundTextData
@ -105,18 +123,24 @@ static struct {
const char *name;
const UConverterType type;
} const cnvNameType[] = {
#if !UCONFIG_ONLY_HTML_CONVERSION
{ "bocu1", UCNV_BOCU1 },
{ "cesu8", UCNV_CESU8 },
#if !UCONFIG_NO_LEGACY_CONVERSION
#endif
#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
{ "hz",UCNV_HZ },
#endif
#if !UCONFIG_ONLY_HTML_CONVERSION
{ "imapmailboxname", UCNV_IMAP_MAILBOX },
#if !UCONFIG_NO_LEGACY_CONVERSION
#endif
#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
{ "iscii", UCNV_ISCII },
#endif
#if !UCONFIG_NO_LEGACY_CONVERSION
{ "iso2022", UCNV_ISO_2022 },
#endif
{ "iso88591", UCNV_LATIN_1 },
#if !UCONFIG_NO_LEGACY_CONVERSION
#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
{ "lmbcs1", UCNV_LMBCS_1 },
{ "lmbcs11",UCNV_LMBCS_11 },
{ "lmbcs16",UCNV_LMBCS_16 },
@ -130,7 +154,9 @@ static struct {
{ "lmbcs6", UCNV_LMBCS_6 },
{ "lmbcs8", UCNV_LMBCS_8 },
#endif
#if !UCONFIG_ONLY_HTML_CONVERSION
{ "scsu", UCNV_SCSU },
#endif
{ "usascii", UCNV_US_ASCII },
{ "utf16", UCNV_UTF16 },
{ "utf16be", UCNV_UTF16_BigEndian },
@ -142,6 +168,7 @@ static struct {
{ "utf16oppositeendian", UCNV_UTF16_BigEndian},
{ "utf16platformendian", UCNV_UTF16_LittleEndian },
#endif
#if !UCONFIG_ONLY_HTML_CONVERSION
{ "utf32", UCNV_UTF32 },
{ "utf32be", UCNV_UTF32_BigEndian },
{ "utf32le", UCNV_UTF32_LittleEndian },
@ -152,9 +179,14 @@ static struct {
{ "utf32oppositeendian", UCNV_UTF32_BigEndian },
{ "utf32platformendian", UCNV_UTF32_LittleEndian },
#endif
#endif
#if !UCONFIG_ONLY_HTML_CONVERSION
{ "utf7", UCNV_UTF7 },
#endif
{ "utf8", UCNV_UTF8 },
#if !UCONFIG_ONLY_HTML_CONVERSION
{ "x11compoundtext", UCNV_COMPOUND_TEXT}
#endif
};

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_ct.c
@ -14,7 +14,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/uset.h"

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2014, International Business Machines
* Copyright (C) 2000-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_lmb.cpp
@ -25,7 +25,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv_err.h"
#include "unicode/ucnv.h"

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2011, International Business Machines
* Copyright (C) 2002-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u32.c
@ -16,7 +16,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/utf.h"

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2011, International Business Machines
* Copyright (C) 2002-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u7.c
@ -16,7 +16,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv.h"
#include "ucnv_bld.h"

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2012, International Business Machines
* Copyright (C) 2002-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u8.c
@ -87,6 +87,15 @@ static const int8_t bytesFromUTF8[256] = {
static const uint32_t
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
static UBool hasCESU8Data(const UConverter *cnv)
{
#if UCONFIG_ONLY_HTML_CONVERSION
return FALSE;
#else
return (UBool)(cnv->sharedData == &_CESU8Data);
#endif
}
static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
UErrorCode * err)
{
@ -96,10 +105,10 @@ static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
const UChar *targetLimit = args->targetLimit;
unsigned char *toUBytes = cnv->toUBytes;
UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
UBool isCESU8 = hasCESU8Data(cnv);
uint32_t ch, ch2 = 0;
int32_t i, inBytes;
/* Restore size of current sequence */
if (cnv->toUnicodeStatus && myTarget < targetLimit)
{
@ -226,7 +235,7 @@ static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
const UChar *targetLimit = args->targetLimit;
unsigned char *toUBytes = cnv->toUBytes;
UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
UBool isCESU8 = hasCESU8Data(cnv);
uint32_t ch, ch2 = 0;
int32_t i, inBytes;
@ -357,7 +366,7 @@ U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
UChar32 ch;
uint8_t tempBuf[4];
int32_t indexToWrite;
UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
UBool isNotCESU8 = !hasCESU8Data(cnv);
if (cnv->fromUChar32 && myTarget < targetLimit)
{
@ -473,7 +482,7 @@ U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * ar
int32_t offsetNum, nextSourceIndex;
int32_t indexToWrite;
uint8_t tempBuf[4];
UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
UBool isNotCESU8 = !hasCESU8Data(cnv);
if (cnv->fromUChar32 && myTarget < targetLimit)
{

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2002-2011, International Business Machines
* Copyright (C) 2002-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -19,7 +19,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/ucnv_cb.h"

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2014, International Business Machines
* Copyright (C) 2000-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvhz.c
@ -16,7 +16,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "cmemory.h"
#include "unicode/ucnv.h"
@ -635,4 +635,4 @@ const UConverterSharedData _HZData={
0
};
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
#endif /* #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION */

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2012, International Business Machines
* Copyright (C) 2000-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvisci.c
@ -17,7 +17,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/ucnv_cb.h"

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2000-2011, International Business Machines
* Copyright (C) 2000-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -21,7 +21,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/ucnv_cb.h"

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2014, International Business Machines
* Copyright (C) 2002-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: uconfig.h
@ -200,7 +200,7 @@
* It does not turn off legacy conversion because that is necessary
* for ICU to work on EBCDIC platforms (for the default converter).
* If you want "only collation" and do not build for EBCDIC,
* then you can define UCONFIG_NO_LEGACY_CONVERSION 1 as well.
* then you can define UCONFIG_NO_CONVERSION or UCONFIG_NO_LEGACY_CONVERSION to 1 as well.
*
* @stable ICU 2.4
*/
@ -269,6 +269,21 @@
# define UCONFIG_NO_LEGACY_CONVERSION 1
#endif
/**
* \def UCONFIG_ONLY_HTML_CONVERSION
* This switch turns off all of the converters NOT listed in
* the HTML encoding standard:
* http://www.w3.org/TR/encoding/#names-and-labels
*
* This is not possible on EBCDIC platforms
* because they need ibm-37 or ibm-1047 default converters.
*
* @draft ICU 55
*/
#ifndef UCONFIG_ONLY_HTML_CONVERSION
# define UCONFIG_ONLY_HTML_CONVERSION 0
#endif
/**
* \def UCONFIG_NO_LEGACY_CONVERSION
* This switch turns off all converters except for

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2013, International Business Machines
* Copyright (C) 2005-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -110,6 +110,7 @@ static void U_CALLCONV initRecognizers(UErrorCode &status) {
new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
#if !UCONFIG_ONLY_HTML_CONVERSION
new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
@ -117,6 +118,7 @@ static void U_CALLCONV initRecognizers(UErrorCode &status) {
new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
#endif
};
int32_t rCount = ARRAY_SIZE(tempArray);

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Copyright (C) 2005-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -119,6 +119,7 @@ static const uint8_t escapeSequences_2022JP[][5] = {
{0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7
};
#if !UCONFIG_ONLY_HTML_CONVERSION
static const uint8_t escapeSequences_2022KR[][5] = {
{0x1b, 0x24, 0x29, 0x43, 0x00}
};
@ -136,6 +137,7 @@ static const uint8_t escapeSequences_2022CN[][5] = {
{0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2
{0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3
};
#endif
CharsetRecog_2022JP::~CharsetRecog_2022JP() {}
@ -152,6 +154,7 @@ UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const
return (confidence > 0);
}
#if !UCONFIG_ONLY_HTML_CONVERSION
CharsetRecog_2022KR::~CharsetRecog_2022KR() {}
const char *CharsetRecog_2022KR::getName() const {
@ -181,6 +184,7 @@ UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const
results->set(textIn, this, confidence);
return (confidence > 0);
}
#endif
CharsetRecog_2022::~CharsetRecog_2022() {
// nothing to do

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Copyright (C) 2005-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -65,6 +65,7 @@ public:
UBool match(InputText *textIn, CharsetMatch *results) const;
};
#if !UCONFIG_ONLY_HTML_CONVERSION
class CharsetRecog_2022KR :public CharsetRecog_2022 {
public:
virtual ~CharsetRecog_2022KR();
@ -84,6 +85,7 @@ public:
UBool match(InputText *textIn, CharsetMatch *results) const;
};
#endif
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2013, International Business Machines
* Copyright (C) 2005-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -137,6 +137,7 @@ int32_t NGramParser::parse(InputText *det)
return (int32_t) (rawPercent * 300.0);
}
#if !UCONFIG_ONLY_HTML_CONVERSION
static const uint8_t unshapeMap_IBM420[] = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
@ -232,6 +233,7 @@ void NGramParser_IBM420::parseCharacters(InputText *det)
}
}
}
#endif
CharsetRecog_sbcs::CharsetRecog_sbcs()
{
@ -624,6 +626,7 @@ static const uint8_t charMap_KOI8_R[] = {
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
};
#if !UCONFIG_ONLY_HTML_CONVERSION
static const int32_t ngrams_IBM424_he_rtl[] = {
0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
@ -691,6 +694,7 @@ static const uint8_t charMap_IBM420_ar[]= {
/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
};
#endif
//ISO-8859-1,2,5,6,7,8,9 Ngrams
@ -1155,6 +1159,7 @@ UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
return (confidence > 0);
}
#if !UCONFIG_ONLY_HTML_CONVERSION
CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
{
// nothing to do
@ -1253,6 +1258,7 @@ UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
results->set(textIn, this, confidence);
return (confidence > 0);
}
#endif
U_NAMESPACE_END
#endif

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2013, International Business Machines
* Copyright (C) 2005-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -50,6 +50,7 @@ public:
};
#if !UCONFIG_ONLY_HTML_CONVERSION
class NGramParser_IBM420 : public NGramParser
{
private:
@ -61,6 +62,7 @@ private:
public:
NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
};
#endif
class CharsetRecog_sbcs : public CharsetRecognizer
@ -229,6 +231,7 @@ public:
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
#if !UCONFIG_ONLY_HTML_CONVERSION
class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
{
public:
@ -280,6 +283,7 @@ class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
#endif
U_NAMESPACE_END