mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-10012 Disable EBCDIC Arabic/Hebrew detectors by default. Added APIs to enable/disable each charset and get currently active charsets.
X-SVN-Rev: 34350
This commit is contained in:
parent
7b6fa12efe
commit
920dadff8d
5 changed files with 326 additions and 59 deletions
|
@ -32,7 +32,21 @@
|
|||
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
|
||||
#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
||||
|
||||
static icu::CharsetRecognizer **fCSRecognizers = NULL;
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct CSRecognizerInfo : public UMemory {
|
||||
CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
|
||||
: recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
|
||||
|
||||
~CSRecognizerInfo() {delete recognizer;};
|
||||
|
||||
CharsetRecognizer *recognizer;
|
||||
UBool isDefaultEnabled;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
static icu::CSRecognizerInfo **fCSRecognizers = NULL;
|
||||
static icu::UInitOnce gCSRecognizersInitOnce;
|
||||
static int32_t fCSRecognizers_size = 0;
|
||||
|
||||
|
@ -70,47 +84,48 @@ charsetMatchComparator(const void * /*context*/, const void *left, const void *r
|
|||
static void U_CALLCONV initRecognizers(UErrorCode &status) {
|
||||
U_NAMESPACE_USE
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
|
||||
CharsetRecognizer *tempArray[] = {
|
||||
new CharsetRecog_UTF8(),
|
||||
CSRecognizerInfo *tempArray[] = {
|
||||
new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
|
||||
|
||||
new CharsetRecog_UTF_16_BE(),
|
||||
new CharsetRecog_UTF_16_LE(),
|
||||
new CharsetRecog_UTF_32_BE(),
|
||||
new CharsetRecog_UTF_32_LE(),
|
||||
new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
|
||||
|
||||
new CharsetRecog_8859_1(),
|
||||
new CharsetRecog_8859_2(),
|
||||
new CharsetRecog_8859_5_ru(),
|
||||
new CharsetRecog_8859_6_ar(),
|
||||
new CharsetRecog_8859_7_el(),
|
||||
new CharsetRecog_8859_8_I_he(),
|
||||
new CharsetRecog_8859_8_he(),
|
||||
new CharsetRecog_windows_1251(),
|
||||
new CharsetRecog_windows_1256(),
|
||||
new CharsetRecog_KOI8_R(),
|
||||
new CharsetRecog_8859_9_tr(),
|
||||
new CharsetRecog_sjis(),
|
||||
new CharsetRecog_gb_18030(),
|
||||
new CharsetRecog_euc_jp(),
|
||||
new CharsetRecog_euc_kr(),
|
||||
new CharsetRecog_big5(),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
|
||||
|
||||
new CharsetRecog_2022JP(),
|
||||
new CharsetRecog_2022KR(),
|
||||
new CharsetRecog_2022CN(),
|
||||
new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
|
||||
new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
|
||||
|
||||
new CharsetRecog_IBM424_he_rtl(),
|
||||
new CharsetRecog_IBM424_he_ltr(),
|
||||
new CharsetRecog_IBM420_ar_rtl(),
|
||||
new CharsetRecog_IBM420_ar_ltr()
|
||||
new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
|
||||
new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
|
||||
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
|
||||
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
|
||||
};
|
||||
int32_t rCount = ARRAY_SIZE(tempArray);
|
||||
|
||||
fCSRecognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
|
||||
fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
|
||||
|
||||
if (fCSRecognizers == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
fCSRecognizers_size = rCount;
|
||||
for (int32_t r = 0; r < rCount; r += 1) {
|
||||
fCSRecognizers[r] = tempArray[r];
|
||||
|
@ -132,7 +147,8 @@ void CharsetDetector::setRecognizers(UErrorCode &status)
|
|||
|
||||
CharsetDetector::CharsetDetector(UErrorCode &status)
|
||||
: textIn(new InputText(status)), resultArray(NULL),
|
||||
resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
|
||||
resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
|
||||
fEnabledRecognizers(NULL)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
|
@ -170,6 +186,10 @@ CharsetDetector::~CharsetDetector()
|
|||
}
|
||||
|
||||
uprv_free(resultArray);
|
||||
|
||||
if (fEnabledRecognizers) {
|
||||
uprv_free(fEnabledRecognizers);
|
||||
}
|
||||
}
|
||||
|
||||
void CharsetDetector::setText(const char *in, int32_t len)
|
||||
|
@ -234,7 +254,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
|
|||
// give a match quality > 0.
|
||||
resultCount = 0;
|
||||
for (i = 0; i < fCSRecognizers_size; i += 1) {
|
||||
csr = fCSRecognizers[i];
|
||||
csr = fCSRecognizers[i]->recognizer;
|
||||
if (csr->match(textIn, resultArray[resultCount])) {
|
||||
resultCount++;
|
||||
}
|
||||
|
@ -251,6 +271,46 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
|
|||
return resultArray;
|
||||
}
|
||||
|
||||
void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t modIdx = -1;
|
||||
UBool isDefaultVal = FALSE;
|
||||
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
|
||||
CSRecognizerInfo *csrinfo = fCSRecognizers[i];
|
||||
if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
|
||||
modIdx = i;
|
||||
isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (modIdx < 0) {
|
||||
// No matching encoding found
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (fEnabledRecognizers == NULL && !isDefaultVal) {
|
||||
// Create an array storing the non default setting
|
||||
fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
|
||||
if (fEnabledRecognizers == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
// Initialize the array with default info
|
||||
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
|
||||
fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
|
||||
}
|
||||
}
|
||||
|
||||
if (fEnabledRecognizers != NULL) {
|
||||
fEnabledRecognizers[modIdx] = enabled;
|
||||
}
|
||||
}
|
||||
|
||||
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
|
||||
{
|
||||
if( index > fCSRecognizers_size-1 || index < 0) {
|
||||
|
@ -267,6 +327,8 @@ U_NAMESPACE_END
|
|||
U_CDECL_BEGIN
|
||||
typedef struct {
|
||||
int32_t currIndex;
|
||||
UBool all;
|
||||
UBool *enabledRecognizers;
|
||||
} Context;
|
||||
|
||||
|
||||
|
@ -281,27 +343,73 @@ enumClose(UEnumeration *en) {
|
|||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
enumCount(UEnumeration *, UErrorCode *) {
|
||||
return fCSRecognizers_size;
|
||||
enumCount(UEnumeration *en, UErrorCode *) {
|
||||
if (((Context *)en->context)->all) {
|
||||
// ucsdet_getAllDetectableCharsets, all charset detector names
|
||||
return fCSRecognizers_size;
|
||||
}
|
||||
|
||||
// Otherwise, ucsdet_getDetectableCharsets - only enabled ones
|
||||
int32_t count = 0;
|
||||
UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
|
||||
if (enabledArray != NULL) {
|
||||
// custom set
|
||||
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
|
||||
if (enabledArray[i]) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// default set
|
||||
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
|
||||
if (fCSRecognizers[i]->isDefaultEnabled) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static const char* U_CALLCONV
|
||||
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
|
||||
if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
|
||||
if(resultLength != NULL) {
|
||||
*resultLength = 0;
|
||||
const char *currName = NULL;
|
||||
|
||||
if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
|
||||
if (((Context *)en->context)->all) {
|
||||
// ucsdet_getAllDetectableCharsets, all charset detector names
|
||||
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
|
||||
((Context *)en->context)->currIndex++;
|
||||
} else {
|
||||
// ucsdet_getDetectableCharsets
|
||||
UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
|
||||
if (enabledArray != NULL) {
|
||||
// custome set
|
||||
while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
|
||||
if (enabledArray[((Context *)en->context)->currIndex]) {
|
||||
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
|
||||
}
|
||||
((Context *)en->context)->currIndex++;
|
||||
}
|
||||
} else {
|
||||
// default set
|
||||
while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
|
||||
if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
|
||||
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
|
||||
}
|
||||
((Context *)en->context)->currIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
|
||||
|
||||
if(resultLength != NULL) {
|
||||
*resultLength = (int32_t)uprv_strlen(currName);
|
||||
*resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
|
||||
}
|
||||
((Context *)en->context)->currIndex++;
|
||||
|
||||
return currName;
|
||||
}
|
||||
|
||||
|
||||
static void U_CALLCONV
|
||||
enumReset(UEnumeration *en, UErrorCode *) {
|
||||
((Context *)en->context)->currIndex = 0;
|
||||
|
@ -317,25 +425,61 @@ static const UEnumeration gCSDetEnumeration = {
|
|||
enumReset
|
||||
};
|
||||
|
||||
U_CAPI UEnumeration * U_EXPORT2
|
||||
ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
|
||||
{
|
||||
U_NAMESPACE_USE
|
||||
U_CDECL_END
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
|
||||
{
|
||||
|
||||
/* Initialize recognized charsets. */
|
||||
setRecognizers(status);
|
||||
|
||||
if(U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Initialize recognized charsets. */
|
||||
CharsetDetector::getDetectableCount();
|
||||
|
||||
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
|
||||
if (en == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
|
||||
en->context = (void*)NEW_ARRAY(Context, 1);
|
||||
if (en->context == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
DELETE_ARRAY(en);
|
||||
return 0;
|
||||
}
|
||||
uprv_memset(en->context, 0, sizeof(Context));
|
||||
((Context*)en->context)->all = TRUE;
|
||||
return en;
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
#endif
|
||||
UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
|
||||
{
|
||||
if(U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
|
||||
if (en == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
|
||||
en->context = (void*)NEW_ARRAY(Context, 1);
|
||||
if (en->context == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
DELETE_ARRAY(en);
|
||||
return 0;
|
||||
}
|
||||
uprv_memset(en->context, 0, sizeof(Context));
|
||||
((Context*)en->context)->all = FALSE;
|
||||
((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
|
||||
return en;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2006, International Business Machines
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -28,6 +28,10 @@ private:
|
|||
UBool fFreshTextSet;
|
||||
static void setRecognizers(UErrorCode &status);
|
||||
|
||||
UBool *fEnabledRecognizers; // If not null, active set of charset recognizers had
|
||||
// been changed from the default. The array index is
|
||||
// corresponding to fCSRecognizers. See setDetectableCharset().
|
||||
|
||||
public:
|
||||
CharsetDetector(UErrorCode &status);
|
||||
|
||||
|
@ -47,7 +51,12 @@ public:
|
|||
|
||||
// const char *getCharsetName(int32_t index, UErrorCode& status) const;
|
||||
|
||||
static int32_t getDetectableCount();
|
||||
static int32_t getDetectableCount();
|
||||
|
||||
|
||||
static UEnumeration * getAllDetectableCharsets(UErrorCode &status);
|
||||
UEnumeration * getDetectableCharsets(UErrorCode &status) const;
|
||||
void setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
********************************************************************************
|
||||
* Copyright (C) 2005-2007, International Business Machines
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************************
|
||||
*/
|
||||
|
@ -11,6 +11,11 @@
|
|||
#include "unicode/ucsdet.h"
|
||||
#include "csdetect.h"
|
||||
#include "csmatch.h"
|
||||
#include "csrsbcs.h"
|
||||
#include "csrmbcs.h"
|
||||
#include "csrutf8.h"
|
||||
#include "csrucode.h"
|
||||
#include "csr2022.h"
|
||||
|
||||
#include "cmemory.h"
|
||||
|
||||
|
@ -175,6 +180,26 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
|
|||
|
||||
return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status)
|
||||
{
|
||||
((CharsetDetector *)ucsd)->setDetectableCharset(encoding, enabled, *status);
|
||||
}
|
||||
|
||||
U_CAPI UEnumeration * U_EXPORT2
|
||||
ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
|
||||
{
|
||||
return CharsetDetector::getAllDetectableCharsets(*status);
|
||||
}
|
||||
|
||||
U_DRAFT UEnumeration * U_EXPORT2
|
||||
ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status)
|
||||
{
|
||||
return ((CharsetDetector *)ucsd)->getDetectableCharsets(*status);
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2010, International Business Machines
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: ucsdet.h
|
||||
|
@ -321,12 +321,21 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
|
|||
* The returned UEnumeration provides access to the names of
|
||||
* the charsets.
|
||||
*
|
||||
* <p>
|
||||
* The state of the Charset detector that is passed in does not
|
||||
* affect the result of this function, but requiring a valid, open
|
||||
* charset detector as a parameter insures that the charset detection
|
||||
* service has been safely initialized and that the required detection
|
||||
* data is available.
|
||||
*
|
||||
* <p>
|
||||
* <b>Note:</b> Multiple different charset encodings in a same family may use
|
||||
* a single shared name in this implementation. For example, this method returns
|
||||
* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
|
||||
* (Windows Latin 1). However, actual detection result could be "windows-1252"
|
||||
* when the input data matches Latin 1 code points with any points only available
|
||||
* in "windows-1252".
|
||||
*
|
||||
* @param ucsd a Charset detector.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return an iterator providing access to the detectable charset names.
|
||||
|
@ -335,7 +344,6 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
|
|||
U_STABLE UEnumeration * U_EXPORT2
|
||||
ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Test whether input filtering is enabled for this charset detector.
|
||||
* Input filtering removes text that appears to be HTML or xml
|
||||
|
@ -346,6 +354,7 @@ ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *statu
|
|||
* @return TRUE if filtering is enabled.
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
|
||||
U_STABLE UBool U_EXPORT2
|
||||
ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
|
||||
|
||||
|
@ -364,6 +373,39 @@ ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
|
|||
U_STABLE UBool U_EXPORT2
|
||||
ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
|
||||
|
||||
|
||||
/**
|
||||
* Get an iterator over the set of detectable charsets -
|
||||
* over the charsets that are enabled by the specified charset detector.
|
||||
*
|
||||
* The returned UEnumeration provides access to the names of
|
||||
* the charsets.
|
||||
*
|
||||
* @param ucsd a Charset detector.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return an iterator providing access to the detectable charset names by
|
||||
* the specified charset detector.
|
||||
* @internal
|
||||
*/
|
||||
U_DRAFT UEnumeration * U_EXPORT2
|
||||
ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Enable or disable individual charset encoding.
|
||||
* A name of charset encoding must be included in the names returned by
|
||||
* {@link #getAllDetectableCharsets()}.
|
||||
*
|
||||
* @param ucsd a Charset detector.
|
||||
* @param encoding encoding the name of charset encoding.
|
||||
* @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
|
||||
* charset encoding.
|
||||
* @param status receives the return status. When the name of charset encoding
|
||||
* is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
|
||||
* @internal
|
||||
*/
|
||||
U_DRAFT void U_EXPORT2
|
||||
ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
|
||||
|
||||
#endif
|
||||
#endif /* __UCSDET_H */
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2012, International Business Machines
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -275,6 +275,45 @@ void CharsetDetectionTest::ConstructionTest()
|
|||
printf("%s\n", name);
|
||||
#endif
|
||||
}
|
||||
|
||||
const char* defDisabled[] = {
|
||||
"IBM420_rtl", "IBM420_ltr",
|
||||
"IBM424_rtl", "IBM424_ltr",
|
||||
0
|
||||
};
|
||||
|
||||
LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
|
||||
const char *activeName = NULL;
|
||||
|
||||
while (activeName = uenum_next(eActive.getAlias(), NULL, status)) {
|
||||
// the charset must be included in all list
|
||||
UBool found = FALSE;
|
||||
|
||||
const char *name = NULL;
|
||||
uenum_reset(e.getAlias(), status);
|
||||
while (name = uenum_next(e.getAlias(), NULL, status)) {
|
||||
if (strcmp(activeName, name) == 0) {
|
||||
found = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
errln(UnicodeString(activeName) + " is not included in the all charset list.");
|
||||
}
|
||||
|
||||
// some charsets are disabled by default
|
||||
found = FALSE;
|
||||
for (int32_t i = 0; defDisabled[i] != 0; i++) {
|
||||
if (strcmp(activeName, defDisabled[i]) == 0) {
|
||||
found = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found) {
|
||||
errln(UnicodeString(activeName) + " should not be included in the default charset list.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CharsetDetectionTest::UTF8Test()
|
||||
|
@ -597,6 +636,10 @@ void CharsetDetectionTest::IBM424Test()
|
|||
char *bytes_r = extractBytes(s2, "IBM424", brLength);
|
||||
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
|
||||
ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
|
||||
ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
|
||||
ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Error opening charset detector. - %s", u_errorName(status));
|
||||
}
|
||||
|
@ -684,6 +727,10 @@ void CharsetDetectionTest::IBM420Test()
|
|||
if (U_FAILURE(status)) {
|
||||
errln("Error opening charset detector. - %s", u_errorName(status));
|
||||
}
|
||||
ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
|
||||
ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
|
||||
ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
|
||||
ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue