ICU-10012 Disable EBCDIC Arabic/Hebrew detectors by default. Added APIs to enable/disable each charset and get currently active charsets.

X-SVN-Rev: 34350
This commit is contained in:
Yoshito Umaoka 2013-09-17 06:57:53 +00:00
parent 7b6fa12efe
commit 920dadff8d
5 changed files with 326 additions and 59 deletions

View file

@ -32,7 +32,21 @@
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
static icu::CharsetRecognizer **fCSRecognizers = NULL;
U_NAMESPACE_BEGIN
struct CSRecognizerInfo : public UMemory {
CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
: recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
~CSRecognizerInfo() {delete recognizer;};
CharsetRecognizer *recognizer;
UBool isDefaultEnabled;
};
U_NAMESPACE_END
static icu::CSRecognizerInfo **fCSRecognizers = NULL;
static icu::UInitOnce gCSRecognizersInitOnce;
static int32_t fCSRecognizers_size = 0;
@ -70,47 +84,48 @@ charsetMatchComparator(const void * /*context*/, const void *left, const void *r
static void U_CALLCONV initRecognizers(UErrorCode &status) {
U_NAMESPACE_USE
ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
CharsetRecognizer *tempArray[] = {
new CharsetRecog_UTF8(),
CSRecognizerInfo *tempArray[] = {
new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
new CharsetRecog_UTF_16_BE(),
new CharsetRecog_UTF_16_LE(),
new CharsetRecog_UTF_32_BE(),
new CharsetRecog_UTF_32_LE(),
new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
new CharsetRecog_8859_1(),
new CharsetRecog_8859_2(),
new CharsetRecog_8859_5_ru(),
new CharsetRecog_8859_6_ar(),
new CharsetRecog_8859_7_el(),
new CharsetRecog_8859_8_I_he(),
new CharsetRecog_8859_8_he(),
new CharsetRecog_windows_1251(),
new CharsetRecog_windows_1256(),
new CharsetRecog_KOI8_R(),
new CharsetRecog_8859_9_tr(),
new CharsetRecog_sjis(),
new CharsetRecog_gb_18030(),
new CharsetRecog_euc_jp(),
new CharsetRecog_euc_kr(),
new CharsetRecog_big5(),
new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
new CharsetRecog_2022JP(),
new CharsetRecog_2022KR(),
new CharsetRecog_2022CN(),
new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
new CharsetRecog_IBM424_he_rtl(),
new CharsetRecog_IBM424_he_ltr(),
new CharsetRecog_IBM420_ar_rtl(),
new CharsetRecog_IBM420_ar_ltr()
new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
};
int32_t rCount = ARRAY_SIZE(tempArray);
fCSRecognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
if (fCSRecognizers == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
}
else {
fCSRecognizers_size = rCount;
for (int32_t r = 0; r < rCount; r += 1) {
fCSRecognizers[r] = tempArray[r];
@ -132,7 +147,8 @@ void CharsetDetector::setRecognizers(UErrorCode &status)
CharsetDetector::CharsetDetector(UErrorCode &status)
: textIn(new InputText(status)), resultArray(NULL),
resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
fEnabledRecognizers(NULL)
{
if (U_FAILURE(status)) {
return;
@ -170,6 +186,10 @@ CharsetDetector::~CharsetDetector()
}
uprv_free(resultArray);
if (fEnabledRecognizers) {
uprv_free(fEnabledRecognizers);
}
}
void CharsetDetector::setText(const char *in, int32_t len)
@ -234,7 +254,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
// give a match quality > 0.
resultCount = 0;
for (i = 0; i < fCSRecognizers_size; i += 1) {
csr = fCSRecognizers[i];
csr = fCSRecognizers[i]->recognizer;
if (csr->match(textIn, resultArray[resultCount])) {
resultCount++;
}
@ -251,6 +271,46 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
return resultArray;
}
void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
int32_t modIdx = -1;
UBool isDefaultVal = FALSE;
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
CSRecognizerInfo *csrinfo = fCSRecognizers[i];
if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
modIdx = i;
isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
break;
}
}
if (modIdx < 0) {
// No matching encoding found
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (fEnabledRecognizers == NULL && !isDefaultVal) {
// Create an array storing the non default setting
fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
if (fEnabledRecognizers == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Initialize the array with default info
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
}
}
if (fEnabledRecognizers != NULL) {
fEnabledRecognizers[modIdx] = enabled;
}
}
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
{
if( index > fCSRecognizers_size-1 || index < 0) {
@ -267,6 +327,8 @@ U_NAMESPACE_END
U_CDECL_BEGIN
typedef struct {
int32_t currIndex;
UBool all;
UBool *enabledRecognizers;
} Context;
@ -281,27 +343,73 @@ enumClose(UEnumeration *en) {
}
static int32_t U_CALLCONV
enumCount(UEnumeration *, UErrorCode *) {
return fCSRecognizers_size;
enumCount(UEnumeration *en, UErrorCode *) {
if (((Context *)en->context)->all) {
// ucsdet_getAllDetectableCharsets, all charset detector names
return fCSRecognizers_size;
}
// Otherwise, ucsdet_getDetectableCharsets - only enabled ones
int32_t count = 0;
UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
if (enabledArray != NULL) {
// custom set
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
if (enabledArray[i]) {
count++;
}
}
} else {
// default set
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
if (fCSRecognizers[i]->isDefaultEnabled) {
count++;
}
}
}
return count;
}
static const char* U_CALLCONV
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
if(resultLength != NULL) {
*resultLength = 0;
const char *currName = NULL;
if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
if (((Context *)en->context)->all) {
// ucsdet_getAllDetectableCharsets, all charset detector names
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
((Context *)en->context)->currIndex++;
} else {
// ucsdet_getDetectableCharsets
UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
if (enabledArray != NULL) {
// custome set
while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
if (enabledArray[((Context *)en->context)->currIndex]) {
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
}
((Context *)en->context)->currIndex++;
}
} else {
// default set
while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
}
((Context *)en->context)->currIndex++;
}
}
}
return NULL;
}
const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
if(resultLength != NULL) {
*resultLength = (int32_t)uprv_strlen(currName);
*resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
}
((Context *)en->context)->currIndex++;
return currName;
}
static void U_CALLCONV
enumReset(UEnumeration *en, UErrorCode *) {
((Context *)en->context)->currIndex = 0;
@ -317,25 +425,61 @@ static const UEnumeration gCSDetEnumeration = {
enumReset
};
U_CAPI UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
{
U_NAMESPACE_USE
U_CDECL_END
if(U_FAILURE(*status)) {
U_NAMESPACE_BEGIN
UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
{
/* Initialize recognized charsets. */
setRecognizers(status);
if(U_FAILURE(status)) {
return 0;
}
/* Initialize recognized charsets. */
CharsetDetector::getDetectableCount();
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
if (en == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
en->context = (void*)NEW_ARRAY(Context, 1);
if (en->context == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
DELETE_ARRAY(en);
return 0;
}
uprv_memset(en->context, 0, sizeof(Context));
((Context*)en->context)->all = TRUE;
return en;
}
U_CDECL_END
#endif
UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
{
if(U_FAILURE(status)) {
return 0;
}
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
if (en == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
en->context = (void*)NEW_ARRAY(Context, 1);
if (en->context == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
DELETE_ARRAY(en);
return 0;
}
uprv_memset(en->context, 0, sizeof(Context));
((Context*)en->context)->all = FALSE;
((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
return en;
}
U_NAMESPACE_END
#endif

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2006, International Business Machines
* Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -28,6 +28,10 @@ private:
UBool fFreshTextSet;
static void setRecognizers(UErrorCode &status);
UBool *fEnabledRecognizers; // If not null, active set of charset recognizers had
// been changed from the default. The array index is
// corresponding to fCSRecognizers. See setDetectableCharset().
public:
CharsetDetector(UErrorCode &status);
@ -47,7 +51,12 @@ public:
// const char *getCharsetName(int32_t index, UErrorCode& status) const;
static int32_t getDetectableCount();
static int32_t getDetectableCount();
static UEnumeration * getAllDetectableCharsets(UErrorCode &status);
UEnumeration * getDetectableCharsets(UErrorCode &status) const;
void setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status);
};
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
********************************************************************************
* Copyright (C) 2005-2007, International Business Machines
* Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*/
@ -11,6 +11,11 @@
#include "unicode/ucsdet.h"
#include "csdetect.h"
#include "csmatch.h"
#include "csrsbcs.h"
#include "csrmbcs.h"
#include "csrutf8.h"
#include "csrucode.h"
#include "csr2022.h"
#include "cmemory.h"
@ -175,6 +180,26 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status);
}
U_CAPI void U_EXPORT2
ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status)
{
((CharsetDetector *)ucsd)->setDetectableCharset(encoding, enabled, *status);
}
U_CAPI UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
{
return CharsetDetector::getAllDetectableCharsets(*status);
}
U_DRAFT UEnumeration * U_EXPORT2
ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status)
{
return ((CharsetDetector *)ucsd)->getDetectableCharsets(*status);
}
U_CDECL_END
#endif

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2010, International Business Machines
* Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucsdet.h
@ -321,12 +321,21 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
* The returned UEnumeration provides access to the names of
* the charsets.
*
* <p>
* The state of the Charset detector that is passed in does not
* affect the result of this function, but requiring a valid, open
* charset detector as a parameter insures that the charset detection
* service has been safely initialized and that the required detection
* data is available.
*
* <p>
* <b>Note:</b> Multiple different charset encodings in a same family may use
* a single shared name in this implementation. For example, this method returns
* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
* (Windows Latin 1). However, actual detection result could be "windows-1252"
* when the input data matches Latin 1 code points with any points only available
* in "windows-1252".
*
* @param ucsd a Charset detector.
* @param status Any error conditions are reported back in this variable.
* @return an iterator providing access to the detectable charset names.
@ -335,7 +344,6 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
U_STABLE UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
/**
* Test whether input filtering is enabled for this charset detector.
* Input filtering removes text that appears to be HTML or xml
@ -346,6 +354,7 @@ ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *statu
* @return TRUE if filtering is enabled.
* @stable ICU 3.6
*/
U_STABLE UBool U_EXPORT2
ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
@ -364,6 +373,39 @@ ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
U_STABLE UBool U_EXPORT2
ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
/**
* Get an iterator over the set of detectable charsets -
* over the charsets that are enabled by the specified charset detector.
*
* The returned UEnumeration provides access to the names of
* the charsets.
*
* @param ucsd a Charset detector.
* @param status Any error conditions are reported back in this variable.
* @return an iterator providing access to the detectable charset names by
* the specified charset detector.
* @internal
*/
U_DRAFT UEnumeration * U_EXPORT2
ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
/**
* Enable or disable individual charset encoding.
* A name of charset encoding must be included in the names returned by
* {@link #getAllDetectableCharsets()}.
*
* @param ucsd a Charset detector.
* @param encoding encoding the name of charset encoding.
* @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
* charset encoding.
* @param status receives the return status. When the name of charset encoding
* is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
* @internal
*/
U_DRAFT void U_EXPORT2
ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
#endif
#endif /* __UCSDET_H */

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -275,6 +275,45 @@ void CharsetDetectionTest::ConstructionTest()
printf("%s\n", name);
#endif
}
const char* defDisabled[] = {
"IBM420_rtl", "IBM420_ltr",
"IBM424_rtl", "IBM424_ltr",
0
};
LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
const char *activeName = NULL;
while (activeName = uenum_next(eActive.getAlias(), NULL, status)) {
// the charset must be included in all list
UBool found = FALSE;
const char *name = NULL;
uenum_reset(e.getAlias(), status);
while (name = uenum_next(e.getAlias(), NULL, status)) {
if (strcmp(activeName, name) == 0) {
found = TRUE;
break;
}
}
if (!found) {
errln(UnicodeString(activeName) + " is not included in the all charset list.");
}
// some charsets are disabled by default
found = FALSE;
for (int32_t i = 0; defDisabled[i] != 0; i++) {
if (strcmp(activeName, defDisabled[i]) == 0) {
found = TRUE;
break;
}
}
if (found) {
errln(UnicodeString(activeName) + " should not be included in the default charset list.");
}
}
}
void CharsetDetectionTest::UTF8Test()
@ -597,6 +636,10 @@ void CharsetDetectionTest::IBM424Test()
char *bytes_r = extractBytes(s2, "IBM424", brLength);
UCharsetDetector *csd = ucsdet_open(&status);
ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
if (U_FAILURE(status)) {
errln("Error opening charset detector. - %s", u_errorName(status));
}
@ -684,6 +727,10 @@ void CharsetDetectionTest::IBM420Test()
if (U_FAILURE(status)) {
errln("Error opening charset detector. - %s", u_errorName(status));
}
ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
const UCharsetMatch *match;
const char *name;