ICU-6394 charset detection returns duplicate charsets

X-SVN-Rev: 25909
This commit is contained in:
Andy Heninger 2009-04-24 22:24:27 +00:00
parent fe7d64b5b5
commit b215e528d6
3 changed files with 76 additions and 10 deletions

View file

@ -289,16 +289,32 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
}
uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
////Bubble sort
//for(int32_t i = resultCount; i > 1; i -= 1) {
// for(int32_t j = 0; j < i-1; j += 1) {
// if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
// CharsetMatch *temp = resultArray[j];
// resultArray[j] = resultArray[j+1];
// resultArray[j+1] = temp;
// }
// }
//}
// Remove duplicate charsets from the results.
// Simple minded, brute force approach - check each entry against all that follow.
// The first entry of any duplicated set is the one that should be kept because it will
// be the one with the highest confidence rating.
// (Duplicate matches have different languages, only the charset is the same)
// Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
// deleted, just reordered, with the unwanted duplicates placed after the good results.
int32_t j, k;
for (i=0; i<resultCount; i++) {
const char *charSetName = resultArray[i]->getName();
for (j=i+1; j<resultCount; ) {
if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
// Not a duplicate.
j++;
} else {
// Duplicate entry at index j.
CharsetMatch *duplicate = resultArray[j];
for (k=j; k<resultCount-1; k++) {
resultArray[k] = resultArray[k+1];
}
resultCount--;
resultArray[resultCount] = duplicate;
}
}
}
fFreshTextSet = FALSE;
}

View file

@ -11,6 +11,7 @@
#include "unicode/ucnv.h"
#include "unicode/unistr.h"
#include "unicode/putil.h"
#include "unicode/uniset.h"
#include "intltest.h"
#include "csdetest.h"
@ -84,6 +85,10 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char
if (exec) IBM420Test();
break;
case 8: name = "Ticket6394Test";
if (exec) Ticket6394Test();
break;
default: name = "";
break; //needed to end loop
}
@ -692,3 +697,47 @@ bail:
freeBytes(bytes_r);
ucsdet_close(csd);
}
void CharsetDetectionTest::Ticket6394Test() {
#if !UCONFIG_NO_CONVERSION
const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
"Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
"encodings more than once. The hop through UnicodeString is for platforms "
"where this char * string is be EBCDIC and needs conversion to Latin1.";
char latin1Text[sizeof(charText)];
UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector *csd = ucsdet_open(&status);
ucsdet_setText(csd, latin1Text, -1, &status);
if (U_FAILURE(status)) {
errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
return;
}
int32_t matchCount = 0;
const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
if (U_FAILURE(status)) {
errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
return;
}
UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
int32_t i;
for (i=0; i<matchCount; i++) {
UnicodeString charSetName(ucsdet_getName(matches[i], &status));
if (U_FAILURE(status)) {
errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
status = U_ZERO_ERROR;
}
if (setOfCharsetNames.contains(charSetName)) {
errln("Fail at file %s, line %d ", __FILE__, __LINE__);
errln(UnicodeString(" Duplicate charset name = ") + charSetName);
}
setOfCharsetNames.add(charSetName);
}
ucsdet_close(csd);
#endif
}

View file

@ -29,6 +29,7 @@ public:
virtual void DetectionTest();
virtual void IBM424Test();
virtual void IBM420Test();
virtual void Ticket6394Test();
private:
void checkEncoding(const UnicodeString &testString,