mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-6394 charset detection returns duplicate charsets
X-SVN-Rev: 25909
This commit is contained in:
parent
fe7d64b5b5
commit
b215e528d6
3 changed files with 76 additions and 10 deletions
|
@ -289,16 +289,32 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
|
|||
}
|
||||
|
||||
uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
|
||||
////Bubble sort
|
||||
//for(int32_t i = resultCount; i > 1; i -= 1) {
|
||||
// for(int32_t j = 0; j < i-1; j += 1) {
|
||||
// if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
|
||||
// CharsetMatch *temp = resultArray[j];
|
||||
// resultArray[j] = resultArray[j+1];
|
||||
// resultArray[j+1] = temp;
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
// Remove duplicate charsets from the results.
|
||||
// Simple minded, brute force approach - check each entry against all that follow.
|
||||
// The first entry of any duplicated set is the one that should be kept because it will
|
||||
// be the one with the highest confidence rating.
|
||||
// (Duplicate matches have different languages, only the charset is the same)
|
||||
// Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
|
||||
// deleted, just reordered, with the unwanted duplicates placed after the good results.
|
||||
int32_t j, k;
|
||||
for (i=0; i<resultCount; i++) {
|
||||
const char *charSetName = resultArray[i]->getName();
|
||||
for (j=i+1; j<resultCount; ) {
|
||||
if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
|
||||
// Not a duplicate.
|
||||
j++;
|
||||
} else {
|
||||
// Duplicate entry at index j.
|
||||
CharsetMatch *duplicate = resultArray[j];
|
||||
for (k=j; k<resultCount-1; k++) {
|
||||
resultArray[k] = resultArray[k+1];
|
||||
}
|
||||
resultCount--;
|
||||
resultArray[resultCount] = duplicate;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fFreshTextSet = FALSE;
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include "unicode/ucnv.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
#include "intltest.h"
|
||||
#include "csdetest.h"
|
||||
|
@ -84,6 +85,10 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char
|
|||
if (exec) IBM420Test();
|
||||
break;
|
||||
|
||||
case 8: name = "Ticket6394Test";
|
||||
if (exec) Ticket6394Test();
|
||||
break;
|
||||
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
}
|
||||
|
@ -692,3 +697,47 @@ bail:
|
|||
freeBytes(bytes_r);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
||||
|
||||
void CharsetDetectionTest::Ticket6394Test() {
|
||||
#if !UCONFIG_NO_CONVERSION
|
||||
const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
|
||||
"Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
|
||||
"encodings more than once. The hop through UnicodeString is for platforms "
|
||||
"where this char * string is be EBCDIC and needs conversion to Latin1.";
|
||||
char latin1Text[sizeof(charText)];
|
||||
UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
ucsdet_setText(csd, latin1Text, -1, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t matchCount = 0;
|
||||
const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
|
||||
int32_t i;
|
||||
for (i=0; i<matchCount; i++) {
|
||||
UnicodeString charSetName(ucsdet_getName(matches[i], &status));
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
if (setOfCharsetNames.contains(charSetName)) {
|
||||
errln("Fail at file %s, line %d ", __FILE__, __LINE__);
|
||||
errln(UnicodeString(" Duplicate charset name = ") + charSetName);
|
||||
}
|
||||
setOfCharsetNames.add(charSetName);
|
||||
}
|
||||
ucsdet_close(csd);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ public:
|
|||
virtual void DetectionTest();
|
||||
virtual void IBM424Test();
|
||||
virtual void IBM420Test();
|
||||
virtual void Ticket6394Test();
|
||||
|
||||
private:
|
||||
void checkEncoding(const UnicodeString &testString,
|
||||
|
|
Loading…
Add table
Reference in a new issue