ICU-6394 charset detection returns duplicate charsets

X-SVN-Rev: 25909
2025-04-14 17:24:01 +00:00 · 2009-04-24 22:24:27 +00:00 · 2009-04-24 22:24:27 +00:00 · b215e528d6
commit b215e528d6
parent fe7d64b5b5
3 changed files with 76 additions and 10 deletions
--- a/icu4c/source/i18n/csdetect.cpp
+++ b/icu4c/source/i18n/csdetect.cpp
@ -289,16 +289,32 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
        }

        uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
-        ////Bubble sort
-        //for(int32_t i = resultCount; i > 1; i -= 1) {
-        //    for(int32_t j = 0; j < i-1; j += 1) {
-        //        if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
-        //            CharsetMatch *temp = resultArray[j];
-        //            resultArray[j] = resultArray[j+1];
-        //            resultArray[j+1] = temp;
-        //        }
-        //    }
-        //}
+
+        // Remove duplicate charsets from the results.
+        // Simple minded, brute force approach - check each entry against all that follow.
+        // The first entry of any duplicated set is the one that should be kept because it will
+        // be the one with the highest confidence rating.
+        //   (Duplicate matches have different languages, only the charset is the same)
+        // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
+        // deleted, just reordered, with the unwanted duplicates placed after the good results.
+        int32_t j, k;
+        for (i=0; i<resultCount; i++) {
+            const char *charSetName = resultArray[i]->getName();
+            for (j=i+1; j<resultCount; ) {
+                if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
+                    // Not a duplicate.
+                    j++;
+                } else {
+                    // Duplicate entry at index j.  
+                    CharsetMatch *duplicate = resultArray[j];
+                    for (k=j; k<resultCount-1; k++) {
+                        resultArray[k] = resultArray[k+1];
+                    }
+                    resultCount--;
+                    resultArray[resultCount] = duplicate;
+                }
+            }
+        }

        fFreshTextSet = FALSE;
    }
--- a/icu4c/source/test/intltest/csdetest.cpp
+++ b/icu4c/source/test/intltest/csdetest.cpp
@ -11,6 +11,7 @@
 #include "unicode/ucnv.h"
 #include "unicode/unistr.h"
 #include "unicode/putil.h"
+#include "unicode/uniset.h"

 #include "intltest.h"
 #include "csdetest.h"
@ -84,6 +85,10 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char
            if (exec) IBM420Test();
            break;

+       case 8: name = "Ticket6394Test";
+            if (exec) Ticket6394Test();
+            break;
+
        default: name = "";
            break; //needed to end loop
    }
@ -692,3 +697,47 @@ bail:
    freeBytes(bytes_r);
    ucsdet_close(csd);
 }
+
+
+void CharsetDetectionTest::Ticket6394Test() {
+#if !UCONFIG_NO_CONVERSION
+    const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
+                             "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
+                             "encodings more than once.  The hop through UnicodeString is for platforms " 
+                             "where this char * string is be EBCDIC and needs conversion to Latin1.";
+    char latin1Text[sizeof(charText)];
+    UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
+
+    UErrorCode status = U_ZERO_ERROR;
+    UCharsetDetector *csd = ucsdet_open(&status);
+    ucsdet_setText(csd, latin1Text, -1, &status);
+    if (U_FAILURE(status)) {
+        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
+        return;
+    }
+
+    int32_t matchCount = 0;
+    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
+    if (U_FAILURE(status)) {
+        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
+        return;
+    }
+
+    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
+    int32_t i;
+    for (i=0; i<matchCount; i++) {
+        UnicodeString charSetName(ucsdet_getName(matches[i], &status));
+        if (U_FAILURE(status)) {
+            errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
+            status = U_ZERO_ERROR;
+        }
+        if (setOfCharsetNames.contains(charSetName)) {
+            errln("Fail at file %s, line %d ", __FILE__, __LINE__);
+            errln(UnicodeString("   Duplicate charset name = ") + charSetName);
+        }
+        setOfCharsetNames.add(charSetName);
+    }
+    ucsdet_close(csd);
+#endif
+}
+
--- a/icu4c/source/test/intltest/csdetest.h
+++ b/icu4c/source/test/intltest/csdetest.h
@ -29,6 +29,7 @@ public:
    virtual void DetectionTest();
    virtual void IBM424Test();
    virtual void IBM420Test();
+    virtual void Ticket6394Test();

 private:
    void checkEncoding(const UnicodeString &testString,