ICU-4639 Cleanup, more error checking.

X-SVN-Rev: 19086
2025-04-14 17:24:01 +00:00 · 2006-02-07 21:59:16 +00:00 · 2006-02-07 21:59:16 +00:00 · cb69e53d8d
commit cb69e53d8d
parent 5cb6459f4b
4 changed files with 135 additions and 11 deletions
--- a/icu4c/source/i18n/csdetect.cpp
+++ b/icu4c/source/i18n/csdetect.cpp
@ -194,7 +194,7 @@ const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
    if(maxMatchesFound > 0) {
        return resultArray[0];
    } else {
-        return 0;
+        return NULL;
    }
 }

--- a/icu4c/source/i18n/csrmbcs.cpp
+++ b/icu4c/source/i18n/csrmbcs.cpp
@ -158,7 +158,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
        totalCharCount += 1;

        if (iter->error) {
-            badCharCount++; 
+            badCharCount += 1; 
        } else {
            if (iter->charValue <= 0xFF) {
                singleByteCharCount += 1;
@ -206,7 +206,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
    if (commonChars == 0) {
        // We have no statistics on frequently occuring characters.
        //  Assess confidence purely on having a reasonable number of
-        //  multi-byte characters (the more the better
+        //  multi-byte characters (the more the better)
        confidence = 30 + doubleByteCharCount - 20*badCharCount;
        if (confidence > 100) {
            confidence = 100;
@ -215,11 +215,14 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
        //
        // Frequency of occurence statistics exist.
        //
-        double maxVal = log10((float)doubleByteCharCount / 4);
+        double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
        double scaleFactor = 90.0 / maxVal;
        confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
        confidence = min(confidence, 100);
-        if(confidence<0)confidence=0;
+    }
+
+    if (confidence < 0) {
+        confidence = 0;
    }

    return confidence;
--- a/icu4c/source/i18n/ucsdet.cpp
+++ b/icu4c/source/i18n/ucsdet.cpp
@ -26,7 +26,9 @@ ucsdet_open(UErrorCode   *status)
    if(U_FAILURE(*status)) {
        return 0;
    }
+
    CharsetDetector* csd = new CharsetDetector();
+
    return (UCharsetDetector *) csd;
 }

@ -44,6 +46,11 @@ ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCo
        return;
    }

+    if (ucsd == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
    CharsetDetector *csd = (CharsetDetector *) ucsd;

    csd->setText(textIn, len);
@ -53,7 +60,12 @@ U_DRAFT const char * U_EXPORT2
 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status)
 {
    if(U_FAILURE(*status)) {
-        return 0;
+        return NULL;
+    }
+
+    if (ucsm == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
    }

    CharsetMatch *csm = (CharsetMatch *) ucsm;
@ -65,7 +77,12 @@ U_DRAFT int32_t U_EXPORT2
 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status)
 {
    if(U_FAILURE(*status)) {
-        return -1;
+        return 0;
+    }
+
+    if (ucsm == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
    }

    CharsetMatch *csm = (CharsetMatch *) ucsm;
@ -77,7 +94,12 @@ U_DRAFT const char * U_EXPORT2
 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status)
 {
    if(U_FAILURE(*status)) {
-        return 0;
+        return NULL;
+    }
+
+    if (ucsm == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
    }

    CharsetMatch *csm = (CharsetMatch *) ucsm;
@ -89,7 +111,12 @@ U_DRAFT const UCharsetMatch * U_EXPORT2
 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status)
 {
    if(U_FAILURE(*status)) {
-        return 0;
+        return NULL;
+    }
+
+    if (ucsd == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
    }

    CharsetDetector *csd = (CharsetDetector *) ucsd;
@ -104,6 +131,11 @@ ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t
        return;
    }

+    if (ucsd == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
    CharsetDetector *csd = (CharsetDetector *) ucsd;

    csd->setDeclaredEncoding(encoding,length);
@ -114,7 +146,12 @@ ucsdet_detectAll(UCharsetDetector *ucsd,
                 int32_t *maxMatchesFound, UErrorCode *status)
 {
    if(U_FAILURE(*status)) {
-        return 0;
+        return NULL;
+    }
+
+    if (ucsd == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
    }

    CharsetDetector *csd = (CharsetDetector *) ucsd;
@ -143,6 +180,11 @@ ucsdet_detectAll(UCharsetDetector *ucsd,
 U_DRAFT  UBool U_EXPORT2
 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd)
 {
+    // todo: could use an error return...
+    if (ucsd == NULL) {
+        return FALSE;
+    }
+
    CharsetDetector *csd = (CharsetDetector *) ucsd;

    return csd->getStripTagsFlag();
@ -151,8 +193,14 @@ ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd)
 U_DRAFT  UBool U_EXPORT2
 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter)
 {
+    // todo: could use an error return...
+    if (ucsd == NULL) {
+        return FALSE;
+    }
+
    CharsetDetector *csd = (CharsetDetector *) ucsd;
    UBool prev = csd->getStripTagsFlag();
+
    csd->setStripTagsFlag(filter);

    return prev;
@ -163,7 +211,12 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
                 UChar *buf, int32_t cap, UErrorCode *status)
 {
    if(U_FAILURE(*status)) {
-        return -1;
+        return 0;
+    }
+
+    if (ucsm == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
    }

    CharsetMatch *csm = (CharsetMatch *) ucsm;
--- a/icu4c/source/test/intltest/csdetest.cpp
+++ b/icu4c/source/test/intltest/csdetest.cpp
@ -140,11 +140,18 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
    ucsdet_setText(csd, bytes, byteLength, &status);

    const UCharsetMatch *csm = ucsdet_detect(csd, &status);
+
+
    UnicodeString name(ucsdet_getName(csm, &status));
    UnicodeString lang(ucsdet_getLanguage(csm, &status));
    UChar *decoded = NULL;
    int32_t dLength = 0;

+    if (csm == NULL) {
+        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
+        goto bail;
+    }
+
    if (name.compare(eSplit[0]) != 0) {
        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);

@ -240,12 +247,20 @@ void CharsetDetectionTest::UTF8Test()
    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);

+    if (match == NULL) {
+        errln("Detection failure for UTF-8: got no matches.");
+        goto bail;
+    }
+
    ucsdet_getUChars(match, detected, sLength, &status);

    if (s.compare(detected, sLength) != 0) {
        errln("Round-trip test failed!");
    }

+    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
+
+bail:
    DELETE_ARRAY(detected);
    freeBytes(bytes);
    ucsdet_close(csd);
@ -268,23 +283,49 @@ void CharsetDetectionTest::UTF16Test()
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
    const char *name;
+    int32_t conf;

    ucsdet_setText(csd, beBytes, beLength, &status);
    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        errln("Encoding detection failure for UTF-16BE: got no matches.");
+        goto try_le;
+    }
+
    name  = ucsdet_getName(match, &status);
+    conf  = ucsdet_getConfidence(match, &status);

    if (strcmp(name, "UTF-16BE") != 0) {
        errln("Encoding detection failure for UTF-16BE: got %s", name);
    }

+    if (conf != 100) {
+        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
+    }
+
+try_le:
    ucsdet_setText(csd, leBytes, leLength, &status);
    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        errln("Encoding detection failure for UTF-16LE: got no matches.");
+        goto bail;
+    }
+
    name  = ucsdet_getName(match, &status);
+    conf = ucsdet_getConfidence(match, &status);
+

    if (strcmp(name, "UTF-16LE") != 0) {
        errln("Enconding detection failure for UTF-16LE: got %s", name);
    }

+    if (conf != 100) {
+        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
+    }
+
+bail:
    freeBytes(leBytes);
    freeBytes(beBytes);
    ucsdet_close(csd);
@ -309,21 +350,35 @@ void CharsetDetectionTest::InputFilterTest()

    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        errln("Turning on the input filter resulted in no matches.");
+        goto turn_off;
+    }
+
    lang = ucsdet_getLanguage(match, &status);

    if (strcmp(lang, "fr") != 0) {
        errln("Input filter did not strip markup!");
    }

+turn_off:
    ucsdet_enableInputFilter(csd, FALSE);
    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        errln("Turning off the input filter resulted in no matches.");
+        goto bail;
+    }
+
    lang = ucsdet_getLanguage(match, &status);

    if (strcmp(lang, "en") != 0) {
        errln("Unfiltered input did not detect as English!");
    }

+bail:
    freeBytes(bytes);
    ucsdet_close(csd);
 }
@ -343,6 +398,12 @@ void CharsetDetectionTest::C1BytesTest()

    ucsdet_setText(csd, bWindows, lWindows, &status);
    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        errln("English test with C1 bytes got no matches.");
+        goto bail;
+    }
+
    name  = ucsdet_getName(match, &status);

    if (strcmp(name, "windows-1252") != 0) {
@ -351,12 +412,19 @@ void CharsetDetectionTest::C1BytesTest()

    ucsdet_setText(csd, bISO, lISO, &status);
    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        errln("English text without C1 bytes got no matches.");
+        goto bail;
+    }
+
    name  = ucsdet_getName(match, &status);

    if (strcmp(name, "ISO-8859-1") != 0) {
        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
    }

+bail:
    freeBytes(bWindows);
    freeBytes(bISO);