ICU-4639 Remove static constants from headers, fix overflow in confidence calculation in match_mbcs.

X-SVN-Rev: 19122
2025-04-07 06:25:30 +00:00 · 2006-02-09 21:13:01 +00:00 · 2006-02-09 21:13:01 +00:00 · 959633d24c
commit 959633d24c
parent e1b12af644
7 changed files with 70 additions and 34 deletions
--- a/icu4c/source/i18n/csdetect.cpp
+++ b/icu4c/source/i18n/csdetect.cpp
@ -240,7 +240,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
        }

        for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) {
-            resultArray[i]->set(textIn,0,0);
+            resultArray[i]->set(textIn, 0, 0);
        }

        //Bubble sort
@ -248,8 +248,8 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
            for(int32_t j = 0; j < i-1; j += 1) {
                if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
                    CharsetMatch *temp = resultArray[j];
-                    resultArray[j]= resultArray[j+1];
-                    resultArray[j+1]=temp;
+                    resultArray[j] = resultArray[j+1];
+                    resultArray[j+1] = temp;
                }
            }
        }
--- a/icu4c/source/i18n/csrmbcs.cpp
+++ b/icu4c/source/i18n/csrmbcs.cpp
@ -193,9 +193,9 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[

    delete iter;

-    if (doubleByteCharCount == 0 && badCharCount == 0) {
-        // No multi-byte chars.
-        //   ASCII file?  It's probably not our encoding,
+    if (doubleByteCharCount <= 10 && badCharCount == 0) {
+        // Not many multi-byte chars.
+        //   ASCII or ISO file?  It's probably not our encoding,
        //   but is not incompatible with our encoding, so don't give it a zero.
        confidence = 10;

@ -217,6 +217,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
        //  Assess confidence purely on having a reasonable number of
        //  multi-byte characters (the more the better)
        confidence = 30 + doubleByteCharCount - 20*badCharCount;
+
        if (confidence > 100) {
            confidence = 100;
        }
@ -224,9 +225,11 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
        //
        // Frequency of occurence statistics exist.
        //
+
        double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
        double scaleFactor = 90.0 / maxVal;
        confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
+
        confidence = min(confidence, 100);
    }

--- a/icu4c/source/i18n/csrsbcs.cpp
+++ b/icu4c/source/i18n/csrsbcs.cpp
@ -9,6 +9,11 @@

 #include "csrsbcs.h"

+#include <stdio.h>
+
+#define N_GRAM_SIZE 3
+#define N_GRAM_MASK 0xFFFFFF
+
 U_NAMESPACE_BEGIN

 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
@ -75,7 +80,7 @@ void NGramParser::lookup(int32_t thisNgram)

 void NGramParser::addByte(int32_t b)
 {
-    ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
+    ngram = ((ngram << 8) + b) & N_GRAM_MASK;
    lookup(ngram);
 }

@ -681,7 +686,10 @@ CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()

 int32_t CharsetRecog_8859_1_en::match(InputText *textIn)
 {
-	return match_sbcs(textIn, ngrams, charMap);
+    int32_t result = match_sbcs(textIn, ngrams, charMap);
+
+   // printf("8859_1_en: result = %d\n", result);
+	return result; //match_sbcs(textIn, ngrams, charMap);
 }

 CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()
--- a/icu4c/source/i18n/csrsbcs.h
+++ b/icu4c/source/i18n/csrsbcs.h
@ -18,9 +18,6 @@ U_NAMESPACE_BEGIN
 class NGramParser : public UMemory
 {
 private:
-    static const int32_t N_GRAM_SIZE = 3;
-    static const int32_t N_GRAM_MASK = 0xFFFFFF;
-
    int32_t byteIndex;
    int32_t ngram;
        
--- a/icu4c/source/i18n/inputext.cpp
+++ b/icu4c/source/i18n/inputext.cpp
@ -16,13 +16,15 @@

 U_NAMESPACE_BEGIN

+#define BUFFER_SIZE 8192
+
 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
 #define DELETE_ARRAY(array) uprv_free((void *) (array))

 InputText::InputText()
-    : fInputBytes(NEW_ARRAY(uint8_t, kBufSize)), // The text to be checked.  Markup will have been
+    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
                                                 //   removed if appropriate.
      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
                                                 //   Value is percent, not absolute.
@ -84,7 +86,7 @@ void InputText::MungeInput(UBool fStripTags) {
    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    //     guess as to whether the input was actually marked up at all.
    if (fStripTags) {
-        for (srci = 0; srci < fRawLength; srci += 1) {
+        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
            b = fRawInput[srci];

            if (b == (uint8_t)'<') {
@ -117,8 +119,8 @@ void InputText::MungeInput(UBool fStripTags) {
        (fInputLen < 100 && fRawLength>600)) {
            int32_t limit = fRawLength;

-            if (limit > kBufSize) {
-                limit = kBufSize;
+            if (limit > BUFFER_SIZE) {
+                limit = BUFFER_SIZE;
            }

            for (srci=0; srci<limit; srci++) {
--- a/icu4c/source/i18n/inputext.h
+++ b/icu4c/source/i18n/inputext.h
@ -24,8 +24,6 @@ public:
    UBool isSet() const; 
    void MungeInput(UBool fStripTags);

-
-    static const int32_t kBufSize = 8192;
    // The text to be checked.  Markup will have been
    //   removed if appropriate.
    uint8_t    *fInputBytes;
--- a/icu4c/source/test/intltest/csdetest.cpp
+++ b/icu4c/source/test/intltest/csdetest.cpp
@ -15,12 +15,15 @@
 #include "intltest.h"
 #include "csdetest.h"

-//#include "cmemory.h"
 #include "xmlparser.h"

 #include <stdlib.h>
 #include <string.h>

+#ifdef DEBUG_DETECT
+#include <stdio.h>
+#endif
+
 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
@ -155,6 +158,7 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
    if (name.compare(eSplit[0]) != 0) {
        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);

+#ifdef DEBUG_DETECT
        int32_t matchCount;
        const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);

@ -163,9 +167,9 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
            const char *lang = ucsdet_getLanguage(matches[m], &status);
            int32_t confidence = ucsdet_getConfidence(matches[m], &status);

-            errln("%s (%s) %d\n", name, lang, confidence);
+            printf("%s (%s) %d\n", name, lang, confidence);
        }
-
+#endif
        goto bail;
    }

@ -180,12 +184,15 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
    if (testString.compare(decoded, dLength) != 0) {
        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");

-        //for(int32_t i = 0; i < testLength; i += 1) {
-        //    if(testString[i] != decoded[i]) {
-        //        printf("Strings differ at byte %d\n", i);
-        //        break;
-        //    }
-        //}
+#ifdef DEBUG_DETECT
+        for(int32_t i = 0; i < testLength; i += 1) {
+            if(testString[i] != decoded[i]) {
+                printf("Strings differ at byte %d\n", i);
+                break;
+            }
+        }
+#endif
+
    }

    DELETE_ARRAY(decoded);
@ -217,13 +224,21 @@ void CharsetDetectionTest::ConstructionTest()
    UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
    int32_t count = uenum_count(e, &status);

+#ifdef DEBUG_DETECT
+    printf("There are %d recognizers.\n", count);
+#endif
+
    for(int32_t i = 0; i < count; i += 1) {
        int32_t length;
        const char *name = uenum_next(e, &length, &status);

        if(name == NULL || length <= 0) {
-            errln("ucsdet_getAllDetectableCharsets() returned an null or empty name!");
+            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
        }
+
+#ifdef DEBUG_DETECT
+        printf("%s\n", name);
+#endif
    }

    uenum_close(e);
@ -340,7 +355,7 @@ void CharsetDetectionTest::InputFilterTest()
    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
    UCharsetDetector *csd = ucsdet_open(&status);
    const UCharsetMatch *match;
-    const char *lang;
+    const char *lang, *name;

    ucsdet_enableInputFilter(csd, TRUE);

@ -348,6 +363,7 @@ void CharsetDetectionTest::InputFilterTest()
        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
    }

+
    ucsdet_setText(csd, bytes, byteLength, &status);
    match = ucsdet_detect(csd, &status);

@ -356,10 +372,16 @@ void CharsetDetectionTest::InputFilterTest()
        goto turn_off;
    }

-    lang = ucsdet_getLanguage(match, &status);
+    name = ucsdet_getName(match, &status);

-    if (strcmp(lang, "fr") != 0) {
-        errln("Input filter did not strip markup!");
+    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
+        errln("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
+    } else {
+        lang = ucsdet_getLanguage(match, &status);
+
+        if (lang == NULL || strcmp(lang, "fr") != 0) {
+            errln("Input filter did not strip markup!");
+        }
    }

 turn_off:
@ -372,10 +394,16 @@ turn_off:
        goto bail;
    }

-    lang = ucsdet_getLanguage(match, &status);
+    name = ucsdet_getName(match, &status);

-    if (strcmp(lang, "en") != 0) {
-        errln("Unfiltered input did not detect as English!");
+    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
+        errln("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
+    } else {
+        lang = ucsdet_getLanguage(match, &status);
+
+        if (lang == NULL || strcmp(lang, "en") != 0) {
+            errln("Unfiltered input did not detect as English!");
+        }
    }

 bail: