From 959633d24c89af93c07a69f90415657588296066 Mon Sep 17 00:00:00 2001
From: Eric Mader <emader@svn.icu-project.org>
Date: Thu, 9 Feb 2006 21:13:01 +0000
Subject: [PATCH] ICU-4639 Remove static constants from headers, fix overflow
 in confidence calculation in match_mbcs.

X-SVN-Rev: 19122
---
 icu4c/source/i18n/csdetect.cpp          |  6 +--
 icu4c/source/i18n/csrmbcs.cpp           |  9 ++--
 icu4c/source/i18n/csrsbcs.cpp           | 12 ++++-
 icu4c/source/i18n/csrsbcs.h             |  3 --
 icu4c/source/i18n/inputext.cpp          | 10 ++--
 icu4c/source/i18n/inputext.h            |  2 -
 icu4c/source/test/intltest/csdetest.cpp | 62 ++++++++++++++++++-------
 7 files changed, 70 insertions(+), 34 deletions(-)

diff --git a/icu4c/source/i18n/csdetect.cpp b/icu4c/source/i18n/csdetect.cpp
index 68c9e7adabd..c31ac81f4f5 100644
--- a/icu4c/source/i18n/csdetect.cpp
+++ b/icu4c/source/i18n/csdetect.cpp
@@ -240,7 +240,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
         }
 
         for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) {
-            resultArray[i]->set(textIn,0,0);
+            resultArray[i]->set(textIn, 0, 0);
         }
 
         //Bubble sort
@@ -248,8 +248,8 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
             for(int32_t j = 0; j < i-1; j += 1) {
                 if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
                     CharsetMatch *temp = resultArray[j];
-                    resultArray[j]= resultArray[j+1];
-                    resultArray[j+1]=temp;
+                    resultArray[j] = resultArray[j+1];
+                    resultArray[j+1] = temp;
                 }
             }
         }
diff --git a/icu4c/source/i18n/csrmbcs.cpp b/icu4c/source/i18n/csrmbcs.cpp
index 3d3587d2bfa..410e774dafa 100644
--- a/icu4c/source/i18n/csrmbcs.cpp
+++ b/icu4c/source/i18n/csrmbcs.cpp
@@ -193,9 +193,9 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
 
     delete iter;
 
-    if (doubleByteCharCount == 0 && badCharCount == 0) {
-        // No multi-byte chars.
-        //   ASCII file?  It's probably not our encoding,
+    if (doubleByteCharCount <= 10 && badCharCount == 0) {
+        // Not many multi-byte chars.
+        //   ASCII or ISO file?  It's probably not our encoding,
         //   but is not incompatible with our encoding, so don't give it a zero.
         confidence = 10;
 
@@ -217,6 +217,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
         //  Assess confidence purely on having a reasonable number of
         //  multi-byte characters (the more the better)
         confidence = 30 + doubleByteCharCount - 20*badCharCount;
+
         if (confidence > 100) {
             confidence = 100;
         }
@@ -224,9 +225,11 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
         //
         // Frequency of occurence statistics exist.
         //
+
         double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
         double scaleFactor = 90.0 / maxVal;
         confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
+
         confidence = min(confidence, 100);
     }
 
diff --git a/icu4c/source/i18n/csrsbcs.cpp b/icu4c/source/i18n/csrsbcs.cpp
index 35eb991587d..b8319993bdd 100644
--- a/icu4c/source/i18n/csrsbcs.cpp
+++ b/icu4c/source/i18n/csrsbcs.cpp
@@ -9,6 +9,11 @@
 
 #include "csrsbcs.h"
 
+#include <stdio.h>
+
+#define N_GRAM_SIZE 3
+#define N_GRAM_MASK 0xFFFFFF
+
 U_NAMESPACE_BEGIN
 
 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
@@ -75,7 +80,7 @@ void NGramParser::lookup(int32_t thisNgram)
 
 void NGramParser::addByte(int32_t b)
 {
-    ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
+    ngram = ((ngram << 8) + b) & N_GRAM_MASK;
     lookup(ngram);
 }
 
@@ -681,7 +686,10 @@ CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()
 
 int32_t CharsetRecog_8859_1_en::match(InputText *textIn)
 {
-	return match_sbcs(textIn, ngrams, charMap);
+    int32_t result = match_sbcs(textIn, ngrams, charMap);
+
+   // printf("8859_1_en: result = %d\n", result);
+	return result; //match_sbcs(textIn, ngrams, charMap);
 }
 
 CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()
diff --git a/icu4c/source/i18n/csrsbcs.h b/icu4c/source/i18n/csrsbcs.h
index 46b99719445..16edb88e495 100644
--- a/icu4c/source/i18n/csrsbcs.h
+++ b/icu4c/source/i18n/csrsbcs.h
@@ -18,9 +18,6 @@ U_NAMESPACE_BEGIN
 class NGramParser : public UMemory
 {
  private:
-    static const int32_t N_GRAM_SIZE = 3;
-    static const int32_t N_GRAM_MASK = 0xFFFFFF;
-
     int32_t byteIndex;
     int32_t ngram;
         
diff --git a/icu4c/source/i18n/inputext.cpp b/icu4c/source/i18n/inputext.cpp
index 30605d85d4b..f38d12df41f 100644
--- a/icu4c/source/i18n/inputext.cpp
+++ b/icu4c/source/i18n/inputext.cpp
@@ -16,13 +16,15 @@
 
 U_NAMESPACE_BEGIN
 
+#define BUFFER_SIZE 8192
+
 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
 
 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
 #define DELETE_ARRAY(array) uprv_free((void *) (array))
 
 InputText::InputText()
-    : fInputBytes(NEW_ARRAY(uint8_t, kBufSize)), // The text to be checked.  Markup will have been
+    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
                                                  //   removed if appropriate.
       fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
                                                  //   Value is percent, not absolute.
@@ -84,7 +86,7 @@ void InputText::MungeInput(UBool fStripTags) {
     //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
     //     guess as to whether the input was actually marked up at all.
     if (fStripTags) {
-        for (srci = 0; srci < fRawLength; srci += 1) {
+        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
             b = fRawInput[srci];
 
             if (b == (uint8_t)'<') {
@@ -117,8 +119,8 @@ void InputText::MungeInput(UBool fStripTags) {
         (fInputLen < 100 && fRawLength>600)) {
             int32_t limit = fRawLength;
 
-            if (limit > kBufSize) {
-                limit = kBufSize;
+            if (limit > BUFFER_SIZE) {
+                limit = BUFFER_SIZE;
             }
 
             for (srci=0; srci<limit; srci++) {
diff --git a/icu4c/source/i18n/inputext.h b/icu4c/source/i18n/inputext.h
index c6156b9d8a4..9e01115eb3c 100644
--- a/icu4c/source/i18n/inputext.h
+++ b/icu4c/source/i18n/inputext.h
@@ -24,8 +24,6 @@ public:
     UBool isSet() const; 
     void MungeInput(UBool fStripTags);
 
-
-    static const int32_t kBufSize = 8192;
     // The text to be checked.  Markup will have been
     //   removed if appropriate.
     uint8_t    *fInputBytes;
diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp
index daec425b82e..aeb93f6d88a 100644
--- a/icu4c/source/test/intltest/csdetest.cpp
+++ b/icu4c/source/test/intltest/csdetest.cpp
@@ -15,12 +15,15 @@
 #include "intltest.h"
 #include "csdetest.h"
 
-//#include "cmemory.h"
 #include "xmlparser.h"
 
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef DEBUG_DETECT
+#include <stdio.h>
+#endif
+
 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
 
 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
@@ -155,6 +158,7 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
     if (name.compare(eSplit[0]) != 0) {
         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
 
+#ifdef DEBUG_DETECT
         int32_t matchCount;
         const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
 
@@ -163,9 +167,9 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
             const char *lang = ucsdet_getLanguage(matches[m], &status);
             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
 
-            errln("%s (%s) %d\n", name, lang, confidence);
+            printf("%s (%s) %d\n", name, lang, confidence);
         }
-
+#endif
         goto bail;
     }
 
@@ -180,12 +184,15 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
     if (testString.compare(decoded, dLength) != 0) {
         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
 
-        //for(int32_t i = 0; i < testLength; i += 1) {
-        //    if(testString[i] != decoded[i]) {
-        //        printf("Strings differ at byte %d\n", i);
-        //        break;
-        //    }
-        //}
+#ifdef DEBUG_DETECT
+        for(int32_t i = 0; i < testLength; i += 1) {
+            if(testString[i] != decoded[i]) {
+                printf("Strings differ at byte %d\n", i);
+                break;
+            }
+        }
+#endif
+
     }
 
     DELETE_ARRAY(decoded);
@@ -217,13 +224,21 @@ void CharsetDetectionTest::ConstructionTest()
     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
     int32_t count = uenum_count(e, &status);
 
+#ifdef DEBUG_DETECT
+    printf("There are %d recognizers.\n", count);
+#endif
+
     for(int32_t i = 0; i < count; i += 1) {
         int32_t length;
         const char *name = uenum_next(e, &length, &status);
 
         if(name == NULL || length <= 0) {
-            errln("ucsdet_getAllDetectableCharsets() returned an null or empty name!");
+            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
         }
+
+#ifdef DEBUG_DETECT
+        printf("%s\n", name);
+#endif
     }
 
     uenum_close(e);
@@ -340,7 +355,7 @@ void CharsetDetectionTest::InputFilterTest()
     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
     UCharsetDetector *csd = ucsdet_open(&status);
     const UCharsetMatch *match;
-    const char *lang;
+    const char *lang, *name;
 
     ucsdet_enableInputFilter(csd, TRUE);
 
@@ -348,6 +363,7 @@ void CharsetDetectionTest::InputFilterTest()
         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
     }
 
+
     ucsdet_setText(csd, bytes, byteLength, &status);
     match = ucsdet_detect(csd, &status);
 
@@ -356,10 +372,16 @@ void CharsetDetectionTest::InputFilterTest()
         goto turn_off;
     }
 
-    lang = ucsdet_getLanguage(match, &status);
+    name = ucsdet_getName(match, &status);
 
-    if (strcmp(lang, "fr") != 0) {
-        errln("Input filter did not strip markup!");
+    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
+        errln("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
+    } else {
+        lang = ucsdet_getLanguage(match, &status);
+
+        if (lang == NULL || strcmp(lang, "fr") != 0) {
+            errln("Input filter did not strip markup!");
+        }
     }
 
 turn_off:
@@ -372,10 +394,16 @@ turn_off:
         goto bail;
     }
 
-    lang = ucsdet_getLanguage(match, &status);
+    name = ucsdet_getName(match, &status);
 
-    if (strcmp(lang, "en") != 0) {
-        errln("Unfiltered input did not detect as English!");
+    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
+        errln("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
+    } else {
+        lang = ucsdet_getLanguage(match, &status);
+
+        if (lang == NULL || strcmp(lang, "en") != 0) {
+            errln("Unfiltered input did not detect as English!");
+        }
     }
 
 bail: