From 959633d24c89af93c07a69f90415657588296066 Mon Sep 17 00:00:00 2001 From: Eric Mader Date: Thu, 9 Feb 2006 21:13:01 +0000 Subject: [PATCH] ICU-4639 Remove static constants from headers, fix overflow in confidence calculation in match_mbcs. X-SVN-Rev: 19122 --- icu4c/source/i18n/csdetect.cpp | 6 +-- icu4c/source/i18n/csrmbcs.cpp | 9 ++-- icu4c/source/i18n/csrsbcs.cpp | 12 ++++- icu4c/source/i18n/csrsbcs.h | 3 -- icu4c/source/i18n/inputext.cpp | 10 ++-- icu4c/source/i18n/inputext.h | 2 - icu4c/source/test/intltest/csdetest.cpp | 62 ++++++++++++++++++------- 7 files changed, 70 insertions(+), 34 deletions(-) diff --git a/icu4c/source/i18n/csdetect.cpp b/icu4c/source/i18n/csdetect.cpp index 68c9e7adabd..c31ac81f4f5 100644 --- a/icu4c/source/i18n/csdetect.cpp +++ b/icu4c/source/i18n/csdetect.cpp @@ -240,7 +240,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, } for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) { - resultArray[i]->set(textIn,0,0); + resultArray[i]->set(textIn, 0, 0); } //Bubble sort @@ -248,8 +248,8 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, for(int32_t j = 0; j < i-1; j += 1) { if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) { CharsetMatch *temp = resultArray[j]; - resultArray[j]= resultArray[j+1]; - resultArray[j+1]=temp; + resultArray[j] = resultArray[j+1]; + resultArray[j+1] = temp; } } } diff --git a/icu4c/source/i18n/csrmbcs.cpp b/icu4c/source/i18n/csrmbcs.cpp index 3d3587d2bfa..410e774dafa 100644 --- a/icu4c/source/i18n/csrmbcs.cpp +++ b/icu4c/source/i18n/csrmbcs.cpp @@ -193,9 +193,9 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[ delete iter; - if (doubleByteCharCount == 0 && badCharCount == 0) { - // No multi-byte chars. - // ASCII file? It's probably not our encoding, + if (doubleByteCharCount <= 10 && badCharCount == 0) { + // Not many multi-byte chars. + // ASCII or ISO file? It's probably not our encoding, // but is not incompatible with our encoding, so don't give it a zero. confidence = 10; @@ -217,6 +217,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[ // Assess confidence purely on having a reasonable number of // multi-byte characters (the more the better) confidence = 30 + doubleByteCharCount - 20*badCharCount; + if (confidence > 100) { confidence = 100; } @@ -224,9 +225,11 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[ // // Frequency of occurence statistics exist. // + double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/ double scaleFactor = 90.0 / maxVal; confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0); + confidence = min(confidence, 100); } diff --git a/icu4c/source/i18n/csrsbcs.cpp b/icu4c/source/i18n/csrsbcs.cpp index 35eb991587d..b8319993bdd 100644 --- a/icu4c/source/i18n/csrsbcs.cpp +++ b/icu4c/source/i18n/csrsbcs.cpp @@ -9,6 +9,11 @@ #include "csrsbcs.h" +#include + +#define N_GRAM_SIZE 3 +#define N_GRAM_MASK 0xFFFFFF + U_NAMESPACE_BEGIN NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) @@ -75,7 +80,7 @@ void NGramParser::lookup(int32_t thisNgram) void NGramParser::addByte(int32_t b) { - ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK; + ngram = ((ngram << 8) + b) & N_GRAM_MASK; lookup(ngram); } @@ -681,7 +686,10 @@ CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en() int32_t CharsetRecog_8859_1_en::match(InputText *textIn) { - return match_sbcs(textIn, ngrams, charMap); + int32_t result = match_sbcs(textIn, ngrams, charMap); + + // printf("8859_1_en: result = %d\n", result); + return result; //match_sbcs(textIn, ngrams, charMap); } CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da() diff --git a/icu4c/source/i18n/csrsbcs.h b/icu4c/source/i18n/csrsbcs.h index 46b99719445..16edb88e495 100644 --- a/icu4c/source/i18n/csrsbcs.h +++ b/icu4c/source/i18n/csrsbcs.h @@ -18,9 +18,6 @@ U_NAMESPACE_BEGIN class NGramParser : public UMemory { private: - static const int32_t N_GRAM_SIZE = 3; - static const int32_t N_GRAM_MASK = 0xFFFFFF; - int32_t byteIndex; int32_t ngram; diff --git a/icu4c/source/i18n/inputext.cpp b/icu4c/source/i18n/inputext.cpp index 30605d85d4b..f38d12df41f 100644 --- a/icu4c/source/i18n/inputext.cpp +++ b/icu4c/source/i18n/inputext.cpp @@ -16,13 +16,15 @@ U_NAMESPACE_BEGIN +#define BUFFER_SIZE 8192 + #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) #define DELETE_ARRAY(array) uprv_free((void *) (array)) InputText::InputText() - : fInputBytes(NEW_ARRAY(uint8_t, kBufSize)), // The text to be checked. Markup will have been + : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been // removed if appropriate. fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. // Value is percent, not absolute. @@ -84,7 +86,7 @@ void InputText::MungeInput(UBool fStripTags) { // Count how many total '<' and illegal (nested) '<' occur, so we can make some // guess as to whether the input was actually marked up at all. if (fStripTags) { - for (srci = 0; srci < fRawLength; srci += 1) { + for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { b = fRawInput[srci]; if (b == (uint8_t)'<') { @@ -117,8 +119,8 @@ void InputText::MungeInput(UBool fStripTags) { (fInputLen < 100 && fRawLength>600)) { int32_t limit = fRawLength; - if (limit > kBufSize) { - limit = kBufSize; + if (limit > BUFFER_SIZE) { + limit = BUFFER_SIZE; } for (srci=0; srci #include +#ifdef DEBUG_DETECT +#include +#endif + #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) @@ -155,6 +158,7 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const if (name.compare(eSplit[0]) != 0) { errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); +#ifdef DEBUG_DETECT int32_t matchCount; const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); @@ -163,9 +167,9 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const const char *lang = ucsdet_getLanguage(matches[m], &status); int32_t confidence = ucsdet_getConfidence(matches[m], &status); - errln("%s (%s) %d\n", name, lang, confidence); + printf("%s (%s) %d\n", name, lang, confidence); } - +#endif goto bail; } @@ -180,12 +184,15 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const if (testString.compare(decoded, dLength) != 0) { errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string."); - //for(int32_t i = 0; i < testLength; i += 1) { - // if(testString[i] != decoded[i]) { - // printf("Strings differ at byte %d\n", i); - // break; - // } - //} +#ifdef DEBUG_DETECT + for(int32_t i = 0; i < testLength; i += 1) { + if(testString[i] != decoded[i]) { + printf("Strings differ at byte %d\n", i); + break; + } + } +#endif + } DELETE_ARRAY(decoded); @@ -217,13 +224,21 @@ void CharsetDetectionTest::ConstructionTest() UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); int32_t count = uenum_count(e, &status); +#ifdef DEBUG_DETECT + printf("There are %d recognizers.\n", count); +#endif + for(int32_t i = 0; i < count; i += 1) { int32_t length; const char *name = uenum_next(e, &length, &status); if(name == NULL || length <= 0) { - errln("ucsdet_getAllDetectableCharsets() returned an null or empty name!"); + errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); } + +#ifdef DEBUG_DETECT + printf("%s\n", name); +#endif } uenum_close(e); @@ -340,7 +355,7 @@ void CharsetDetectionTest::InputFilterTest() char *bytes = extractBytes(s, "ISO-8859-1", byteLength); UCharsetDetector *csd = ucsdet_open(&status); const UCharsetMatch *match; - const char *lang; + const char *lang, *name; ucsdet_enableInputFilter(csd, TRUE); @@ -348,6 +363,7 @@ void CharsetDetectionTest::InputFilterTest() errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"); } + ucsdet_setText(csd, bytes, byteLength, &status); match = ucsdet_detect(csd, &status); @@ -356,10 +372,16 @@ void CharsetDetectionTest::InputFilterTest() goto turn_off; } - lang = ucsdet_getLanguage(match, &status); + name = ucsdet_getName(match, &status); - if (strcmp(lang, "fr") != 0) { - errln("Input filter did not strip markup!"); + if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { + errln("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name); + } else { + lang = ucsdet_getLanguage(match, &status); + + if (lang == NULL || strcmp(lang, "fr") != 0) { + errln("Input filter did not strip markup!"); + } } turn_off: @@ -372,10 +394,16 @@ turn_off: goto bail; } - lang = ucsdet_getLanguage(match, &status); + name = ucsdet_getName(match, &status); - if (strcmp(lang, "en") != 0) { - errln("Unfiltered input did not detect as English!"); + if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { + errln("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name); + } else { + lang = ucsdet_getLanguage(match, &status); + + if (lang == NULL || strcmp(lang, "en") != 0) { + errln("Unfiltered input did not detect as English!"); + } } bail: