ICU-4639 Remove static constants from headers, fix overflow in confidence calculation in match_mbcs.

X-SVN-Rev: 19122
This commit is contained in:
Eric Mader 2006-02-09 21:13:01 +00:00
parent e1b12af644
commit 959633d24c
7 changed files with 70 additions and 34 deletions

View file

@ -240,7 +240,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
}
for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) {
resultArray[i]->set(textIn,0,0);
resultArray[i]->set(textIn, 0, 0);
}
//Bubble sort
@ -248,8 +248,8 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
for(int32_t j = 0; j < i-1; j += 1) {
if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
CharsetMatch *temp = resultArray[j];
resultArray[j]= resultArray[j+1];
resultArray[j+1]=temp;
resultArray[j] = resultArray[j+1];
resultArray[j+1] = temp;
}
}
}

View file

@ -193,9 +193,9 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
delete iter;
if (doubleByteCharCount == 0 && badCharCount == 0) {
// No multi-byte chars.
// ASCII file? It's probably not our encoding,
if (doubleByteCharCount <= 10 && badCharCount == 0) {
// Not many multi-byte chars.
// ASCII or ISO file? It's probably not our encoding,
// but is not incompatible with our encoding, so don't give it a zero.
confidence = 10;
@ -217,6 +217,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
// Assess confidence purely on having a reasonable number of
// multi-byte characters (the more the better)
confidence = 30 + doubleByteCharCount - 20*badCharCount;
if (confidence > 100) {
confidence = 100;
}
@ -224,9 +225,11 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
//
// Frequency of occurence statistics exist.
//
double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
double scaleFactor = 90.0 / maxVal;
confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
confidence = min(confidence, 100);
}

View file

@ -9,6 +9,11 @@
#include "csrsbcs.h"
#include <stdio.h>
#define N_GRAM_SIZE 3
#define N_GRAM_MASK 0xFFFFFF
U_NAMESPACE_BEGIN
NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
@ -75,7 +80,7 @@ void NGramParser::lookup(int32_t thisNgram)
void NGramParser::addByte(int32_t b)
{
ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
ngram = ((ngram << 8) + b) & N_GRAM_MASK;
lookup(ngram);
}
@ -681,7 +686,10 @@ CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()
int32_t CharsetRecog_8859_1_en::match(InputText *textIn)
{
return match_sbcs(textIn, ngrams, charMap);
int32_t result = match_sbcs(textIn, ngrams, charMap);
// printf("8859_1_en: result = %d\n", result);
return result; //match_sbcs(textIn, ngrams, charMap);
}
CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()

View file

@ -18,9 +18,6 @@ U_NAMESPACE_BEGIN
class NGramParser : public UMemory
{
private:
static const int32_t N_GRAM_SIZE = 3;
static const int32_t N_GRAM_MASK = 0xFFFFFF;
int32_t byteIndex;
int32_t ngram;

View file

@ -16,13 +16,15 @@
U_NAMESPACE_BEGIN
#define BUFFER_SIZE 8192
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
InputText::InputText()
: fInputBytes(NEW_ARRAY(uint8_t, kBufSize)), // The text to be checked. Markup will have been
: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
// removed if appropriate.
fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
// Value is percent, not absolute.
@ -84,7 +86,7 @@ void InputText::MungeInput(UBool fStripTags) {
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
// guess as to whether the input was actually marked up at all.
if (fStripTags) {
for (srci = 0; srci < fRawLength; srci += 1) {
for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
b = fRawInput[srci];
if (b == (uint8_t)'<') {
@ -117,8 +119,8 @@ void InputText::MungeInput(UBool fStripTags) {
(fInputLen < 100 && fRawLength>600)) {
int32_t limit = fRawLength;
if (limit > kBufSize) {
limit = kBufSize;
if (limit > BUFFER_SIZE) {
limit = BUFFER_SIZE;
}
for (srci=0; srci<limit; srci++) {

View file

@ -24,8 +24,6 @@ public:
UBool isSet() const;
void MungeInput(UBool fStripTags);
static const int32_t kBufSize = 8192;
// The text to be checked. Markup will have been
// removed if appropriate.
uint8_t *fInputBytes;

View file

@ -15,12 +15,15 @@
#include "intltest.h"
#include "csdetest.h"
//#include "cmemory.h"
#include "xmlparser.h"
#include <stdlib.h>
#include <string.h>
#ifdef DEBUG_DETECT
#include <stdio.h>
#endif
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
@ -155,6 +158,7 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
if (name.compare(eSplit[0]) != 0) {
errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
#ifdef DEBUG_DETECT
int32_t matchCount;
const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
@ -163,9 +167,9 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
const char *lang = ucsdet_getLanguage(matches[m], &status);
int32_t confidence = ucsdet_getConfidence(matches[m], &status);
errln("%s (%s) %d\n", name, lang, confidence);
printf("%s (%s) %d\n", name, lang, confidence);
}
#endif
goto bail;
}
@ -180,12 +184,15 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
if (testString.compare(decoded, dLength) != 0) {
errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
//for(int32_t i = 0; i < testLength; i += 1) {
// if(testString[i] != decoded[i]) {
// printf("Strings differ at byte %d\n", i);
// break;
// }
//}
#ifdef DEBUG_DETECT
for(int32_t i = 0; i < testLength; i += 1) {
if(testString[i] != decoded[i]) {
printf("Strings differ at byte %d\n", i);
break;
}
}
#endif
}
DELETE_ARRAY(decoded);
@ -217,13 +224,21 @@ void CharsetDetectionTest::ConstructionTest()
UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
int32_t count = uenum_count(e, &status);
#ifdef DEBUG_DETECT
printf("There are %d recognizers.\n", count);
#endif
for(int32_t i = 0; i < count; i += 1) {
int32_t length;
const char *name = uenum_next(e, &length, &status);
if(name == NULL || length <= 0) {
errln("ucsdet_getAllDetectableCharsets() returned an null or empty name!");
errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
}
#ifdef DEBUG_DETECT
printf("%s\n", name);
#endif
}
uenum_close(e);
@ -340,7 +355,7 @@ void CharsetDetectionTest::InputFilterTest()
char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *lang;
const char *lang, *name;
ucsdet_enableInputFilter(csd, TRUE);
@ -348,6 +363,7 @@ void CharsetDetectionTest::InputFilterTest()
errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
}
ucsdet_setText(csd, bytes, byteLength, &status);
match = ucsdet_detect(csd, &status);
@ -356,10 +372,16 @@ void CharsetDetectionTest::InputFilterTest()
goto turn_off;
}
lang = ucsdet_getLanguage(match, &status);
name = ucsdet_getName(match, &status);
if (strcmp(lang, "fr") != 0) {
errln("Input filter did not strip markup!");
if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
errln("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
} else {
lang = ucsdet_getLanguage(match, &status);
if (lang == NULL || strcmp(lang, "fr") != 0) {
errln("Input filter did not strip markup!");
}
}
turn_off:
@ -372,10 +394,16 @@ turn_off:
goto bail;
}
lang = ucsdet_getLanguage(match, &status);
name = ucsdet_getName(match, &status);
if (strcmp(lang, "en") != 0) {
errln("Unfiltered input did not detect as English!");
if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
errln("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
} else {
lang = ucsdet_getLanguage(match, &status);
if (lang == NULL || strcmp(lang, "en") != 0) {
errln("Unfiltered input did not detect as English!");
}
}
bail: