mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 06:25:30 +00:00
ICU-4639 Remove static constants from headers, fix overflow in confidence calculation in match_mbcs.
X-SVN-Rev: 19122
This commit is contained in:
parent
e1b12af644
commit
959633d24c
7 changed files with 70 additions and 34 deletions
|
@ -240,7 +240,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
|
|||
}
|
||||
|
||||
for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) {
|
||||
resultArray[i]->set(textIn,0,0);
|
||||
resultArray[i]->set(textIn, 0, 0);
|
||||
}
|
||||
|
||||
//Bubble sort
|
||||
|
@ -248,8 +248,8 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
|
|||
for(int32_t j = 0; j < i-1; j += 1) {
|
||||
if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
|
||||
CharsetMatch *temp = resultArray[j];
|
||||
resultArray[j]= resultArray[j+1];
|
||||
resultArray[j+1]=temp;
|
||||
resultArray[j] = resultArray[j+1];
|
||||
resultArray[j+1] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -193,9 +193,9 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
|
|||
|
||||
delete iter;
|
||||
|
||||
if (doubleByteCharCount == 0 && badCharCount == 0) {
|
||||
// No multi-byte chars.
|
||||
// ASCII file? It's probably not our encoding,
|
||||
if (doubleByteCharCount <= 10 && badCharCount == 0) {
|
||||
// Not many multi-byte chars.
|
||||
// ASCII or ISO file? It's probably not our encoding,
|
||||
// but is not incompatible with our encoding, so don't give it a zero.
|
||||
confidence = 10;
|
||||
|
||||
|
@ -217,6 +217,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
|
|||
// Assess confidence purely on having a reasonable number of
|
||||
// multi-byte characters (the more the better)
|
||||
confidence = 30 + doubleByteCharCount - 20*badCharCount;
|
||||
|
||||
if (confidence > 100) {
|
||||
confidence = 100;
|
||||
}
|
||||
|
@ -224,9 +225,11 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
|
|||
//
|
||||
// Frequency of occurence statistics exist.
|
||||
//
|
||||
|
||||
double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
|
||||
double scaleFactor = 90.0 / maxVal;
|
||||
confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
|
||||
|
||||
confidence = min(confidence, 100);
|
||||
}
|
||||
|
||||
|
|
|
@ -9,6 +9,11 @@
|
|||
|
||||
#include "csrsbcs.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define N_GRAM_SIZE 3
|
||||
#define N_GRAM_MASK 0xFFFFFF
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
|
||||
|
@ -75,7 +80,7 @@ void NGramParser::lookup(int32_t thisNgram)
|
|||
|
||||
void NGramParser::addByte(int32_t b)
|
||||
{
|
||||
ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
|
||||
ngram = ((ngram << 8) + b) & N_GRAM_MASK;
|
||||
lookup(ngram);
|
||||
}
|
||||
|
||||
|
@ -681,7 +686,10 @@ CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()
|
|||
|
||||
int32_t CharsetRecog_8859_1_en::match(InputText *textIn)
|
||||
{
|
||||
return match_sbcs(textIn, ngrams, charMap);
|
||||
int32_t result = match_sbcs(textIn, ngrams, charMap);
|
||||
|
||||
// printf("8859_1_en: result = %d\n", result);
|
||||
return result; //match_sbcs(textIn, ngrams, charMap);
|
||||
}
|
||||
|
||||
CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()
|
||||
|
|
|
@ -18,9 +18,6 @@ U_NAMESPACE_BEGIN
|
|||
class NGramParser : public UMemory
|
||||
{
|
||||
private:
|
||||
static const int32_t N_GRAM_SIZE = 3;
|
||||
static const int32_t N_GRAM_MASK = 0xFFFFFF;
|
||||
|
||||
int32_t byteIndex;
|
||||
int32_t ngram;
|
||||
|
||||
|
|
|
@ -16,13 +16,15 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#define BUFFER_SIZE 8192
|
||||
|
||||
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
||||
|
||||
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
|
||||
#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
||||
|
||||
InputText::InputText()
|
||||
: fInputBytes(NEW_ARRAY(uint8_t, kBufSize)), // The text to be checked. Markup will have been
|
||||
: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
|
||||
// removed if appropriate.
|
||||
fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
|
||||
// Value is percent, not absolute.
|
||||
|
@ -84,7 +86,7 @@ void InputText::MungeInput(UBool fStripTags) {
|
|||
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
|
||||
// guess as to whether the input was actually marked up at all.
|
||||
if (fStripTags) {
|
||||
for (srci = 0; srci < fRawLength; srci += 1) {
|
||||
for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
|
||||
b = fRawInput[srci];
|
||||
|
||||
if (b == (uint8_t)'<') {
|
||||
|
@ -117,8 +119,8 @@ void InputText::MungeInput(UBool fStripTags) {
|
|||
(fInputLen < 100 && fRawLength>600)) {
|
||||
int32_t limit = fRawLength;
|
||||
|
||||
if (limit > kBufSize) {
|
||||
limit = kBufSize;
|
||||
if (limit > BUFFER_SIZE) {
|
||||
limit = BUFFER_SIZE;
|
||||
}
|
||||
|
||||
for (srci=0; srci<limit; srci++) {
|
||||
|
|
|
@ -24,8 +24,6 @@ public:
|
|||
UBool isSet() const;
|
||||
void MungeInput(UBool fStripTags);
|
||||
|
||||
|
||||
static const int32_t kBufSize = 8192;
|
||||
// The text to be checked. Markup will have been
|
||||
// removed if appropriate.
|
||||
uint8_t *fInputBytes;
|
||||
|
|
|
@ -15,12 +15,15 @@
|
|||
#include "intltest.h"
|
||||
#include "csdetest.h"
|
||||
|
||||
//#include "cmemory.h"
|
||||
#include "xmlparser.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef DEBUG_DETECT
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
||||
|
||||
#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
|
||||
|
@ -155,6 +158,7 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
|
|||
if (name.compare(eSplit[0]) != 0) {
|
||||
errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
|
||||
|
||||
#ifdef DEBUG_DETECT
|
||||
int32_t matchCount;
|
||||
const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
|
||||
|
||||
|
@ -163,9 +167,9 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
|
|||
const char *lang = ucsdet_getLanguage(matches[m], &status);
|
||||
int32_t confidence = ucsdet_getConfidence(matches[m], &status);
|
||||
|
||||
errln("%s (%s) %d\n", name, lang, confidence);
|
||||
printf("%s (%s) %d\n", name, lang, confidence);
|
||||
}
|
||||
|
||||
#endif
|
||||
goto bail;
|
||||
}
|
||||
|
||||
|
@ -180,12 +184,15 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
|
|||
if (testString.compare(decoded, dLength) != 0) {
|
||||
errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
|
||||
|
||||
//for(int32_t i = 0; i < testLength; i += 1) {
|
||||
// if(testString[i] != decoded[i]) {
|
||||
// printf("Strings differ at byte %d\n", i);
|
||||
// break;
|
||||
// }
|
||||
//}
|
||||
#ifdef DEBUG_DETECT
|
||||
for(int32_t i = 0; i < testLength; i += 1) {
|
||||
if(testString[i] != decoded[i]) {
|
||||
printf("Strings differ at byte %d\n", i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
DELETE_ARRAY(decoded);
|
||||
|
@ -217,13 +224,21 @@ void CharsetDetectionTest::ConstructionTest()
|
|||
UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
|
||||
int32_t count = uenum_count(e, &status);
|
||||
|
||||
#ifdef DEBUG_DETECT
|
||||
printf("There are %d recognizers.\n", count);
|
||||
#endif
|
||||
|
||||
for(int32_t i = 0; i < count; i += 1) {
|
||||
int32_t length;
|
||||
const char *name = uenum_next(e, &length, &status);
|
||||
|
||||
if(name == NULL || length <= 0) {
|
||||
errln("ucsdet_getAllDetectableCharsets() returned an null or empty name!");
|
||||
errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
|
||||
}
|
||||
|
||||
#ifdef DEBUG_DETECT
|
||||
printf("%s\n", name);
|
||||
#endif
|
||||
}
|
||||
|
||||
uenum_close(e);
|
||||
|
@ -340,7 +355,7 @@ void CharsetDetectionTest::InputFilterTest()
|
|||
char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *lang;
|
||||
const char *lang, *name;
|
||||
|
||||
ucsdet_enableInputFilter(csd, TRUE);
|
||||
|
||||
|
@ -348,6 +363,7 @@ void CharsetDetectionTest::InputFilterTest()
|
|||
errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
|
||||
}
|
||||
|
||||
|
||||
ucsdet_setText(csd, bytes, byteLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
|
@ -356,10 +372,16 @@ void CharsetDetectionTest::InputFilterTest()
|
|||
goto turn_off;
|
||||
}
|
||||
|
||||
lang = ucsdet_getLanguage(match, &status);
|
||||
name = ucsdet_getName(match, &status);
|
||||
|
||||
if (strcmp(lang, "fr") != 0) {
|
||||
errln("Input filter did not strip markup!");
|
||||
if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
|
||||
errln("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
|
||||
} else {
|
||||
lang = ucsdet_getLanguage(match, &status);
|
||||
|
||||
if (lang == NULL || strcmp(lang, "fr") != 0) {
|
||||
errln("Input filter did not strip markup!");
|
||||
}
|
||||
}
|
||||
|
||||
turn_off:
|
||||
|
@ -372,10 +394,16 @@ turn_off:
|
|||
goto bail;
|
||||
}
|
||||
|
||||
lang = ucsdet_getLanguage(match, &status);
|
||||
name = ucsdet_getName(match, &status);
|
||||
|
||||
if (strcmp(lang, "en") != 0) {
|
||||
errln("Unfiltered input did not detect as English!");
|
||||
if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
|
||||
errln("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
|
||||
} else {
|
||||
lang = ucsdet_getLanguage(match, &status);
|
||||
|
||||
if (lang == NULL || strcmp(lang, "en") != 0) {
|
||||
errln("Unfiltered input did not detect as English!");
|
||||
}
|
||||
}
|
||||
|
||||
bail:
|
||||
|
|
Loading…
Add table
Reference in a new issue