ICU-4639 Cleanup, more error checking.

X-SVN-Rev: 19086
This commit is contained in:
Eric Mader 2006-02-07 21:59:16 +00:00
parent 5cb6459f4b
commit cb69e53d8d
4 changed files with 135 additions and 11 deletions

View file

@ -194,7 +194,7 @@ const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
if(maxMatchesFound > 0) {
return resultArray[0];
} else {
return 0;
return NULL;
}
}

View file

@ -158,7 +158,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
totalCharCount += 1;
if (iter->error) {
badCharCount++;
badCharCount += 1;
} else {
if (iter->charValue <= 0xFF) {
singleByteCharCount += 1;
@ -206,7 +206,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
if (commonChars == 0) {
// We have no statistics on frequently occuring characters.
// Assess confidence purely on having a reasonable number of
// multi-byte characters (the more the better
// multi-byte characters (the more the better)
confidence = 30 + doubleByteCharCount - 20*badCharCount;
if (confidence > 100) {
confidence = 100;
@ -215,11 +215,14 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
//
// Frequency of occurence statistics exist.
//
double maxVal = log10((float)doubleByteCharCount / 4);
double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
double scaleFactor = 90.0 / maxVal;
confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
confidence = min(confidence, 100);
if(confidence<0)confidence=0;
}
if (confidence < 0) {
confidence = 0;
}
return confidence;

View file

@ -26,7 +26,9 @@ ucsdet_open(UErrorCode *status)
if(U_FAILURE(*status)) {
return 0;
}
CharsetDetector* csd = new CharsetDetector();
return (UCharsetDetector *) csd;
}
@ -44,6 +46,11 @@ ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCo
return;
}
if (ucsd == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
CharsetDetector *csd = (CharsetDetector *) ucsd;
csd->setText(textIn, len);
@ -53,7 +60,12 @@ U_DRAFT const char * U_EXPORT2
ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status)
{
if(U_FAILURE(*status)) {
return 0;
return NULL;
}
if (ucsm == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
CharsetMatch *csm = (CharsetMatch *) ucsm;
@ -65,7 +77,12 @@ U_DRAFT int32_t U_EXPORT2
ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status)
{
if(U_FAILURE(*status)) {
return -1;
return 0;
}
if (ucsm == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
CharsetMatch *csm = (CharsetMatch *) ucsm;
@ -77,7 +94,12 @@ U_DRAFT const char * U_EXPORT2
ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status)
{
if(U_FAILURE(*status)) {
return 0;
return NULL;
}
if (ucsm == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
CharsetMatch *csm = (CharsetMatch *) ucsm;
@ -89,7 +111,12 @@ U_DRAFT const UCharsetMatch * U_EXPORT2
ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status)
{
if(U_FAILURE(*status)) {
return 0;
return NULL;
}
if (ucsd == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
CharsetDetector *csd = (CharsetDetector *) ucsd;
@ -104,6 +131,11 @@ ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t
return;
}
if (ucsd == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
CharsetDetector *csd = (CharsetDetector *) ucsd;
csd->setDeclaredEncoding(encoding,length);
@ -114,7 +146,12 @@ ucsdet_detectAll(UCharsetDetector *ucsd,
int32_t *maxMatchesFound, UErrorCode *status)
{
if(U_FAILURE(*status)) {
return 0;
return NULL;
}
if (ucsd == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
CharsetDetector *csd = (CharsetDetector *) ucsd;
@ -143,6 +180,11 @@ ucsdet_detectAll(UCharsetDetector *ucsd,
U_DRAFT UBool U_EXPORT2
ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd)
{
// todo: could use an error return...
if (ucsd == NULL) {
return FALSE;
}
CharsetDetector *csd = (CharsetDetector *) ucsd;
return csd->getStripTagsFlag();
@ -151,8 +193,14 @@ ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd)
U_DRAFT UBool U_EXPORT2
ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter)
{
// todo: could use an error return...
if (ucsd == NULL) {
return FALSE;
}
CharsetDetector *csd = (CharsetDetector *) ucsd;
UBool prev = csd->getStripTagsFlag();
csd->setStripTagsFlag(filter);
return prev;
@ -163,7 +211,12 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
UChar *buf, int32_t cap, UErrorCode *status)
{
if(U_FAILURE(*status)) {
return -1;
return 0;
}
if (ucsm == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
CharsetMatch *csm = (CharsetMatch *) ucsm;

View file

@ -140,11 +140,18 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
ucsdet_setText(csd, bytes, byteLength, &status);
const UCharsetMatch *csm = ucsdet_detect(csd, &status);
UnicodeString name(ucsdet_getName(csm, &status));
UnicodeString lang(ucsdet_getLanguage(csm, &status));
UChar *decoded = NULL;
int32_t dLength = 0;
if (csm == NULL) {
errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
goto bail;
}
if (name.compare(eSplit[0]) != 0) {
errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
@ -240,12 +247,20 @@ void CharsetDetectionTest::UTF8Test()
ucsdet_setText(csd, bytes, byteLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
errln("Detection failure for UTF-8: got no matches.");
goto bail;
}
ucsdet_getUChars(match, detected, sLength, &status);
if (s.compare(detected, sLength) != 0) {
errln("Round-trip test failed!");
}
ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
bail:
DELETE_ARRAY(detected);
freeBytes(bytes);
ucsdet_close(csd);
@ -268,23 +283,49 @@ void CharsetDetectionTest::UTF16Test()
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *name;
int32_t conf;
ucsdet_setText(csd, beBytes, beLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
errln("Encoding detection failure for UTF-16BE: got no matches.");
goto try_le;
}
name = ucsdet_getName(match, &status);
conf = ucsdet_getConfidence(match, &status);
if (strcmp(name, "UTF-16BE") != 0) {
errln("Encoding detection failure for UTF-16BE: got %s", name);
}
if (conf != 100) {
errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
}
try_le:
ucsdet_setText(csd, leBytes, leLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
errln("Encoding detection failure for UTF-16LE: got no matches.");
goto bail;
}
name = ucsdet_getName(match, &status);
conf = ucsdet_getConfidence(match, &status);
if (strcmp(name, "UTF-16LE") != 0) {
errln("Enconding detection failure for UTF-16LE: got %s", name);
}
if (conf != 100) {
errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
}
bail:
freeBytes(leBytes);
freeBytes(beBytes);
ucsdet_close(csd);
@ -309,21 +350,35 @@ void CharsetDetectionTest::InputFilterTest()
ucsdet_setText(csd, bytes, byteLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
errln("Turning on the input filter resulted in no matches.");
goto turn_off;
}
lang = ucsdet_getLanguage(match, &status);
if (strcmp(lang, "fr") != 0) {
errln("Input filter did not strip markup!");
}
turn_off:
ucsdet_enableInputFilter(csd, FALSE);
ucsdet_setText(csd, bytes, byteLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
errln("Turning off the input filter resulted in no matches.");
goto bail;
}
lang = ucsdet_getLanguage(match, &status);
if (strcmp(lang, "en") != 0) {
errln("Unfiltered input did not detect as English!");
}
bail:
freeBytes(bytes);
ucsdet_close(csd);
}
@ -343,6 +398,12 @@ void CharsetDetectionTest::C1BytesTest()
ucsdet_setText(csd, bWindows, lWindows, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
errln("English test with C1 bytes got no matches.");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "windows-1252") != 0) {
@ -351,12 +412,19 @@ void CharsetDetectionTest::C1BytesTest()
ucsdet_setText(csd, bISO, lISO, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
errln("English text without C1 bytes got no matches.");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "ISO-8859-1") != 0) {
errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
}
bail:
freeBytes(bWindows);
freeBytes(bISO);