mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-4639 Cleanup, more error checking.
X-SVN-Rev: 19086
This commit is contained in:
parent
5cb6459f4b
commit
cb69e53d8d
4 changed files with 135 additions and 11 deletions
|
@ -194,7 +194,7 @@ const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
|
|||
if(maxMatchesFound > 0) {
|
||||
return resultArray[0];
|
||||
} else {
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -158,7 +158,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
|
|||
totalCharCount += 1;
|
||||
|
||||
if (iter->error) {
|
||||
badCharCount++;
|
||||
badCharCount += 1;
|
||||
} else {
|
||||
if (iter->charValue <= 0xFF) {
|
||||
singleByteCharCount += 1;
|
||||
|
@ -206,7 +206,7 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
|
|||
if (commonChars == 0) {
|
||||
// We have no statistics on frequently occuring characters.
|
||||
// Assess confidence purely on having a reasonable number of
|
||||
// multi-byte characters (the more the better
|
||||
// multi-byte characters (the more the better)
|
||||
confidence = 30 + doubleByteCharCount - 20*badCharCount;
|
||||
if (confidence > 100) {
|
||||
confidence = 100;
|
||||
|
@ -215,11 +215,14 @@ int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[
|
|||
//
|
||||
// Frequency of occurence statistics exist.
|
||||
//
|
||||
double maxVal = log10((float)doubleByteCharCount / 4);
|
||||
double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
|
||||
double scaleFactor = 90.0 / maxVal;
|
||||
confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
|
||||
confidence = min(confidence, 100);
|
||||
if(confidence<0)confidence=0;
|
||||
}
|
||||
|
||||
if (confidence < 0) {
|
||||
confidence = 0;
|
||||
}
|
||||
|
||||
return confidence;
|
||||
|
|
|
@ -26,7 +26,9 @@ ucsdet_open(UErrorCode *status)
|
|||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
CharsetDetector* csd = new CharsetDetector();
|
||||
|
||||
return (UCharsetDetector *) csd;
|
||||
}
|
||||
|
||||
|
@ -44,6 +46,11 @@ ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCo
|
|||
return;
|
||||
}
|
||||
|
||||
if (ucsd == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
CharsetDetector *csd = (CharsetDetector *) ucsd;
|
||||
|
||||
csd->setText(textIn, len);
|
||||
|
@ -53,7 +60,12 @@ U_DRAFT const char * U_EXPORT2
|
|||
ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status)
|
||||
{
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (ucsm == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
CharsetMatch *csm = (CharsetMatch *) ucsm;
|
||||
|
@ -65,7 +77,12 @@ U_DRAFT int32_t U_EXPORT2
|
|||
ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status)
|
||||
{
|
||||
if(U_FAILURE(*status)) {
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ucsm == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
CharsetMatch *csm = (CharsetMatch *) ucsm;
|
||||
|
@ -77,7 +94,12 @@ U_DRAFT const char * U_EXPORT2
|
|||
ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status)
|
||||
{
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (ucsm == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
CharsetMatch *csm = (CharsetMatch *) ucsm;
|
||||
|
@ -89,7 +111,12 @@ U_DRAFT const UCharsetMatch * U_EXPORT2
|
|||
ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status)
|
||||
{
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (ucsd == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
CharsetDetector *csd = (CharsetDetector *) ucsd;
|
||||
|
@ -104,6 +131,11 @@ ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t
|
|||
return;
|
||||
}
|
||||
|
||||
if (ucsd == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
CharsetDetector *csd = (CharsetDetector *) ucsd;
|
||||
|
||||
csd->setDeclaredEncoding(encoding,length);
|
||||
|
@ -114,7 +146,12 @@ ucsdet_detectAll(UCharsetDetector *ucsd,
|
|||
int32_t *maxMatchesFound, UErrorCode *status)
|
||||
{
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (ucsd == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
CharsetDetector *csd = (CharsetDetector *) ucsd;
|
||||
|
@ -143,6 +180,11 @@ ucsdet_detectAll(UCharsetDetector *ucsd,
|
|||
U_DRAFT UBool U_EXPORT2
|
||||
ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd)
|
||||
{
|
||||
// todo: could use an error return...
|
||||
if (ucsd == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
CharsetDetector *csd = (CharsetDetector *) ucsd;
|
||||
|
||||
return csd->getStripTagsFlag();
|
||||
|
@ -151,8 +193,14 @@ ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd)
|
|||
U_DRAFT UBool U_EXPORT2
|
||||
ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter)
|
||||
{
|
||||
// todo: could use an error return...
|
||||
if (ucsd == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
CharsetDetector *csd = (CharsetDetector *) ucsd;
|
||||
UBool prev = csd->getStripTagsFlag();
|
||||
|
||||
csd->setStripTagsFlag(filter);
|
||||
|
||||
return prev;
|
||||
|
@ -163,7 +211,12 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
|
|||
UChar *buf, int32_t cap, UErrorCode *status)
|
||||
{
|
||||
if(U_FAILURE(*status)) {
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ucsm == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
CharsetMatch *csm = (CharsetMatch *) ucsm;
|
||||
|
|
|
@ -140,11 +140,18 @@ void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
|
|||
ucsdet_setText(csd, bytes, byteLength, &status);
|
||||
|
||||
const UCharsetMatch *csm = ucsdet_detect(csd, &status);
|
||||
|
||||
|
||||
UnicodeString name(ucsdet_getName(csm, &status));
|
||||
UnicodeString lang(ucsdet_getLanguage(csm, &status));
|
||||
UChar *decoded = NULL;
|
||||
int32_t dLength = 0;
|
||||
|
||||
if (csm == NULL) {
|
||||
errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (name.compare(eSplit[0]) != 0) {
|
||||
errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
|
||||
|
||||
|
@ -240,12 +247,20 @@ void CharsetDetectionTest::UTF8Test()
|
|||
ucsdet_setText(csd, bytes, byteLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("Detection failure for UTF-8: got no matches.");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
ucsdet_getUChars(match, detected, sLength, &status);
|
||||
|
||||
if (s.compare(detected, sLength) != 0) {
|
||||
errln("Round-trip test failed!");
|
||||
}
|
||||
|
||||
ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
|
||||
|
||||
bail:
|
||||
DELETE_ARRAY(detected);
|
||||
freeBytes(bytes);
|
||||
ucsdet_close(csd);
|
||||
|
@ -268,23 +283,49 @@ void CharsetDetectionTest::UTF16Test()
|
|||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
int32_t conf;
|
||||
|
||||
ucsdet_setText(csd, beBytes, beLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("Encoding detection failure for UTF-16BE: got no matches.");
|
||||
goto try_le;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
conf = ucsdet_getConfidence(match, &status);
|
||||
|
||||
if (strcmp(name, "UTF-16BE") != 0) {
|
||||
errln("Encoding detection failure for UTF-16BE: got %s", name);
|
||||
}
|
||||
|
||||
if (conf != 100) {
|
||||
errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
|
||||
}
|
||||
|
||||
try_le:
|
||||
ucsdet_setText(csd, leBytes, leLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("Encoding detection failure for UTF-16LE: got no matches.");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
conf = ucsdet_getConfidence(match, &status);
|
||||
|
||||
|
||||
if (strcmp(name, "UTF-16LE") != 0) {
|
||||
errln("Enconding detection failure for UTF-16LE: got %s", name);
|
||||
}
|
||||
|
||||
if (conf != 100) {
|
||||
errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(leBytes);
|
||||
freeBytes(beBytes);
|
||||
ucsdet_close(csd);
|
||||
|
@ -309,21 +350,35 @@ void CharsetDetectionTest::InputFilterTest()
|
|||
|
||||
ucsdet_setText(csd, bytes, byteLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("Turning on the input filter resulted in no matches.");
|
||||
goto turn_off;
|
||||
}
|
||||
|
||||
lang = ucsdet_getLanguage(match, &status);
|
||||
|
||||
if (strcmp(lang, "fr") != 0) {
|
||||
errln("Input filter did not strip markup!");
|
||||
}
|
||||
|
||||
turn_off:
|
||||
ucsdet_enableInputFilter(csd, FALSE);
|
||||
ucsdet_setText(csd, bytes, byteLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("Turning off the input filter resulted in no matches.");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
lang = ucsdet_getLanguage(match, &status);
|
||||
|
||||
if (strcmp(lang, "en") != 0) {
|
||||
errln("Unfiltered input did not detect as English!");
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bytes);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
@ -343,6 +398,12 @@ void CharsetDetectionTest::C1BytesTest()
|
|||
|
||||
ucsdet_setText(csd, bWindows, lWindows, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("English test with C1 bytes got no matches.");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
|
||||
if (strcmp(name, "windows-1252") != 0) {
|
||||
|
@ -351,12 +412,19 @@ void CharsetDetectionTest::C1BytesTest()
|
|||
|
||||
ucsdet_setText(csd, bISO, lISO, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("English text without C1 bytes got no matches.");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
|
||||
if (strcmp(name, "ISO-8859-1") != 0) {
|
||||
errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bWindows);
|
||||
freeBytes(bISO);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue