ICU-21823 Adding changes to fix charset detection incase of no match

This commit is contained in:
shaobero 2021-11-04 17:57:12 +00:00 committed by Markus Scherer
parent dbf7c20be6
commit b1269c9121
4 changed files with 30 additions and 0 deletions

View file

@ -270,6 +270,11 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound,
maxMatchesFound = resultCount;
if (maxMatchesFound == 0) {
status = U_INVALID_CHAR_FOUND;
return NULL;
}
return resultArray;
}

View file

@ -402,6 +402,7 @@ static void TestBufferOverflow(void) {
}
for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
status = U_ZERO_ERROR;
ucsdet_setText(csd, testStrings[idx], -1, &status);
match = ucsdet_detect(csd, &status);

View file

@ -109,6 +109,10 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char
if (exec) Ticket6954Test();
break;
case 10: name = "Ticket21823Test";
if (exec) Ticket21823Test();
break;
default: name = "";
break; //needed to end loop
}
@ -839,3 +843,22 @@ void CharsetDetectionTest::Ticket6954Test() {
TEST_ASSERT(strcmp(name1, "windows-1252")==0);
#endif
}
// Ticket 21823 - Issue with Charset Detector for ill-formed input strings.
// Its fix involves returning a failure based error code
// (U_INVALID_CHAR_FOUND) incase no charsets appear to match the input data.
void CharsetDetectionTest::Ticket21823Test() {
UErrorCode status = U_ZERO_ERROR;
std::string str = "\x80";
UCharsetDetector* csd = ucsdet_open(&status);
ucsdet_setText(csd, str.data(), str.length(), &status);
const UCharsetMatch* match = ucsdet_detect(csd, &status);
if (match == NULL) {
TEST_ASSERT(U_FAILURE(status));
}
ucsdet_close(csd);
}

View file

@ -33,6 +33,7 @@ public:
virtual void IBM420Test();
virtual void Ticket6394Test();
virtual void Ticket6954Test();
virtual void Ticket21823Test();
private:
void checkEncoding(const UnicodeString &testString,