icu/icu4c/source/test/cintltst/ucsdetst.c
Fredrik Roubert 030fa1a479 ICU-21148 Consistently use standard lowercase true/false everywhere.
This is the normal standard way in C, C++ as well as Java and there's no
longer any reason for ICU to be different. The various internal macros
providing custom boolean constants can all be deleted and code as well
as documentation can be updated to use lowercase true/false everywhere.
2022-09-07 20:56:33 +02:00

593 lines
25 KiB
C

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
****************************************************************************
* Copyright (c) 2005-2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
****************************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/ucsdet.h"
#include "unicode/ucnv.h"
#include "unicode/ustring.h"
#include "cintltst.h"
#include "cmemory.h"
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) free(array)
static void TestConstruction(void);
static void TestUTF8(void);
static void TestUTF16(void);
static void TestC1Bytes(void);
static void TestInputFilter(void);
static void TestChaining(void);
static void TestBufferOverflow(void);
static void TestIBM424(void);
static void TestIBM420(void);
void addUCsdetTest(TestNode** root);
void addUCsdetTest(TestNode** root)
{
addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
#if !UCONFIG_NO_LEGACY_CONVERSION
addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
#endif
}
static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
{
UErrorCode status;
char buffer[1024];
char *dest, *destLimit = buffer + sizeof(buffer);
const UChar *srcLimit = src + length;
int32_t result = 0;
do {
dest = buffer;
status = U_ZERO_ERROR;
ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &status);
result += (int32_t) (dest - buffer);
} while (status == U_BUFFER_OVERFLOW_ERROR);
return result;
}
static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
{
UErrorCode status = U_ZERO_ERROR;
UConverter *cnv = ucnv_open(codepage, &status);
int32_t byteCount = preflight(src, length, cnv);
const UChar *srcLimit = src + length;
char *bytes = NEW_ARRAY(char, byteCount + 1);
char *dest = bytes, *destLimit = bytes + byteCount + 1;
ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &status);
ucnv_close(cnv);
*byteLength = byteCount;
return bytes;
}
static void freeBytes(char *bytes)
{
DELETE_ARRAY(bytes);
}
static void TestConstruction(void)
{
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector *csd = ucsdet_open(&status);
UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
const char *name;
int32_t count = uenum_count(e, &status);
int32_t i, length;
for(i = 0; i < count; i += 1) {
name = uenum_next(e, &length, &status);
if(name == NULL || length <= 0) {
log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
}
}
/* one past the list of all names must return NULL */
name = uenum_next(e, &length, &status);
if(name != NULL || length != 0 || U_FAILURE(status)) {
log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
}
uenum_close(e);
ucsdet_close(csd);
}
static void TestUTF8(void)
{
UErrorCode status = U_ZERO_ERROR;
static const char ss[] = "This is a string with some non-ascii characters that will "
"be converted to UTF-8, then shoved through the detection process. "
"\\u0391\\u0392\\u0393\\u0394\\u0395"
"Sure would be nice if our source could contain Unicode directly!";
int32_t byteLength = 0, sLength = 0, dLength = 0;
UChar s[sizeof(ss)];
char *bytes;
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
UChar detected[sizeof(ss)];
sLength = u_unescape(ss, s, sizeof(ss));
bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
ucsdet_setText(csd, bytes, byteLength, &status);
if (U_FAILURE(status)) {
log_err("status is %s\n", u_errorName(status));
goto bail;
}
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Detection failure for UTF-8: got no matches.\n");
goto bail;
}
dLength = ucsdet_getUChars(match, detected, sLength, &status);
if (u_strCompare(detected, dLength, s, sLength, false) != 0) {
log_err("Round-trip test failed!\n");
}
ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
bail:
freeBytes(bytes);
ucsdet_close(csd);
}
static void TestUTF16(void)
{
UErrorCode status = U_ZERO_ERROR;
/* Notice the BOM on the start of this string */
static const UChar chars[] = {
0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
0x064a, 0x062a, 0x0000};
int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *name;
int32_t conf;
ucsdet_setText(csd, beBytes, beLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
goto try_le;
}
name = ucsdet_getName(match, &status);
conf = ucsdet_getConfidence(match, &status);
if (strcmp(name, "UTF-16BE") != 0) {
log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
}
if (conf != 100) {
log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
}
try_le:
ucsdet_setText(csd, leBytes, leLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
conf = ucsdet_getConfidence(match, &status);
if (strcmp(name, "UTF-16LE") != 0) {
log_err("Encoding detection failure for UTF-16LE: got %s\n", name);
}
if (conf != 100) {
log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
}
bail:
freeBytes(leBytes);
freeBytes(beBytes);
ucsdet_close(csd);
}
static void TestC1Bytes(void)
{
#if !UCONFIG_NO_LEGACY_CONVERSION
UErrorCode status = U_ZERO_ERROR;
static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
int32_t sISOLength = 0, sWindowsLength = 0;
UChar sISO[sizeof(ssISO)];
UChar sWindows[sizeof(ssWindows)];
int32_t lISO = 0, lWindows = 0;
char *bISO;
char *bWindows;
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *name;
sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
ucsdet_setText(csd, bWindows, lWindows, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("English test with C1 bytes got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "windows-1252") != 0) {
log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
}
ucsdet_setText(csd, bISO, lISO, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("English text without C1 bytes got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "ISO-8859-1") != 0) {
log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
}
bail:
freeBytes(bWindows);
freeBytes(bISO);
ucsdet_close(csd);
#endif
}
static void TestInputFilter(void)
{
UErrorCode status = U_ZERO_ERROR;
static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
int32_t sLength = 0;
UChar s[sizeof(ss)];
int32_t byteLength = 0;
char *bytes;
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *lang, *name;
sLength = u_unescape(ss, s, sizeof(ss));
bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
ucsdet_enableInputFilter(csd, true);
if (!ucsdet_isInputFilterEnabled(csd)) {
log_err("ucsdet_enableInputFilter(csd, true) did not enable input filter!\n");
}
ucsdet_setText(csd, bytes, byteLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Turning on the input filter resulted in no matches.\n");
goto turn_off;
}
name = ucsdet_getName(match, &status);
if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
} else {
lang = ucsdet_getLanguage(match, &status);
if (lang == NULL || strcmp(lang, "fr") != 0) {
log_err("Input filter did not strip markup!\n");
}
}
turn_off:
ucsdet_enableInputFilter(csd, false);
ucsdet_setText(csd, bytes, byteLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Turning off the input filter resulted in no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
} else {
lang = ucsdet_getLanguage(match, &status);
if (lang == NULL || strcmp(lang, "en") != 0) {
log_err("Unfiltered input did not detect as English!\n");
}
}
bail:
freeBytes(bytes);
ucsdet_close(csd);
}
static void TestChaining(void) {
UErrorCode status = U_USELESS_COLLATOR_ERROR;
ucsdet_open(&status);
ucsdet_setText(NULL, NULL, 0, &status);
ucsdet_getName(NULL, &status);
ucsdet_getConfidence(NULL, &status);
ucsdet_getLanguage(NULL, &status);
ucsdet_detect(NULL, &status);
ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
ucsdet_detectAll(NULL, NULL, &status);
ucsdet_getUChars(NULL, NULL, 0, &status);
ucsdet_getUChars(NULL, NULL, 0, &status);
ucsdet_close(NULL);
/* All of this code should have done nothing. */
if (status != U_USELESS_COLLATOR_ERROR) {
log_err("Status got changed to %s\n", u_errorName(status));
}
}
static void TestBufferOverflow(void) {
UErrorCode status = U_ZERO_ERROR;
static const char *testStrings[] = {
"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
"\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
"\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
"\xa1", /* Could be a single byte shift-jis at the end */
"\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
"\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
};
static const char *testResults[] = {
"windows-1252",
"windows-1252",
"windows-1252",
"windows-1252",
"ISO-2022-JP",
NULL,
NULL,
"ISO-8859-1"
};
int32_t idx = 0;
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
if (U_FAILURE(status)) {
log_err("Couldn't open detector. %s\n", u_errorName(status));
goto bail;
}
for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
status = U_ZERO_ERROR;
ucsdet_setText(csd, testStrings[idx], -1, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
if (testResults[idx] != NULL) {
log_err("Unexpectedly got no results at index %d.\n", idx);
}
else {
log_verbose("Got no result as expected at index %d.\n", idx);
}
continue;
}
if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
goto bail;
}
}
bail:
ucsdet_close(csd);
}
static void TestIBM424(void)
{
UErrorCode status = U_ZERO_ERROR;
static const UChar chars[] = {
0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
};
static const UChar chars_reverse[] = {
0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
0x0000
};
int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *name;
ucsdet_setText(csd, bytes, bLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM424_rtl") != 0) {
log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
}
ucsdet_setText(csd, bytes_r, brLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM424_ltr") != 0) {
log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
}
bail:
freeBytes(bytes);
freeBytes(bytes_r);
ucsdet_close(csd);
}
static void TestIBM420(void)
{
UErrorCode status = U_ZERO_ERROR;
static const UChar chars[] = {
0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
0x0000
};
static const UChar chars_reverse[] = {
0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
0x0000,
};
int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *name;
ucsdet_setText(csd, bytes, bLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM420_rtl") != 0) {
log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
}
ucsdet_setText(csd, bytes_r, brLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM420_ltr") != 0) {
log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
}
bail:
freeBytes(bytes);
freeBytes(bytes_r);
ucsdet_close(csd);
}