ICU-6778 Update ICU4C IBM420 and IBM424 CharsetDetector with changes made to ICU4J.

X-SVN-Rev: 25763
This commit is contained in:
Michael Ow 2009-04-13 21:32:21 +00:00
parent b31d86a522
commit 97f2fecc68
4 changed files with 309 additions and 36 deletions

View file

@ -122,8 +122,10 @@ void CharsetDetector::setRecognizers(UErrorCode &status)
new CharsetRecog_2022KR(),
new CharsetRecog_2022CN(),
new CharsetRecog_IBM424_he(),
new CharsetRecog_IBM420_ar()
new CharsetRecog_IBM424_he_rtl(),
new CharsetRecog_IBM424_he_ltr(),
new CharsetRecog_IBM420_ar_rtl(),
new CharsetRecog_IBM420_ar_ltr()
};
int32_t rCount = ARRAY_SIZE(tempArray);
int32_t r;

View file

@ -7,6 +7,8 @@
#include "unicode/utypes.h"
#include "cmemory.h"
#if !UCONFIG_NO_CONVERSION
#include "csrsbcs.h"
@ -521,13 +523,20 @@ static const uint8_t charMap_KOI8_R[] = {
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
};
static const int32_t ngrams_IBM424_he[] = {
static const int32_t ngrams_IBM424_he_rtl[] = {
0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
};
static const int32_t ngrams_IBM424_he_ltr[] = {
0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
};
static const uint8_t charMap_IBM424_he[] = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
@ -548,11 +557,18 @@ static const uint8_t charMap_IBM424_he[] = {
/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
};
static const int32_t ngrams_IBM420_ar[] = {
0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, 0x56B167,
0x56B169, 0x56B173, 0x56B173, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x624073, 0x6240AB, 0x6240BB, 0x634056,
0x734056, 0x7356B1, 0x736240, 0x73BD40, 0x754056, 0x756240, 0x774540, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB17745, 0xB1DA40, 0xB1DC40, 0xBB5640,
0xBB6240, 0xBBBD40, 0xBD4056, 0xCB4056, 0xCB5640, 0xD67940, 0xDA4056, 0xDC4056, 0xDC4073, 0xDC40BB, 0xDCBD40, 0xEAB140, 0x4056B1, 0x56B163, 0x6240B1, 0xBB4056,
static const int32_t ngrams_IBM420_ar_rtl[] = {
0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
};
static const int32_t ngrams_IBM420_ar_ltr[] = {
0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
};
static const uint8_t charMap_IBM420_ar[]= {
@ -1150,29 +1166,64 @@ CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
// nothing to do
}
const char *CharsetRecog_IBM424_he::getName() const
{
return "IBM424";
}
const char *CharsetRecog_IBM424_he::getLanguage() const
{
return "he";
}
int32_t CharsetRecog_IBM424_he::match(InputText *textIn)
{
return match_sbcs(textIn, ngrams_IBM424_he, charMap_IBM424_he);
}
CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
{
// nothing to do
}
const char *CharsetRecog_IBM420_ar::getName() const
const char *CharsetRecog_IBM424_he_rtl::getName() const
{
return "IBM420";
return "IBM424_rtl";
}
int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn)
{
return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
}
CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
{
// nothing to do
}
const char *CharsetRecog_IBM424_he_ltr::getName() const
{
return "IBM424_ltr";
}
int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn)
{
return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
}
static const uint8_t unshapeMap_IBM420[] = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
};
CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
{
// nothing to do
}
const char *CharsetRecog_IBM420_ar::getLanguage() const
@ -1180,10 +1231,118 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const
return "ar";
}
int32_t CharsetRecog_IBM420_ar::match(InputText *textIn)
void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
prev_fInputBytesLength = textIn->fInputLen;
prev_fInputBytes = textIn->fInputBytes;
int32_t length = 0;
uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
if (bb != NULL) {
textIn->fInputBytes = bb;
textIn->fInputLen = length;
deleteBuffer = TRUE;
} else {
deleteBuffer = FALSE;
}
}
uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
if (resultArray != NULL) {
for (int32_t i = 0; i < inputBytesLength; i++) {
resultArray[i] = unshapeMap_IBM420[resultArray[i]];
}
}
return resultArray;
}
uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
int32_t bigBufferLength = inputBytesLength * 2;
uint8_t *bigBuffer = new uint8_t[bigBufferLength];
uint8_t *resultBuffer = NULL;
if (bigBuffer != NULL) {
int32_t bufferIndex;
uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
if (isLamAlef(inputBytes[i])) {
bigBuffer[bufferIndex++] = unshapedLamAlef[0];
bigBuffer[bufferIndex++] = unshapedLamAlef[1];
} else {
bigBuffer[bufferIndex++] = inputBytes[i];
}
}
length = bufferIndex;
resultBuffer = new uint8_t[length];
if (resultBuffer != NULL) {
uprv_memcpy(resultBuffer, bigBuffer, length);
}
}
if (bigBuffer != NULL) {
delete [] bigBuffer;
}
return resultBuffer;
}
void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
if (deleteBuffer) {
delete [] textIn->fInputBytes;
textIn->fInputBytes = prev_fInputBytes;
textIn->fInputLen = prev_fInputBytesLength;
}
}
UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
uint8_t shapedLamAlef[] = {
0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
};
for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
if (b == shapedLamAlef[i]) {
return TRUE;
}
}
return FALSE;
}
CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
{
// TODO: May need to add shaping
return match_sbcs(textIn, ngrams_IBM420_ar, charMap_IBM420_ar);
// nothing to do
}
const char *CharsetRecog_IBM420_ar_rtl::getName() const
{
return "IBM420_rtl";
}
int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn)
{
return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
}
CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
{
// nothing to do
}
const char *CharsetRecog_IBM420_ar_ltr::getName() const
{
return "IBM420_ltr";
}
int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn)
{
return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
}
U_NAMESPACE_END

View file

@ -363,10 +363,23 @@ class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
public:
virtual ~CharsetRecog_IBM424_he();
const char *getName() const;
const char *getLanguage() const;
};
class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
public:
virtual ~CharsetRecog_IBM424_he_rtl();
const char *getName() const;
int32_t match(InputText *textIn);
};
class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
virtual ~CharsetRecog_IBM424_he_ltr();
const char *getName() const;
int32_t match(InputText *textIn);
};
@ -375,10 +388,36 @@ class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
public:
virtual ~CharsetRecog_IBM420_ar();
const char *getName() const;
const char *getLanguage() const;
protected:
void matchInit(InputText *textIn);
void matchFinish(InputText *textIn);
private:
uint8_t *prev_fInputBytes;
int32_t prev_fInputBytesLength;
UBool deleteBuffer;
UBool isLamAlef(uint8_t b);
uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
};
class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
public:
virtual ~CharsetRecog_IBM420_ar_rtl();
const char *getName() const;
int32_t match(InputText *textIn);
};
class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
virtual ~CharsetRecog_IBM420_ar_ltr();
const char *getName() const;
int32_t match(InputText *textIn);
};

View file

@ -446,8 +446,33 @@ static void TestIBM424(void)
0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
};
int32_t bLength = 0, cLength = ARRAY_SIZE(chars);
static const UChar chars_reverse[] = {
0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
0x0000
};
int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *name;
@ -456,17 +481,31 @@ static void TestIBM424(void)
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM424: got no matches.\n");
log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM424") != 0) {
log_err("Encoding detection failure for IBM424: got %s\n", name);
if (strcmp(name, "IBM424_rtl") != 0) {
log_err("Encoding detection failure for IBM424_rtl: got %s\n", name);
}
ucsdet_setText(csd, bytes_r, brLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM424_ltr") != 0) {
log_err("Encoding detection failure for IBM424_ltr: got %s\n", name);
}
bail:
freeBytes(bytes);
freeBytes(bytes_r);
ucsdet_close(csd);
}
@ -490,8 +529,28 @@ static void TestIBM420(void)
0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
0x0000
};
int32_t bLength = 0, cLength = ARRAY_SIZE(chars);
static const UChar chars_reverse[] = {
0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
0x0000,
};
int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *match;
const char *name;
@ -500,16 +559,30 @@ static void TestIBM420(void)
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM420: got no matches.\n");
log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM420") != 0) {
log_err("Encoding detection failure for IBM420: got %s\n", name);
if (strcmp(name, "IBM420_rtl") != 0) {
log_err("Encoding detection failure for IBM420_rtl: got %s\n", name);
}
ucsdet_setText(csd, bytes_r, brLength, &status);
match = ucsdet_detect(csd, &status);
if (match == NULL) {
log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
goto bail;
}
name = ucsdet_getName(match, &status);
if (strcmp(name, "IBM420_ltr") != 0) {
log_err("Encoding detection failure for IBM420_ltr: got %s\n", name);
}
bail:
freeBytes(bytes);
freeBytes(bytes_r);
ucsdet_close(csd);
}