mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-6778 Update ICU4C IBM420 and IBM424 CharsetDetector with changes made to ICU4J.
X-SVN-Rev: 25763
This commit is contained in:
parent
b31d86a522
commit
97f2fecc68
4 changed files with 309 additions and 36 deletions
|
@ -122,8 +122,10 @@ void CharsetDetector::setRecognizers(UErrorCode &status)
|
|||
new CharsetRecog_2022KR(),
|
||||
new CharsetRecog_2022CN(),
|
||||
|
||||
new CharsetRecog_IBM424_he(),
|
||||
new CharsetRecog_IBM420_ar()
|
||||
new CharsetRecog_IBM424_he_rtl(),
|
||||
new CharsetRecog_IBM424_he_ltr(),
|
||||
new CharsetRecog_IBM420_ar_rtl(),
|
||||
new CharsetRecog_IBM420_ar_ltr()
|
||||
};
|
||||
int32_t rCount = ARRAY_SIZE(tempArray);
|
||||
int32_t r;
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "cmemory.h"
|
||||
|
||||
#if !UCONFIG_NO_CONVERSION
|
||||
#include "csrsbcs.h"
|
||||
|
||||
|
@ -521,13 +523,20 @@ static const uint8_t charMap_KOI8_R[] = {
|
|||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
};
|
||||
|
||||
static const int32_t ngrams_IBM424_he[] = {
|
||||
static const int32_t ngrams_IBM424_he_rtl[] = {
|
||||
0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
|
||||
0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
|
||||
0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
|
||||
0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
|
||||
};
|
||||
|
||||
static const int32_t ngrams_IBM424_he_ltr[] = {
|
||||
0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
|
||||
0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
|
||||
0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
|
||||
0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
|
||||
};
|
||||
|
||||
static const uint8_t charMap_IBM424_he[] = {
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
|
@ -548,11 +557,18 @@ static const uint8_t charMap_IBM424_he[] = {
|
|||
/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
};
|
||||
|
||||
static const int32_t ngrams_IBM420_ar[] = {
|
||||
0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, 0x56B167,
|
||||
0x56B169, 0x56B173, 0x56B173, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x624073, 0x6240AB, 0x6240BB, 0x634056,
|
||||
0x734056, 0x7356B1, 0x736240, 0x73BD40, 0x754056, 0x756240, 0x774540, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB17745, 0xB1DA40, 0xB1DC40, 0xBB5640,
|
||||
0xBB6240, 0xBBBD40, 0xBD4056, 0xCB4056, 0xCB5640, 0xD67940, 0xDA4056, 0xDC4056, 0xDC4073, 0xDC40BB, 0xDCBD40, 0xEAB140, 0x4056B1, 0x56B163, 0x6240B1, 0xBB4056,
|
||||
static const int32_t ngrams_IBM420_ar_rtl[] = {
|
||||
0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
|
||||
0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
|
||||
0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
|
||||
0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
|
||||
};
|
||||
|
||||
static const int32_t ngrams_IBM420_ar_ltr[] = {
|
||||
0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
|
||||
0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
|
||||
0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
|
||||
0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
|
||||
};
|
||||
|
||||
static const uint8_t charMap_IBM420_ar[]= {
|
||||
|
@ -1150,29 +1166,64 @@ CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
|
|||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM424_he::getName() const
|
||||
{
|
||||
return "IBM424";
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM424_he::getLanguage() const
|
||||
{
|
||||
return "he";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM424_he::match(InputText *textIn)
|
||||
{
|
||||
return match_sbcs(textIn, ngrams_IBM424_he, charMap_IBM424_he);
|
||||
}
|
||||
|
||||
CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
|
||||
CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
|
||||
{
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM420_ar::getName() const
|
||||
const char *CharsetRecog_IBM424_he_rtl::getName() const
|
||||
{
|
||||
return "IBM420";
|
||||
return "IBM424_rtl";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn)
|
||||
{
|
||||
return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
|
||||
}
|
||||
|
||||
CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
|
||||
{
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM424_he_ltr::getName() const
|
||||
{
|
||||
return "IBM424_ltr";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn)
|
||||
{
|
||||
return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
|
||||
}
|
||||
|
||||
static const uint8_t unshapeMap_IBM420[] = {
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
|
||||
/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
||||
/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
|
||||
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
|
||||
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
|
||||
/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
|
||||
/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
|
||||
/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
|
||||
/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
|
||||
/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
};
|
||||
|
||||
CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
|
||||
{
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM420_ar::getLanguage() const
|
||||
|
@ -1180,10 +1231,118 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const
|
|||
return "ar";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM420_ar::match(InputText *textIn)
|
||||
void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
|
||||
prev_fInputBytesLength = textIn->fInputLen;
|
||||
prev_fInputBytes = textIn->fInputBytes;
|
||||
|
||||
int32_t length = 0;
|
||||
uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
|
||||
|
||||
if (bb != NULL) {
|
||||
textIn->fInputBytes = bb;
|
||||
textIn->fInputLen = length;
|
||||
|
||||
deleteBuffer = TRUE;
|
||||
} else {
|
||||
deleteBuffer = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
|
||||
uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
|
||||
|
||||
if (resultArray != NULL) {
|
||||
for (int32_t i = 0; i < inputBytesLength; i++) {
|
||||
resultArray[i] = unshapeMap_IBM420[resultArray[i]];
|
||||
}
|
||||
}
|
||||
|
||||
return resultArray;
|
||||
}
|
||||
|
||||
uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
|
||||
int32_t bigBufferLength = inputBytesLength * 2;
|
||||
uint8_t *bigBuffer = new uint8_t[bigBufferLength];
|
||||
uint8_t *resultBuffer = NULL;
|
||||
|
||||
if (bigBuffer != NULL) {
|
||||
int32_t bufferIndex;
|
||||
uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
|
||||
|
||||
for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
|
||||
if (isLamAlef(inputBytes[i])) {
|
||||
bigBuffer[bufferIndex++] = unshapedLamAlef[0];
|
||||
bigBuffer[bufferIndex++] = unshapedLamAlef[1];
|
||||
} else {
|
||||
bigBuffer[bufferIndex++] = inputBytes[i];
|
||||
}
|
||||
}
|
||||
|
||||
length = bufferIndex;
|
||||
resultBuffer = new uint8_t[length];
|
||||
if (resultBuffer != NULL) {
|
||||
uprv_memcpy(resultBuffer, bigBuffer, length);
|
||||
}
|
||||
}
|
||||
|
||||
if (bigBuffer != NULL) {
|
||||
delete [] bigBuffer;
|
||||
}
|
||||
|
||||
return resultBuffer;
|
||||
}
|
||||
|
||||
void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
|
||||
if (deleteBuffer) {
|
||||
delete [] textIn->fInputBytes;
|
||||
|
||||
textIn->fInputBytes = prev_fInputBytes;
|
||||
textIn->fInputLen = prev_fInputBytesLength;
|
||||
}
|
||||
}
|
||||
|
||||
UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
|
||||
uint8_t shapedLamAlef[] = {
|
||||
0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
|
||||
if (b == shapedLamAlef[i]) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
|
||||
{
|
||||
// TODO: May need to add shaping
|
||||
return match_sbcs(textIn, ngrams_IBM420_ar, charMap_IBM420_ar);
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM420_ar_rtl::getName() const
|
||||
{
|
||||
return "IBM420_rtl";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn)
|
||||
{
|
||||
return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
|
||||
}
|
||||
|
||||
CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
|
||||
{
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM420_ar_ltr::getName() const
|
||||
{
|
||||
return "IBM420_ltr";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn)
|
||||
{
|
||||
return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -363,10 +363,23 @@ class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
|
|||
public:
|
||||
virtual ~CharsetRecog_IBM424_he();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
const char *getLanguage() const;
|
||||
};
|
||||
|
||||
class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
|
||||
public:
|
||||
virtual ~CharsetRecog_IBM424_he_rtl();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
int32_t match(InputText *textIn);
|
||||
};
|
||||
|
||||
class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
|
||||
virtual ~CharsetRecog_IBM424_he_ltr();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
int32_t match(InputText *textIn);
|
||||
};
|
||||
|
||||
|
@ -375,10 +388,36 @@ class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
|
|||
public:
|
||||
virtual ~CharsetRecog_IBM420_ar();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
const char *getLanguage() const;
|
||||
|
||||
protected:
|
||||
void matchInit(InputText *textIn);
|
||||
void matchFinish(InputText *textIn);
|
||||
|
||||
private:
|
||||
uint8_t *prev_fInputBytes;
|
||||
int32_t prev_fInputBytesLength;
|
||||
UBool deleteBuffer;
|
||||
|
||||
UBool isLamAlef(uint8_t b);
|
||||
uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
|
||||
uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
|
||||
};
|
||||
|
||||
class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
|
||||
public:
|
||||
virtual ~CharsetRecog_IBM420_ar_rtl();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
int32_t match(InputText *textIn);
|
||||
};
|
||||
|
||||
class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
|
||||
virtual ~CharsetRecog_IBM420_ar_ltr();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
int32_t match(InputText *textIn);
|
||||
};
|
||||
|
||||
|
|
|
@ -446,8 +446,33 @@ static void TestIBM424(void)
|
|||
0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
|
||||
0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
|
||||
};
|
||||
int32_t bLength = 0, cLength = ARRAY_SIZE(chars);
|
||||
|
||||
static const UChar chars_reverse[] = {
|
||||
0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
|
||||
0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
|
||||
0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
|
||||
0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
|
||||
0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
|
||||
0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
|
||||
0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
|
||||
0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
|
||||
0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
|
||||
0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
|
||||
0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
|
||||
0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
|
||||
0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
|
||||
0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
|
||||
0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
|
||||
0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
|
||||
0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
|
||||
0x0000
|
||||
};
|
||||
|
||||
int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
|
||||
|
||||
char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
|
||||
char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
|
||||
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
|
@ -456,17 +481,31 @@ static void TestIBM424(void)
|
|||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
log_err("Encoding detection failure for IBM424: got no matches.\n");
|
||||
log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM424") != 0) {
|
||||
log_err("Encoding detection failure for IBM424: got %s\n", name);
|
||||
if (strcmp(name, "IBM424_rtl") != 0) {
|
||||
log_err("Encoding detection failure for IBM424_rtl: got %s\n", name);
|
||||
}
|
||||
|
||||
ucsdet_setText(csd, bytes_r, brLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM424_ltr") != 0) {
|
||||
log_err("Encoding detection failure for IBM424_ltr: got %s\n", name);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bytes);
|
||||
freeBytes(bytes_r);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
||||
|
@ -490,8 +529,28 @@ static void TestIBM420(void)
|
|||
0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
|
||||
0x0000
|
||||
};
|
||||
int32_t bLength = 0, cLength = ARRAY_SIZE(chars);
|
||||
static const UChar chars_reverse[] = {
|
||||
0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
|
||||
0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
|
||||
0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
|
||||
0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
|
||||
0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
|
||||
0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
|
||||
0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
|
||||
0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
|
||||
0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
|
||||
0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
|
||||
0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
|
||||
0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
|
||||
0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
|
||||
0x0000,
|
||||
};
|
||||
|
||||
int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
|
||||
|
||||
char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
|
||||
char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
|
||||
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
|
@ -500,16 +559,30 @@ static void TestIBM420(void)
|
|||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
log_err("Encoding detection failure for IBM420: got no matches.\n");
|
||||
log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM420") != 0) {
|
||||
log_err("Encoding detection failure for IBM420: got %s\n", name);
|
||||
if (strcmp(name, "IBM420_rtl") != 0) {
|
||||
log_err("Encoding detection failure for IBM420_rtl: got %s\n", name);
|
||||
}
|
||||
|
||||
ucsdet_setText(csd, bytes_r, brLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM420_ltr") != 0) {
|
||||
log_err("Encoding detection failure for IBM420_ltr: got %s\n", name);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bytes);
|
||||
freeBytes(bytes_r);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue