From 97f2fecc681d48877f8dcd878fe26551348c719b Mon Sep 17 00:00:00 2001 From: Michael Ow Date: Mon, 13 Apr 2009 21:32:21 +0000 Subject: [PATCH] ICU-6778 Update ICU4C IBM420 and IBM424 CharsetDetector with changes made to ICU4J. X-SVN-Rev: 25763 --- icu4c/source/i18n/csdetect.cpp | 6 +- icu4c/source/i18n/csrsbcs.cpp | 203 +++++++++++++++++++++++--- icu4c/source/i18n/csrsbcs.h | 47 +++++- icu4c/source/test/cintltst/ucsdetst.c | 89 ++++++++++- 4 files changed, 309 insertions(+), 36 deletions(-) diff --git a/icu4c/source/i18n/csdetect.cpp b/icu4c/source/i18n/csdetect.cpp index f306e09067a..2deefdd9963 100644 --- a/icu4c/source/i18n/csdetect.cpp +++ b/icu4c/source/i18n/csdetect.cpp @@ -122,8 +122,10 @@ void CharsetDetector::setRecognizers(UErrorCode &status) new CharsetRecog_2022KR(), new CharsetRecog_2022CN(), - new CharsetRecog_IBM424_he(), - new CharsetRecog_IBM420_ar() + new CharsetRecog_IBM424_he_rtl(), + new CharsetRecog_IBM424_he_ltr(), + new CharsetRecog_IBM420_ar_rtl(), + new CharsetRecog_IBM420_ar_ltr() }; int32_t rCount = ARRAY_SIZE(tempArray); int32_t r; diff --git a/icu4c/source/i18n/csrsbcs.cpp b/icu4c/source/i18n/csrsbcs.cpp index 5dbe934eaa7..19634037a20 100644 --- a/icu4c/source/i18n/csrsbcs.cpp +++ b/icu4c/source/i18n/csrsbcs.cpp @@ -7,6 +7,8 @@ #include "unicode/utypes.h" +#include "cmemory.h" + #if !UCONFIG_NO_CONVERSION #include "csrsbcs.h" @@ -521,13 +523,20 @@ static const uint8_t charMap_KOI8_R[] = { 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, }; -static const int32_t ngrams_IBM424_he[] = { +static const int32_t ngrams_IBM424_he_rtl[] = { 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056, 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069, }; +static const int32_t ngrams_IBM424_he_ltr[] = { + 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141, + 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054, + 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940, + 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651, +}; + static const uint8_t charMap_IBM424_he[] = { /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, @@ -548,11 +557,18 @@ static const uint8_t charMap_IBM424_he[] = { /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, }; -static const int32_t ngrams_IBM420_ar[] = { - 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, 0x56B167, - 0x56B169, 0x56B173, 0x56B173, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x624073, 0x6240AB, 0x6240BB, 0x634056, - 0x734056, 0x7356B1, 0x736240, 0x73BD40, 0x754056, 0x756240, 0x774540, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB17745, 0xB1DA40, 0xB1DC40, 0xBB5640, - 0xBB6240, 0xBBBD40, 0xBD4056, 0xCB4056, 0xCB5640, 0xD67940, 0xDA4056, 0xDC4056, 0xDC4073, 0xDC40BB, 0xDCBD40, 0xEAB140, 0x4056B1, 0x56B163, 0x6240B1, 0xBB4056, +static const int32_t ngrams_IBM420_ar_rtl[] = { + 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, + 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB, + 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40, + 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40, +}; + +static const int32_t ngrams_IBM420_ar_ltr[] = { + 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF, + 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD, + 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156, + 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156 }; static const uint8_t charMap_IBM420_ar[]= { @@ -1150,29 +1166,64 @@ CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() // nothing to do } -const char *CharsetRecog_IBM424_he::getName() const -{ - return "IBM424"; -} - const char *CharsetRecog_IBM424_he::getLanguage() const { return "he"; } -int32_t CharsetRecog_IBM424_he::match(InputText *textIn) -{ - return match_sbcs(textIn, ngrams_IBM424_he, charMap_IBM424_he); -} - -CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() +CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl() { // nothing to do } -const char *CharsetRecog_IBM420_ar::getName() const +const char *CharsetRecog_IBM424_he_rtl::getName() const { - return "IBM420"; + return "IBM424_rtl"; +} + +int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn) +{ + return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he); +} + +CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr() +{ + // nothing to do +} + +const char *CharsetRecog_IBM424_he_ltr::getName() const +{ + return "IBM424_ltr"; +} + +int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn) +{ + return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he); +} + +static const uint8_t unshapeMap_IBM420[] = { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, +/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, +/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, +/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, +/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, +/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, +/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, +/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, +/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, +/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, +/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, +/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +}; + +CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() +{ + // nothing to do } const char *CharsetRecog_IBM420_ar::getLanguage() const @@ -1180,10 +1231,118 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const return "ar"; } -int32_t CharsetRecog_IBM420_ar::match(InputText *textIn) +void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) { + prev_fInputBytesLength = textIn->fInputLen; + prev_fInputBytes = textIn->fInputBytes; + + int32_t length = 0; + uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length); + + if (bb != NULL) { + textIn->fInputBytes = bb; + textIn->fInputLen = length; + + deleteBuffer = TRUE; + } else { + deleteBuffer = FALSE; + } +} + +uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { + uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length); + + if (resultArray != NULL) { + for (int32_t i = 0; i < inputBytesLength; i++) { + resultArray[i] = unshapeMap_IBM420[resultArray[i]]; + } + } + + return resultArray; +} + +uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { + int32_t bigBufferLength = inputBytesLength * 2; + uint8_t *bigBuffer = new uint8_t[bigBufferLength]; + uint8_t *resultBuffer = NULL; + + if (bigBuffer != NULL) { + int32_t bufferIndex; + uint8_t unshapedLamAlef[] = { 0xb1, 0x56 }; + + for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) { + if (isLamAlef(inputBytes[i])) { + bigBuffer[bufferIndex++] = unshapedLamAlef[0]; + bigBuffer[bufferIndex++] = unshapedLamAlef[1]; + } else { + bigBuffer[bufferIndex++] = inputBytes[i]; + } + } + + length = bufferIndex; + resultBuffer = new uint8_t[length]; + if (resultBuffer != NULL) { + uprv_memcpy(resultBuffer, bigBuffer, length); + } + } + + if (bigBuffer != NULL) { + delete [] bigBuffer; + } + + return resultBuffer; +} + +void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) { + if (deleteBuffer) { + delete [] textIn->fInputBytes; + + textIn->fInputBytes = prev_fInputBytes; + textIn->fInputLen = prev_fInputBytesLength; + } +} + +UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) { + uint8_t shapedLamAlef[] = { + 0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8 + }; + + for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) { + if (b == shapedLamAlef[i]) { + return TRUE; + } + } + + return FALSE; +} + +CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl() { - // TODO: May need to add shaping - return match_sbcs(textIn, ngrams_IBM420_ar, charMap_IBM420_ar); + // nothing to do +} + +const char *CharsetRecog_IBM420_ar_rtl::getName() const +{ + return "IBM420_rtl"; +} + +int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn) +{ + return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar); +} + +CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr() +{ + // nothing to do +} + +const char *CharsetRecog_IBM420_ar_ltr::getName() const +{ + return "IBM420_ltr"; +} + +int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn) +{ + return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar); } U_NAMESPACE_END diff --git a/icu4c/source/i18n/csrsbcs.h b/icu4c/source/i18n/csrsbcs.h index 1f0572b03a1..21cbabe5663 100644 --- a/icu4c/source/i18n/csrsbcs.h +++ b/icu4c/source/i18n/csrsbcs.h @@ -363,10 +363,23 @@ class CharsetRecog_IBM424_he : public CharsetRecog_sbcs public: virtual ~CharsetRecog_IBM424_he(); - const char *getName() const; - const char *getLanguage() const; +}; +class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { +public: + virtual ~CharsetRecog_IBM424_he_rtl(); + + const char *getName() const; + + int32_t match(InputText *textIn); +}; + +class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { + virtual ~CharsetRecog_IBM424_he_ltr(); + + const char *getName() const; + int32_t match(InputText *textIn); }; @@ -375,10 +388,36 @@ class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs public: virtual ~CharsetRecog_IBM420_ar(); - const char *getName() const; - const char *getLanguage() const; + +protected: + void matchInit(InputText *textIn); + void matchFinish(InputText *textIn); + +private: + uint8_t *prev_fInputBytes; + int32_t prev_fInputBytesLength; + UBool deleteBuffer; + + UBool isLamAlef(uint8_t b); + uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); + uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); +}; +class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { +public: + virtual ~CharsetRecog_IBM420_ar_rtl(); + + const char *getName() const; + + int32_t match(InputText *textIn); +}; + +class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { + virtual ~CharsetRecog_IBM420_ar_ltr(); + + const char *getName() const; + int32_t match(InputText *textIn); }; diff --git a/icu4c/source/test/cintltst/ucsdetst.c b/icu4c/source/test/cintltst/ucsdetst.c index c27848bb75c..c571b37019a 100644 --- a/icu4c/source/test/cintltst/ucsdetst.c +++ b/icu4c/source/test/cintltst/ucsdetst.c @@ -446,8 +446,33 @@ static void TestIBM424(void) 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 }; - int32_t bLength = 0, cLength = ARRAY_SIZE(chars); + + static const UChar chars_reverse[] = { + 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, + 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, + 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, + 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, + 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, + 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, + 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, + 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, + 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, + 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, + 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, + 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, + 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, + 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, + 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, + 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, + 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, + 0x0000 + }; + + int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse); + char *bytes = extractBytes(chars, cLength, "IBM424", &bLength); + char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength); + UCharsetDetector *csd = ucsdet_open(&status); const UCharsetMatch *match; const char *name; @@ -456,17 +481,31 @@ static void TestIBM424(void) match = ucsdet_detect(csd, &status); if (match == NULL) { - log_err("Encoding detection failure for IBM424: got no matches.\n"); + log_err("Encoding detection failure for IBM424_rtl: got no matches.\n"); goto bail; } name = ucsdet_getName(match, &status); - if (strcmp(name, "IBM424") != 0) { - log_err("Encoding detection failure for IBM424: got %s\n", name); + if (strcmp(name, "IBM424_rtl") != 0) { + log_err("Encoding detection failure for IBM424_rtl: got %s\n", name); + } + + ucsdet_setText(csd, bytes_r, brLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Encoding detection failure for IBM424_ltr: got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + if (strcmp(name, "IBM424_ltr") != 0) { + log_err("Encoding detection failure for IBM424_ltr: got %s\n", name); } bail: freeBytes(bytes); + freeBytes(bytes_r); ucsdet_close(csd); } @@ -490,8 +529,28 @@ static void TestIBM420(void) 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 0x0000 }; - int32_t bLength = 0, cLength = ARRAY_SIZE(chars); + static const UChar chars_reverse[] = { + 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, + 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, + 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, + 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, + 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, + 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, + 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, + 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, + 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, + 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, + 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, + 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, + 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, + 0x0000, + }; + + int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse); + char *bytes = extractBytes(chars, cLength, "IBM420", &bLength); + char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength); + UCharsetDetector *csd = ucsdet_open(&status); const UCharsetMatch *match; const char *name; @@ -500,16 +559,30 @@ static void TestIBM420(void) match = ucsdet_detect(csd, &status); if (match == NULL) { - log_err("Encoding detection failure for IBM420: got no matches.\n"); + log_err("Encoding detection failure for IBM420_rtl: got no matches.\n"); goto bail; } name = ucsdet_getName(match, &status); - if (strcmp(name, "IBM420") != 0) { - log_err("Encoding detection failure for IBM420: got %s\n", name); + if (strcmp(name, "IBM420_rtl") != 0) { + log_err("Encoding detection failure for IBM420_rtl: got %s\n", name); + } + + ucsdet_setText(csd, bytes_r, brLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Encoding detection failure for IBM420_ltr: got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + if (strcmp(name, "IBM420_ltr") != 0) { + log_err("Encoding detection failure for IBM420_ltr: got %s\n", name); } bail: freeBytes(bytes); + freeBytes(bytes_r); ucsdet_close(csd); }