From 26f9de0065af76cac0421c18f24962267cfce87c Mon Sep 17 00:00:00 2001 From: Michael Ow Date: Tue, 31 Mar 2009 15:39:00 +0000 Subject: [PATCH] ICU-6778 Port IBM420 and IBM424 CharsetDetector to ICU4C. X-SVN-Rev: 25696 --- icu4c/source/i18n/csdetect.cpp | 5 +- icu4c/source/i18n/csrsbcs.cpp | 97 ++++++++++++++++++++++- icu4c/source/i18n/csrsbcs.h | 26 +++++- icu4c/source/test/cintltst/ucsdetst.c | 96 +++++++++++++++++++++- icu4c/source/test/intltest/csdetest.cpp | 101 +++++++++++++++++++++++- icu4c/source/test/intltest/csdetest.h | 4 +- 6 files changed, 323 insertions(+), 6 deletions(-) diff --git a/icu4c/source/i18n/csdetect.cpp b/icu4c/source/i18n/csdetect.cpp index 034bcdf8b74..f306e09067a 100644 --- a/icu4c/source/i18n/csdetect.cpp +++ b/icu4c/source/i18n/csdetect.cpp @@ -120,7 +120,10 @@ void CharsetDetector::setRecognizers(UErrorCode &status) new CharsetRecog_2022JP(), new CharsetRecog_2022KR(), - new CharsetRecog_2022CN() + new CharsetRecog_2022CN(), + + new CharsetRecog_IBM424_he(), + new CharsetRecog_IBM420_ar() }; int32_t rCount = ARRAY_SIZE(tempArray); int32_t r; diff --git a/icu4c/source/i18n/csrsbcs.cpp b/icu4c/source/i18n/csrsbcs.cpp index feae39d8a12..5dbe934eaa7 100644 --- a/icu4c/source/i18n/csrsbcs.cpp +++ b/icu4c/source/i18n/csrsbcs.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2008, International Business Machines + * Copyright (C) 2005-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -521,6 +521,60 @@ static const uint8_t charMap_KOI8_R[] = { 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, }; +static const int32_t ngrams_IBM424_he[] = { + 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, + 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, + 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056, + 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069, +}; + +static const uint8_t charMap_IBM424_he[] = { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40, +/* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +}; + +static const int32_t ngrams_IBM420_ar[] = { + 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, 0x56B167, + 0x56B169, 0x56B173, 0x56B173, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x624073, 0x6240AB, 0x6240BB, 0x634056, + 0x734056, 0x7356B1, 0x736240, 0x73BD40, 0x754056, 0x756240, 0x774540, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB17745, 0xB1DA40, 0xB1DC40, 0xBB5640, + 0xBB6240, 0xBBBD40, 0xBD4056, 0xCB4056, 0xCB5640, 0xD67940, 0xDA4056, 0xDC4056, 0xDC4073, 0xDC40BB, 0xDCBD40, 0xEAB140, 0x4056B1, 0x56B163, 0x6240B1, 0xBB4056, +}; + +static const uint8_t charMap_IBM420_ar[]= { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, +/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, +/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, +/* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, +/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF, +/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, +/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, +/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, +}; + //ISO-8859-1,2,5,6,7,8,9 Ngrams static const int32_t ngrams_8859_1_en[] = { 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F, @@ -1091,6 +1145,47 @@ int32_t CharsetRecog_KOI8_R::match(InputText *textIn) return match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); } +CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() +{ + // nothing to do +} + +const char *CharsetRecog_IBM424_he::getName() const +{ + return "IBM424"; +} + +const char *CharsetRecog_IBM424_he::getLanguage() const +{ + return "he"; +} + +int32_t CharsetRecog_IBM424_he::match(InputText *textIn) +{ + return match_sbcs(textIn, ngrams_IBM424_he, charMap_IBM424_he); +} + +CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() +{ + // nothing to do +} + +const char *CharsetRecog_IBM420_ar::getName() const +{ + return "IBM420"; +} + +const char *CharsetRecog_IBM420_ar::getLanguage() const +{ + return "ar"; +} + +int32_t CharsetRecog_IBM420_ar::match(InputText *textIn) +{ + // TODO: May need to add shaping + return match_sbcs(textIn, ngrams_IBM420_ar, charMap_IBM420_ar); +} + U_NAMESPACE_END #endif diff --git a/icu4c/source/i18n/csrsbcs.h b/icu4c/source/i18n/csrsbcs.h index 456fc4dff43..1f0572b03a1 100644 --- a/icu4c/source/i18n/csrsbcs.h +++ b/icu4c/source/i18n/csrsbcs.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -358,6 +358,30 @@ public: int32_t match(InputText *textIn); }; +class CharsetRecog_IBM424_he : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_IBM424_he(); + + const char *getName() const; + + const char *getLanguage() const; + + int32_t match(InputText *textIn); +}; + +class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs +{ +public: + virtual ~CharsetRecog_IBM420_ar(); + + const char *getName() const; + + const char *getLanguage() const; + + int32_t match(InputText *textIn); +}; + U_NAMESPACE_END #endif diff --git a/icu4c/source/test/cintltst/ucsdetst.c b/icu4c/source/test/cintltst/ucsdetst.c index 7eb3f4cbdbd..c27848bb75c 100644 --- a/icu4c/source/test/cintltst/ucsdetst.c +++ b/icu4c/source/test/cintltst/ucsdetst.c @@ -1,6 +1,6 @@ /* **************************************************************************** - * Copyright (c) 2005-2008, International Business Machines Corporation and * + * Copyright (c) 2005-2009, International Business Machines Corporation and * * others. All Rights Reserved. * **************************************************************************** */ @@ -28,6 +28,8 @@ static void TestC1Bytes(void); static void TestInputFilter(void); static void TestChaining(void); static void TestBufferOverflow(void); +static void TestIBM424(void); +static void TestIBM420(void); void addUCsdetTest(TestNode** root); @@ -40,6 +42,8 @@ void addUCsdetTest(TestNode** root) addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter"); addTest(root, &TestChaining, "ucsdetst/TestErrorChaining"); addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow"); + addTest(root, &TestIBM424, "ucsdetst/TestIBM424"); + addTest(root, &TestIBM420, "ucsdetst/TestIBM420"); } static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) @@ -419,3 +423,93 @@ bail: ucsdet_close(csd); } +static void TestIBM424(void) +{ + UErrorCode status = U_ZERO_ERROR; + + static const UChar chars[] = { + 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, + 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, + 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, + 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, + 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, + 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, + 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, + 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, + 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, + 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, + 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, + 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, + 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, + 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, + 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, + 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, + 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 + }; + int32_t bLength = 0, cLength = ARRAY_SIZE(chars); + char *bytes = extractBytes(chars, cLength, "IBM424", &bLength); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + const char *name; + + ucsdet_setText(csd, bytes, bLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Encoding detection failure for IBM424: got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + if (strcmp(name, "IBM424") != 0) { + log_err("Encoding detection failure for IBM424: got %s\n", name); + } + +bail: + freeBytes(bytes); + ucsdet_close(csd); +} + +static void TestIBM420(void) +{ + UErrorCode status = U_ZERO_ERROR; + + static const UChar chars[] = { + 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, + 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, + 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, + 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, + 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, + 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, + 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, + 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, + 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, + 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, + 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, + 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, + 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, + 0x0000 + }; + int32_t bLength = 0, cLength = ARRAY_SIZE(chars); + char *bytes = extractBytes(chars, cLength, "IBM420", &bLength); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + const char *name; + + ucsdet_setText(csd, bytes, bLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Encoding detection failure for IBM420: got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + if (strcmp(name, "IBM420") != 0) { + log_err("Encoding detection failure for IBM420: got %s\n", name); + } + +bail: + freeBytes(bytes); + ucsdet_close(csd); +} diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp index 1e3b64ddc32..e0565f57069 100644 --- a/icu4c/source/test/intltest/csdetest.cpp +++ b/icu4c/source/test/intltest/csdetest.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2008, International Business Machines + * Copyright (C) 2005-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -76,6 +76,14 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char if (exec) DetectionTest(); break; + case 6: name = "IBM424Test"; + if (exec) IBM424Test(); + break; + + case 7: name = "IBM420Test"; + if (exec) IBM420Test(); + break; + default: name = ""; break; //needed to end loop } @@ -515,4 +523,95 @@ void CharsetDetectionTest::DetectionTest() #endif } +void CharsetDetectionTest::IBM424Test() +{ + UErrorCode status = U_ZERO_ERROR; + + static const UChar chars[] = { + 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, + 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, + 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, + 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, + 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, + 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, + 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, + 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, + 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, + 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, + 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, + 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, + 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, + 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, + 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, + 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, + 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 + }; + UnicodeString s(chars); + int32_t bLength = 0; + char *bytes = extractBytes(s, "IBM424", bLength); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + const char *name; + ucsdet_setText(csd, bytes, bLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + errln("Encoding detection failure for IBM424: got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + if (strcmp(name, "IBM424") != 0) { + errln("Encoding detection failure for IBM424: got %s\n", name); + } + +bail: + freeBytes(bytes); + ucsdet_close(csd); +} + +void CharsetDetectionTest::IBM420Test() +{ + UErrorCode status = U_ZERO_ERROR; + + static const UChar chars[] = { + 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, + 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, + 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, + 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, + 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, + 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, + 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, + 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, + 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, + 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, + 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, + 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, + 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, + 0x0000 + }; + UnicodeString s(chars); + int32_t bLength = 0; + char *bytes = extractBytes(s, "IBM420", bLength); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + const char *name; + + ucsdet_setText(csd, bytes, bLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + errln("Encoding detection failure for IBM420: got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + if (strcmp(name, "IBM420") != 0) { + errln("Encoding detection failure for IBM420: got %s\n", name); + } + +bail: + freeBytes(bytes); + ucsdet_close(csd); +} diff --git a/icu4c/source/test/intltest/csdetest.h b/icu4c/source/test/intltest/csdetest.h index 223a640a57b..a3e0c5a4976 100644 --- a/icu4c/source/test/intltest/csdetest.h +++ b/icu4c/source/test/intltest/csdetest.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -27,6 +27,8 @@ public: virtual void C1BytesTest(); virtual void InputFilterTest(); virtual void DetectionTest(); + virtual void IBM424Test(); + virtual void IBM420Test(); private: void checkEncoding(const UnicodeString &testString,