mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-6778 Port IBM420 and IBM424 CharsetDetector to ICU4C.
X-SVN-Rev: 25696
This commit is contained in:
parent
84db790ca6
commit
26f9de0065
6 changed files with 323 additions and 6 deletions
|
@ -120,7 +120,10 @@ void CharsetDetector::setRecognizers(UErrorCode &status)
|
|||
|
||||
new CharsetRecog_2022JP(),
|
||||
new CharsetRecog_2022KR(),
|
||||
new CharsetRecog_2022CN()
|
||||
new CharsetRecog_2022CN(),
|
||||
|
||||
new CharsetRecog_IBM424_he(),
|
||||
new CharsetRecog_IBM420_ar()
|
||||
};
|
||||
int32_t rCount = ARRAY_SIZE(tempArray);
|
||||
int32_t r;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2008, International Business Machines
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -521,6 +521,60 @@ static const uint8_t charMap_KOI8_R[] = {
|
|||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
};
|
||||
|
||||
static const int32_t ngrams_IBM424_he[] = {
|
||||
0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
|
||||
0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
|
||||
0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
|
||||
0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
|
||||
};
|
||||
|
||||
static const uint8_t charMap_IBM424_he[] = {
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
|
||||
/* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
};
|
||||
|
||||
static const int32_t ngrams_IBM420_ar[] = {
|
||||
0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, 0x56B167,
|
||||
0x56B169, 0x56B173, 0x56B173, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x624073, 0x6240AB, 0x6240BB, 0x634056,
|
||||
0x734056, 0x7356B1, 0x736240, 0x73BD40, 0x754056, 0x756240, 0x774540, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB17745, 0xB1DA40, 0xB1DC40, 0xBB5640,
|
||||
0xBB6240, 0xBBBD40, 0xBD4056, 0xCB4056, 0xCB5640, 0xD67940, 0xDA4056, 0xDC4056, 0xDC4073, 0xDC40BB, 0xDCBD40, 0xEAB140, 0x4056B1, 0x56B163, 0x6240B1, 0xBB4056,
|
||||
};
|
||||
|
||||
static const uint8_t charMap_IBM420_ar[]= {
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
|
||||
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
|
||||
/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
|
||||
/* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
|
||||
/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
|
||||
/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
|
||||
/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
|
||||
};
|
||||
|
||||
//ISO-8859-1,2,5,6,7,8,9 Ngrams
|
||||
static const int32_t ngrams_8859_1_en[] = {
|
||||
0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
|
||||
|
@ -1091,6 +1145,47 @@ int32_t CharsetRecog_KOI8_R::match(InputText *textIn)
|
|||
return match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
|
||||
}
|
||||
|
||||
CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
|
||||
{
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM424_he::getName() const
|
||||
{
|
||||
return "IBM424";
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM424_he::getLanguage() const
|
||||
{
|
||||
return "he";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM424_he::match(InputText *textIn)
|
||||
{
|
||||
return match_sbcs(textIn, ngrams_IBM424_he, charMap_IBM424_he);
|
||||
}
|
||||
|
||||
CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
|
||||
{
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM420_ar::getName() const
|
||||
{
|
||||
return "IBM420";
|
||||
}
|
||||
|
||||
const char *CharsetRecog_IBM420_ar::getLanguage() const
|
||||
{
|
||||
return "ar";
|
||||
}
|
||||
|
||||
int32_t CharsetRecog_IBM420_ar::match(InputText *textIn)
|
||||
{
|
||||
// TODO: May need to add shaping
|
||||
return match_sbcs(textIn, ngrams_IBM420_ar, charMap_IBM420_ar);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2006, International Business Machines
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -358,6 +358,30 @@ public:
|
|||
int32_t match(InputText *textIn);
|
||||
};
|
||||
|
||||
class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
|
||||
{
|
||||
public:
|
||||
virtual ~CharsetRecog_IBM424_he();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
const char *getLanguage() const;
|
||||
|
||||
int32_t match(InputText *textIn);
|
||||
};
|
||||
|
||||
class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
|
||||
{
|
||||
public:
|
||||
virtual ~CharsetRecog_IBM420_ar();
|
||||
|
||||
const char *getName() const;
|
||||
|
||||
const char *getLanguage() const;
|
||||
|
||||
int32_t match(InputText *textIn);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
****************************************************************************
|
||||
* Copyright (c) 2005-2008, International Business Machines Corporation and *
|
||||
* Copyright (c) 2005-2009, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
****************************************************************************
|
||||
*/
|
||||
|
@ -28,6 +28,8 @@ static void TestC1Bytes(void);
|
|||
static void TestInputFilter(void);
|
||||
static void TestChaining(void);
|
||||
static void TestBufferOverflow(void);
|
||||
static void TestIBM424(void);
|
||||
static void TestIBM420(void);
|
||||
|
||||
void addUCsdetTest(TestNode** root);
|
||||
|
||||
|
@ -40,6 +42,8 @@ void addUCsdetTest(TestNode** root)
|
|||
addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
|
||||
addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
|
||||
addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
|
||||
addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
|
||||
addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
|
||||
}
|
||||
|
||||
static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
|
||||
|
@ -419,3 +423,93 @@ bail:
|
|||
ucsdet_close(csd);
|
||||
}
|
||||
|
||||
static void TestIBM424(void)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
static const UChar chars[] = {
|
||||
0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
|
||||
0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
|
||||
0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
|
||||
0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
|
||||
0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
|
||||
0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
|
||||
0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
|
||||
0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
|
||||
0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
|
||||
0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
|
||||
0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
|
||||
0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
|
||||
0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
|
||||
0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
|
||||
0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
|
||||
0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
|
||||
0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
|
||||
};
|
||||
int32_t bLength = 0, cLength = ARRAY_SIZE(chars);
|
||||
char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
|
||||
ucsdet_setText(csd, bytes, bLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
log_err("Encoding detection failure for IBM424: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM424") != 0) {
|
||||
log_err("Encoding detection failure for IBM424: got %s\n", name);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bytes);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
||||
static void TestIBM420(void)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
static const UChar chars[] = {
|
||||
0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
|
||||
0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
|
||||
0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
|
||||
0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
|
||||
0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
|
||||
0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
|
||||
0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
|
||||
0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
|
||||
0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
|
||||
0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
|
||||
0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
|
||||
0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
|
||||
0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
|
||||
0x0000
|
||||
};
|
||||
int32_t bLength = 0, cLength = ARRAY_SIZE(chars);
|
||||
char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
|
||||
ucsdet_setText(csd, bytes, bLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
log_err("Encoding detection failure for IBM420: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM420") != 0) {
|
||||
log_err("Encoding detection failure for IBM420: got %s\n", name);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bytes);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2008, International Business Machines
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -76,6 +76,14 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char
|
|||
if (exec) DetectionTest();
|
||||
break;
|
||||
|
||||
case 6: name = "IBM424Test";
|
||||
if (exec) IBM424Test();
|
||||
break;
|
||||
|
||||
case 7: name = "IBM420Test";
|
||||
if (exec) IBM420Test();
|
||||
break;
|
||||
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
}
|
||||
|
@ -515,4 +523,95 @@ void CharsetDetectionTest::DetectionTest()
|
|||
#endif
|
||||
}
|
||||
|
||||
void CharsetDetectionTest::IBM424Test()
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
static const UChar chars[] = {
|
||||
0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
|
||||
0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
|
||||
0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
|
||||
0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
|
||||
0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
|
||||
0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
|
||||
0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
|
||||
0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
|
||||
0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
|
||||
0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
|
||||
0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
|
||||
0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
|
||||
0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
|
||||
0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
|
||||
0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
|
||||
0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
|
||||
0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
|
||||
};
|
||||
UnicodeString s(chars);
|
||||
int32_t bLength = 0;
|
||||
char *bytes = extractBytes(s, "IBM424", bLength);
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
|
||||
ucsdet_setText(csd, bytes, bLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("Encoding detection failure for IBM424: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM424") != 0) {
|
||||
errln("Encoding detection failure for IBM424: got %s\n", name);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bytes);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
||||
void CharsetDetectionTest::IBM420Test()
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
static const UChar chars[] = {
|
||||
0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
|
||||
0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
|
||||
0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
|
||||
0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
|
||||
0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
|
||||
0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
|
||||
0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
|
||||
0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
|
||||
0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
|
||||
0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
|
||||
0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
|
||||
0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
|
||||
0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
|
||||
0x0000
|
||||
};
|
||||
UnicodeString s(chars);
|
||||
int32_t bLength = 0;
|
||||
char *bytes = extractBytes(s, "IBM420", bLength);
|
||||
UCharsetDetector *csd = ucsdet_open(&status);
|
||||
const UCharsetMatch *match;
|
||||
const char *name;
|
||||
|
||||
ucsdet_setText(csd, bytes, bLength, &status);
|
||||
match = ucsdet_detect(csd, &status);
|
||||
|
||||
if (match == NULL) {
|
||||
errln("Encoding detection failure for IBM420: got no matches.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
name = ucsdet_getName(match, &status);
|
||||
if (strcmp(name, "IBM420") != 0) {
|
||||
errln("Encoding detection failure for IBM420: got %s\n", name);
|
||||
}
|
||||
|
||||
bail:
|
||||
freeBytes(bytes);
|
||||
ucsdet_close(csd);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2006, International Business Machines
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -27,6 +27,8 @@ public:
|
|||
virtual void C1BytesTest();
|
||||
virtual void InputFilterTest();
|
||||
virtual void DetectionTest();
|
||||
virtual void IBM424Test();
|
||||
virtual void IBM420Test();
|
||||
|
||||
private:
|
||||
void checkEncoding(const UnicodeString &testString,
|
||||
|
|
Loading…
Add table
Reference in a new issue