From ee7482efd7fa9cb768deedf5d6cc43a53a222655 Mon Sep 17 00:00:00 2001 From: Eric Mader Date: Fri, 10 Feb 2006 23:49:09 +0000 Subject: [PATCH] ICU-4639 Initial version of C test. Clean up error messages. X-SVN-Rev: 19137 --- icu4c/source/test/cintltst/calltest.c | 4 +- icu4c/source/test/cintltst/cintltst.vcproj | 3 + icu4c/source/test/cintltst/ucsdetst.c | 327 +++++++++++++++++++++ icu4c/source/test/intltest/csdetest.cpp | 4 +- 4 files changed, 335 insertions(+), 3 deletions(-) create mode 100644 icu4c/source/test/cintltst/ucsdetst.c diff --git a/icu4c/source/test/cintltst/calltest.c b/icu4c/source/test/cintltst/calltest.c index 3740b40e5c0..fa32f1aa427 100644 --- a/icu4c/source/test/cintltst/calltest.c +++ b/icu4c/source/test/cintltst/calltest.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1996-2005, International Business Machines Corporation and + * Copyright (c) 1996-2006, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -38,6 +38,7 @@ void addHeapMutexTest(TestNode **root); void addUTraceTest(TestNode** root); void addURegexTest(TestNode** root); void addUTextTest(TestNode** root); +void addUCsdetTest(TestNode** root); void addAllTests(TestNode** root) @@ -76,5 +77,6 @@ void addAllTests(TestNode** root) addURegexTest(root); #endif addUTextTest(root); + addUCsdetTest(root); } diff --git a/icu4c/source/test/cintltst/cintltst.vcproj b/icu4c/source/test/cintltst/cintltst.vcproj index 4f18198afc2..55e28478534 100644 --- a/icu4c/source/test/cintltst/cintltst.vcproj +++ b/icu4c/source/test/cintltst/cintltst.vcproj @@ -395,6 +395,9 @@ + + diff --git a/icu4c/source/test/cintltst/ucsdetst.c b/icu4c/source/test/cintltst/ucsdetst.c new file mode 100644 index 00000000000..4421ccf7495 --- /dev/null +++ b/icu4c/source/test/cintltst/ucsdetst.c @@ -0,0 +1,327 @@ +/* + **************************************************************************** + * Copyright (c) 2005-2006, International Business Machines Corporation and * + * others. All Rights Reserved. * + **************************************************************************** + */ + +#include "unicode/utypes.h" + +#include "unicode/ucsdet.h" +#include "unicode/ucnv.h" +#include "unicode/ustring.h" + +#include "cintltst.h" + +#include +#include + +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +#define NEW_ARRAY(type,count) (type *) ctst_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) + +static void TestConstruction(void); +static void TestUTF8(void); +static void TestUTF16(void); +static void TestC1Bytes(void); +static void TestInputFilter(void); + +void addUCsdetTest(TestNode** root); + +void addUCsdetTest(TestNode** root) +{ + addTest(root, &TestConstruction, "tscsdet/TestConstruction"); + addTest(root, &TestUTF8, "tscsdet/TestUTF8"); + addTest(root, &TestUTF16, "tscsdet/TestUTF16"); + addTest(root, &TestC1Bytes, "tscsdet/TestC1Bytes"); + addTest(root, &TestInputFilter, "tscsdet/TestInputFilter"); +} + +static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) +{ + UErrorCode status; + char buffer[1024]; + char *dest, *destLimit = buffer + sizeof(buffer); + const UChar *srcLimit = src + length; + int32_t result = 0; + + do { + dest = buffer; + status = U_ZERO_ERROR; + ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); + result += (int32_t) (dest - buffer); + } while (status == U_BUFFER_OVERFLOW_ERROR); + + return result; +} + +static UChar *unescape(const char *src, int32_t *length) +{ + int32_t charCount = u_unescape(src, NULL, 0); + UChar *chars = NEW_ARRAY(UChar, charCount + 1); + + u_unescape(src, chars, charCount); + + *length = charCount; + return chars; +} + +static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength) +{ + UErrorCode status = U_ZERO_ERROR; + UConverter *cnv = ucnv_open(codepage, &status); + int32_t byteCount = preflight(src, length, cnv); + const UChar *srcLimit = src + length; + char *bytes = NEW_ARRAY(char, byteCount + 1); + char *dest = bytes, *destLimit = bytes + byteCount + 1; + + ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); + ucnv_close(cnv); + + *byteLength = byteCount; + return bytes; +} + +static void freeBytes(char *bytes) +{ + DELETE_ARRAY(bytes); +} + +static void TestConstruction(void) +{ + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector *csd = ucsdet_open(&status); + UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); + int32_t count = uenum_count(e, &status); + int32_t i; + + for(i = 0; i < count; i += 1) { + int32_t length; + const char *name = uenum_next(e, &length, &status); + + if(name == NULL || length <= 0) { + log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n"); + } + } + + uenum_close(e); + ucsdet_close(csd); +} + +static void TestUTF8(void) +{ + UErrorCode status = U_ZERO_ERROR; + char *ss = "This is a string with some non-ascii characters that will " + "be converted to UTF-8, then shoved through the detection process. " + "\\u0391\\u0392\\u0393\\u0394\\u0395" + "Sure would be nice if our source could contain Unicode directly!"; + int32_t byteLength = 0, sLength = 0, dLength = 0; + UChar *s = unescape(ss, &sLength); + char *bytes = extractBytes(s, sLength, "UTF-8", &byteLength); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + UChar *detected = NEW_ARRAY(UChar, sLength); + + ucsdet_setText(csd, bytes, byteLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Detection failure for UTF-8: got no matches.\n"); + goto bail; + } + + dLength = ucsdet_getUChars(match, detected, sLength, &status); + + if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) { + log_err("Round-trip test failed!\n"); + } + + ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ + +bail: + DELETE_ARRAY(detected); + freeBytes(bytes); + ucsdet_close(csd); +} + +static void TestUTF16(void) +{ + UErrorCode status = U_ZERO_ERROR; + /* Notice the BOM on the start of this string */ + UChar chars[] = { + 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, + 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, + 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, + 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, + 0x064a, 0x062a, 0x0000}; + int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars); + char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength); + char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + const char *name; + int32_t conf; + + ucsdet_setText(csd, beBytes, beLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Encoding detection failure for UTF-16BE: got no matches.\n"); + goto try_le; + } + + name = ucsdet_getName(match, &status); + conf = ucsdet_getConfidence(match, &status); + + if (strcmp(name, "UTF-16BE") != 0) { + log_err("Encoding detection failure for UTF-16BE: got %s\n", name); + } + + if (conf != 100) { + log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf); + } + +try_le: + ucsdet_setText(csd, leBytes, leLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Encoding detection failure for UTF-16LE: got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + conf = ucsdet_getConfidence(match, &status); + + + if (strcmp(name, "UTF-16LE") != 0) { + log_err("Enconding detection failure for UTF-16LE: got %s\n", name); + } + + if (conf != 100) { + log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf); + } + +bail: + freeBytes(leBytes); + freeBytes(beBytes); + ucsdet_close(csd); +} + +static void TestC1Bytes(void) +{ + UErrorCode status = U_ZERO_ERROR; + char *ssISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; + char *ssWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes."; + int32_t sISOLength = 0, sWindowsLength = 0; + UChar *sISO = unescape(ssISO, &sISOLength); + UChar *sWindows = unescape(ssWindows, &sWindowsLength); + int32_t lISO = 0, lWindows = 0; + char *bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO); + char *bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + const char *name; + + ucsdet_setText(csd, bWindows, lWindows, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("English test with C1 bytes got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + + if (strcmp(name, "windows-1252") != 0) { + log_err("English text with C1 bytes does not detect as windows-1252, but as %s\n", name); + } + + ucsdet_setText(csd, bISO, lISO, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("English text without C1 bytes got no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + + if (strcmp(name, "ISO-8859-1") != 0) { + log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name); + } + +bail: + freeBytes(bWindows); + freeBytes(bISO); + + ucsdet_close(csd); +} + +static void TestInputFilter(void) +{ + UErrorCode status = U_ZERO_ERROR; + char *ss = " Un tr\\u00E8s petit peu de Fran\\u00E7ais. "; + int32_t sLength = 0; + UChar *s = unescape(ss, &sLength); + int32_t byteLength = 0; + char *bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength); + UCharsetDetector *csd = ucsdet_open(&status); + const UCharsetMatch *match; + const char *lang, *name; + + ucsdet_enableInputFilter(csd, TRUE); + + if (!ucsdet_isInputFilterEnabled(csd)) { + log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n"); + } + + + ucsdet_setText(csd, bytes, byteLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Turning on the input filter resulted in no matches.\n"); + goto turn_off; + } + + name = ucsdet_getName(match, &status); + + if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { + log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name); + } else { + lang = ucsdet_getLanguage(match, &status); + + if (lang == NULL || strcmp(lang, "fr") != 0) { + log_err("Input filter did not strip markup!\n"); + } + } + +turn_off: + ucsdet_enableInputFilter(csd, FALSE); + ucsdet_setText(csd, bytes, byteLength, &status); + match = ucsdet_detect(csd, &status); + + if (match == NULL) { + log_err("Turning off the input filter resulted in no matches.\n"); + goto bail; + } + + name = ucsdet_getName(match, &status); + + if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { + log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name); + } else { + lang = ucsdet_getLanguage(match, &status); + + if (lang == NULL || strcmp(lang, "en") != 0) { + log_err("Unfiltered input did not detect as English!\n"); + } + } + +bail: + freeBytes(bytes); + ucsdet_close(csd); +} + diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp index aeb93f6d88a..722e948a1ec 100644 --- a/icu4c/source/test/intltest/csdetest.cpp +++ b/icu4c/source/test/intltest/csdetest.cpp @@ -375,7 +375,7 @@ void CharsetDetectionTest::InputFilterTest() name = ucsdet_getName(match, &status); if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { - errln("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name); + errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); } else { lang = ucsdet_getLanguage(match, &status); @@ -397,7 +397,7 @@ turn_off: name = ucsdet_getName(match, &status); if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { - errln("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name); + errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); } else { lang = ucsdet_getLanguage(match, &status);