From ee7482efd7fa9cb768deedf5d6cc43a53a222655 Mon Sep 17 00:00:00 2001
From: Eric Mader <emader@svn.icu-project.org>
Date: Fri, 10 Feb 2006 23:49:09 +0000
Subject: [PATCH] ICU-4639 Initial version of C test. Clean up error messages.

X-SVN-Rev: 19137
---
 icu4c/source/test/cintltst/calltest.c      |   4 +-
 icu4c/source/test/cintltst/cintltst.vcproj |   3 +
 icu4c/source/test/cintltst/ucsdetst.c      | 327 +++++++++++++++++++++
 icu4c/source/test/intltest/csdetest.cpp    |   4 +-
 4 files changed, 335 insertions(+), 3 deletions(-)
 create mode 100644 icu4c/source/test/cintltst/ucsdetst.c
diff --git a/icu4c/source/test/cintltst/calltest.c b/icu4c/source/test/cintltst/calltest.c
index 3740b40e5c0..fa32f1aa427 100644
--- a/icu4c/source/test/cintltst/calltest.c
+++ b/icu4c/source/test/cintltst/calltest.c
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT: 
- * Copyright (c) 1996-2005, International Business Machines Corporation and
+ * Copyright (c) 1996-2006, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************/
 /********************************************************************************
@@ -38,6 +38,7 @@ void addHeapMutexTest(TestNode **root);
 void addUTraceTest(TestNode** root);
 void addURegexTest(TestNode** root);
 void addUTextTest(TestNode** root);
+void addUCsdetTest(TestNode** root);
 
 
 void addAllTests(TestNode** root)
@@ -76,5 +77,6 @@ void addAllTests(TestNode** root)
     addURegexTest(root);
 #endif
     addUTextTest(root);
+    addUCsdetTest(root);
 }
 
diff --git a/icu4c/source/test/cintltst/cintltst.vcproj b/icu4c/source/test/cintltst/cintltst.vcproj
index 4f18198afc2..55e28478534 100644
--- a/icu4c/source/test/cintltst/cintltst.vcproj
+++ b/icu4c/source/test/cintltst/cintltst.vcproj
@@ -395,6 +395,9 @@
 			<File
 				RelativePath=".\currtest.c">
 			</File>
+			<File
+				RelativePath=".\ucsdetst.c">
+			</File>
 			<File
 				RelativePath=".\utmstest.c">
 			</File>
diff --git a/icu4c/source/test/cintltst/ucsdetst.c b/icu4c/source/test/cintltst/ucsdetst.c
new file mode 100644
index 00000000000..4421ccf7495
--- /dev/null
+++ b/icu4c/source/test/cintltst/ucsdetst.c
@@ -0,0 +1,327 @@
+/*
+ ****************************************************************************
+ * Copyright (c) 2005-2006, International Business Machines Corporation and *
+ * others. All Rights Reserved.                                             *
+ ****************************************************************************
+ */
+
+#include "unicode/utypes.h"
+
+#include "unicode/ucsdet.h"
+#include "unicode/ucnv.h"
+#include "unicode/ustring.h"
+
+#include "cintltst.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
+
+#define NEW_ARRAY(type,count) (type *) ctst_malloc((count) * sizeof(type))
+#define DELETE_ARRAY(array)
+
+static void TestConstruction(void);
+static void TestUTF8(void);
+static void TestUTF16(void);
+static void TestC1Bytes(void);
+static void TestInputFilter(void);
+
+void addUCsdetTest(TestNode** root);
+
+void addUCsdetTest(TestNode** root)
+{
+    addTest(root, &TestConstruction, "tscsdet/TestConstruction");
+    addTest(root, &TestUTF8, "tscsdet/TestUTF8");
+    addTest(root, &TestUTF16, "tscsdet/TestUTF16");
+    addTest(root, &TestC1Bytes, "tscsdet/TestC1Bytes");
+    addTest(root, &TestInputFilter, "tscsdet/TestInputFilter");
+}
+
+static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
+{
+    UErrorCode status;
+    char buffer[1024];
+    char *dest, *destLimit = buffer + sizeof(buffer);
+    const UChar *srcLimit = src + length;
+    int32_t result = 0;
+
+    do {
+        dest = buffer;
+        status = U_ZERO_ERROR;
+        ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
+        result += (int32_t) (dest - buffer);
+    } while (status == U_BUFFER_OVERFLOW_ERROR);
+
+    return result;
+}
+
+static UChar *unescape(const char *src, int32_t *length)
+{
+    int32_t charCount = u_unescape(src, NULL, 0);
+    UChar *chars = NEW_ARRAY(UChar, charCount + 1); 
+
+    u_unescape(src, chars, charCount);
+
+    *length = charCount;
+    return chars;
+}
+
+static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    UConverter *cnv = ucnv_open(codepage, &status);
+    int32_t byteCount = preflight(src, length, cnv);
+    const UChar *srcLimit = src + length;
+    char *bytes = NEW_ARRAY(char, byteCount + 1);
+    char *dest = bytes, *destLimit = bytes + byteCount + 1;
+
+    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
+    ucnv_close(cnv);
+
+    *byteLength = byteCount;
+    return bytes;
+}
+
+static void freeBytes(char *bytes)
+{
+    DELETE_ARRAY(bytes);
+}
+
+static void TestConstruction(void)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    UCharsetDetector *csd = ucsdet_open(&status);
+    UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
+    int32_t count = uenum_count(e, &status);
+    int32_t i;
+
+    for(i = 0; i < count; i += 1) {
+        int32_t length;
+        const char *name = uenum_next(e, &length, &status);
+
+        if(name == NULL || length <= 0) {
+            log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
+        }
+    }
+
+    uenum_close(e);
+    ucsdet_close(csd);
+}
+
+static void TestUTF8(void)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    char *ss = "This is a string with some non-ascii characters that will "
+               "be converted to UTF-8, then shoved through the detection process.  "
+               "\\u0391\\u0392\\u0393\\u0394\\u0395"
+               "Sure would be nice if our source could contain Unicode directly!";
+    int32_t byteLength = 0, sLength = 0, dLength = 0;
+    UChar *s = unescape(ss, &sLength);
+    char *bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
+    UCharsetDetector *csd = ucsdet_open(&status);
+    const UCharsetMatch *match;
+    UChar *detected = NEW_ARRAY(UChar, sLength);
+
+    ucsdet_setText(csd, bytes, byteLength, &status);
+    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        log_err("Detection failure for UTF-8: got no matches.\n");
+        goto bail;
+    }
+
+    dLength = ucsdet_getUChars(match, detected, sLength, &status);
+
+    if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
+        log_err("Round-trip test failed!\n");
+    }
+
+    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
+
+bail:
+    DELETE_ARRAY(detected);
+    freeBytes(bytes);
+    ucsdet_close(csd);
+}
+
+static void TestUTF16(void)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    /* Notice the BOM on the start of this string */
+    UChar chars[] = {
+        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
+        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
+        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
+        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
+        0x064a, 0x062a, 0x0000};
+    int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
+    char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
+    char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
+    UCharsetDetector *csd = ucsdet_open(&status);
+    const UCharsetMatch *match;
+    const char *name;
+    int32_t conf;
+
+    ucsdet_setText(csd, beBytes, beLength, &status);
+    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
+        goto try_le;
+    }
+
+    name  = ucsdet_getName(match, &status);
+    conf  = ucsdet_getConfidence(match, &status);
+
+    if (strcmp(name, "UTF-16BE") != 0) {
+        log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
+    }
+
+    if (conf != 100) {
+        log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
+    }
+
+try_le:
+    ucsdet_setText(csd, leBytes, leLength, &status);
+    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
+        goto bail;
+    }
+
+    name  = ucsdet_getName(match, &status);
+    conf = ucsdet_getConfidence(match, &status);
+
+
+    if (strcmp(name, "UTF-16LE") != 0) {
+        log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
+    }
+
+    if (conf != 100) {
+        log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
+    }
+
+bail:
+    freeBytes(leBytes);
+    freeBytes(beBytes);
+    ucsdet_close(csd);
+}
+
+static void TestC1Bytes(void)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    char *ssISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
+    char *ssWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
+    int32_t sISOLength = 0, sWindowsLength = 0;
+    UChar *sISO = unescape(ssISO, &sISOLength);
+    UChar *sWindows = unescape(ssWindows, &sWindowsLength);
+    int32_t lISO = 0, lWindows = 0;
+    char *bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
+    char *bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
+    UCharsetDetector *csd = ucsdet_open(&status);
+    const UCharsetMatch *match;
+    const char *name;
+
+    ucsdet_setText(csd, bWindows, lWindows, &status);
+    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        log_err("English test with C1 bytes got no matches.\n");
+        goto bail;
+    }
+
+    name  = ucsdet_getName(match, &status);
+
+    if (strcmp(name, "windows-1252") != 0) {
+        log_err("English text with C1 bytes does not detect as windows-1252, but as %s\n", name);
+    }
+
+    ucsdet_setText(csd, bISO, lISO, &status);
+    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        log_err("English text without C1 bytes got no matches.\n");
+        goto bail;
+    }
+
+    name  = ucsdet_getName(match, &status);
+
+    if (strcmp(name, "ISO-8859-1") != 0) {
+        log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
+    }
+
+bail:
+    freeBytes(bWindows);
+    freeBytes(bISO);
+
+    ucsdet_close(csd);
+}
+
+static void TestInputFilter(void)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    char *ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
+    int32_t sLength = 0;
+    UChar *s  = unescape(ss, &sLength);
+    int32_t byteLength = 0;
+    char *bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
+    UCharsetDetector *csd = ucsdet_open(&status);
+    const UCharsetMatch *match;
+    const char *lang, *name;
+
+    ucsdet_enableInputFilter(csd, TRUE);
+
+    if (!ucsdet_isInputFilterEnabled(csd)) {
+        log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
+    }
+
+
+    ucsdet_setText(csd, bytes, byteLength, &status);
+    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        log_err("Turning on the input filter resulted in no matches.\n");
+        goto turn_off;
+    }
+
+    name = ucsdet_getName(match, &status);
+
+    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
+        log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
+    } else {
+        lang = ucsdet_getLanguage(match, &status);
+
+        if (lang == NULL || strcmp(lang, "fr") != 0) {
+            log_err("Input filter did not strip markup!\n");
+        }
+    }
+
+turn_off:
+    ucsdet_enableInputFilter(csd, FALSE);
+    ucsdet_setText(csd, bytes, byteLength, &status);
+    match = ucsdet_detect(csd, &status);
+
+    if (match == NULL) {
+        log_err("Turning off the input filter resulted in no matches.\n");
+        goto bail;
+    }
+
+    name = ucsdet_getName(match, &status);
+
+    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
+        log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
+    } else {
+        lang = ucsdet_getLanguage(match, &status);
+
+        if (lang == NULL || strcmp(lang, "en") != 0) {
+            log_err("Unfiltered input did not detect as English!\n");
+        }
+    }
+
+bail:
+    freeBytes(bytes);
+    ucsdet_close(csd);
+}
+
diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp
index aeb93f6d88a..722e948a1ec 100644
--- a/icu4c/source/test/intltest/csdetest.cpp
+++ b/icu4c/source/test/intltest/csdetest.cpp
@@ -375,7 +375,7 @@ void CharsetDetectionTest::InputFilterTest()
     name = ucsdet_getName(match, &status);
 
     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
-        errln("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
+        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
     } else {
         lang = ucsdet_getLanguage(match, &status);
 
@@ -397,7 +397,7 @@ turn_off:
     name = ucsdet_getName(match, &status);
 
     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
-        errln("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
+        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
     } else {
         lang = ucsdet_getLanguage(match, &status);