ICU-434 new sample [convsamp]

X-SVN-Rev: 1606
2025-04-08 06:53:45 +00:00 · 2000-06-12 20:29:04 +00:00 · 2000-06-12 20:29:04 +00:00 · baffa188a7
commit baffa188a7
parent 88aa217d7c
3 changed files with 424 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -75,6 +75,7 @@ icu4c/source/data/brkitr/wordBE.brk -text
 icu4c/source/data/brkitr/wordLE.brk -text
 icu4c/source/data/brkitr/word_thBE.brk -text
 icu4c/source/data/brkitr/word_thLE.brk -text
+icu4c/source/samples/ucnv/data01.ut8 -text
 icu4c/source/test/testdata/en_US.uni -text
 icu4c/source/test/testdata/uni-text.txt -text
 icu4j/src/com/ibm/demo/rbbi/english.dict -text
--- a/icu4c/source/samples/ucnv/convsamp.cpp
+++ b/icu4c/source/samples/ucnv/convsamp.cpp
@ -0,0 +1,408 @@
+/**************************************************************************
+*
+*   Copyright (C) 2000, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+***************************************************************************
+*   file name:  convsamp.c
+*   encoding:   ASCII (7-bit)
+*
+*   created on: 2000may30
+*   created by: Steven R. Loomis
+*
+*   Sample code for the ICU conversion routines.
+*
+* Note: Nothing special is needed to build this sample. Link with
+*       the icu UC and icu I18N libraries. 
+* 
+*       I use 'assert' for error checking, you probably will want
+*       something more flexible.  '***BEGIN SAMPLE***' and 
+*       '***END SAMPLE***' mark pieces suitable for stand alone
+*       code snippets.
+*
+*/
+
+#include <stdio.h>
+#include <ctype.h>            /* for isspace, etc.    */
+#include <assert.h>
+
+#include "unicode/utypes.h"   /* Basic ICU data types */
+#include "unicode/ucnv.h"     /* C   Converter API    */
+#include "unicode/convert.h"  /* C++ Converter API    */
+#include "unicode/ustring.h"  /* some more string fcns*/
+
+/* Some utility functions */
+
+static const UChar kNone[] = { 0x0000 };
+
+/* Print a UChar if possible, in seven characters. */
+void prettyPrintUChar(UChar c)
+{
+  if(  (c <= 0x007F) &&
+       (isgraph(c))  ) {
+    printf("  '%c'  ", (char)(0x00FF&c));
+  } else if ( c > 0x007F ) {
+    char buf[100];
+    UErrorCode status = U_ZERO_ERROR;
+    UTextOffset o;
+    
+    o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 100, &status);
+    if(U_SUCCESS(status)) {
+      buf[6] = 0;
+      printf("%- 7s", buf);
+    } else {
+      printf("??????? ");
+    }
+  } else {
+    switch((char)(c & 0x007F)) {
+    case ' ':
+      printf("  ' '  ");
+      break;
+    case '\t':
+      printf("  \t   ");
+      break;
+    case '\n':
+      printf("  \n   ");
+      break;
+    default:
+      printf("       ");
+      break;
+    }
+  }
+}
+
+
+void printUChars(const char  *name = "?", 
+                 const UChar *uch  = kNone,
+                 int32_t     len   = -1 )
+{
+  int32_t i;
+
+  if( (len == -1) && (uch) ) {
+    len = u_strlen(uch);
+  }
+
+  printf("% 5s:", name);
+  for( i = 0; i <len; i++) {
+    printf("%- 6d ", i);
+  }
+  printf("\n");
+
+  printf("% 5s: ", "uni");
+  for( i = 0; i <len; i++) {
+    printf("\\u%04X ", (int)uch[i]);
+  }
+  printf("\n");
+
+  printf("% 5s: ", "ch");
+  for( i = 0; i <len; i++) {
+    prettyPrintUChar(uch[i]);
+  }
+  printf("\n");
+}
+
+void printBytes(const char  *name = "?", 
+                 const char *uch  = "",
+                 int32_t     len   = -1 )
+{
+  int32_t i;
+
+  if( (len == -1) && (uch) ) {
+    len = strlen(uch);
+  }
+
+  printf("% 5s:", name);
+  for( i = 0; i <len; i++) {
+    printf(" %- 4d", i);
+  }
+  printf("\n");
+
+  printf("% 5s: ", "uni");
+  for( i = 0; i <len; i++) {
+    printf("\\x%02X ", 0x00FF & (int)uch[i]);
+  }
+  printf("\n");
+
+  printf("% 5s: ", "ch");
+  for( i = 0; i <len; i++) {
+    if(isgraph(uch[i])) {
+      printf(" '%c' ", (char)uch[i]);
+    } else {
+      printf("     ");
+    }
+  }
+  printf("\n");
+}
+
+
+/*******************************************************************
+  Very simple C++ sample to convert the word 'Moscow' in Russian, followed
+  by an exclamation mark (!) into the KOI8-R Russian code page.
+
+  This example first creates a UnicodeString out of the Unicode chars.
+
+  targetSize must be set to the amount of space available in the target
+  buffer. After UnicodeConverterCPP::fromUnicodeString() is called, 
+  targetSize will contain the number of bytes in target[] which were
+  used in the resulting codepage.  In this case, there is a 1:1 mapping
+  between the input and output characters. The exclamation mark has the
+  same value in both KOI8-R and Unicode.
+
+  src: 0      1      2      3      4      5      6     
+  uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 
+   ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'  
+
+ targ:  0    1    2    3    4    5    6  
+  uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 
+   ch:                                '!' 
+
+
+ */
+UErrorCode convsample_01()
+{
+  printf("\n\n==============================================\n"
+         "Sample 01: C++: simple Unicode -> koi8-r conversion\n");
+
+
+  // **************************** START SAMPLE *******************
+  // "Moscva!" in cyrillic letters, to be converted to the KOI8-R
+  // Russian code page.
+  UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
+                     0x0430, 0x0021, 0x0000 };
+  char target[100];
+  int32_t targetSize = sizeof(target);
+  UnicodeString myString(source);
+  UErrorCode status = U_ZERO_ERROR;
+
+  // set up the converter
+  UnicodeConverterCPP conv("koi8-r", status);
+  assert(U_SUCCESS(status));
+
+  // convert to KOI8-R
+  conv.fromUnicodeString(target, targetSize, myString, status);
+  assert(U_SUCCESS(status));
+
+  // ***************************** END SAMPLE ********************
+  
+  // Print it out
+  printUChars("src", source);
+  printf("\n");
+  printBytes("targ", target, targetSize);
+
+  return U_ZERO_ERROR;
+}
+
+
+/******************************************************
+  Similar sample to the preceding one. 
+  You must call ucnv_close to clean up the memory used by the
+  converter.
+
+  'len' returns the number of OUTPUT bytes resulting from the 
+  conversion. In this case, it will be 9 even though there are
+  only 6 unicode characters going in. This is because the 
+  letters 'cat' each only take up one byte, but the remaining
+  three UChars each take up 2 bytes in the output codepage.
+
+  src: 0      1      2      3      4      5     
+  uni: \u0063 \u0061 \u0074 \u732B \uFF2F \uFF2B 
+   ch:   'c'    'a'    't'  CJK UN               
+
+ targ:  0    1    2    3    4    5    6    7    8  
+  uni: \x63 \x61 \x74 \x94 \x4C \x82 \x6E \x82 \x6A 
+   ch:  'c'  'a'  't'   [not readable here........]
+
+ */
+
+UErrorCode convsample_02()
+{
+  printf("\n\n==============================================\n"
+         "Sample 02: C: simple Unicode -> Shift_Jis conversion\n");
+
+
+  // **************************** START SAMPLE *******************
+  // "cat<cat>OK"
+  UChar source[] = {   0x0063, 0x0061, 0x0074, 0x732B, 0xFF2F, 0xFF2B,
+                     0x0000 };
+  char target[100];
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter *conv;
+  int32_t     len;
+
+  // set up the converter
+  conv = ucnv_open("shift_jis", &status);
+  assert(U_SUCCESS(status));
+
+  // convert to shift-jis
+  len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
+  assert(U_SUCCESS(status));
+
+  // close the converter
+  ucnv_close(conv);
+
+  // ***************************** END SAMPLE ********************
+  
+  // Print it out
+  printUChars("src", source);
+  printf("\n");
+  printBytes("targ", target, len);
+
+  return U_ZERO_ERROR;
+}
+
+
+UErrorCode convsample_03()
+{
+  printf("\n\n==============================================\n"
+         "Sample 03: C: print out all converters\n");
+
+  int32_t count;
+  int32_t i;
+
+  // **************************** START SAMPLE *******************
+  count = ucnv_countAvailable();
+  printf("Available converters: %d\n", count);
+  
+  for(i=0;i<count;i++) 
+  {
+    printf("%s ", ucnv_getAvailableName(i));
+  }
+
+  // ***************************** END SAMPLE ********************
+  
+  printf("\n");
+
+  return U_ZERO_ERROR;
+}
+
+
+
+#define BUFFERSIZE 17 /* make it interesting :) */
+
+/*
+  Converting from a codepage to Unicode in bulk..
+  What is the best way to determine the buffer size?
+
+     The 'buffersize' is in bytes of input.
+    For a given converter, divinding this by the minimum char size
+    give you the maximum number of Unicode characters that could be
+    expected for a given number of input bytes.
+     see: ucnv_getMinCharSize()
+
+     For example, a single byte codepage like 'Latin-3' has a 
+    minimum char size of 1. (It takes at least 1 byte to represent
+    each Unicode char.) So the unicode buffer has the same number of
+    UChars as the input buffer has bytes.
+
+     In a strictly double byte codepage such as cp1362 (Windows
+    Korean), the minimum char size is 2. So, only half as many Unicode
+    chars as bytes are needed.
+
+     This work to calculate the buffer size is an optimization. Any
+    size of input and output buffer can be used, as long as the
+    program handles the following cases: If the input buffer is empty,
+    the source pointer will be equal to sourceLimit.  If the output
+    buffer is empty, U_INDEX_OUTOFBOUNDS_ERROR will be returned. 
+ */
+
+UErrorCode convsample_05()
+{
+  printf("\n\n==============================================\n"
+         "Sample 05: C: count the number of letters in a UTF-8 document\n");
+
+  FILE *f;
+  int32_t count;
+  char inBuf[BUFFERSIZE];
+  const char *source;
+  const char *sourceLimit;
+  UChar *uBuf;
+  UChar *target;
+  UChar *targetLimit;
+  UChar *p;
+  int32_t uBufSize = 0;
+  UConverter *conv;
+  UErrorCode status = U_ZERO_ERROR;
+  uint32_t letters=0, total=0;
+
+  f = fopen("data01.ut8", "r");
+  if(!f)
+  {
+    fprintf(stderr, "Couldn't open file 'data01.ut8' (UTF-8 data file).\n");
+    return U_FILE_ACCESS_ERROR;
+  }
+
+  // **************************** START SAMPLE *******************
+  conv = ucnv_open("utf-8", &status);
+  assert(U_SUCCESS(status));
+
+  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
+  printf("input bytes %d / min chars %d = %d UChars\n",
+         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
+  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
+  assert(uBuf!=NULL);
+
+  // grab another buffer's worth
+  while((!feof(f)) && 
+        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
+  {
+    // Convert bytes to unicode
+    source = inBuf;
+    sourceLimit = inBuf + count;
+    
+    do
+    {
+        target = uBuf;
+        targetLimit = uBuf + uBufSize;
+        
+        ucnv_toUnicode(conv, &target, targetLimit, 
+                       &source, sourceLimit, NULL,
+                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
+                                   /* is true (when no more data will come) */
+                       &status);
+      
+        if(status != U_INDEX_OUTOFBOUNDS_ERROR)
+        {
+          // simply ran out of space - we'll reset the target ptr the next
+          // time through the loop.
+          status = U_ZERO_ERROR;
+        }
+        else
+        {
+          //  Check other errors here.
+          assert(U_SUCCESS(status));
+          // Break out of the loop (by force)
+        }
+
+        // Process the Unicode
+        // Todo: handle UTF-16/surrogates
+
+        for(p = uBuf; p<target; p++)
+        {
+          if(u_isalpha(*p))
+            letters++;
+          total++;
+        }
+    } while (source < sourceLimit); // while simply out of space
+  }
+
+  printf("%d letters out of %d total UChars.\n", letters, total);
+  
+  // ***************************** END SAMPLE ********************
+  ucnv_close(conv);
+
+  printf("\n");
+
+  return U_ZERO_ERROR;
+}
+#undef BUFFERSIZE
+
+/* main */
+
+int main()
+{
+  convsample_01();
+  convsample_02();
+  convsample_03();
+//convsample_04();  /* not written yet */
+  convsample_05();
+}
--- a/icu4c/source/samples/ucnv/data01.ut8
+++ b/icu4c/source/samples/ucnv/data01.ut8
@ -0,0 +1,15 @@
+-*- Coding: utf-8 ; -*- // for emacs
+[some latin, devanagari, and cyrillic text]
+raj  Rajasthani  राजेस्थानी
+     Konkani     कोंकणी
+     Haryanvi    हरियानवी
+rm Rhaeto-Romance रहेय्टो-रोमान्स
+rn Kirundi किरून्दी
+ro Romanian रूमानीयन्
+ru Russian रुसी футбол
+том
+атом
+Нева
+Fid-dinja hawn aktar imġienen minn nies f'sensiehom.
+Il-mistoqsija oħt il-għerf. (Asking is the sister of knowing.)
+aфутбол중앙일보－の“日本語”