mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-434 new sample [convsamp]
X-SVN-Rev: 1606
This commit is contained in:
parent
88aa217d7c
commit
baffa188a7
3 changed files with 424 additions and 0 deletions
1
.gitattributes
vendored
1
.gitattributes
vendored
|
@ -75,6 +75,7 @@ icu4c/source/data/brkitr/wordBE.brk -text
|
|||
icu4c/source/data/brkitr/wordLE.brk -text
|
||||
icu4c/source/data/brkitr/word_thBE.brk -text
|
||||
icu4c/source/data/brkitr/word_thLE.brk -text
|
||||
icu4c/source/samples/ucnv/data01.ut8 -text
|
||||
icu4c/source/test/testdata/en_US.uni -text
|
||||
icu4c/source/test/testdata/uni-text.txt -text
|
||||
icu4j/src/com/ibm/demo/rbbi/english.dict -text
|
||||
|
|
408
icu4c/source/samples/ucnv/convsamp.cpp
Normal file
408
icu4c/source/samples/ucnv/convsamp.cpp
Normal file
|
@ -0,0 +1,408 @@
|
|||
/**************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
***************************************************************************
|
||||
* file name: convsamp.c
|
||||
* encoding: ASCII (7-bit)
|
||||
*
|
||||
* created on: 2000may30
|
||||
* created by: Steven R. Loomis
|
||||
*
|
||||
* Sample code for the ICU conversion routines.
|
||||
*
|
||||
* Note: Nothing special is needed to build this sample. Link with
|
||||
* the icu UC and icu I18N libraries.
|
||||
*
|
||||
* I use 'assert' for error checking, you probably will want
|
||||
* something more flexible. '***BEGIN SAMPLE***' and
|
||||
* '***END SAMPLE***' mark pieces suitable for stand alone
|
||||
* code snippets.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h> /* for isspace, etc. */
|
||||
#include <assert.h>
|
||||
|
||||
#include "unicode/utypes.h" /* Basic ICU data types */
|
||||
#include "unicode/ucnv.h" /* C Converter API */
|
||||
#include "unicode/convert.h" /* C++ Converter API */
|
||||
#include "unicode/ustring.h" /* some more string fcns*/
|
||||
|
||||
/* Some utility functions */
|
||||
|
||||
static const UChar kNone[] = { 0x0000 };
|
||||
|
||||
/* Print a UChar if possible, in seven characters. */
|
||||
void prettyPrintUChar(UChar c)
|
||||
{
|
||||
if( (c <= 0x007F) &&
|
||||
(isgraph(c)) ) {
|
||||
printf(" '%c' ", (char)(0x00FF&c));
|
||||
} else if ( c > 0x007F ) {
|
||||
char buf[100];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UTextOffset o;
|
||||
|
||||
o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 100, &status);
|
||||
if(U_SUCCESS(status)) {
|
||||
buf[6] = 0;
|
||||
printf("%- 7s", buf);
|
||||
} else {
|
||||
printf("??????? ");
|
||||
}
|
||||
} else {
|
||||
switch((char)(c & 0x007F)) {
|
||||
case ' ':
|
||||
printf(" ' ' ");
|
||||
break;
|
||||
case '\t':
|
||||
printf(" \t ");
|
||||
break;
|
||||
case '\n':
|
||||
printf(" \n ");
|
||||
break;
|
||||
default:
|
||||
printf(" ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void printUChars(const char *name = "?",
|
||||
const UChar *uch = kNone,
|
||||
int32_t len = -1 )
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
if( (len == -1) && (uch) ) {
|
||||
len = u_strlen(uch);
|
||||
}
|
||||
|
||||
printf("% 5s:", name);
|
||||
for( i = 0; i <len; i++) {
|
||||
printf("%- 6d ", i);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("% 5s: ", "uni");
|
||||
for( i = 0; i <len; i++) {
|
||||
printf("\\u%04X ", (int)uch[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("% 5s: ", "ch");
|
||||
for( i = 0; i <len; i++) {
|
||||
prettyPrintUChar(uch[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void printBytes(const char *name = "?",
|
||||
const char *uch = "",
|
||||
int32_t len = -1 )
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
if( (len == -1) && (uch) ) {
|
||||
len = strlen(uch);
|
||||
}
|
||||
|
||||
printf("% 5s:", name);
|
||||
for( i = 0; i <len; i++) {
|
||||
printf(" %- 4d", i);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("% 5s: ", "uni");
|
||||
for( i = 0; i <len; i++) {
|
||||
printf("\\x%02X ", 0x00FF & (int)uch[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("% 5s: ", "ch");
|
||||
for( i = 0; i <len; i++) {
|
||||
if(isgraph(uch[i])) {
|
||||
printf(" '%c' ", (char)uch[i]);
|
||||
} else {
|
||||
printf(" ");
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
Very simple C++ sample to convert the word 'Moscow' in Russian, followed
|
||||
by an exclamation mark (!) into the KOI8-R Russian code page.
|
||||
|
||||
This example first creates a UnicodeString out of the Unicode chars.
|
||||
|
||||
targetSize must be set to the amount of space available in the target
|
||||
buffer. After UnicodeConverterCPP::fromUnicodeString() is called,
|
||||
targetSize will contain the number of bytes in target[] which were
|
||||
used in the resulting codepage. In this case, there is a 1:1 mapping
|
||||
between the input and output characters. The exclamation mark has the
|
||||
same value in both KOI8-R and Unicode.
|
||||
|
||||
src: 0 1 2 3 4 5 6
|
||||
uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
|
||||
ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
|
||||
|
||||
targ: 0 1 2 3 4 5 6
|
||||
uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
|
||||
ch: '!'
|
||||
|
||||
|
||||
*/
|
||||
UErrorCode convsample_01()
|
||||
{
|
||||
printf("\n\n==============================================\n"
|
||||
"Sample 01: C++: simple Unicode -> koi8-r conversion\n");
|
||||
|
||||
|
||||
// **************************** START SAMPLE *******************
|
||||
// "Moscva!" in cyrillic letters, to be converted to the KOI8-R
|
||||
// Russian code page.
|
||||
UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
|
||||
0x0430, 0x0021, 0x0000 };
|
||||
char target[100];
|
||||
int32_t targetSize = sizeof(target);
|
||||
UnicodeString myString(source);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// set up the converter
|
||||
UnicodeConverterCPP conv("koi8-r", status);
|
||||
assert(U_SUCCESS(status));
|
||||
|
||||
// convert to KOI8-R
|
||||
conv.fromUnicodeString(target, targetSize, myString, status);
|
||||
assert(U_SUCCESS(status));
|
||||
|
||||
// ***************************** END SAMPLE ********************
|
||||
|
||||
// Print it out
|
||||
printUChars("src", source);
|
||||
printf("\n");
|
||||
printBytes("targ", target, targetSize);
|
||||
|
||||
return U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/******************************************************
|
||||
Similar sample to the preceding one.
|
||||
You must call ucnv_close to clean up the memory used by the
|
||||
converter.
|
||||
|
||||
'len' returns the number of OUTPUT bytes resulting from the
|
||||
conversion. In this case, it will be 9 even though there are
|
||||
only 6 unicode characters going in. This is because the
|
||||
letters 'cat' each only take up one byte, but the remaining
|
||||
three UChars each take up 2 bytes in the output codepage.
|
||||
|
||||
src: 0 1 2 3 4 5
|
||||
uni: \u0063 \u0061 \u0074 \u732B \uFF2F \uFF2B
|
||||
ch: 'c' 'a' 't' CJK UN
|
||||
|
||||
targ: 0 1 2 3 4 5 6 7 8
|
||||
uni: \x63 \x61 \x74 \x94 \x4C \x82 \x6E \x82 \x6A
|
||||
ch: 'c' 'a' 't' [not readable here........]
|
||||
|
||||
*/
|
||||
|
||||
UErrorCode convsample_02()
|
||||
{
|
||||
printf("\n\n==============================================\n"
|
||||
"Sample 02: C: simple Unicode -> Shift_Jis conversion\n");
|
||||
|
||||
|
||||
// **************************** START SAMPLE *******************
|
||||
// "cat<cat>OK"
|
||||
UChar source[] = { 0x0063, 0x0061, 0x0074, 0x732B, 0xFF2F, 0xFF2B,
|
||||
0x0000 };
|
||||
char target[100];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UConverter *conv;
|
||||
int32_t len;
|
||||
|
||||
// set up the converter
|
||||
conv = ucnv_open("shift_jis", &status);
|
||||
assert(U_SUCCESS(status));
|
||||
|
||||
// convert to shift-jis
|
||||
len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
|
||||
assert(U_SUCCESS(status));
|
||||
|
||||
// close the converter
|
||||
ucnv_close(conv);
|
||||
|
||||
// ***************************** END SAMPLE ********************
|
||||
|
||||
// Print it out
|
||||
printUChars("src", source);
|
||||
printf("\n");
|
||||
printBytes("targ", target, len);
|
||||
|
||||
return U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
UErrorCode convsample_03()
|
||||
{
|
||||
printf("\n\n==============================================\n"
|
||||
"Sample 03: C: print out all converters\n");
|
||||
|
||||
int32_t count;
|
||||
int32_t i;
|
||||
|
||||
// **************************** START SAMPLE *******************
|
||||
count = ucnv_countAvailable();
|
||||
printf("Available converters: %d\n", count);
|
||||
|
||||
for(i=0;i<count;i++)
|
||||
{
|
||||
printf("%s ", ucnv_getAvailableName(i));
|
||||
}
|
||||
|
||||
// ***************************** END SAMPLE ********************
|
||||
|
||||
printf("\n");
|
||||
|
||||
return U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define BUFFERSIZE 17 /* make it interesting :) */
|
||||
|
||||
/*
|
||||
Converting from a codepage to Unicode in bulk..
|
||||
What is the best way to determine the buffer size?
|
||||
|
||||
The 'buffersize' is in bytes of input.
|
||||
For a given converter, divinding this by the minimum char size
|
||||
give you the maximum number of Unicode characters that could be
|
||||
expected for a given number of input bytes.
|
||||
see: ucnv_getMinCharSize()
|
||||
|
||||
For example, a single byte codepage like 'Latin-3' has a
|
||||
minimum char size of 1. (It takes at least 1 byte to represent
|
||||
each Unicode char.) So the unicode buffer has the same number of
|
||||
UChars as the input buffer has bytes.
|
||||
|
||||
In a strictly double byte codepage such as cp1362 (Windows
|
||||
Korean), the minimum char size is 2. So, only half as many Unicode
|
||||
chars as bytes are needed.
|
||||
|
||||
This work to calculate the buffer size is an optimization. Any
|
||||
size of input and output buffer can be used, as long as the
|
||||
program handles the following cases: If the input buffer is empty,
|
||||
the source pointer will be equal to sourceLimit. If the output
|
||||
buffer is empty, U_INDEX_OUTOFBOUNDS_ERROR will be returned.
|
||||
*/
|
||||
|
||||
UErrorCode convsample_05()
|
||||
{
|
||||
printf("\n\n==============================================\n"
|
||||
"Sample 05: C: count the number of letters in a UTF-8 document\n");
|
||||
|
||||
FILE *f;
|
||||
int32_t count;
|
||||
char inBuf[BUFFERSIZE];
|
||||
const char *source;
|
||||
const char *sourceLimit;
|
||||
UChar *uBuf;
|
||||
UChar *target;
|
||||
UChar *targetLimit;
|
||||
UChar *p;
|
||||
int32_t uBufSize = 0;
|
||||
UConverter *conv;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
uint32_t letters=0, total=0;
|
||||
|
||||
f = fopen("data01.ut8", "r");
|
||||
if(!f)
|
||||
{
|
||||
fprintf(stderr, "Couldn't open file 'data01.ut8' (UTF-8 data file).\n");
|
||||
return U_FILE_ACCESS_ERROR;
|
||||
}
|
||||
|
||||
// **************************** START SAMPLE *******************
|
||||
conv = ucnv_open("utf-8", &status);
|
||||
assert(U_SUCCESS(status));
|
||||
|
||||
uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
|
||||
printf("input bytes %d / min chars %d = %d UChars\n",
|
||||
BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
|
||||
uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
|
||||
assert(uBuf!=NULL);
|
||||
|
||||
// grab another buffer's worth
|
||||
while((!feof(f)) &&
|
||||
((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
|
||||
{
|
||||
// Convert bytes to unicode
|
||||
source = inBuf;
|
||||
sourceLimit = inBuf + count;
|
||||
|
||||
do
|
||||
{
|
||||
target = uBuf;
|
||||
targetLimit = uBuf + uBufSize;
|
||||
|
||||
ucnv_toUnicode(conv, &target, targetLimit,
|
||||
&source, sourceLimit, NULL,
|
||||
feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
|
||||
/* is true (when no more data will come) */
|
||||
&status);
|
||||
|
||||
if(status != U_INDEX_OUTOFBOUNDS_ERROR)
|
||||
{
|
||||
// simply ran out of space - we'll reset the target ptr the next
|
||||
// time through the loop.
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check other errors here.
|
||||
assert(U_SUCCESS(status));
|
||||
// Break out of the loop (by force)
|
||||
}
|
||||
|
||||
// Process the Unicode
|
||||
// Todo: handle UTF-16/surrogates
|
||||
|
||||
for(p = uBuf; p<target; p++)
|
||||
{
|
||||
if(u_isalpha(*p))
|
||||
letters++;
|
||||
total++;
|
||||
}
|
||||
} while (source < sourceLimit); // while simply out of space
|
||||
}
|
||||
|
||||
printf("%d letters out of %d total UChars.\n", letters, total);
|
||||
|
||||
// ***************************** END SAMPLE ********************
|
||||
ucnv_close(conv);
|
||||
|
||||
printf("\n");
|
||||
|
||||
return U_ZERO_ERROR;
|
||||
}
|
||||
#undef BUFFERSIZE
|
||||
|
||||
/* main */
|
||||
|
||||
int main()
|
||||
{
|
||||
convsample_01();
|
||||
convsample_02();
|
||||
convsample_03();
|
||||
//convsample_04(); /* not written yet */
|
||||
convsample_05();
|
||||
}
|
15
icu4c/source/samples/ucnv/data01.ut8
Normal file
15
icu4c/source/samples/ucnv/data01.ut8
Normal file
|
@ -0,0 +1,15 @@
|
|||
-*- Coding: utf-8 ; -*- // for emacs
|
||||
[some latin, devanagari, and cyrillic text]
|
||||
raj Rajasthani राजेस्थानी
|
||||
Konkani कोंकणी
|
||||
Haryanvi हरियानवी
|
||||
rm Rhaeto-Romance रहेय्टो-रोमान्स
|
||||
rn Kirundi किरून्दी
|
||||
ro Romanian रूमानीयन्
|
||||
ru Russian रुसी футбол
|
||||
том
|
||||
атом
|
||||
Нева
|
||||
Fid-dinja hawn aktar imġienen minn nies f'sensiehom.
|
||||
Il-mistoqsija oħt il-għerf. (Asking is the sister of knowing.)
|
||||
aфутбол중앙일보-の“日本語”
|
Loading…
Add table
Reference in a new issue