From 1569b465a5bbe4d06c9d9a3ea6bfe465d4324a9f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 30 Nov 1999 23:25:49 +0000 Subject: [PATCH] ICU-68 implement unicode string literals in c X-SVN-Rev: 263 --- icu4c/source/common/putil.c | 76 ++++++++++++++++++++++++++ icu4c/source/common/putil.h | 39 ++++++++++++- icu4c/source/common/ustring.h | 50 +++++++++++++++-- icu4c/source/test/intltest/strtest.cpp | 20 +++++++ 4 files changed, 180 insertions(+), 5 deletions(-) diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index 39fdf6e8716..34d7cdf91f4 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -1378,3 +1378,79 @@ const char* icu_getDefaultCodepage() return DEFAULT_CONVERTER_NAME; } + +#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY +/* + * These maps for ASCII to/from EBCDIC are from + * "UTF-EBCDIC - EBCDIC-Friendly Unicode (or UCS) Transformation Format" + * at http://www.unicode.org/unicode/reports/tr16/ + * but modified to explicitly exclude the variant graphical characters + * that are in ASCII at 0xa0 and above. + */ + +static uint8_t asciiFromEbcdic[256]={ + 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, + 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2E, 0x3C, 0x28, 0x2B, 0x7C, + 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E, + 0x2D, 0x2F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2C, 0x25, 0x5F, 0x3E, 0x3F, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22, + 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, + 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x5C, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F +}; + +static uint8_t ebcdicFromAscii[256]={ + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, + 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, + 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, + 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D, + 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, + 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +#endif + +U_CAPI void U_EXPORT2 +u_charsToUChars(const char *cs, UChar *us, UTextOffset length) { + while(length>0) { +#if U_CHARSET_FAMILY==U_ASCII_FAMILY + *us++=(UChar)(uint8_t)(*cs++); +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY + *us++=(UChar)asciiFromEbcdic[(uint8_t)(*cs++)]; +#else +# error U_CHARSET_FAMILY is not valid +#endif + --length; + } +} + +U_CAPI void U_EXPORT2 +u_UCharsToChars(const UChar *us, char *cs, UTextOffset length) { + while(length>0) { +#if U_CHARSET_FAMILY==U_ASCII_FAMILY + *cs++=(char)(*us++); +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY + *cs++=(char)ebcdicFromAscii[*us++]; +#else +# error U_CHARSET_FAMILY is not valid +#endif + --length; + } +} diff --git a/icu4c/source/common/putil.h b/icu4c/source/common/putil.h index 6a623aad11e..495f862aba6 100644 --- a/icu4c/source/common/putil.h +++ b/icu4c/source/common/putil.h @@ -158,7 +158,7 @@ U_CAPI const char* U_EXPORT2 icu_getDefaultLocaleID(void); */ U_CAPI double U_EXPORT2 icu_nextDouble(double d, bool_t positive); -/* +/** * Filesystem file and path separator characters. * Example: '/' and ':' on Unix, '\\' and ';' on Windows. */ @@ -179,4 +179,41 @@ U_CAPI double U_EXPORT2 icu_nextDouble(double d, bool_t positive); # define U_PATH_SEP_STRING ":" #endif +/** + * Convert char characters to UChar characters. + * This utility function is useful only for "invariant characters" + * that are encoded in the platform default encoding. + * They are a small, constant subset of the encoding and include + * just the latin letters, digits, and some punctuation. + * For details, see utypes.h . + * + * @param cs Input string, points to length + * character bytes from a subset of the platform encoding. + * @param us Output string, points to memory for length + * Unicode characters. + * @param length The number of characters to convert; this may + * include the terminating NUL. + */ +U_CAPI void U_EXPORT2 +u_charsToUChars(const char *cs, UChar *us, UTextOffset length); + +/** + * Convert UChar characters to char characters. + * This utility function is useful only for "invariant characters" + * that can be encoded in the platform default encoding. + * They are a small, constant subset of the encoding and include + * just the latin letters, digits, and some punctuation. + * For details, see utypes.h . + * + * @param us Input string, points to length + * Unicode characters that can be encoded with the + * codepage-invariant subset of the platform encoding. + * @param cs Output string, points to memory for length + * character bytes. + * @param length The number of characters to convert; this may + * include the terminating NUL. + */ +U_CAPI void U_EXPORT2 +u_UCharsToChars(const UChar *us, char *cs, UTextOffset length); + #endif diff --git a/icu4c/source/common/ustring.h b/icu4c/source/common/ustring.h index 595e4490f8c..48938586120 100644 --- a/icu4c/source/common/ustring.h +++ b/icu4c/source/common/ustring.h @@ -158,9 +158,51 @@ U_CAPI UChar* U_EXPORT2 u_uastrncpy(UChar *ucs1, */ U_CAPI char* U_EXPORT2 u_austrcpy(char *s1, const UChar *us2 ); + +/** + * Unicode String literals in C. + * We need one macro to declare a variable for the string + * and to statically preinitialize it if possible, + * and a second macro to dynamically intialize such a string variable if necessary. + * + * The macros are defined for maximum performance. + * They work only for strings that contain "invariant characters", i.e., + * only latin letters, digits, and some punctuation. + * See utypes.h for details. + * + * A pair of macros for a single string must be used with the same + * parameters. + * The string parameter must be a C string literal. + * The length of the string, not including the terminating + * NUL must be specified as a constant. + * The U_STRING_DECL macro should be invoked exactly once for one + * such string variable before it is used. + * + * Usage: + *
+ *     U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
+ *     U_STRING_DECL(ustringVar2, "jumps 5%", 8);
+ *     static bool_t didInit=FALSE;
+ *     
+ *     int32_t function() {
+ *         if(!didInit) {
+ *             U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
+ *             U_STRING_INIT(ustringVar2, "jumps 5%", 8);
+ *             didInit=TRUE;
+ *         }
+ *         return u_strcmp(ustringVar1, ustringVar2);
+ *     }
+ * 
+ */ +#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)L ## cs } +# define U_STRING_INIT(var, cs, length) +#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (UChar *)cs } +# define U_STRING_INIT(var, cs, length) +#else +# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1] +# define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1) #endif - - - - +#endif diff --git a/icu4c/source/test/intltest/strtest.cpp b/icu4c/source/test/intltest/strtest.cpp index 0926de1cc33..de4bcfa0384 100644 --- a/icu4c/source/test/intltest/strtest.cpp +++ b/icu4c/source/test/intltest/strtest.cpp @@ -22,6 +22,7 @@ #include "putil.h" #include "intltest.h" #include "strtest.h" +#include "ustring.h" void StringTest::TestEndian() { union { @@ -49,6 +50,8 @@ void StringTest::TestCharsetFamily() { } } +U_STRING_DECL(ustringVar, "aZ0 -", 5); + void StringTest::runIndexedTest(int32_t index, bool_t exec, char *&name, char *par) { if(exec) { logln("TestSuite Character and String Test: "); @@ -72,6 +75,23 @@ void StringTest::runIndexedTest(int32_t index, bool_t exec, char *&name, char *p TestCharsetFamily(); } break; + case 3: + name="Test_U_STRING"; + if(exec) { + U_STRING_INIT(ustringVar, "aZ0 -", 5); + if( sizeof(ustringVar)/sizeof(*ustringVar)!=6 || + ustringVar[0]!=0x61 || + ustringVar[1]!=0x5a || + ustringVar[2]!=0x30 || + ustringVar[3]!=0x20 || + ustringVar[4]!=0x2d || + ustringVar[5]!=0 + ) { + errln("Test_U_STRING: U_STRING_DECL with U_STRING_INIT does not work right! " + "See putil.h and utypes.h with platform.h."); + } + } + break; default: name=""; break;