From cf3e1e7f4097ce968c980c8abb2989a9fc439307 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 16 Mar 2017 19:52:20 +0000 Subject: [PATCH 1/5] ICU-12766 turn on escaping for z (mh-os390) X-SVN-Rev: 39840 --- icu4c/source/config/mh-os390 | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/icu4c/source/config/mh-os390 b/icu4c/source/config/mh-os390 index efcb5b4c035..9bd3f9bf4c9 100644 --- a/icu4c/source/config/mh-os390 +++ b/icu4c/source/config/mh-os390 @@ -208,10 +208,27 @@ endif %.o: $(srcdir)/%.c $(COMPILE.c) $(DYNAMICCPPFLAGS) $(DYNAMICCFLAGS) -o $@ $< +# This causes escapesrc to be built before other ICU targets. +NEED_ESCAPING=YES + +ifneq ($(SKIP_ESCAPING),) %.$(STATIC_O): $(srcdir)/%.cpp $(COMPILE.cc) $(STATICCPPFLAGS) $(STATICCXXFLAGS) -o $@ $< %.o: $(srcdir)/%.cpp $(COMPILE.cc) $(DYNAMICCPPFLAGS) $(DYNAMICCXXFLAGS) -o $@ $< +else +# convert *.cpp files to _*.cpp with \u / \U escaping +CLEANFILES += _*.cpp + +# the actual escaping +_%.cpp: $(srcdir)/%.cpp + @$(BINDIR)/escapesrc$(EXEEXT) $< $@ + +%.$(STATIC_O): _%.cpp + $(COMPILE.cc) $(STATICCPPFLAGS) $(STATICCXXFLAGS) -o $@ $< +%.o: _%.cpp + $(COMPILE.cc) $(DYNAMICCPPFLAGS) $(DYNAMICCXXFLAGS) -o $@ $< +endif ## Dependency rules %.d : %.u From 02cf91d3e6a2f739a8b7c3fd3ac1df211082aef5 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 16 Mar 2017 21:49:19 +0000 Subject: [PATCH 2/5] ICU-12766 support escaping on ebcdic (z) X-SVN-Rev: 39842 --- icu4c/source/common/unicode/platform.h | 2 +- icu4c/source/tools/escapesrc/Makefile.in | 6 + icu4c/source/tools/escapesrc/cptbl.h | 521 +++++++++++++++++++++ icu4c/source/tools/escapesrc/escapesrc.cpp | 29 +- icu4c/source/tools/escapesrc/tblgen.cpp | 80 ++++ 5 files changed, 633 insertions(+), 5 deletions(-) create mode 100644 icu4c/source/tools/escapesrc/cptbl.h create mode 100644 icu4c/source/tools/escapesrc/tblgen.cpp diff --git a/icu4c/source/common/unicode/platform.h b/icu4c/source/common/unicode/platform.h index 41b363093cc..dc7d386281a 100644 --- a/icu4c/source/common/unicode/platform.h +++ b/icu4c/source/common/unicode/platform.h @@ -486,7 +486,7 @@ # define U_CPLUSPLUS_VERSION 1 #endif -#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11) +#if (U_PLATFORM == U_PF_AIX || U_PLATFORM == U_PF_OS390) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11) // add in std::nullptr_t namespace std { typedef decltype(nullptr) nullptr_t; diff --git a/icu4c/source/tools/escapesrc/Makefile.in b/icu4c/source/tools/escapesrc/Makefile.in index 8a0e2c1f803..7580ccdc315 100644 --- a/icu4c/source/tools/escapesrc/Makefile.in +++ b/icu4c/source/tools/escapesrc/Makefile.in @@ -95,6 +95,12 @@ $(TARGET) : $(OBJECTS) cd $(top_builddir) \ && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status +# depends on ICU being built +gen-table: tblgen$(EXEEXT) + $(INVOKE) ./tblgen$(EXEEXT) > $(srcdir)/cptbl.h + +tblgen$(EXEEXT): tblgen.o + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) $(LIBICUUC) ifeq (,$(MAKECMDGOALS)) -include $(DEPS) diff --git a/icu4c/source/tools/escapesrc/cptbl.h b/icu4c/source/tools/escapesrc/cptbl.h new file mode 100644 index 00000000000..898e16c925d --- /dev/null +++ b/icu4c/source/tools/escapesrc/cptbl.h @@ -0,0 +1,521 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html +// generated by tblgen. You weren't going to edit it by hand, were you? + +static const char cp1047_8859_1[256] = { + (char)0x00, /* 00 */ + (char)0x01, /* 01 */ + (char)0x02, /* 02 */ + (char)0x03, /* 03 */ + (char)0x9C, /* 04 */ + (char)0x09, /* 05 */ + (char)0x86, /* 06 */ + (char)0x7F, /* 07 */ + (char)0x97, /* 08 */ + (char)0x8D, /* 09 */ + (char)0x8E, /* 0A */ + (char)0x0B, /* 0B */ + (char)0x0C, /* 0C */ + (char)0x0D, /* 0D */ + (char)0x0E, /* 0E */ + (char)0x0F, /* 0F */ + (char)0x10, /* 10 */ + (char)0x11, /* 11 */ + (char)0x12, /* 12 */ + (char)0x13, /* 13 */ + (char)0x9D, /* 14 */ + (char)0x85, /* 15 */ + (char)0x08, /* 16 */ + (char)0x87, /* 17 */ + (char)0x18, /* 18 */ + (char)0x19, /* 19 */ + (char)0x92, /* 1A */ + (char)0x8F, /* 1B */ + (char)0x1C, /* 1C */ + (char)0x1D, /* 1D */ + (char)0x1E, /* 1E */ + (char)0x1F, /* 1F */ + (char)0x80, /* 20 */ + (char)0x81, /* 21 */ + (char)0x82, /* 22 */ + (char)0x83, /* 23 */ + (char)0x84, /* 24 */ + (char)0x0A, /* 25 */ + (char)0x17, /* 26 */ + (char)0x1B, /* 27 */ + (char)0x88, /* 28 */ + (char)0x89, /* 29 */ + (char)0x8A, /* 2A */ + (char)0x8B, /* 2B */ + (char)0x8C, /* 2C */ + (char)0x05, /* 2D */ + (char)0x06, /* 2E */ + (char)0x07, /* 2F */ + (char)0x90, /* 30 */ + (char)0x91, /* 31 */ + (char)0x16, /* 32 */ + (char)0x93, /* 33 */ + (char)0x94, /* 34 */ + (char)0x95, /* 35 */ + (char)0x96, /* 36 */ + (char)0x04, /* 37 */ + (char)0x98, /* 38 */ + (char)0x99, /* 39 */ + (char)0x9A, /* 3A */ + (char)0x9B, /* 3B */ + (char)0x14, /* 3C */ + (char)0x15, /* 3D */ + (char)0x9E, /* 3E */ + (char)0x1A, /* 3F */ + (char)0x20, /* 40 */ + (char)0xA0, /* 41 */ + (char)0xE2, /* 42 */ + (char)0xE4, /* 43 */ + (char)0xE0, /* 44 */ + (char)0xE1, /* 45 */ + (char)0xE3, /* 46 */ + (char)0xE5, /* 47 */ + (char)0xE7, /* 48 */ + (char)0xF1, /* 49 */ + (char)0xA2, /* 4A */ + (char)0x2E, /* 4B */ + (char)0x3C, /* 4C */ + (char)0x28, /* 4D */ + (char)0x2B, /* 4E */ + (char)0x7C, /* 4F */ + (char)0x26, /* 50 */ + (char)0xE9, /* 51 */ + (char)0xEA, /* 52 */ + (char)0xEB, /* 53 */ + (char)0xE8, /* 54 */ + (char)0xED, /* 55 */ + (char)0xEE, /* 56 */ + (char)0xEF, /* 57 */ + (char)0xEC, /* 58 */ + (char)0xDF, /* 59 */ + (char)0x21, /* 5A */ + (char)0x24, /* 5B */ + (char)0x2A, /* 5C */ + (char)0x29, /* 5D */ + (char)0x3B, /* 5E */ + (char)0x5E, /* 5F */ + (char)0x2D, /* 60 */ + (char)0x2F, /* 61 */ + (char)0xC2, /* 62 */ + (char)0xC4, /* 63 */ + (char)0xC0, /* 64 */ + (char)0xC1, /* 65 */ + (char)0xC3, /* 66 */ + (char)0xC5, /* 67 */ + (char)0xC7, /* 68 */ + (char)0xD1, /* 69 */ + (char)0xA6, /* 6A */ + (char)0x2C, /* 6B */ + (char)0x25, /* 6C */ + (char)0x5F, /* 6D */ + (char)0x3E, /* 6E */ + (char)0x3F, /* 6F */ + (char)0xF8, /* 70 */ + (char)0xC9, /* 71 */ + (char)0xCA, /* 72 */ + (char)0xCB, /* 73 */ + (char)0xC8, /* 74 */ + (char)0xCD, /* 75 */ + (char)0xCE, /* 76 */ + (char)0xCF, /* 77 */ + (char)0xCC, /* 78 */ + (char)0x60, /* 79 */ + (char)0x3A, /* 7A */ + (char)0x23, /* 7B */ + (char)0x40, /* 7C */ + (char)0x27, /* 7D */ + (char)0x3D, /* 7E */ + (char)0x22, /* 7F */ + (char)0xD8, /* 80 */ + (char)0x61, /* 81 */ + (char)0x62, /* 82 */ + (char)0x63, /* 83 */ + (char)0x64, /* 84 */ + (char)0x65, /* 85 */ + (char)0x66, /* 86 */ + (char)0x67, /* 87 */ + (char)0x68, /* 88 */ + (char)0x69, /* 89 */ + (char)0xAB, /* 8A */ + (char)0xBB, /* 8B */ + (char)0xF0, /* 8C */ + (char)0xFD, /* 8D */ + (char)0xFE, /* 8E */ + (char)0xB1, /* 8F */ + (char)0xB0, /* 90 */ + (char)0x6A, /* 91 */ + (char)0x6B, /* 92 */ + (char)0x6C, /* 93 */ + (char)0x6D, /* 94 */ + (char)0x6E, /* 95 */ + (char)0x6F, /* 96 */ + (char)0x70, /* 97 */ + (char)0x71, /* 98 */ + (char)0x72, /* 99 */ + (char)0xAA, /* 9A */ + (char)0xBA, /* 9B */ + (char)0xE6, /* 9C */ + (char)0xB8, /* 9D */ + (char)0xC6, /* 9E */ + (char)0xA4, /* 9F */ + (char)0xB5, /* A0 */ + (char)0x7E, /* A1 */ + (char)0x73, /* A2 */ + (char)0x74, /* A3 */ + (char)0x75, /* A4 */ + (char)0x76, /* A5 */ + (char)0x77, /* A6 */ + (char)0x78, /* A7 */ + (char)0x79, /* A8 */ + (char)0x7A, /* A9 */ + (char)0xA1, /* AA */ + (char)0xBF, /* AB */ + (char)0xD0, /* AC */ + (char)0x5B, /* AD */ + (char)0xDE, /* AE */ + (char)0xAE, /* AF */ + (char)0xAC, /* B0 */ + (char)0xA3, /* B1 */ + (char)0xA5, /* B2 */ + (char)0xB7, /* B3 */ + (char)0xA9, /* B4 */ + (char)0xA7, /* B5 */ + (char)0xB6, /* B6 */ + (char)0xBC, /* B7 */ + (char)0xBD, /* B8 */ + (char)0xBE, /* B9 */ + (char)0xDD, /* BA */ + (char)0xA8, /* BB */ + (char)0xAF, /* BC */ + (char)0x5D, /* BD */ + (char)0xB4, /* BE */ + (char)0xD7, /* BF */ + (char)0x7B, /* C0 */ + (char)0x41, /* C1 */ + (char)0x42, /* C2 */ + (char)0x43, /* C3 */ + (char)0x44, /* C4 */ + (char)0x45, /* C5 */ + (char)0x46, /* C6 */ + (char)0x47, /* C7 */ + (char)0x48, /* C8 */ + (char)0x49, /* C9 */ + (char)0xAD, /* CA */ + (char)0xF4, /* CB */ + (char)0xF6, /* CC */ + (char)0xF2, /* CD */ + (char)0xF3, /* CE */ + (char)0xF5, /* CF */ + (char)0x7D, /* D0 */ + (char)0x4A, /* D1 */ + (char)0x4B, /* D2 */ + (char)0x4C, /* D3 */ + (char)0x4D, /* D4 */ + (char)0x4E, /* D5 */ + (char)0x4F, /* D6 */ + (char)0x50, /* D7 */ + (char)0x51, /* D8 */ + (char)0x52, /* D9 */ + (char)0xB9, /* DA */ + (char)0xFB, /* DB */ + (char)0xFC, /* DC */ + (char)0xF9, /* DD */ + (char)0xFA, /* DE */ + (char)0xFF, /* DF */ + (char)0x5C, /* E0 */ + (char)0xF7, /* E1 */ + (char)0x53, /* E2 */ + (char)0x54, /* E3 */ + (char)0x55, /* E4 */ + (char)0x56, /* E5 */ + (char)0x57, /* E6 */ + (char)0x58, /* E7 */ + (char)0x59, /* E8 */ + (char)0x5A, /* E9 */ + (char)0xB2, /* EA */ + (char)0xD4, /* EB */ + (char)0xD6, /* EC */ + (char)0xD2, /* ED */ + (char)0xD3, /* EE */ + (char)0xD5, /* EF */ + (char)0x30, /* F0 */ + (char)0x31, /* F1 */ + (char)0x32, /* F2 */ + (char)0x33, /* F3 */ + (char)0x34, /* F4 */ + (char)0x35, /* F5 */ + (char)0x36, /* F6 */ + (char)0x37, /* F7 */ + (char)0x38, /* F8 */ + (char)0x39, /* F9 */ + (char)0xB3, /* FA */ + (char)0xDB, /* FB */ + (char)0xDC, /* FC */ + (char)0xD9, /* FD */ + (char)0xDA, /* FE */ + (char)0x9F, /* FF */ +}; + +static const bool oldIllegal[256] = { + false, /* U+0000 */ + false, /* U+0001 */ + false, /* U+0002 */ + false, /* U+0003 */ + false, /* U+0004 */ + false, /* U+0005 */ + false, /* U+0006 */ + false, /* U+0007 */ + false, /* U+0008 */ + false, /* U+0009 */ + false, /* U+000A */ + false, /* U+000B */ + false, /* U+000C */ + false, /* U+000D */ + false, /* U+000E */ + false, /* U+000F */ + false, /* U+0010 */ + false, /* U+0011 */ + false, /* U+0012 */ + false, /* U+0013 */ + false, /* U+0014 */ + false, /* U+0015 */ + false, /* U+0016 */ + false, /* U+0017 */ + false, /* U+0018 */ + false, /* U+0019 */ + false, /* U+001A */ + false, /* U+001B */ + false, /* U+001C */ + false, /* U+001D */ + false, /* U+001E */ + false, /* U+001F */ + true, /* U+0020 */ + true, /* U+0021 */ + true, /* U+0022 */ + true, /* U+0023 */ + false, /* U+0024 */ + true, /* U+0025 */ + true, /* U+0026 */ + true, /* U+0027 */ + true, /* U+0028 */ + true, /* U+0029 */ + true, /* U+002A */ + true, /* U+002B */ + true, /* U+002C */ + true, /* U+002D */ + true, /* U+002E */ + true, /* U+002F */ + true, /* U+0030 */ + true, /* U+0031 */ + true, /* U+0032 */ + true, /* U+0033 */ + true, /* U+0034 */ + true, /* U+0035 */ + true, /* U+0036 */ + true, /* U+0037 */ + true, /* U+0038 */ + true, /* U+0039 */ + true, /* U+003A */ + true, /* U+003B */ + true, /* U+003C */ + true, /* U+003D */ + true, /* U+003E */ + true, /* U+003F */ + false, /* U+0040 */ + true, /* U+0041 */ + true, /* U+0042 */ + true, /* U+0043 */ + true, /* U+0044 */ + true, /* U+0045 */ + true, /* U+0046 */ + true, /* U+0047 */ + true, /* U+0048 */ + true, /* U+0049 */ + true, /* U+004A */ + true, /* U+004B */ + true, /* U+004C */ + true, /* U+004D */ + true, /* U+004E */ + true, /* U+004F */ + true, /* U+0050 */ + true, /* U+0051 */ + true, /* U+0052 */ + true, /* U+0053 */ + true, /* U+0054 */ + true, /* U+0055 */ + true, /* U+0056 */ + true, /* U+0057 */ + true, /* U+0058 */ + true, /* U+0059 */ + true, /* U+005A */ + true, /* U+005B */ + false, /* U+005C */ + true, /* U+005D */ + true, /* U+005E */ + true, /* U+005F */ + false, /* U+0060 */ + true, /* U+0061 */ + true, /* U+0062 */ + true, /* U+0063 */ + true, /* U+0064 */ + true, /* U+0065 */ + true, /* U+0066 */ + true, /* U+0067 */ + true, /* U+0068 */ + true, /* U+0069 */ + true, /* U+006A */ + true, /* U+006B */ + true, /* U+006C */ + true, /* U+006D */ + true, /* U+006E */ + true, /* U+006F */ + true, /* U+0070 */ + true, /* U+0071 */ + true, /* U+0072 */ + true, /* U+0073 */ + true, /* U+0074 */ + true, /* U+0075 */ + true, /* U+0076 */ + true, /* U+0077 */ + true, /* U+0078 */ + true, /* U+0079 */ + true, /* U+007A */ + true, /* U+007B */ + true, /* U+007C */ + true, /* U+007D */ + true, /* U+007E */ + false, /* U+007F */ + false, /* U+0080 */ + false, /* U+0081 */ + false, /* U+0082 */ + false, /* U+0083 */ + false, /* U+0084 */ + false, /* U+0085 */ + false, /* U+0086 */ + false, /* U+0087 */ + false, /* U+0088 */ + false, /* U+0089 */ + false, /* U+008A */ + false, /* U+008B */ + false, /* U+008C */ + false, /* U+008D */ + false, /* U+008E */ + false, /* U+008F */ + false, /* U+0090 */ + false, /* U+0091 */ + false, /* U+0092 */ + false, /* U+0093 */ + false, /* U+0094 */ + false, /* U+0095 */ + false, /* U+0096 */ + false, /* U+0097 */ + false, /* U+0098 */ + false, /* U+0099 */ + false, /* U+009A */ + false, /* U+009B */ + false, /* U+009C */ + false, /* U+009D */ + false, /* U+009E */ + false, /* U+009F */ + false, /* U+00A0 */ + false, /* U+00A1 */ + false, /* U+00A2 */ + false, /* U+00A3 */ + false, /* U+00A4 */ + false, /* U+00A5 */ + false, /* U+00A6 */ + false, /* U+00A7 */ + false, /* U+00A8 */ + false, /* U+00A9 */ + false, /* U+00AA */ + false, /* U+00AB */ + false, /* U+00AC */ + false, /* U+00AD */ + false, /* U+00AE */ + false, /* U+00AF */ + false, /* U+00B0 */ + false, /* U+00B1 */ + false, /* U+00B2 */ + false, /* U+00B3 */ + false, /* U+00B4 */ + false, /* U+00B5 */ + false, /* U+00B6 */ + false, /* U+00B7 */ + false, /* U+00B8 */ + false, /* U+00B9 */ + false, /* U+00BA */ + false, /* U+00BB */ + false, /* U+00BC */ + false, /* U+00BD */ + false, /* U+00BE */ + false, /* U+00BF */ + false, /* U+00C0 */ + false, /* U+00C1 */ + false, /* U+00C2 */ + false, /* U+00C3 */ + false, /* U+00C4 */ + false, /* U+00C5 */ + false, /* U+00C6 */ + false, /* U+00C7 */ + false, /* U+00C8 */ + false, /* U+00C9 */ + false, /* U+00CA */ + false, /* U+00CB */ + false, /* U+00CC */ + false, /* U+00CD */ + false, /* U+00CE */ + false, /* U+00CF */ + false, /* U+00D0 */ + false, /* U+00D1 */ + false, /* U+00D2 */ + false, /* U+00D3 */ + false, /* U+00D4 */ + false, /* U+00D5 */ + false, /* U+00D6 */ + false, /* U+00D7 */ + false, /* U+00D8 */ + false, /* U+00D9 */ + false, /* U+00DA */ + false, /* U+00DB */ + false, /* U+00DC */ + false, /* U+00DD */ + false, /* U+00DE */ + false, /* U+00DF */ + false, /* U+00E0 */ + false, /* U+00E1 */ + false, /* U+00E2 */ + false, /* U+00E3 */ + false, /* U+00E4 */ + false, /* U+00E5 */ + false, /* U+00E6 */ + false, /* U+00E7 */ + false, /* U+00E8 */ + false, /* U+00E9 */ + false, /* U+00EA */ + false, /* U+00EB */ + false, /* U+00EC */ + false, /* U+00ED */ + false, /* U+00EE */ + false, /* U+00EF */ + false, /* U+00F0 */ + false, /* U+00F1 */ + false, /* U+00F2 */ + false, /* U+00F3 */ + false, /* U+00F4 */ + false, /* U+00F5 */ + false, /* U+00F6 */ + false, /* U+00F7 */ + false, /* U+00F8 */ + false, /* U+00F9 */ + false, /* U+00FA */ + false, /* U+00FB */ + false, /* U+00FC */ + false, /* U+00FD */ + false, /* U+00FE */ + false, /* U+00FF */ +}; + diff --git a/icu4c/source/tools/escapesrc/escapesrc.cpp b/icu4c/source/tools/escapesrc/escapesrc.cpp index e28c5808cf9..4bca5025bd1 100644 --- a/icu4c/source/tools/escapesrc/escapesrc.cpp +++ b/icu4c/source/tools/escapesrc/escapesrc.cpp @@ -27,6 +27,10 @@ static const char kQUOT = 0x27, kDBLQ = 0x22; +# include "cptbl.h" + +# define cp1047_to_8859(c) cp1047_8859_1[c] + std::string prog; void usage() { @@ -150,7 +154,7 @@ bool appendUtf8(std::string &outstr, bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { size_t pos = origpos + 3; std::string outstr; - outstr += (kDBLQ); + outstr += '\"'; // local encoding for(;pos0; pos++,trail--) { + linestr[pos2] = cp1047_to_8859(linestr[pos2]); + } +#endif + // Proceed to decode utf-8 const uint8_t *s = (const uint8_t*) (linestr.c_str()); int32_t i = pos; int32_t length = linestr.size(); UChar32 c; + if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { +#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) + linestr[pos] = old_byte; // put it back +#endif + continue; // single code point not previously legal for \u escaping + } - if(U8_IS_SINGLE((uint8_t)s[i])) continue; // single code point - + // otherwise, convert it to \u / \U { U8_NEXT(s, i, length, c); } if(c<0) { fprintf(stderr, "Illegal utf-8 sequence\n"); + fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); return true; } diff --git a/icu4c/source/tools/escapesrc/tblgen.cpp b/icu4c/source/tools/escapesrc/tblgen.cpp new file mode 100644 index 00000000000..e94fa9036bc --- /dev/null +++ b/icu4c/source/tools/escapesrc/tblgen.cpp @@ -0,0 +1,80 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" +#include "unicode/ucnv.h" +#include "unicode/uniset.h" +#include + +static const char *kConverter = "ibm-1047"; + +int main(int argc, const char *argv[]) { + printf("// %s\n", U_COPYRIGHT_STRING); + printf("// generated by tblgen. You weren't going to edit it by hand, were you?\n"); + printf("\n"); + + UErrorCode status = U_ZERO_ERROR; + LocalUConverterPointer cnv(ucnv_open(kConverter, &status)); + + if(U_FAILURE(status)) { + fprintf(stderr, "Failed to open %s: %s\n", kConverter, u_errorName(status)); + return 1; + } + + printf("static const char cp1047_8859_1[256] = { \n"); + for(int i=0x00; i<0x100; i++) { + char cp1047[1]; + cp1047[0] = i; + UChar u[1]; + UChar *target = u; + const char *source = cp1047; + ucnv_toUnicode(cnv.getAlias(), &target, u+1, &source, cp1047+1, nullptr, true, &status); + if(U_FAILURE(status)) { + fprintf(stderr, "Conversion failure at #%X: %s\n", i, u_errorName(status)); + return 2; + } + printf(" (char)0x%02X, /* %02X */\n", u[0], i); + } + printf("};\n\n"); + + // + // UnicodeSet oldIllegal("[:print:]", status); // [a-zA-Z0-9_}{#)(><%:;.?*+-/^&|~!=,\\u005b\\u005d\\u005c]", status); + UnicodeSet oldIllegal("[0-9 a-z A-Z " + "_ \\{ \\} \\[ \\] # \\( \\) < > % \\: ; . " + "? * + \\- / \\^ \\& | ~ ! = , \\ \" ' ]", status); + + /* + +http://www.lirmm.fr/~ducour/Doc-objets/ISO+IEC+14882-1998.pdf ( note: 1998 ) page 10, section 2.2 says: + +1 The basic source character set consists of 96 characters: the space character, the control characters repre- 15) +senting horizontal tab, vertical tab, form feed, and new-line, plus the following 91 graphical characters: +a b c d e f g h i j k l m n opqrstuvwxyz +A B C D E F G H I J K L M N OPQRSTUVWXYZ +0 12 3 4 5 6 7 8 9 + _ { } [ ] # ( ) < > % : ; . ?*+-/^&|~!=,\" +2 The universal-character-name construct provides a way to name other characters. hex-quad: +hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit +universal-character-name: \u hex-quad +\U hex-quad hex-quad +The character designated by the universal-character-name \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 is NNNNNNNN; the character designated by the universal-character-name \uNNNN is that character whose character short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value for a universal character name is less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the uni- versal character name designates a character in the basic source character set, then the program is ill- formed. + + +So basically: printable ASCII plus 0x00-0x1F, 0x7F-0x9F, was all illegal. + +Some discussion at http://unicode.org/mail-arch/unicode-ml/y2003-m10/0471.html + + */ + + + + printf("static const bool oldIllegal[256] = { \n"); + for(UChar i=0x00; i<0x100;i++) { + printf(" %s, /* U+%04X */\n", + (oldIllegal.contains(i))?" true":"false", + i); + } + printf("};\n\n"); + + return 0; +} From ada968b02db129d877e3caf0637cda600d2f4e68 Mon Sep 17 00:00:00 2001 From: Jeff Genovy <29107334+jefgen@users.noreply.github.com> Date: Wed, 22 Mar 2017 05:21:19 +0000 Subject: [PATCH 3/5] ICU-13027 - Use int32_t instead of int for a few places for consistency. X-SVN-Rev: 39893 --- icu4c/source/common/locmap.cpp | 4 ++-- icu4c/source/common/putil.cpp | 2 +- icu4c/source/common/umapfile.cpp | 2 +- icu4c/source/i18n/windtfmt.cpp | 2 +- icu4c/source/i18n/winnmfmt.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/icu4c/source/common/locmap.cpp b/icu4c/source/common/locmap.cpp index 3ca00757628..eda04e57df0 100644 --- a/icu4c/source/common/locmap.cpp +++ b/icu4c/source/common/locmap.cpp @@ -1116,7 +1116,7 @@ uprv_convertToPosix(uint32_t hostid, char *posixID, int32_t posixIDCapacity, UEr // Note: LOCALE_ALLOW_NEUTRAL_NAMES was enabled in Windows7+, prior versions did not handle neutral (no-region) locale names. tmpLen = LCIDToLocaleName(hostid, (PWSTR)windowsLocaleName, UPRV_LENGTHOF(windowsLocaleName), LOCALE_ALLOW_NEUTRAL_NAMES); if (tmpLen > 1) { - int i = 0; + int32_t i = 0; // Only need to look up in table if have _, eg for de-de_phoneb type alternate sort. bLookup = FALSE; for (i = 0; i < UPRV_LENGTHOF(locName); i++) @@ -1253,7 +1253,7 @@ uprv_convertToLCIDPlatform(const char* localeID) { // Need it to be UTF-16, not 8-bit wchar_t bcp47Tag[LOCALE_NAME_MAX_LENGTH] = {}; - int i; + int32_t i; for (i = 0; i < UPRV_LENGTHOF(bcp47Tag); i++) { if (asciiBCP47Tag[i] == '\0') diff --git a/icu4c/source/common/putil.cpp b/icu4c/source/common/putil.cpp index 70618751ab9..011e6e178f8 100644 --- a/icu4c/source/common/putil.cpp +++ b/icu4c/source/common/putil.cpp @@ -1781,7 +1781,7 @@ The leftmost codepage (.xxx) wins. // First we need to go from UTF-16 to char (and also convert from _ to - while we're at it.) char modifiedWindowsLocale[LOCALE_NAME_MAX_LENGTH]; - int i; + int32_t i; for (i = 0; i < UPRV_LENGTHOF(modifiedWindowsLocale); i++) { if (windowsLocale[i] == '_') diff --git a/icu4c/source/common/umapfile.cpp b/icu4c/source/common/umapfile.cpp index bfa9b1b7190..df588a4f950 100644 --- a/icu4c/source/common/umapfile.cpp +++ b/icu4c/source/common/umapfile.cpp @@ -117,7 +117,7 @@ // First we need to go from char to UTF-16 // u_UCharsToChars could work but it requires length. WCHAR utf16Path[MAX_PATH]; - int i; + int32_t i; for (i = 0; i < UPRV_LENGTHOF(utf16Path); i++) { utf16Path[i] = path[i]; diff --git a/icu4c/source/i18n/windtfmt.cpp b/icu4c/source/i18n/windtfmt.cpp index f9b575f8ab0..d0599ab5147 100644 --- a/icu4c/source/i18n/windtfmt.cpp +++ b/icu4c/source/i18n/windtfmt.cpp @@ -109,7 +109,7 @@ static UErrorCode GetEquivalentWindowsLocaleName(const Locale& locale, UnicodeSt // Need it to be UTF-16, not 8-bit // TODO: This seems like a good thing for a helper wchar_t bcp47Tag[LOCALE_NAME_MAX_LENGTH] = {}; - int i; + int32_t i; for (i = 0; i < UPRV_LENGTHOF(bcp47Tag); i++) { if (asciiBCP47Tag[i] == '\0') diff --git a/icu4c/source/i18n/winnmfmt.cpp b/icu4c/source/i18n/winnmfmt.cpp index 3f8271efaf0..3e03cb1feef 100644 --- a/icu4c/source/i18n/winnmfmt.cpp +++ b/icu4c/source/i18n/winnmfmt.cpp @@ -154,7 +154,7 @@ static UErrorCode GetEquivalentWindowsLocaleName(const Locale& locale, UnicodeSt // Need it to be UTF-16, not 8-bit // TODO: This seems like a good thing for a helper wchar_t bcp47Tag[LOCALE_NAME_MAX_LENGTH] = {}; - int i; + int32_t i; for (i = 0; i < UPRV_LENGTHOF(bcp47Tag); i++) { if (asciiBCP47Tag[i] == '\0') From af5c90b64939b9a50ad20fd89677a7810919478f Mon Sep 17 00:00:00 2001 From: Jeff Genovy <29107334+jefgen@users.noreply.github.com> Date: Wed, 22 Mar 2017 17:36:53 +0000 Subject: [PATCH 4/5] ICU-13054 - ICU readme.html has typo for instructions on building from the command-line. X-SVN-Rev: 39895 --- icu4c/readme.html | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/icu4c/readme.html b/icu4c/readme.html index f7762bb8ef1..3bf795238ea 100644 --- a/icu4c/readme.html +++ b/icu4c/readme.html @@ -943,9 +943,11 @@ "HowToBuildWindowsCommandLine">Using MSDEV At The Command Line Note: You can build ICU from the command line. Assuming that you have properly installed Microsoft Visual C++ to support command line - execution, you can run the following command, 'devenv.com - <ICU>\source\allinone\allinone.sln /build "Win32|Release"'. You can also - use Cygwin with this compiler to build ICU, and you can refer to the 'devenv.com <ICU>\source\allinone\allinone.sln /build "Release|Win32"'. + Or to build the 64-bit Release version from the command line: + 'devenv.com <ICU>\source\allinone\allinone.sln /build "Release|x64"'. +
You can also use Cygwin with this compiler to build ICU, and you can refer to the
How To Build And Install On Windows with Cygwin section for more details.

From f18a0b68570b08188b55a57f919277467b1a454f Mon Sep 17 00:00:00 2001 From: Jeff Genovy <29107334+jefgen@users.noreply.github.com> Date: Wed, 22 Mar 2017 20:05:29 +0000 Subject: [PATCH 5/5] ICU-13017 - Typos in uloc.h comments X-SVN-Rev: 39898 --- icu4c/source/common/unicode/uloc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/icu4c/source/common/unicode/uloc.h b/icu4c/source/common/unicode/uloc.h index 6f94920d373..23a54a9981c 100644 --- a/icu4c/source/common/unicode/uloc.h +++ b/icu4c/source/common/unicode/uloc.h @@ -61,7 +61,7 @@ * http://www.ics.uci.edu/pub/ietf/http/related/iso639.txt * *

- * The second option includes an additonal ISO Country + * The second option includes an additional ISO Country * Code. These codes are the upper-case two-letter codes * as defined by ISO-3166. * You can find a full list of these codes at a number of sites, such as: @@ -69,7 +69,7 @@ * http://www.chemie.fu-berlin.de/diverse/doc/ISO_3166.html * *

- * The third option requires another additonal information--the + * The third option requires another additional information--the * Variant. * The Variant codes are vendor and browser-specific. * For example, use WIN for Windows, MAC for Macintosh, and POSIX for POSIX. @@ -157,7 +157,7 @@ * just a mechanism for identifying these services. * *

- * Each international serivce that performs locale-sensitive operations + * Each international service that performs locale-sensitive operations * allows you * to get all the available objects of that type. You can sift * through these objects by language, country, or variant, @@ -580,7 +580,7 @@ uloc_getDisplayLanguage(const char* locale, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "", while passing Locale::getGerman() * for inLocale would result in "". NULL may be used to specify the default. - * @param script the displayable country code for localeID + * @param script the displayable script for the localeID * @param scriptCapacity the size of the script buffer to store the * displayable script code with * @param status error information if retrieving the displayable script code failed