From 9a4ae3b44073a859e9d9fc0f5b59b1114cc3446e Mon Sep 17 00:00:00 2001 From: Michael Ow Date: Mon, 18 Aug 2014 21:26:34 +0000 Subject: [PATCH] ICU-10551 Make ICU converter callbacks ignore default ignorable code points X-SVN-Rev: 36199 --- icu4c/source/common/ucnv_err.c | 105 +++++++++++++++++++++++- icu4c/source/test/intltest/convtest.cpp | 46 ++++++++++- icu4c/source/test/intltest/convtest.h | 3 +- 3 files changed, 147 insertions(+), 7 deletions(-) diff --git a/icu4c/source/common/ucnv_err.c b/icu4c/source/common/ucnv_err.c index 75659fb5716..42f31daa0e4 100644 --- a/icu4c/source/common/ucnv_err.c +++ b/icu4c/source/common/ucnv_err.c @@ -1,7 +1,7 @@ /* ***************************************************************************** * - * Copyright (C) 1998-2007, International Business Machines + * Copyright (C) 1998-2014, International Business Machines * Corporation and others. All Rights Reserved. * ***************************************************************************** @@ -50,6 +50,76 @@ #define UCNV_PRV_ESCAPE_CSS2 'S' #define UCNV_PRV_STOP_ON_ILLEGAL 'i' +/* + * IS_DEFAULT_IGNORABLE_CODE_POINT + * This is to check if a code point has the default ignorable unicode property. + * As such, this list needs to be updated if the ignorable code point list ever + * changes. + * To avoid dependency on other code, this list is hard coded here. + * When an ignorable code point is found and is unmappable, the default callbacks + * will ignore them. + * (c == 0x00AD) || \ (Latin-1 Punctuation and Symbols) + * (c == 0x034F) || \ (Combining Diacritical Marks Grapheme Joiner) + * (c == 0x061C) || \ (Arabic Format Character) + * (c == 0x115F) || \ (Hangul Jamo Old Initial Consonants) + * (c == 0x1160) || \ (Hangul Jamo Medial Vowels) + * (0x17B4 <= c && c <= 0x17B5) || \ (Khmer Inherent Vowels) + * (0x180B <= c && c <= 0x180E) || \ (Mongolian Format Controls) + * (0x200B <= c && c <= 0x200F) || \ (General Punctuation Format Characters) + * (0x202A <= c && c <= 0x202E) || \ (General Punctuation Format Characters) + * (c == 0x2060) || \ (General Punctuation Format Characters) + * (0x2066 <= c && c <= 0x2069) || \ (General Punctuation Format Characters) + * (0x2061 <= c && c <= 0x2064) || \ (General Punctuation Invisible Operators) + * (0x206A <= c && c <= 0x206F) || \ (General Punctuation Deprecated) + * (c == 0x3164) || \ (Hangul Compatibility Jamo) + * (0x0FE00 <= c && c <= 0x0FE0F) || \ (Variation Selectors) + * (c == 0x0FEFF) || \ (Arabic Presentation Forms B) + * (c == 0x0FFA0) || \ (Halfwidth and Fullwidth Forms) + * (0x01BCA0 <= c && c <= 0x01BCA3) || \ (Shorthand Format Controls) + * (0x01D173 <= c && c <= 0x01D17A) || \ (Musical Symbols) + * (c == 0x0E0001) || \ (Tag Identifiers) + * (0x0E0020 <= c && c <= 0x0E007F) || \ (Tag Components) + * (0x0E0100 <= c && c <= 0x0E01EF) || \ (Variation Selectors Supplement) + * (c == 0x2065) || \ (Unassigned) + * (0x0FFF0 <= c && c <= 0x0FFF8) || \ (Unassigned) + * (c == 0x0E0000) || \ (Unassigned) + * (0x0E0002 <= c && c <= 0x0E001F) || \ (Unassigned) + * (0x0E0080 <= c && c <= 0x0E00FF) || \ (Unassigned) + * (0x0E01F0 <= c && c <= 0x0E0FFF) \ (Unassigned) + */ + +#define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\ + (c == 0x00AD) || \ + (c == 0x034F) || \ + (c == 0x061C) || \ + (c == 0x115F) || \ + (c == 0x1160) || \ + (0x17B4 <= c && c <= 0x17B5) || \ + (0x180B <= c && c <= 0x180E) || \ + (0x200B <= c && c <= 0x200F) || \ + (0x202A <= c && c <= 0x202E) || \ + (c == 0x2060) || \ + (0x2066 <= c && c <= 0x2069) || \ + (0x2061 <= c && c <= 0x2064) || \ + (0x206A <= c && c <= 0x206F) || \ + (c == 0x3164) || \ + (0x0FE00 <= c && c <= 0x0FE0F) || \ + (c == 0x0FEFF) || \ + (c == 0x0FFA0) || \ + (0x01BCA0 <= c && c <= 0x01BCA3) || \ + (0x01D173 <= c && c <= 0x01D17A) || \ + (c == 0x0E0001) || \ + (0x0E0020 <= c && c <= 0x0E007F) || \ + (0x0E0100 <= c && c <= 0x0E01EF) || \ + (c == 0x2065) || \ + (0x0FFF0 <= c && c <= 0x0FFF8) || \ + (c == 0x0E0000) || \ + (0x0E0002 <= c && c <= 0x0E001F) || \ + (0x0E0080 <= c && c <= 0x0E00FF) || \ + (0x0E01F0 <= c && c <= 0x0E0FFF) \ + ) + + /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP ( @@ -61,6 +131,13 @@ UCNV_FROM_U_CALLBACK_STOP ( UConverterCallbackReason reason, UErrorCode * err) { + if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) + { + /* + * Skip if the codepoint has unicode property of default ignorable. + */ + *err = U_ZERO_ERROR; + } /* the caller must have set the error code accordingly */ return; } @@ -92,7 +169,14 @@ UCNV_FROM_U_CALLBACK_SKIP ( { if (reason <= UCNV_IRREGULAR) { - if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) + if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) + { + /* + * Skip if the codepoint has unicode property of default ignorable. + */ + *err = U_ZERO_ERROR; + } + else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) { *err = U_ZERO_ERROR; } @@ -113,7 +197,14 @@ UCNV_FROM_U_CALLBACK_SUBSTITUTE ( { if (reason <= UCNV_IRREGULAR) { - if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) + if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) + { + /* + * Skip if the codepoint has unicode property of default ignorable. + */ + *err = U_ZERO_ERROR; + } + else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) { *err = U_ZERO_ERROR; ucnv_cbFromUWriteSub(fromArgs, 0, err); @@ -155,6 +246,14 @@ UCNV_FROM_U_CALLBACK_ESCAPE ( { return; } + else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) + { + /* + * Skip if the codepoint has unicode property of default ignorable. + */ + *err = U_ZERO_ERROR; + return; + } ucnv_setFromUCallBack (fromArgs->converter, (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE, diff --git a/icu4c/source/test/intltest/convtest.cpp b/icu4c/source/test/intltest/convtest.cpp index c16ab81ceee..481c38a5343 100644 --- a/icu4c/source/test/intltest/convtest.cpp +++ b/icu4c/source/test/intltest/convtest.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2003-2013, International Business Machines +* Copyright (C) 2003-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -71,12 +71,14 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; + case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break; #else case 0: case 1: - case 2: name="skip"; break; + case 2: + case 3: name="skip"; break; #endif - case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; + case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; default: name=""; break; //needed to end loop } } @@ -648,6 +650,44 @@ ConversionTest::TestGetUnicodeSet2() { delete [] s0; } +// Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping +// If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated +void +ConversionTest::TestDefaultIgnorableCallback() { + UErrorCode status = U_ZERO_ERROR; + const char *name = "euc-jp-2007"; + const char *pattern = "[:Default_Ignorable_Code_Point:]"; + UnicodeSet *set = new UnicodeSet(pattern, status); + if (U_FAILURE(status)) { + dataerrln("Unable to create Unicodeset: %s - %s\n", pattern, u_errorName(status)); + return; + } + UConverter *cnv = cnv_open(name, status); + if (U_FAILURE(status)) { + errln("Unable to open converter: %s - %s\n", name, u_errorName(status)); + return; + } + // set callback for the converter + ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); + + UChar32 input[1]; + char output[10]; + int size = set->size(); + for (int i = 0; i < size; i++) { + status = U_ZERO_ERROR; + + input[0] = set->charAt(i); + + ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status); + if (U_FAILURE(status)) { + errln("Callback did not ignore code point: 0x%06X on failed conversion - %s", input[0], u_errorName(status)); + } + } + delete set; + ucnv_close(cnv); +} + + // open testdata or ICU data converter ------------------------------------- *** UConverter * diff --git a/icu4c/source/test/intltest/convtest.h b/icu4c/source/test/intltest/convtest.h index b4f07b5474b..3b5932933ac 100644 --- a/icu4c/source/test/intltest/convtest.h +++ b/icu4c/source/test/intltest/convtest.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * - * Copyright (C) 2003-2007, International Business Machines + * Copyright (C) 2003-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -73,6 +73,7 @@ public: void TestFromUnicode(); void TestGetUnicodeSet(); void TestGetUnicodeSet2(); + void TestDefaultIgnorableCallback(); private: UBool