diff --git a/icu4c/data/unidata/NormalizationTest.txt b/icu4c/data/unidata/NormalizationTest.txt index 08313d3e4fb..28f05c3accd 100644 --- a/icu4c/data/unidata/NormalizationTest.txt +++ b/icu4c/data/unidata/NormalizationTest.txt @@ -31,6 +31,13 @@ # X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X) # @Part0 # Specific cases + +# Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129 +0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; + +# Markus 2001aug30 +0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; + # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 1E0C;1E0C;0044 0323;1E0C;0044 0323; diff --git a/icu4c/source/data/unidata/NormalizationTest.txt b/icu4c/source/data/unidata/NormalizationTest.txt index 08313d3e4fb..28f05c3accd 100644 --- a/icu4c/source/data/unidata/NormalizationTest.txt +++ b/icu4c/source/data/unidata/NormalizationTest.txt @@ -31,6 +31,13 @@ # X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X) # @Part0 # Specific cases + +# Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129 +0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; + +# Markus 2001aug30 +0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; + # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 1E0C;1E0C;0044 0323;1E0C;0044 0323; diff --git a/icu4c/source/test/intltest/normconf.cpp b/icu4c/source/test/intltest/normconf.cpp index 548c2c3e78c..f274f9347a0 100644 --- a/icu4c/source/test/intltest/normconf.cpp +++ b/icu4c/source/test/intltest/normconf.cpp @@ -5,12 +5,14 @@ ************************************************************************ */ -#include "normconf.h" -#include "unicode/normlzr.h" +#include +#include "unicode/utypes.h" #include "unicode/unicode.h" +#include "unicode/normlzr.h" +#include "unicode/uniset.h" #include "cstring.h" -#include "unicode/putil.h" #include "filestrm.h" +#include "normconf.h" #define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) @@ -34,7 +36,7 @@ void NormalizerConformanceTest::runIndexedTest(int32_t index, UBool exec, const #define FIELD_COUNT 5 NormalizerConformanceTest::NormalizerConformanceTest() : - normalizer(UnicodeString("", ""), Normalizer::COMPOSE) {} + normalizer(UnicodeString(), UNORM_NFC) {} NormalizerConformanceTest::~NormalizerConformanceTest() {} @@ -78,6 +80,8 @@ void NormalizerConformanceTest::TestConformance(void) { } } + // UnicodeSet for all code points that are not mentioned in NormalizationTest.txt + UnicodeSet other(0, 0x10ffff); for (int32_t count = 1;;++count) { if (T_FileStream_eof(input)) { @@ -103,6 +107,12 @@ void NormalizerConformanceTest::TestConformance(void) { errln((UnicodeString)"Unable to parse line " + count); break; // Syntax error } + + // Remove a single code point from the "other" UnicodeSet + if(fields[0].length()==fields[0].moveIndex32(0, 1)) { + other.remove(fields[0].char32At(0)); + } + if (checkConformance(fields, UnicodeString(lineBuf, ""))) { ++passCount; } else { @@ -115,26 +125,43 @@ void NormalizerConformanceTest::TestConformance(void) { T_FileStream_close(input); + /* + * Test that all characters that are not mentioned + * as single code points in column 1 + * do not change under any normalization. + */ + UChar32 c; + + // remove U+ffff because that is the end-of-iteration sentinel value + other.remove(0xffff); + + for(c=0; c<=0x10ffff; ++c) { + if(c==0x30000) { + c=0xe0000; + } + if(!other.contains(c)) { + continue; + } + + fields[0]=fields[1]=fields[2]=fields[3]=fields[4].setTo(c); + sprintf(lineBuf, "not mentioned code point U+%04lx", c); + + if (checkConformance(fields, UnicodeString(lineBuf, ""))) { + ++passCount; + } else { + ++failCount; + } + if ((count % 1000) == 0) { + logln((UnicodeString)"Line " + count); + } + } + if (failCount != 0) { errln((UnicodeString)"Total: " + failCount + " lines failed, " + passCount + " lines passed"); } else { logln((UnicodeString)"Total: " + passCount + " lines passed"); } - - /* - * ### TODO: test that all assigned characters that are not mentioned - * as single code points in column 1 - * do not change under any normalization. - * I.e., keep a list (UnicodeSet?) of all single code points in c1, - * then test that for all in (assigned-list) it is - * c1==NFC(c1)==NFD(c1)==NFKC(c1)==NFKD(c1)==FCD(c1) - * - * ### TODO: test FCD - * Idea: since FCD is not a normalization form with guaranteed results, - * test that quickCheck(NF*D(c1), isFCD)==TRUE and that quickCheck(FCD(NF*D(c1)), isNF*D)==TRUE. - * Also test special, controlled cases. - */ } /** @@ -156,40 +183,80 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field, const UnicodeString& line) { UBool pass = TRUE; UErrorCode status = U_ZERO_ERROR; - UnicodeString out; + UnicodeString out, fcd; int32_t fieldNum; for (int32_t i=0; i