From cb3d8ade6d6ea9e5e761cecd4cb2f43b931085ff Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 2 Mar 2010 22:59:05 +0000 Subject: [PATCH] ICU-7144 quick & dirty tool to recreate the UTS #46 data table according to the spec X-SVN-Rev: 27751 --- tools/unicode/c/genuts46/genuts46.cpp | 253 +++++++++++++++++++++++ tools/unicode/c/genuts46/genuts46.sln | 20 ++ tools/unicode/c/genuts46/genuts46.vcproj | 181 ++++++++++++++++ 3 files changed, 454 insertions(+) create mode 100644 tools/unicode/c/genuts46/genuts46.cpp create mode 100644 tools/unicode/c/genuts46/genuts46.sln create mode 100644 tools/unicode/c/genuts46/genuts46.vcproj diff --git a/tools/unicode/c/genuts46/genuts46.cpp b/tools/unicode/c/genuts46/genuts46.cpp new file mode 100644 index 00000000000..34c4a9b994f --- /dev/null +++ b/tools/unicode/c/genuts46/genuts46.cpp @@ -0,0 +1,253 @@ +/* +******************************************************************************* +* Copyright (C) 2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: genuts46.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010mar02 +* created by: Markus W. Scherer +* +* quick & dirty tool to recreate the UTS #46 data table according to the spec +*/ + +#include +#include +#include "unicode/utypes.h" +#include "unicode/errorcode.h" +#include "unicode/normalizer2.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/usetiter.h" + +/** + * icu::ErrorCode subclass for easy UErrorCode handling. + * The destructor calls handleFailure() which calls exit(errorCode) when isFailure(). + */ +class ExitingErrorCode : public icu::ErrorCode { +public: + /** + * @param loc A short string describing where the ExitingErrorCode is used. + */ + ExitingErrorCode(const char *loc) : location(loc) {} + virtual ~ExitingErrorCode(); +protected: + virtual void handleFailure() const; +private: + const char *location; +}; + +ExitingErrorCode::~ExitingErrorCode() { + // Safe because our handleFailure() does not throw exceptions. + if(isFailure()) { handleFailure(); } +} + +void ExitingErrorCode::handleFailure() const { + fprintf(stderr, "error at %s: %s\n", location, errorName()); + exit(errorCode); +} + +enum Status { DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID }; +static const char *const statusNames[]={ + "disallowed", "ignored", "mapped", "deviation", "valid" +}; + +static void +printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) { + if(start==end) { + printf("%04lX ", (long)start); + } else { + printf("%04lX..%04lX ", (long)start, (long)end); + } + printf("; %s", statusNames[status]); + if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) { + printf(" ;"); + const UChar *buffer=mapping.getBuffer(); + int32_t length=mapping.length(); + int32_t i=0; + UChar32 c; + while(inormalize(cString, mapping, errorCode); + if(!baseValidSet.containsAll(mapping)) { + fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c); + disallowedSet.add(c); + removeSet.add(c); + } else if(mapping.isEmpty()) { + ignoredSet.add(c); + } + } + mappedSet.removeAll(removeSet); + } + errorCode.assertSuccess(); + + icu::UnicodeSet validSet(baseValidSet); + validSet. + removeAll(labelSeparators). // non-ASCII label separators will be mapped in the end + removeAll(deviationSet). + removeAll(disallowedSet). + removeAll(mappedSet). + add(0x2e); // not mapped, simply valid + UBool madeChange; + do { + madeChange=FALSE; + { + removeSet.clear(); + icu::UnicodeSetIterator iter(validSet); + while(iter.next()) { + UChar32 c=iter.getCodepoint(); + cString.setTo(c); + nfd->normalize(cString, nfdString, errorCode); + if(!validSet.containsAll(nfdString)) { + fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c); + disallowedSet.add(c); + removeSet.add(c); + madeChange=TRUE; + } + } + validSet.removeAll(removeSet); + } + { + removeSet.clear(); + icu::UnicodeSetIterator iter(mappedSet); + while(iter.next()) { + UChar32 c=iter.getCodepoint(); + cString.setTo(c); + nfkc_cf->normalize(cString, mapping, errorCode); + nfd->normalize(mapping, nfdString, errorCode); + if(!validSet.containsAll(nfdString)) { + fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c); + disallowedSet.add(c); + removeSet.add(c); + madeChange=TRUE; + } + } + mappedSet.removeAll(removeSet); + } + } while(madeChange); + errorCode.assertSuccess(); + + // finish up + labelSeparators.remove(0x2e).freeze(); // U+002E is simply valid + deviationSet.freeze(); + ignoredSet.freeze(); + validSet.freeze(); + mappedSet.freeze(); + + // output + UChar32 prevStart=0, c=0; + Status prevStatus=DISALLOWED, status; + icu::UnicodeString prevMapping; + + icu::UnicodeSetIterator iter(disallowedSet); + while(iter.nextRange()) { + UChar32 start=iter.getCodepoint(); + while(cnormalize(cString, mapping, errorCode); + } else if(ignoredSet.contains(c)) { + status=IGNORED; + } else if(validSet.contains(c)) { + status=VALID; + } else if(mappedSet.contains(c)) { + status=MAPPED; + cString.setTo(c); + nfkc_cf->normalize(cString, mapping, errorCode); + } else { + fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c); + } + if(prevStart + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +