From d385b3c27318d5a6cced03e02d0d015af5e8c87e Mon Sep 17 00:00:00 2001 From: Iain Ireland Date: Mon, 8 Nov 2021 10:17:54 -0800 Subject: [PATCH] ICU-21855 Export case data for ICU4X --- icu4c/source/common/ucase.cpp | 32 +-- icu4c/source/common/ucase.h | 15 ++ .../tools/icuexportdata/icuexportdata.cpp | 222 ++++++++++++------ 3 files changed, 180 insertions(+), 89 deletions(-) diff --git a/icu4c/source/common/ucase.cpp b/icu4c/source/common/ucase.cpp index 1f354c18635..388c86b1bba 100644 --- a/icu4c/source/common/ucase.cpp +++ b/icu4c/source/common/ucase.cpp @@ -22,27 +22,14 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/uset.h" -#include "unicode/udata.h" /* UDataInfo */ #include "unicode/utf16.h" -#include "ucmndata.h" /* DataHeader */ -#include "udatamem.h" -#include "umutex.h" -#include "uassert.h" #include "cmemory.h" -#include "utrie2.h" +#include "uassert.h" #include "ucase.h" +#include "umutex.h" +#include "utrie2.h" -struct UCaseProps { - UDataMemory *mem; - const int32_t *indexes; - const uint16_t *exceptions; - const uint16_t *unfold; - - UTrie2 trie; - uint8_t formatVersion[4]; -}; - -/* ucase_props_data.h is machine-generated by gencase --csource */ +/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */ #define INCLUDED_FROM_UCASE_CPP #include "ucase_props_data.h" @@ -77,6 +64,13 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { /* data access primitives --------------------------------------------------- */ +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) { + *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions); + *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold); + return &ucase_props_singleton; +} + U_CFUNC const UTrie2 * U_EXPORT2 ucase_getTrie() { return &ucase_props_singleton.trie; @@ -690,7 +684,7 @@ ucase_isCaseSensitive(UChar32 c) { * - The general category of C is * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or * Letter Modifier (Lm), or Symbol Modifier (Sk) - * - C is one of the following characters + * - C is one of the following characters * U+0027 APOSTROPHE * U+00AD SOFT HYPHEN (SHY) * U+2019 RIGHT SINGLE QUOTATION MARK @@ -1546,7 +1540,7 @@ U_CAPI UChar32 U_EXPORT2 u_tolower(UChar32 c) { return ucase_tolower(c); } - + /* Transforms the Unicode character to its upper case equivalent.*/ U_CAPI UChar32 U_EXPORT2 u_toupper(UChar32 c) { diff --git a/icu4c/source/common/ucase.h b/icu4c/source/common/ucase.h index a018f82b81b..7bf57fd3706 100644 --- a/icu4c/source/common/ucase.h +++ b/icu4c/source/common/ucase.h @@ -312,6 +312,21 @@ UCaseMapFull(UChar32 c, U_CDECL_END +/* for icuexportdata -------------------------------------------------------- */ + +struct UCaseProps { + void *mem; // TODO: was unused, and type UDataMemory -- remove + const int32_t *indexes; + const uint16_t *exceptions; + const uint16_t *unfold; + + UTrie2 trie; + uint8_t formatVersion[4]; +}; + +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength); + /* file definitions --------------------------------------------------------- */ #define UCASE_DATA_NAME "ucase" diff --git a/icu4c/source/tools/icuexportdata/icuexportdata.cpp b/icu4c/source/tools/icuexportdata/icuexportdata.cpp index 9b36d1031fc..d3794a59924 100644 --- a/icu4c/source/tools/icuexportdata/icuexportdata.cpp +++ b/icu4c/source/tools/icuexportdata/icuexportdata.cpp @@ -15,6 +15,7 @@ #include "unicode/uscript.h" #include "unicode/putil.h" #include "unicode/umutablecptrie.h" +#include "ucase.h" #include "writesrc.h" U_NAMESPACE_USE @@ -228,14 +229,14 @@ void dumpScriptExtensions(FILE* f) { } } - if (isScxValUnique) { + if (isScxValUnique) { outputDedupVec.push_back(scxValVec); usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n"); } // We must update the value in the UCPTrie for the code point to contain: // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is - // the index into the companion array + // the index into the companion array // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether // 3: other // 2: Script=Inherited @@ -333,35 +334,25 @@ static UOption options[]={ UOPTION_QUIET, }; -int main(int argc, char* argv[]) { - U_MAIN_INIT_ARGS(argc, argv); - - /* preset then read command line options */ - options[OPT_DESTDIR].value=u_getDataDirectory(); - argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); - - if(options[OPT_VERSION].doesOccur) { - printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n", - U_ICU_DATA_VERSION); - printf("%s\n", U_COPYRIGHT_STRING); - exit(0); - } - - /* error handling, printing usage message */ - if(argc<0) { - fprintf(stderr, - "error in command line argument \"%s\"\n", - argv[-argc]); - } else if(argc<2) { - argc=-1; - } - - /* get the options values */ - haveCopyright = options[OPT_COPYRIGHT].doesOccur; - destdir = options[OPT_DESTDIR].value; - VERBOSE = options[OPT_VERBOSE].doesOccur; - QUIET = options[OPT_QUIET].doesOccur; +void printHelp(FILE* stdfile, const char* program) { + fprintf(stdfile, + "usage: %s -m mode [-options] [--all | properties...]\n" + "\tdump Unicode property data to .toml files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-m or --mode mode: currently only 'uprops' and 'ucase', but more may be added\n" + "\t --trie-type set the trie type (small or fast, default small)\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t --all write out all properties known to icuexportdata\n" + "\t --index write an _index.toml summarizing all data exported\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n", + program); +} +int exportUprops(int argc, char* argv[]) { // Load list of Unicode properties std::vector propNames; for (int i=1; iucptrie; + umutablecptrie_setRange(ucptrie, start, end, value, status); + handleError(status, "setRange"); + + return TRUE; +} + +int exportCase(int argc, char* argv[]) { + if (argc > 1) { + fprintf(stderr, "ucase mode does not expect additional arguments\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + (void) argv; // Suppress unused variable warning + + IcuToolErrorCode status("icuexportdata"); + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + handleError(status, "exportCase"); + + int32_t exceptionsLength, unfoldLength; + const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength); + const UTrie2* caseTrie = &caseProps->trie; + + AddRangeHelper helper = { builder.getAlias() }; + utrie2_enum(caseTrie, NULL, addRangeToUCPTrie, &helper); + + UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16; + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + width, + status)); + handleError(status, "exportCase"); + + FILE* f = prepareOutputFile("ucase"); + + UVersionInfo versionInfo; + u_getUnicodeVersion(versionInfo); + char uvbuf[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(versionInfo, uvbuf); + fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n", + U_ICU_VERSION, + uvbuf); + + fputs("[ucase.code_point_trie]\n", f); + usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + fputs("\n", f); + + const char* indent = " "; + const char* suffix = "\n]\n"; + + fputs("[ucase.exceptions]\n", f); + const char* exceptionsPrefix = "exceptions = [\n "; + int32_t exceptionsWidth = 16; + usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth, + exceptionsLength, indent, suffix); + fputs("\n", f); + + fputs("[ucase.unfold]\n", f); + const char* unfoldPrefix = "unfold = [\n "; + int32_t unfoldWidth = 16; + usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth, + unfoldLength, indent, suffix); + + return 0; +} + +int main(int argc, char* argv[]) { + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[OPT_DESTDIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + if(options[OPT_VERSION].doesOccur) { + printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n", + U_ICU_DATA_VERSION); + printf("%s\n", U_COPYRIGHT_STRING); + exit(0); + } + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + + if (argc < 0 + || options[OPT_HELP_H].doesOccur + || options[OPT_HELP_QUESTION_MARK].doesOccur + || !options[OPT_MODE].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + printHelp(stdfile, argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + /* get the options values */ + haveCopyright = options[OPT_COPYRIGHT].doesOccur; + destdir = options[OPT_DESTDIR].value; + VERBOSE = options[OPT_VERBOSE].doesOccur; + QUIET = options[OPT_QUIET].doesOccur; + + if (options[OPT_TRIE_TYPE].doesOccur) { + if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { + trieType = UCPTRIE_TYPE_FAST; + } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { + trieType = UCPTRIE_TYPE_SMALL; + } else { + fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + const char* mode = options[OPT_MODE].value; + if (uprv_strcmp(mode, "uprops") == 0) { + return exportUprops(argc, argv); + } else if (uprv_strcmp(mode, "ucase") == 0) { + return exportCase(argc, argv); + } + + fprintf(stderr, "Invalid option for --mode (must be uprops or ucase)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; +}