mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-21855 Export case data for ICU4X
This commit is contained in:
parent
704415402a
commit
d385b3c273
3 changed files with 180 additions and 89 deletions
|
@ -22,27 +22,14 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/udata.h" /* UDataInfo */
|
||||
#include "unicode/utf16.h"
|
||||
#include "ucmndata.h" /* DataHeader */
|
||||
#include "udatamem.h"
|
||||
#include "umutex.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
#include "utrie2.h"
|
||||
#include "uassert.h"
|
||||
#include "ucase.h"
|
||||
#include "umutex.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
struct UCaseProps {
|
||||
UDataMemory *mem;
|
||||
const int32_t *indexes;
|
||||
const uint16_t *exceptions;
|
||||
const uint16_t *unfold;
|
||||
|
||||
UTrie2 trie;
|
||||
uint8_t formatVersion[4];
|
||||
};
|
||||
|
||||
/* ucase_props_data.h is machine-generated by gencase --csource */
|
||||
/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
|
||||
#define INCLUDED_FROM_UCASE_CPP
|
||||
#include "ucase_props_data.h"
|
||||
|
||||
|
@ -77,6 +64,13 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
|||
|
||||
/* data access primitives --------------------------------------------------- */
|
||||
|
||||
U_CAPI const struct UCaseProps * U_EXPORT2
|
||||
ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
|
||||
*pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
|
||||
*pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
|
||||
return &ucase_props_singleton;
|
||||
}
|
||||
|
||||
U_CFUNC const UTrie2 * U_EXPORT2
|
||||
ucase_getTrie() {
|
||||
return &ucase_props_singleton.trie;
|
||||
|
@ -690,7 +684,7 @@ ucase_isCaseSensitive(UChar32 c) {
|
|||
* - The general category of C is
|
||||
* Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
|
||||
* Letter Modifier (Lm), or Symbol Modifier (Sk)
|
||||
* - C is one of the following characters
|
||||
* - C is one of the following characters
|
||||
* U+0027 APOSTROPHE
|
||||
* U+00AD SOFT HYPHEN (SHY)
|
||||
* U+2019 RIGHT SINGLE QUOTATION MARK
|
||||
|
@ -1546,7 +1540,7 @@ U_CAPI UChar32 U_EXPORT2
|
|||
u_tolower(UChar32 c) {
|
||||
return ucase_tolower(c);
|
||||
}
|
||||
|
||||
|
||||
/* Transforms the Unicode character to its upper case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_toupper(UChar32 c) {
|
||||
|
|
|
@ -312,6 +312,21 @@ UCaseMapFull(UChar32 c,
|
|||
|
||||
U_CDECL_END
|
||||
|
||||
/* for icuexportdata -------------------------------------------------------- */
|
||||
|
||||
struct UCaseProps {
|
||||
void *mem; // TODO: was unused, and type UDataMemory -- remove
|
||||
const int32_t *indexes;
|
||||
const uint16_t *exceptions;
|
||||
const uint16_t *unfold;
|
||||
|
||||
UTrie2 trie;
|
||||
uint8_t formatVersion[4];
|
||||
};
|
||||
|
||||
U_CAPI const struct UCaseProps * U_EXPORT2
|
||||
ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength);
|
||||
|
||||
/* file definitions --------------------------------------------------------- */
|
||||
|
||||
#define UCASE_DATA_NAME "ucase"
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "unicode/uscript.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
#include "ucase.h"
|
||||
#include "writesrc.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -228,14 +229,14 @@ void dumpScriptExtensions(FILE* f) {
|
|||
}
|
||||
}
|
||||
|
||||
if (isScxValUnique) {
|
||||
if (isScxValUnique) {
|
||||
outputDedupVec.push_back(scxValVec);
|
||||
usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n");
|
||||
}
|
||||
|
||||
// We must update the value in the UCPTrie for the code point to contain:
|
||||
// 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is
|
||||
// the index into the companion array
|
||||
// the index into the companion array
|
||||
// 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether
|
||||
// 3: other
|
||||
// 2: Script=Inherited
|
||||
|
@ -333,35 +334,25 @@ static UOption options[]={
|
|||
UOPTION_QUIET,
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
|
||||
/* preset then read command line options */
|
||||
options[OPT_DESTDIR].value=u_getDataDirectory();
|
||||
argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
|
||||
|
||||
if(options[OPT_VERSION].doesOccur) {
|
||||
printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
|
||||
U_ICU_DATA_VERSION);
|
||||
printf("%s\n", U_COPYRIGHT_STRING);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
fprintf(stderr,
|
||||
"error in command line argument \"%s\"\n",
|
||||
argv[-argc]);
|
||||
} else if(argc<2) {
|
||||
argc=-1;
|
||||
}
|
||||
|
||||
/* get the options values */
|
||||
haveCopyright = options[OPT_COPYRIGHT].doesOccur;
|
||||
destdir = options[OPT_DESTDIR].value;
|
||||
VERBOSE = options[OPT_VERBOSE].doesOccur;
|
||||
QUIET = options[OPT_QUIET].doesOccur;
|
||||
void printHelp(FILE* stdfile, const char* program) {
|
||||
fprintf(stdfile,
|
||||
"usage: %s -m mode [-options] [--all | properties...]\n"
|
||||
"\tdump Unicode property data to .toml files\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-m or --mode mode: currently only 'uprops' and 'ucase', but more may be added\n"
|
||||
"\t --trie-type set the trie type (small or fast, default small)\n"
|
||||
"\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t --all write out all properties known to icuexportdata\n"
|
||||
"\t --index write an _index.toml summarizing all data exported\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-v or --verbose Turn on verbose output\n"
|
||||
"\t-q or --quiet do not display warnings and progress\n",
|
||||
program);
|
||||
}
|
||||
|
||||
int exportUprops(int argc, char* argv[]) {
|
||||
// Load list of Unicode properties
|
||||
std::vector<const char*> propNames;
|
||||
for (int i=1; i<argc; i++) {
|
||||
|
@ -386,46 +377,6 @@ int main(int argc, char* argv[]) {
|
|||
}
|
||||
}
|
||||
|
||||
if (propNames.empty()
|
||||
|| options[OPT_HELP_H].doesOccur
|
||||
|| options[OPT_HELP_QUESTION_MARK].doesOccur
|
||||
|| !options[OPT_MODE].doesOccur) {
|
||||
FILE *stdfile=argc<0 ? stderr : stdout;
|
||||
fprintf(stdfile,
|
||||
"usage: %s -m uprops [-options] [--all | properties...]\n"
|
||||
"\tdump Unicode property data to .toml files\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-m or --mode mode: currently only 'uprops', but more may be added\n"
|
||||
"\t --trie-type set the trie type (small or fast, default small)\n"
|
||||
"\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t --all write out all properties known to icuexportdata\n"
|
||||
"\t --index write an _index.toml summarizing all data exported\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-v or --verbose Turn on verbose output\n"
|
||||
"\t-q or --quiet do not display warnings and progress\n",
|
||||
argv[0]);
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
const char* mode = options[OPT_MODE].value;
|
||||
if (uprv_strcmp(mode, "uprops") != 0) {
|
||||
fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
||||
if (options[OPT_TRIE_TYPE].doesOccur) {
|
||||
if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
|
||||
trieType = UCPTRIE_TYPE_FAST;
|
||||
} else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
|
||||
trieType = UCPTRIE_TYPE_SMALL;
|
||||
} else {
|
||||
fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
for (const char* propName : propNames) {
|
||||
UProperty propEnum = u_getPropertyEnum(propName);
|
||||
if (propEnum == UCHAR_INVALID_CODE) {
|
||||
|
@ -470,3 +421,134 @@ int main(int argc, char* argv[]) {
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct AddRangeHelper {
|
||||
UMutableCPTrie* ucptrie;
|
||||
};
|
||||
|
||||
static UBool U_CALLCONV
|
||||
addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) {
|
||||
IcuToolErrorCode status("addRangeToUCPTrie");
|
||||
UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie;
|
||||
umutablecptrie_setRange(ucptrie, start, end, value, status);
|
||||
handleError(status, "setRange");
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int exportCase(int argc, char* argv[]) {
|
||||
if (argc > 1) {
|
||||
fprintf(stderr, "ucase mode does not expect additional arguments\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
(void) argv; // Suppress unused variable warning
|
||||
|
||||
IcuToolErrorCode status("icuexportdata");
|
||||
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
|
||||
handleError(status, "exportCase");
|
||||
|
||||
int32_t exceptionsLength, unfoldLength;
|
||||
const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength);
|
||||
const UTrie2* caseTrie = &caseProps->trie;
|
||||
|
||||
AddRangeHelper helper = { builder.getAlias() };
|
||||
utrie2_enum(caseTrie, NULL, addRangeToUCPTrie, &helper);
|
||||
|
||||
UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16;
|
||||
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
|
||||
builder.getAlias(),
|
||||
trieType,
|
||||
width,
|
||||
status));
|
||||
handleError(status, "exportCase");
|
||||
|
||||
FILE* f = prepareOutputFile("ucase");
|
||||
|
||||
UVersionInfo versionInfo;
|
||||
u_getUnicodeVersion(versionInfo);
|
||||
char uvbuf[U_MAX_VERSION_STRING_LENGTH];
|
||||
u_versionToString(versionInfo, uvbuf);
|
||||
fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
|
||||
U_ICU_VERSION,
|
||||
uvbuf);
|
||||
|
||||
fputs("[ucase.code_point_trie]\n", f);
|
||||
usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
|
||||
fputs("\n", f);
|
||||
|
||||
const char* indent = " ";
|
||||
const char* suffix = "\n]\n";
|
||||
|
||||
fputs("[ucase.exceptions]\n", f);
|
||||
const char* exceptionsPrefix = "exceptions = [\n ";
|
||||
int32_t exceptionsWidth = 16;
|
||||
usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth,
|
||||
exceptionsLength, indent, suffix);
|
||||
fputs("\n", f);
|
||||
|
||||
fputs("[ucase.unfold]\n", f);
|
||||
const char* unfoldPrefix = "unfold = [\n ";
|
||||
int32_t unfoldWidth = 16;
|
||||
usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth,
|
||||
unfoldLength, indent, suffix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
|
||||
/* preset then read command line options */
|
||||
options[OPT_DESTDIR].value=u_getDataDirectory();
|
||||
argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
|
||||
|
||||
if(options[OPT_VERSION].doesOccur) {
|
||||
printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
|
||||
U_ICU_DATA_VERSION);
|
||||
printf("%s\n", U_COPYRIGHT_STRING);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
fprintf(stderr,
|
||||
"error in command line argument \"%s\"\n",
|
||||
argv[-argc]);
|
||||
}
|
||||
|
||||
if (argc < 0
|
||||
|| options[OPT_HELP_H].doesOccur
|
||||
|| options[OPT_HELP_QUESTION_MARK].doesOccur
|
||||
|| !options[OPT_MODE].doesOccur) {
|
||||
FILE *stdfile=argc<0 ? stderr : stdout;
|
||||
printHelp(stdfile, argv[0]);
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
/* get the options values */
|
||||
haveCopyright = options[OPT_COPYRIGHT].doesOccur;
|
||||
destdir = options[OPT_DESTDIR].value;
|
||||
VERBOSE = options[OPT_VERBOSE].doesOccur;
|
||||
QUIET = options[OPT_QUIET].doesOccur;
|
||||
|
||||
if (options[OPT_TRIE_TYPE].doesOccur) {
|
||||
if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
|
||||
trieType = UCPTRIE_TYPE_FAST;
|
||||
} else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
|
||||
trieType = UCPTRIE_TYPE_SMALL;
|
||||
} else {
|
||||
fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
const char* mode = options[OPT_MODE].value;
|
||||
if (uprv_strcmp(mode, "uprops") == 0) {
|
||||
return exportUprops(argc, argv);
|
||||
} else if (uprv_strcmp(mode, "ucase") == 0) {
|
||||
return exportCase(argc, argv);
|
||||
}
|
||||
|
||||
fprintf(stderr, "Invalid option for --mode (must be uprops or ucase)\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue