ICU-22028 Export collation and normalization data for ICU4X

This commit is contained in:
Henri Sivonen 2021-11-03 12:28:07 +02:00 committed by Elango
parent d7c424b00f
commit 3cefbd55c7
22 changed files with 1275 additions and 76 deletions

View file

@ -418,7 +418,7 @@ jobs:
timeoutInMinutes: 30
pool:
vmImage: 'windows-2019'
demands:
demands:
- msbuild
- visualstudio
- Cmd
@ -633,6 +633,8 @@ jobs:
cd icu4c/source
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all
displayName: 'Build Unicode property data export file (Full)'
# In the sample file, include:
# - Basic binary properties: AHex WSpace
@ -646,6 +648,18 @@ jobs:
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/fast --trie-type fast AHex gc nt Basic_Emoji sc WSpace blank
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/small --trie-type small AHex gc nt Basic_Emoji sc WSpace blank
displayName: 'Build Unicode property data export file (Sample)'
- script: |
mkdir -p icu4c/source/icuexportdata_uprops_full/collation_unihan
mkdir -p icu4c/source/icuexportdata_uprops_full/collation_implicithan
cd icu4c/source
cd data/coll
FILES=`echo *.txt`
cd -
LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_unihan --ucadata data/in/coll/ucadata-unihan-icu4x.icu $FILES
LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_implicithan --ucadata data/in/coll/ucadata-implicithan-icu4x.icu $FILES
rm icuexportdata_uprops_full/collation_unihan/*.res
rm icuexportdata_uprops_full/collation_implicithan/*.res
displayName: 'Build collation data export file'
- task: PublishBuildArtifacts@1
displayName: 'Publish Artifact: icuexportdata_uprops_full'
inputs:

View file

@ -44,7 +44,7 @@ struct UDataMemory {
int32_t length; /* Length of the data in bytes; -1 if unknown. */
};
U_CFUNC UDataMemory *UDataMemory_createNewInstance(UErrorCode *pErr);
U_CAPI UDataMemory* U_EXPORT2 UDataMemory_createNewInstance(UErrorCode *pErr);
U_CFUNC void UDatamemory_assign (UDataMemory *dest, UDataMemory *source);
U_CFUNC void UDataMemory_init (UDataMemory *This);
U_CFUNC UBool UDataMemory_isLoaded(const UDataMemory *This);

View file

@ -29,7 +29,7 @@
#include "unicode/udata.h"
#include "putilimp.h"
U_CFUNC UBool uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status);
U_CAPI UBool U_EXPORT2 uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status);
U_CFUNC void uprv_unmapFile(UDataMemory *pData);
/* MAP_NONE: no memory mapping, no file access at all */

Binary file not shown.

View file

@ -44,3 +44,6 @@ bazelisk run //tools/unicode/c/genprops $ICU_SRC/icu4c
# We run it twice for different versions of the CLDR root sort order.
bazelisk run //tools/unicode/c/genuca -- --hanOrder implicit $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --hanOrder radical-stroke $ICU_SRC/icu4c
# Also generate the ICU4X versions
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c

View file

@ -221,7 +221,8 @@ public:
/**
* Points to contraction data.
* Bits 31..13: Index into prefix/contraction data.
* Bits 12..11: Unused, 0.
* Bit 12: Unused, 0.
* Bit 11: CONTRACT_HAS_STARTER flag. (Used by ICU4X only.)
* Bit 10: CONTRACT_TRAILING_CCC flag.
* Bit 9: CONTRACT_NEXT_CCC flag.
* Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag.
@ -298,6 +299,8 @@ public:
static const uint32_t CONTRACT_NEXT_CCC = 0x200;
/** Set if any contraction suffix ends with lccc!=0. */
static const uint32_t CONTRACT_TRAILING_CCC = 0x400;
/** Set if any contraction suffix contains a starter. (Used by ICU4X only.) */
static const uint32_t CONTRACT_HAS_STARTER = 0x800;
/** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */
static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100;

View file

@ -198,7 +198,7 @@ const int32_t CollationBuilder::HAS_BEFORE2;
const int32_t CollationBuilder::HAS_BEFORE3;
#endif
CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
CollationBuilder::CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode)
: nfd(*Normalizer2::getNFDInstance(errorCode)),
fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
@ -206,7 +206,8 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro
baseData(b->data),
rootElements(b->data->rootElements, b->data->rootElementsLength),
variableTop(0),
dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),
dataBuilder(new CollationDataBuilder(icu4xMode, errorCode)), fastLatinEnabled(TRUE),
icu4xMode(icu4xMode),
errorReason(NULL),
cesLength(0),
rootPrimaryIndexes(errorCode), nodes(errorCode) {
@ -225,6 +226,10 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro
}
}
CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
: CollationBuilder(b, FALSE, errorCode)
{}
CollationBuilder::~CollationBuilder() {
delete dataBuilder;
}
@ -262,15 +267,19 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
if(U_FAILURE(errorCode)) { return NULL; }
if(dataBuilder->hasMappings()) {
makeTailoredCEs(errorCode);
closeOverComposites(errorCode);
if (!icu4xMode) {
closeOverComposites(errorCode);
}
finalizeCEs(errorCode);
// Copy all of ASCII, and Latin-1 letters, into each tailoring.
optimizeSet.add(0, 0x7f);
optimizeSet.add(0xc0, 0xff);
// Hangul is decomposed on the fly during collation,
// and the tailoring data is always built with HANGUL_TAG specials.
optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
dataBuilder->optimize(optimizeSet, errorCode);
if (!icu4xMode) {
// Copy all of ASCII, and Latin-1 letters, into each tailoring.
optimizeSet.add(0, 0x7f);
optimizeSet.add(0xc0, 0xff);
// Hangul is decomposed on the fly during collation,
// and the tailoring data is always built with HANGUL_TAG specials.
optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
dataBuilder->optimize(optimizeSet, errorCode);
}
tailoring->ensureOwnedData(errorCode);
if(U_FAILURE(errorCode)) { return NULL; }
if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
@ -743,14 +752,18 @@ CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
}
}
uint32_t ce32 = Collation::UNASSIGNED_CE32;
if((prefix != nfdPrefix || str != nfdString) &&
if(!icu4xMode && (prefix != nfdPrefix || str != nfdString) &&
!ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
// Map from the original input to the CEs.
// We do this in case the canonical closure is incomplete,
// so that it is possible to explicitly provide the missing mappings.
ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
}
addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
if (!icu4xMode) {
addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
} else {
addIfDifferent(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
}
if(U_FAILURE(errorCode)) {
parserErrorReason = "writing collation elements";
return;
@ -1608,7 +1621,7 @@ CEFinalizer::~CEFinalizer() {}
void
CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode);
LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(icu4xMode, errorCode), errorCode);
if(U_FAILURE(errorCode)) {
return;
}

View file

@ -39,6 +39,7 @@ class Normalizer2Impl;
class U_I18N_API CollationBuilder : public CollationRuleParser::Sink {
public:
CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode);
CollationBuilder(const CollationTailoring *base, UErrorCode &errorCode);
virtual ~CollationBuilder();
@ -302,6 +303,7 @@ private:
CollationDataBuilder *dataBuilder;
UBool fastLatinEnabled;
UBool icu4xMode;
UnicodeSet optimizeSet;
const char *errorReason;

View file

@ -296,16 +296,19 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &
// ------------------------------------------------------------------------- ***
CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode)
CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
: nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
base(NULL), baseSettings(NULL),
trie(NULL),
ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
modified(FALSE),
icu4xMode(icu4xMode),
fastLatinEnabled(FALSE), fastLatinBuilder(NULL),
collIter(NULL) {
// Reserve the first CE32 for U+0000.
ce32s.addElement(0, errorCode);
if (!icu4xMode) {
ce32s.addElement(0, errorCode);
}
conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
}
@ -329,28 +332,32 @@ CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &error
base = b;
// For a tailoring, the default is to fall back to the base.
trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode);
// For ICU4X, use the same value for fallback as for the default
// to avoid having to have different blocks for the two.
trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);
// Set the Latin-1 letters block so that it is allocated first in the data array,
// to try to improve locality of reference when sorting Latin-1 text.
// Do not use utrie2_setRange32() since that will not actually allocate blocks
// that are filled with the default value.
// ASCII (0..7F) is already preallocated anyway.
for(UChar32 c = 0xc0; c <= 0xff; ++c) {
utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
if (!icu4xMode) {
// Set the Latin-1 letters block so that it is allocated first in the data array,
// to try to improve locality of reference when sorting Latin-1 text.
// Do not use utrie2_setRange32() since that will not actually allocate blocks
// that are filled with the default value.
// ASCII (0..7F) is already preallocated anyway.
for(UChar32 c = 0xc0; c <= 0xff; ++c) {
utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
}
// Hangul syllables are not tailorable (except via tailoring Jamos).
// Always set the Hangul tag to help performance.
// Do this here, rather than in buildMappings(),
// so that we see the HANGUL_TAG in various assertions.
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
// Copy the set contents but don't copy/clone the set as a whole because
// that would copy the isFrozen state too.
unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
}
// Hangul syllables are not tailorable (except via tailoring Jamos).
// Always set the Hangul tag to help performance.
// Do this here, rather than in buildMappings(),
// so that we see the HANGUL_TAG in various assertions.
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
// Copy the set contents but don't copy/clone the set as a whole because
// that would copy the isFrozen state too.
unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
if(U_FAILURE(errorCode)) { return; }
}
@ -567,6 +574,98 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
int32_t cLength = U16_LENGTH(c);
uint32_t oldCE32 = utrie2_get32(trie, c);
UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
if (icu4xMode) {
if (base && c >= 0x1100 && c < 0x1200) {
// Omit jamo tailorings.
// TODO(https://github.com/unicode-org/icu4x/issues/1941).
}
const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
UnicodeString sInNfd;
nfdNormalizer->normalize(s, sInNfd, errorCode);
if (s != sInNfd) {
// s is not in NFD, so it cannot match in ICU4X, since ICU4X only
// does NFD lookups.
// Now check that we're only rejecting known cases.
if (s.length() == 2) {
char16_t second = s.charAt(1);
if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
// Second is a special decomposing Tibetan vowel sign.
// These also get added in the decomposed form, so ignoring
// this instance is OK.
return;
}
if (c == 0xFDD1 && second == 0xAC00) {
// This strange contraction exists in the root and
// doesn't have a decomposed counterpart there.
// This won't match in ICU4X anyway and is very strange:
// Unassigned Arabic presentation form contracting with
// the very first Hangul syllable. Let's ignore this
// explicitly.
return;
}
}
// Unknown case worth investigating if ever found.
errorCode = U_UNSUPPORTED_ERROR;
return;
}
if (!prefix.isEmpty()) {
UnicodeString prefixInNfd;
nfdNormalizer->normalize(prefix, prefixInNfd, errorCode);
if (prefix != prefixInNfd) {
errorCode = U_UNSUPPORTED_ERROR;
return;
}
int32_t count = prefix.countChar32();
if (count > 2) {
// Prefix too long for ICU4X.
errorCode = U_UNSUPPORTED_ERROR;
return;
}
UChar32 utf32[4];
int32_t len = prefix.toUTF32(utf32, 4, errorCode);
if (len != count) {
errorCode = U_INVALID_STATE_ERROR;
return;
}
UChar32 c = utf32[0];
if (u_getCombiningClass(c)) {
// Prefix must start with as starter for ICU4X.
errorCode = U_UNSUPPORTED_ERROR;
return;
}
// XXX: Korean searchjl has jamo in prefix, so commenting out this
// check for now. ICU4X currently ignores non-root jamo tables anyway.
// searchjl was added in
// https://unicode-org.atlassian.net/browse/CLDR-3560
// Contractions were changed to prefixes in
// https://unicode-org.atlassian.net/browse/CLDR-6546
//
// if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) {
// errorCode = U_UNSUPPORTED_ERROR;
// return;
// }
if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) {
// Second character in prefix, if present, must be a kana voicing mark for ICU4X.
errorCode = U_UNSUPPORTED_ERROR;
return;
}
}
if (s.length() > cLength) {
// Check that there's no modern Hangul in contractions.
for (int32_t i = 0; i < s.length(); ++i) {
UChar c = s.charAt(i);
if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
errorCode = U_UNSUPPORTED_ERROR;
return;
}
}
}
}
if(oldCE32 == Collation::FALLBACK_CE32) {
// First tailoring for c.
// If c has contextual base mappings or if we add a contextual mapping,
@ -688,8 +787,11 @@ CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
return encodeOneCEAsCE32(0);
} else if(cesLength == 1) {
return encodeOneCE(ces[0], errorCode);
} else if(cesLength == 2) {
} else if(cesLength == 2 && !icu4xMode) {
// Try to encode two CEs as one CE32.
// Turn this off for ICU4X, because without the canonical closure
// these are so rare that it doesn't make sense to spend a branch
// on checking this tag when using the data.
int64_t ce0 = ces[0];
int64_t ce1 = ces[1];
uint32_t p0 = (uint32_t)(ce0 >> 32);
@ -1297,9 +1399,11 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
setDigitTags(errorCode);
setLeadSurrogates(errorCode);
// For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
if (!icu4xMode) {
// For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
}
utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
if(U_FAILURE(errorCode)) { return; }
@ -1454,6 +1558,20 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode)
// The last suffix character has lccc!=0, allowing for discontiguous contractions.
flags |= Collation::CONTRACT_TRAILING_CCC;
}
if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) {
for (int32_t i = 0; i < suffix.length();) {
UChar32 c = suffix.char32At(i);
if (!u_getCombiningClass(c)) {
flags |= Collation::CONTRACT_HAS_STARTER;
break;
}
if (c > 0xFFFF) {
i += 2;
} else {
++i;
}
}
}
contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode);
if(cond == lastCond) { break; }
cond = getConditionalCE32(cond->next);

View file

@ -60,7 +60,7 @@ public:
virtual int64_t modifyCE(int64_t ce) const = 0;
};
CollationDataBuilder(UErrorCode &errorCode);
CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode);
virtual ~CollationDataBuilder();
@ -255,6 +255,7 @@ private:
protected:
UnicodeSet unsafeBackwardSet;
UBool modified;
UBool icu4xMode;
UBool fastLatinEnabled;
CollationFastLatinBuilder *fastLatinBuilder;

View file

@ -27,6 +27,7 @@
#include "ucln_in.h"
#include "udatamem.h"
#include "umutex.h"
#include "umapfile.h"
U_NAMESPACE_BEGIN
@ -47,17 +48,46 @@ static UBool U_CALLCONV uprv_collation_root_cleanup() {
U_CDECL_END
UDataMemory*
CollationRoot::loadFromFile(const char* ucadataPath, UErrorCode &errorCode) {
UDataMemory dataMemory;
UDataMemory *rDataMem = NULL;
if (U_FAILURE(errorCode)) {
return NULL;
}
if (uprv_mapFile(&dataMemory, ucadataPath, &errorCode)) {
if (dataMemory.pHeader->dataHeader.magic1 == 0xda &&
dataMemory.pHeader->dataHeader.magic2 == 0x27 &&
CollationDataReader::isAcceptable(NULL, "icu", "ucadata", &dataMemory.pHeader->info)) {
rDataMem = UDataMemory_createNewInstance(&errorCode);
if (U_FAILURE(errorCode)) {
return NULL;
}
rDataMem->pHeader = dataMemory.pHeader;
rDataMem->mapAddr = dataMemory.mapAddr;
rDataMem->map = dataMemory.map;
return rDataMem;
}
errorCode = U_INVALID_FORMAT_ERROR;
return NULL;
}
errorCode = U_MISSING_RESOURCE_ERROR;
return NULL;
}
void U_CALLCONV
CollationRoot::load(UErrorCode &errorCode) {
CollationRoot::load(const char* ucadataPath, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
LocalPointer<CollationTailoring> t(new CollationTailoring(NULL));
if(t.isNull() || t->isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
t->memory = udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll",
"icu", "ucadata",
CollationDataReader::isAcceptable, t->version, &errorCode);
t->memory = ucadataPath ? CollationRoot::loadFromFile(ucadataPath, errorCode) :
udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll",
"icu", "ucadata",
CollationDataReader::isAcceptable,
t->version, &errorCode);
if(U_FAILURE(errorCode)) { return; }
const uint8_t *inBytes = static_cast<const uint8_t *>(udata_getMemory(t->memory));
CollationDataReader::read(NULL, inBytes, udata_getLength(t->memory), *t, errorCode);
@ -73,14 +103,14 @@ CollationRoot::load(UErrorCode &errorCode) {
const CollationCacheEntry *
CollationRoot::getRootCacheEntry(UErrorCode &errorCode) {
umtx_initOnce(initOnce, CollationRoot::load, errorCode);
umtx_initOnce(initOnce, CollationRoot::load, static_cast<const char*>(NULL), errorCode);
if(U_FAILURE(errorCode)) { return NULL; }
return rootSingleton;
}
const CollationTailoring *
CollationRoot::getRoot(UErrorCode &errorCode) {
umtx_initOnce(initOnce, CollationRoot::load, errorCode);
umtx_initOnce(initOnce, CollationRoot::load, static_cast<const char*>(NULL), errorCode);
if(U_FAILURE(errorCode)) { return NULL; }
return rootSingleton->tailoring;
}
@ -99,6 +129,12 @@ CollationRoot::getSettings(UErrorCode &errorCode) {
return root->settings;
}
void
CollationRoot::forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode) {
umtx_initOnce(initOnce, CollationRoot::load, ucadataPath, errorCode);
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION

View file

@ -15,6 +15,7 @@
#define __COLLATIONROOT_H__
#include "unicode/utypes.h"
#include "unicode/udata.h"
#if !UCONFIG_NO_COLLATION
@ -34,9 +35,11 @@ public:
static const CollationTailoring *getRoot(UErrorCode &errorCode);
static const CollationData *getData(UErrorCode &errorCode);
static const CollationSettings *getSettings(UErrorCode &errorCode);
static void U_EXPORT2 forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode);
private:
static void U_CALLCONV load(UErrorCode &errorCode);
static void U_CALLCONV load(const char* ucadataPath, UErrorCode &errorCode);
static UDataMemory* loadFromFile(const char* ucadataPath, UErrorCode &errorCode);
};
U_NAMESPACE_END

View file

@ -33,6 +33,7 @@
#include "filterrb.h"
#include "reslist.h"
#include "ucmndata.h" /* TODO: for reading the pool bundle */
#include "collationroot.h"
U_NAMESPACE_USE
@ -84,7 +85,9 @@ enum
WRITE_POOL_BUNDLE,
USE_POOL_BUNDLE,
INCLUDE_UNIHAN_COLL,
FILTERDIR
FILTERDIR,
ICU4X_MODE,
UCADATA
};
UOption options[]={
@ -111,6 +114,8 @@ UOption options[]={
UOPTION_DEF("usePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 20 */
UOPTION_DEF("includeUnihanColl", '\x01', UOPT_NO_ARG),/* 21 */ /* temporary, don't display in usage info */
UOPTION_DEF("filterDir", '\x01', UOPT_OPTIONAL_ARG), /* 22 */
UOPTION_DEF("icu4xMode", 'X', UOPT_NO_ARG),/* 23 */
UOPTION_DEF("ucadata", '\x01', UOPT_REQUIRES_ARG),/* 24 */
};
static UBool write_java = FALSE;
@ -152,6 +157,10 @@ main(int argc,
fprintf(stderr, "%s: cannot combine --writePoolBundle and --usePoolBundle\n", argv[0]);
illegalArg = TRUE;
}
if (options[ICU4X_MODE].doesOccur && !options[UCADATA].doesOccur) {
fprintf(stderr, "%s: --icu4xMode requires --ucadata\n", argv[0]);
illegalArg = TRUE;
}
if(options[FORMAT_VERSION].doesOccur) {
const char *s = options[FORMAT_VERSION].value;
if(uprv_strlen(s) != 1 || (s[0] < '1' && '3' < s[0])) {
@ -302,6 +311,15 @@ main(int argc,
}
}
if (options[UCADATA].doesOccur) {
#if !UCONFIG_NO_COLLATION
CollationRoot::forceLoadFromFile(options[UCADATA].value, status);
#else
fprintf(stderr, "--ucadata was used with UCONFIG_NO_COLLATION\n");
return status;
#endif
}
initParser();
/*added by Jing*/
@ -656,7 +674,7 @@ processFile(const char *filename, const char *cp,
}
/* Parse the data into an SRBRoot */
data.adoptInstead(parse(ucbuf.getAlias(), inputDir, outputDir, filename,
!omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, &status));
!omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, options[ICU4X_MODE].doesOccur, &status));
if (data.isNull() || U_FAILURE(status)) {
fprintf(stderr, "couldn't parse the file %s. Error:%s\n", filename, u_errorName(status));

View file

@ -21,6 +21,8 @@
*/
// Safer use of UnicodeString.
#include <cstdint>
#include <unicode/umachine.h>
#ifndef UNISTR_FROM_CHAR_EXPLICIT
# define UNISTR_FROM_CHAR_EXPLICIT explicit
#endif
@ -42,6 +44,7 @@
#include "reslist.h"
#include "rbt_pars.h"
#include "genrb.h"
#include "unicode/normalizer2.h"
#include "unicode/stringpiece.h"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
@ -59,6 +62,7 @@
#include "collationruleparser.h"
#include "collationtailoring.h"
#include <stdio.h>
#include "writesrc.h"
/* Number of tokens to read ahead of the current stream position */
#define MAX_LOOKAHEAD 3
@ -76,6 +80,9 @@
#define OPENSQBRACKET 0x005B
#define CLOSESQBRACKET 0x005D
#define ICU4X_DIACRITIC_BASE 0x0300
#define ICU4X_DIACRITIC_LIMIT 0x034F
using icu::CharString;
using icu::LocalMemory;
using icu::LocalPointer;
@ -119,6 +126,7 @@ typedef struct {
const char *filename;
UBool makeBinaryCollation;
UBool omitCollationRules;
UBool icu4xMode;
} ParseState;
typedef struct SResource *
@ -764,7 +772,7 @@ GenrbImporter::getRules(
/* Parse the data into an SRBRoot */
LocalPointer<SRBRoot> data(
parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, &errorCode));
parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, FALSE, &errorCode));
if (U_FAILURE(errorCode)) {
return;
}
@ -807,6 +815,333 @@ escape(const UChar *s, char *buffer) {
} // namespace
static FILE*
openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) {
CharString baseName;
baseName.append(name, *status);
baseName.append("_", *status);
baseName.append(collationType, *status);
baseName.append("_", *status);
baseName.append(structType, *status);
CharString outFileName;
if (outputdir && *outputdir) {
outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status);
}
outFileName.append(baseName, *status);
outFileName.append(".toml", *status);
if (U_FAILURE(*status)) {
return NULL;
}
FILE* f = fopen(outFileName.data(), "w");
if (!f) {
*status = U_FILE_ACCESS_ERROR;
return NULL;
}
usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X");
return f;
}
static void
writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) {
FILE* f = openTOML(outputdir, name, collationType, "meta", status);
if (!f) {
return;
}
// printf("writeCollationMetadataTOML %s %s\n", name, collationType);
fprintf(f, "bits = 0x%X\n", metadataBits);
fclose(f);
}
static UChar32
writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
UChar32 limit = ICU4X_DIACRITIC_LIMIT;
FILE* f = openTOML(outputdir, name, collationType, "dia", status);
if (!f) {
return limit;
}
// printf("writeCollationDiacriticsTOML %s %s\n", name, collationType);
uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE];
for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
uint16_t secondary = 0;
uint32_t ce32 = data->getCE32(c);
if (ce32 == icu::Collation::FALLBACK_CE32) {
ce32 = data->base->getCE32(c);
}
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
// These never occur in NFD data
} else if (!icu::Collation::isSimpleOrLongCE32(ce32)) {
if (uprv_strcmp(name, "root") == 0) {
printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c);
fclose(f);
*status = U_INTERNAL_PROGRAM_ERROR;
return limit;
}
limit = c;
break;
} else {
uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32));
if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) {
// Not a CE where only the secondary weight differs from the expected
// pattern.
limit = c;
break;
}
secondary = uint16_t(ce >> 16);
}
secondaries[c - ICU4X_DIACRITIC_BASE] = secondary;
}
usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n");
fclose(f);
return limit;
}
static void
writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) {
FILE* f = openTOML(outputdir, name, collationType, "reord", status);
if (!f) {
return;
}
// printf("writeCollationReorderingTOML %s %s\n", name, collationType);
fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder);
usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n");
usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n");
fclose(f);
}
static void
writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
FILE* f = openTOML(outputdir, name, collationType, "jamo", status);
if (!f) {
printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType);
return;
}
uint32_t jamo[0x1200-0x1100];
for (UChar32 c = 0x1100; c < 0x1200; ++c) {
uint32_t ce32 = data->getCE32(c);
if (ce32 == icu::Collation::FALLBACK_CE32) {
ce32 = data->base->getCE32(c);
}
// Can't reject complex CE32s, because search collations have expansions.
// These expansions refer to the tailoring, which foils the reuse of the
// these jamo tables.
// XXX Figure out what to do. Perhaps instead of having Latin mini expansions,
// there should be Hangul mini expansions.
// XXX in any case, validate that modern jamo are self-contained.
jamo[c - 0x1100] = ce32;
}
usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n");
fclose(f);
}
static UBool
convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) {
if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) {
// Range entirely in conjoining jamo block.
return TRUE;
}
icu::IcuToolErrorCode status("genrb: convertTrie");
umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status);
return !U_FAILURE(*status);
}
static void
writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) {
FILE* f = openTOML(outputdir, name, collationType, "data", status);
if (!f) {
return;
}
// printf("writeCollationDataTOML %s %s\n", name, collationType);
icu::UnicodeSet tailoringSet;
if (data->base) {
tailoringSet.addAll(*(data->unsafeBackwardSet));
tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
} else {
tailoringSet.addAll(*(data->unsafeBackwardSet));
}
// Use the same value for out-of-range and default in the hope of not having to allocate
// different blocks, since ICU4X never does out-of-range queries.
uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
utrie2_enum(data->trie, NULL, &convertTrie, builder.getAlias());
// If the diacritic table was cut short, copy CE32s between the lowered
// limit and the max limit from the root to the tailoring. As of June 2022,
// no collation in CLDR needs this.
for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) {
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
// These never occur in NFD data.
continue;
}
uint32_t ce32 = data->getCE32(c);
if (ce32 == icu::Collation::FALLBACK_CE32) {
ce32 = data->base->getCE32(c);
umutablecptrie_set(builder.getAlias(), c, ce32, status);
}
}
// Ensure that the range covered by the diacritic table isn't duplicated
// in the trie.
for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) {
if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
}
}
icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
builder.getAlias(),
UCPTRIE_TYPE_SMALL,
UCPTRIE_VALUE_BITS_32,
status));
usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n");
usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n");
usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n");
fprintf(f, "[trie]\n");
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
fclose(f);
}
static void
writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
FILE* f = openTOML(outputdir, name, collationType, "prim", status);
if (!f) {
return;
}
// printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType);
uint16_t lastPrimaries[4];
for (int32_t i = 0; i < 4; ++i) {
// getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one
// back to get a value that fits in 16 bits.
lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
}
uint32_t numericPrimary = data->numericPrimary;
if (numericPrimary & 0xFFFFFF) {
printf("Lower 24 bits set in numeric primary");
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n");
fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24);
fclose(f);
}
static void
writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
UBool tailored = FALSE;
UBool tailoredDiacritics = FALSE;
UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
UBool reordering = FALSE;
UBool isRoot = uprv_strcmp(name, "root") == 0;
UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT;
if (!data->base && isRoot) {
diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
if (U_FAILURE(*status)) {
return;
}
writeCollationJamoTOML(outputdir, name, collationType, data, status);
if (U_FAILURE(*status)) {
return;
}
writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status);
if (U_FAILURE(*status)) {
return;
}
} else if (data->base && !lithuanianDotAbove) {
for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
// These never occur in NFD data.
continue;
}
uint32_t ce32 = data->getCE32(c);
if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
tailoredDiacritics = TRUE;
diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
if (U_FAILURE(*status)) {
return;
}
break;
}
}
}
if (settings->hasReordering()) {
reordering = TRUE;
// Note: There are duplicate reorderings. Expecting the ICU4X provider
// to take care of deduplication.
writeCollationReorderingTOML(outputdir, name, collationType, settings, status);
if (U_FAILURE(*status)) {
return;
}
}
// Write collation data if either base is non-null or the name is root.
// Languages that only reorder scripts are otherwise root-like and have
// null base.
if (data->base || isRoot) {
tailored = !isRoot;
writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status);
if (U_FAILURE(*status)) {
return;
}
}
uint32_t maxVariable = (uint32_t)settings->getMaxVariable();
if (maxVariable >= 4) {
printf("Max variable out of range");
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
uint32_t metadataBits = maxVariable;
if (tailored) {
metadataBits |= (1 << 3);
}
if (tailoredDiacritics) {
metadataBits |= (1 << 4);
}
if (reordering) {
metadataBits |= (1 << 5);
}
if (lithuanianDotAbove) {
metadataBits |= (1 << 6);
}
if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) {
metadataBits |= (1 << 7);
}
if (settings->getAlternateHandling() == UCOL_SHIFTED) {
metadataBits |= (1 << 8);
}
switch (settings->getCaseFirst()) {
case UCOL_OFF:
break;
case UCOL_UPPER_FIRST:
metadataBits |= (1 << 9);
metadataBits |= (1 << 10);
break;
case UCOL_LOWER_FIRST:
metadataBits |= (1 << 9);
break;
default:
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status);
}
#endif // !UCONFIG_NO_COLLATION
static TableResource *
@ -952,9 +1287,9 @@ addCollation(ParseState* state, TableResource *result, const char *collationTyp
res_close(result);
return NULL; // TODO: use LocalUResourceBundlePointer for result
}
icu::CollationBuilder builder(base, intStatus);
if(uprv_strncmp(collationType, "search", 6) == 0) {
builder.disableFastLatin(); // build fast-Latin table unless search collator
icu::CollationBuilder builder(base, state->icu4xMode, intStatus);
if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) {
builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X
}
LocalPointer<icu::CollationTailoring> t(
builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
@ -977,6 +1312,19 @@ addCollation(ParseState* state, TableResource *result, const char *collationTyp
return NULL;
}
}
if (state->icu4xMode) {
char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1));
if (nameWithoutSuffix == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
res_close(result);
return NULL;
}
uprv_strcpy(nameWithoutSuffix, state->filename);
*uprv_strrchr(nameWithoutSuffix, '.') = 0;
writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status);
uprv_free(nameWithoutSuffix);
}
icu::LocalMemory<uint8_t> buffer;
int32_t capacity = 100000;
uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
@ -1966,7 +2314,7 @@ parseResource(ParseState* state, char *tag, const struct UString *comment, UErro
/* parse the top-level resource */
struct SRBRoot *
parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status)
UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status)
{
struct UString *tokenValue;
struct UString comment;
@ -1992,6 +2340,7 @@ parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *fi
state.filename = filename;
state.makeBinaryCollation = makeBinaryCollation;
state.omitCollationRules = omitCollationRules;
state.icu4xMode = icu4xMode;
ustr_init(&comment);
expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status);

View file

@ -31,7 +31,7 @@ void initParser();
/* Parse a ResourceBundle text file */
struct SRBRoot* parse(UCHARBUF *buf, const char* inputDir, const char* outputDir,
const char *filename,
UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status);
UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status);
U_CDECL_END

View file

@ -1,7 +1,15 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <unicode/localpointer.h>
#include <unicode/umachine.h>
#include <unicode/unistr.h>
#include <unicode/urename.h>
#include <unicode/uset.h>
#include <vector>
#include <algorithm>
#include "toolutil.h"
@ -15,7 +23,10 @@
#include "unicode/uscript.h"
#include "unicode/putil.h"
#include "unicode/umutablecptrie.h"
#include "unicode/ucharstriebuilder.h"
#include "ucase.h"
#include "unicode/normalizer2.h"
#include "normalizer2impl.h"
#include "writesrc.h"
U_NAMESPACE_USE
@ -299,6 +310,470 @@ FILE* prepareOutputFile(const char* basename) {
return f;
}
#if !UCONFIG_NO_NORMALIZATION
struct PendingDescriptor {
UChar32 scalar;
uint32_t descriptor;
UBool supplementary;
};
void writeCanonicalCompositions(USet* backwardCombiningStarters) {
IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
const char* basename = "compositions";
FILE* f = prepareOutputFile(basename);
LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
if (c >= 0xD800 && c < 0xE000) {
// Surrogate
continue;
}
UnicodeString decomposition;
if (!nfc->getRawDecomposition(c, decomposition)) {
continue;
}
int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
if (len != 2) {
continue;
}
UChar32 starter = utf32[0];
UChar32 second = utf32[1];
UChar32 composite = nfc->composePair(starter, second);
if (composite < 0) {
continue;
}
if (c != composite) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
if (!u_getCombiningClass(second)) {
uset_add(backwardCombiningStarters, second);
}
if (composite >= 0xAC00 && composite <= 0xD7A3) {
// Hangul syllable
continue;
}
UnicodeString backward;
backward.append(second);
backward.append(starter);
backwardBuilder->add(backward, int32_t(composite), status);
}
UnicodeString canonicalCompositionTrie;
backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n");
fclose(f);
handleError(status, basename);
}
void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
FILE* f = prepareOutputFile(basename);
usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n");
usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n");
fclose(f);
}
void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions) {
IcuToolErrorCode status("icuexportdata: writeDecompositionData");
FILE* f = prepareOutputFile(basename);
// Zero is a magic number that means the character decomposes to itself.
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
// Iterate backwards to insert lower code points in the trie first in case it matters
// for trie block allocation.
for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
const PendingDescriptor& pending = pendingTrieInsertions[i];
uint32_t additional = 0;
if (!(pending.descriptor & 0xFFFF0000)) {
uint32_t offset = pending.descriptor & 0xFFF;
if (!pending.supplementary) {
if (offset >= baseSize16) {
// This is a offset to supplementary 16-bit data. We have
// 16-bit base data and 32-bit base data before. However,
// the 16-bit base data length is already part of offset.
additional = baseSize32;
}
} else {
if (offset >= baseSize32) {
// This is an offset to supplementary 32-bit data. We have 16-bit
// base data, 32-bit base data, and 16-bit supplementary data before.
// However, the 32-bit base data length is already part
// of offset.
additional = baseSize16 + supplementSize16;
} else {
// This is an offset to 32-bit base data. We have 16-bit
// base data before.
additional = baseSize16;
}
}
if (offset + additional > 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
}
umutablecptrie_set(builder.getAlias(), pending.scalar, pending.descriptor + additional, status);
}
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
builder.getAlias(),
trieType,
UCPTRIE_VALUE_BITS_32,
status));
handleError(status, basename);
if (!reference) {
usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
} else {
if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
// NFD expectations don't hold. The set must not contain the half-width
// kana voicing marks and must contain iota subscript.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
USet* halfWidthVoicing = uset_openEmpty();
uset_add(halfWidthVoicing, 0xFF9E);
uset_add(halfWidthVoicing, 0xFF9F);
USet* iotaSubscript = uset_openEmpty();
uset_add(iotaSubscript, 0x0345);
uint8_t flags = 0;
USet* halfWidthCheck = uset_cloneAsThawed(uset);
uset_removeAll(halfWidthCheck, reference);
if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
flags |= 1;
} else if (!uset_isEmpty(halfWidthCheck)) {
// The result was neither empty nor contained exactly
// the two half-width voicing marks. The ICU4X
// normalizer doesn't know how to deal with this case.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
uset_close(halfWidthCheck);
USet* iotaCheck = uset_cloneAsThawed(reference);
uset_removeAll(iotaCheck, uset);
if (uset_equals(iotaCheck, iotaSubscript)) {
flags |= (1 << 1);
} else if (!uset_isEmpty(iotaCheck)) {
// The result was neither empty nor contained exactly
// the iota subscript. The ICU4X normalizer doesn't
// know how to deal with this case.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
uset_close(halfWidthCheck);
uset_close(iotaSubscript);
uset_close(halfWidthVoicing);
fprintf(f, "flags = 0x%X\n", flags);
}
fprintf(f, "[trie]\n");
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
fclose(f);
handleError(status, basename);
}
void writePotentialCompositionPassThrough(const char* basename, const Normalizer2* norm, const USet* decompositionStartsWithNonStarter, const USet* decompositionStartsWithBackwardCombiningStarter, USet* potentialPassthroughAndNotBackwardCombining) {
IcuToolErrorCode status("icuexportdata: writePotentialCompositionPassThrough");
FILE* f = prepareOutputFile(basename);
const Normalizer2* nfc = nullptr;
if (!norm) {
// UTS 46 case
norm = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
nfc = Normalizer2::getNFCInstance(status);
}
for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
if (c >= 0xD800 && c < 0xE000) {
// Surrogate
continue;
}
if (uset_contains(decompositionStartsWithNonStarter, c) || uset_contains(decompositionStartsWithBackwardCombiningStarter, c)) {
continue;
}
UnicodeString src;
UnicodeString dst;
src.append(c);
norm->normalize(src, dst, status);
if (nfc && (dst.isEmpty() || (dst == u"\uFFFD" && c != 0xFFFD))) {
// UTS 46 ignored and disallowed fall back to NFC for data
// overlap.
dst.truncate(0);
nfc->normalize(src, dst, status);
}
if (src == dst) {
uset_add(potentialPassthroughAndNotBackwardCombining, c);
}
}
// The surrogate range forms a useless discontinuity. The code
// that reads from the set never looks up by surrage, so let's
// put the surrogate range in the set as a micro-optimization.
uset_addRange(potentialPassthroughAndNotBackwardCombining, 0xD800, 0xDFFF);
usrc_writeUnicodeSet(f, potentialPassthroughAndNotBackwardCombining, UPRV_TARGET_SYNTAX_TOML);
fclose(f);
handleError(status, basename);
}
// Computes data for canonical decompositions
void computeDecompositions(const char* basename, const USet* backwardCombiningStarters, std::vector<uint16_t>& storage16, std::vector<uint32_t>& storage32, USet* decompositionStartsWithNonStarter, USet* decompositionStartsWithBackwardCombiningStarter, std::vector<PendingDescriptor>& pendingTrieInsertions) {
IcuToolErrorCode status("icuexportdata: computeDecompositions");
const Normalizer2* mainNormalizer;
const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
if (uprv_strcmp(basename, "nfkd") == 0) {
mainNormalizer = Normalizer2::getNFKDInstance(status);
} else if (uprv_strcmp(basename, "uts46d") == 0) {
mainNormalizer = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
} else {
mainNormalizer = nfdNormalizer;
}
// Max length as of Unicode 14 is 4 for NFD. For NFKD the max
// is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
// Iterate over all scalar values excluding Hangul syllables.
//
// We go backwards in order to better find overlapping decompositions.
//
// As of Unicode 14:
// Iterate forward without overlap search:
// nfd: 16 size: 896, 32 size: 173
// nfkd: 16 size: 3854, 32 size: 179
//
// Iterate forward with overlap search:
// nfd: 16 size: 888, 32 size: 173
// nfkd: 16 size: 3266, 32 size: 179
//
// Iterate backward with overlap search:
// nfd: 16 size: 776, 32 size: 173
// nfkd: 16 size: 2941, 32 size: 179
//
// UChar32 is signed!
for (UChar32 c = 0x10FFFF; c >= 0; --c) {
if (c >= 0xAC00 && c <= 0xD7A3) {
// Hangul syllable
continue;
}
if (c >= 0xD800 && c < 0xE000) {
// Surrogate
continue;
}
UnicodeString src;
UnicodeString dst;
src.append(c);
if (mainNormalizer != nfdNormalizer) {
UnicodeString inter;
mainNormalizer->normalize(src, inter, status);
nfdNormalizer->normalize(inter, dst, status);
} else {
nfdNormalizer->normalize(src, dst, status);
}
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
// Characters that normalize to nothing or to U+FFFD (without the
// input being U+FFFD) in ICU4C's UTS 46 normalization normalize
// as in NFD in ICU4X's UTF 46 normalization in the interest
// of data size and ICU4X's normalizer being unable to handle
// normalizing to nothing.
// When UTS 46 is implemented on top of ICU4X, a preprocessing
// step is supposed to remove these characters before the
// normalization step.
if (uprv_strcmp(basename, "uts46d") != 0) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
nfdNormalizer->normalize(src, dst, status);
len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
}
if (len > DECOMPOSITION_BUFFER_SIZE) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
bool startsWithNonStarter = u_getCombiningClass(utf32[0]);
if (startsWithNonStarter) {
uset_add(decompositionStartsWithNonStarter, c);
} else if (uset_contains(backwardCombiningStarters, c)) {
uset_add(decompositionStartsWithBackwardCombiningStarter, c);
}
if (mainNormalizer != nfdNormalizer) {
UnicodeString nfd;
nfdNormalizer->normalize(src, nfd, status);
if (dst == nfd) {
continue;
}
} else {
if (src == dst) {
continue;
}
}
if (startsWithNonStarter && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) {
// A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
if (len == 1 && utf32[0] <= 0xFFFF) {
if (utf32[0] == 1) {
// 1 is reserved as a marker for the expansion of U+FDFA.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE});
} else if (len == 2 && utf32[0] <= 0xFFFF && utf32[1] <= 0xFFFF && !u_getCombiningClass(utf32[0]) && u_getCombiningClass(utf32[1])) {
pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), FALSE});
} else {
UBool supplementary = FALSE;
UBool nonInitialStarter = FALSE;
for (int32_t i = 0; i < len; ++i) {
if (utf32[i] > 0xFFFF) {
supplementary = TRUE;
}
if (utf32[i] == 0) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
if (i != 0 && !u_getCombiningClass(utf32[i])) {
nonInitialStarter = TRUE;
}
}
if (!supplementary) {
if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
if (len == 18 && c == 0xFDFA) {
// Special marker for the one character whose decomposition
// is too long.
pendingTrieInsertions.push_back({c, 1 << 16, supplementary});
continue;
} else {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
}
} else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
// Complex decomposition
// Format for 16-bit value:
// 15..13: length minus two for 16-bit case and length minus one for
// the 32-bit case. Length 8 needs to fit in three bits in
// the 16-bit case, and this way the value is future-proofed
// up to 9 in the 16-bit case. Zero is unused and length one
// in the 16-bit case goes directly into the trie.
// 12: 1 if all trailing characters are guaranteed non-starters,
// 0 if no guarantees about non-starterness.
// Note: The bit choice is this way around to allow for
// dynamically falling back to not having this but instead
// having one more bit for length by merely choosing
// different masks.
// 11..0: Start offset in storage. The offset is to the logical
// sequence of scalars16, scalars32, supplementary_scalars16,
// supplementary_scalars32.
uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
if (!supplementary) {
descriptor |= (uint32_t(len) - 2) << 13;
} else {
descriptor |= (uint32_t(len) - 1) << 13;
}
if (descriptor & 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
size_t index = 0;
bool writeToStorage = FALSE;
// Sadly, C++ lacks break and continue by label, so using goto in the
// inner loops to break or continue the outer loop.
if (!supplementary) {
outer16: for (;;) {
if (index == storage16.size()) {
writeToStorage = TRUE;
break;
}
if (storage16[index] == utf32[0]) {
for (int32_t i = 1; i < len; ++i) {
if (storage16[index + i] != uint32_t(utf32[i])) {
++index;
// continue outer
goto outer16;
}
}
// break outer
goto after;
}
++index;
}
} else {
outer32: for (;;) {
if (index == storage32.size()) {
writeToStorage = TRUE;
break;
}
if (storage32[index] == uint32_t(utf32[0])) {
for (int32_t i = 1; i < len; ++i) {
if (storage32[index + i] != uint32_t(utf32[i])) {
++index;
// continue outer
goto outer32;
}
}
// break outer
goto after;
}
++index;
}
}
after:
if (index > 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
descriptor |= uint32_t(index);
if (!descriptor || descriptor > 0xFFFF) {
// > 0xFFFF should never happen if the code above is correct.
// == 0 should not happen due to the nature of the data.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
if (writeToStorage) {
if (!supplementary) {
for (int32_t i = 0; i < len; ++i) {
storage16.push_back(uint16_t(utf32[i]));
}
} else {
for (int32_t i = 0; i < len; ++i) {
storage32.push_back(uint32_t(utf32[i]));
}
}
}
pendingTrieInsertions.push_back({c, descriptor, supplementary});
}
}
if (storage16.size() + storage32.size() > 0xFFF) {
status.set(U_INTERNAL_PROGRAM_ERROR);
}
handleError(status, basename);
}
#endif // !UCONFIG_NO_NORMALIZATION
enum {
OPT_HELP_H,
OPT_HELP_QUESTION_MARK,
@ -341,7 +816,7 @@ void printHelp(FILE* stdfile, const char* program) {
"options:\n"
"\t-h or -? or --help this usage text\n"
"\t-V or --version show a version message\n"
"\t-m or --mode mode: currently only 'uprops' and 'ucase', but more may be added\n"
"\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
"\t --trie-type set the trie type (small or fast, default small)\n"
"\t-d or --destdir destination directory, followed by the path\n"
"\t --all write out all properties known to icuexportdata\n"
@ -387,6 +862,46 @@ int exportUprops(int argc, char* argv[]) {
}
}
if (propNames.empty()
|| options[OPT_HELP_H].doesOccur
|| options[OPT_HELP_QUESTION_MARK].doesOccur
|| !options[OPT_MODE].doesOccur) {
FILE *stdfile=argc<0 ? stderr : stdout;
fprintf(stdfile,
"usage: %s -m uprops [-options] [--all | properties...]\n"
"\tdump Unicode property data to .toml files\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
"\t-V or --version show a version message\n"
"\t-m or --mode mode: currently only 'uprops', but more may be added\n"
"\t --trie-type set the trie type (small or fast, default small)\n"
"\t-d or --destdir destination directory, followed by the path\n"
"\t --all write out all properties known to icuexportdata\n"
"\t --index write an _index.toml summarizing all data exported\n"
"\t-c or --copyright include a copyright notice\n"
"\t-v or --verbose Turn on verbose output\n"
"\t-q or --quiet do not display warnings and progress\n",
argv[0]);
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
const char* mode = options[OPT_MODE].value;
if (uprv_strcmp(mode, "uprops") != 0) {
fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
return U_ILLEGAL_ARGUMENT_ERROR;
}
if (options[OPT_TRIE_TYPE].doesOccur) {
if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
trieType = UCPTRIE_TYPE_FAST;
} else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
trieType = UCPTRIE_TYPE_SMALL;
} else {
fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
return U_ILLEGAL_ARGUMENT_ERROR;
}
}
for (const char* propName : propNames) {
UProperty propEnum = u_getPropertyEnum(propName);
if (propEnum == UCHAR_INVALID_CODE) {
@ -505,6 +1020,81 @@ int exportCase(int argc, char* argv[]) {
return 0;
}
#if !UCONFIG_NO_NORMALIZATION
int exportNorm() {
IcuToolErrorCode status("icuexportdata: exportNorm");
USet* backwardCombiningStarters = uset_openEmpty();
writeCanonicalCompositions(backwardCombiningStarters);
std::vector<uint16_t> storage16;
std::vector<uint32_t> storage32;
USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
std::vector<PendingDescriptor> nfdPendingTrieInsertions;
computeDecompositions("nfd", backwardCombiningStarters, storage16, storage32, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfdPendingTrieInsertions);
uint32_t baseSize16 = storage16.size();
uint32_t baseSize32 = storage32.size();
USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
computeDecompositions("nfkd", backwardCombiningStarters, storage16, storage32, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkdPendingTrieInsertions);
USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
std::vector<PendingDescriptor> uts46PendingTrieInsertions;
computeDecompositions("uts46d", backwardCombiningStarters, storage16, storage32, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PendingTrieInsertions);
uint32_t supplementSize16 = storage16.size() - baseSize16;
uint32_t supplementSize32 = storage32.size() - baseSize32;
writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions);
writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions);
writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions);
writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
USet* nfcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
writePotentialCompositionPassThrough("nfc", nfc, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfcPotentialPassthroughAndNotBackwardCombining);
USet* nfkcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
const Normalizer2* nfkc = Normalizer2::getNFKCInstance(status);
writePotentialCompositionPassThrough("nfkc", nfkc, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkcPotentialPassthroughAndNotBackwardCombining);
USet* uts46PotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
writePotentialCompositionPassThrough("uts46", nullptr, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PotentialPassthroughAndNotBackwardCombining);
// Check that NFKC set has no characters that NFC doesn't also have.
uset_removeAll(nfkcPotentialPassthroughAndNotBackwardCombining, nfcPotentialPassthroughAndNotBackwardCombining);
if (!uset_isEmpty(nfkcPotentialPassthroughAndNotBackwardCombining)) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, "exportNorm");
}
uset_close(nfcPotentialPassthroughAndNotBackwardCombining);
uset_close(nfkcPotentialPassthroughAndNotBackwardCombining);
uset_close(uts46PotentialPassthroughAndNotBackwardCombining);
uset_close(nfdDecompositionStartsWithNonStarter);
uset_close(nfkdDecompositionStartsWithNonStarter);
uset_close(uts46DecompositionStartsWithNonStarter);
uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
uset_close(backwardCombiningStarters);
handleError(status, "exportNorm");
return 0;
}
#endif // !UCONFIG_NO_NORMALIZATION
int main(int argc, char* argv[]) {
U_MAIN_INIT_ARGS(argc, argv);
@ -553,12 +1143,20 @@ int main(int argc, char* argv[]) {
}
const char* mode = options[OPT_MODE].value;
if (uprv_strcmp(mode, "norm") == 0) {
#if !UCONFIG_NO_NORMALIZATION
return exportNorm();
#else
fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
return U_ILLEGAL_ARGUMENT_ERROR;
#endif
}
if (uprv_strcmp(mode, "uprops") == 0) {
return exportUprops(argc, argv);
} else if (uprv_strcmp(mode, "ucase") == 0) {
return exportCase(argc, argv);
}
fprintf(stderr, "Invalid option for --mode (must be uprops or ucase)\n");
fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
return U_ILLEGAL_ARGUMENT_ERROR;
}

View file

@ -19,6 +19,7 @@
*/
#include <stdio.h>
#include <inttypes.h>
#include <time.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
@ -143,12 +144,14 @@ usrc_writeArray(FILE *f,
const uint8_t *p8;
const uint16_t *p16;
const uint32_t *p32;
uint32_t value;
const int64_t *p64; // Signed due to TOML!
int64_t value; // Signed due to TOML!
int32_t i, col;
p8=NULL;
p16=NULL;
p32=NULL;
p64=NULL;
switch(width) {
case 8:
p8=(const uint8_t *)p;
@ -159,6 +162,9 @@ usrc_writeArray(FILE *f,
case 32:
p32=(const uint32_t *)p;
break;
case 64:
p64=(const int64_t *)p;
break;
default:
fprintf(stderr, "usrc_writeArray(width=%ld) unrecognized width\n", (long)width);
return;
@ -186,11 +192,14 @@ usrc_writeArray(FILE *f,
case 32:
value=p32[i];
break;
case 64:
value=p64[i];
break;
default:
value=0; /* unreachable */
break;
}
fprintf(f, value<=9 ? "%lu" : "0x%lx", (unsigned long)value);
fprintf(f, value<=9 ? "%" PRId64 : "0x%" PRIx64, value);
}
if(postfix!=NULL) {
fputs(postfix, f);

View file

@ -69,7 +69,7 @@ usrc_writeFileNameGeneratedBy(
const char *generator);
/**
* Writes the contents of an array of 8/16/32-bit words.
* Writes the contents of an array of 8/16/32/64-bit words.
* The prefix and postfix are optional (can be NULL) and are written first/last.
* The prefix may contain a %ld or similar field for the array length.
* The {} and declaration etc. need to be included in prefix/postfix or

View file

@ -83,14 +83,15 @@ binarySearch(const UVector64 &list, int64_t ce) {
} // namespace
CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
: CollationDataBuilder(errorCode),
CollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
: CollationDataBuilder(icu4xMode, errorCode),
numericPrimary(0x12000000),
firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
rootElements(errorCode),
scriptStartsLength(1) {
uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
this->icu4xMode = icu4xMode;
}
CollationBaseDataBuilder::~CollationBaseDataBuilder() {
@ -119,7 +120,9 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) {
trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
// Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
for(UChar32 c = 0; c < 0x180; ++c) {
// In the ICU4X case, only preallocate ASCII, because we don't store CE32s for
// precomposed characters.
for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) {
utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
}
@ -128,8 +131,10 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) {
// Some code assumes that the root first primary CE is the "space first primary"
// from FractionalUCA.txt.
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
if (!icu4xMode) {
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
}
// Add a mapping for the first-unassigned boundary,
// which is the AlphabeticIndex overflow boundary.

View file

@ -37,7 +37,7 @@ U_NAMESPACE_BEGIN
*/
class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder {
public:
CollationBaseDataBuilder(UErrorCode &errorCode);
CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode);
virtual ~CollationBaseDataBuilder();

View file

@ -24,6 +24,7 @@
#define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
#include <stdio.h>
#include <stdint.h>
#include "unicode/utypes.h"
#include "unicode/errorcode.h"
#include "unicode/localpointer.h"
@ -69,7 +70,7 @@ enum HanOrderValue {
HAN_RADICAL_STROKE
};
static UBool beVerbose=FALSE, withCopyright=TRUE;
static UBool beVerbose=FALSE, withCopyright=TRUE, icu4xMode=FALSE;
static HanOrderValue hanOrder = HAN_NO_ORDER;
@ -832,6 +833,11 @@ parseFractionalUCA(const char *filename,
int32_t lineNumber = 0;
char buffer[30000];
const Normalizer2* norm = nullptr;
if (icu4xMode) {
norm = Normalizer2::getNFDInstance(*status);
}
UChar32 maxCodePoint = 0;
while(!feof(data)) {
if(U_FAILURE(*status)) {
@ -889,6 +895,24 @@ parseFractionalUCA(const char *filename,
// CollationBaseDataBuilder::init() maps them to special CEs.
// Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
if(0xfffd <= c && c <= 0xffff) { continue; }
if (icu4xMode) {
if (c >= 0xAC00 && c <= 0xD7A3) {
// Hangul syllable
continue;
}
if (c >= 0xD800 && c < 0xE000) {
// Surrogate
continue;
}
UnicodeString src;
UnicodeString dst;
src.append(c);
norm->normalize(src, dst, *status);
if (src != dst) {
// c decomposed, skip it
continue;
}
}
if(s.length() >= 2 && c == 0xFDD1) {
UChar32 c2 = s.char32At(1);
int32_t script = getCharScript(c2);
@ -923,7 +947,6 @@ parseFractionalUCA(const char *filename,
(int)lineNumber, filename, line);
exit(U_INVALID_FORMAT_ERROR);
}
builder.add(prefix, s, ces, cesLength, *status);
}
}
@ -1126,8 +1149,9 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder,
CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
const char *dataName =
hanOrder == HAN_IMPLICIT ? "ucadata-implicithan" :
"ucadata-unihan";
hanOrder == HAN_IMPLICIT ?
(icu4xMode ? "ucadata-implicithan-icu4x" : "ucadata-implicithan") :
(icu4xMode ? "ucadata-unihan-icu4x" : "ucadata-unihan");
UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo,
withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
@ -1275,7 +1299,7 @@ parseAndWriteCollationRootData(
const char *sourceCodePath,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
CollationBaseDataBuilder builder(errorCode);
CollationBaseDataBuilder builder(icu4xMode, errorCode);
builder.init(errorCode);
parseFractionalUCA(fracUCAPath, builder, &errorCode);
buildAndWriteBaseData(builder, binaryDataPath, errorCode);
@ -1289,7 +1313,8 @@ enum {
HELP_QUESTION_MARK,
VERBOSE,
COPYRIGHT,
HAN_ORDER
HAN_ORDER,
ICU4X
};
static UOption options[]={
@ -1297,7 +1322,8 @@ static UOption options[]={
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG)
UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG),
UOPTION_DEF("icu4x", 'X', UOPT_NO_ARG)
};
extern "C" int
@ -1348,6 +1374,7 @@ main(int argc, char* argv[]) {
beVerbose=options[VERBOSE].doesOccur;
withCopyright=options[COPYRIGHT].doesOccur;
icu4xMode=options[ICU4X].doesOccur;
IcuToolErrorCode errorCode("genuca");