mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-11 08:01:32 +00:00
ICU-22028 Export collation and normalization data for ICU4X
This commit is contained in:
parent
d7c424b00f
commit
3cefbd55c7
22 changed files with 1275 additions and 76 deletions
|
@ -418,7 +418,7 @@ jobs:
|
|||
timeoutInMinutes: 30
|
||||
pool:
|
||||
vmImage: 'windows-2019'
|
||||
demands:
|
||||
demands:
|
||||
- msbuild
|
||||
- visualstudio
|
||||
- Cmd
|
||||
|
@ -633,6 +633,8 @@ jobs:
|
|||
cd icu4c/source
|
||||
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all
|
||||
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all
|
||||
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all
|
||||
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all
|
||||
displayName: 'Build Unicode property data export file (Full)'
|
||||
# In the sample file, include:
|
||||
# - Basic binary properties: AHex WSpace
|
||||
|
@ -646,6 +648,18 @@ jobs:
|
|||
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/fast --trie-type fast AHex gc nt Basic_Emoji sc WSpace blank
|
||||
LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/small --trie-type small AHex gc nt Basic_Emoji sc WSpace blank
|
||||
displayName: 'Build Unicode property data export file (Sample)'
|
||||
- script: |
|
||||
mkdir -p icu4c/source/icuexportdata_uprops_full/collation_unihan
|
||||
mkdir -p icu4c/source/icuexportdata_uprops_full/collation_implicithan
|
||||
cd icu4c/source
|
||||
cd data/coll
|
||||
FILES=`echo *.txt`
|
||||
cd -
|
||||
LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_unihan --ucadata data/in/coll/ucadata-unihan-icu4x.icu $FILES
|
||||
LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_implicithan --ucadata data/in/coll/ucadata-implicithan-icu4x.icu $FILES
|
||||
rm icuexportdata_uprops_full/collation_unihan/*.res
|
||||
rm icuexportdata_uprops_full/collation_implicithan/*.res
|
||||
displayName: 'Build collation data export file'
|
||||
- task: PublishBuildArtifacts@1
|
||||
displayName: 'Publish Artifact: icuexportdata_uprops_full'
|
||||
inputs:
|
||||
|
|
|
@ -44,7 +44,7 @@ struct UDataMemory {
|
|||
int32_t length; /* Length of the data in bytes; -1 if unknown. */
|
||||
};
|
||||
|
||||
U_CFUNC UDataMemory *UDataMemory_createNewInstance(UErrorCode *pErr);
|
||||
U_CAPI UDataMemory* U_EXPORT2 UDataMemory_createNewInstance(UErrorCode *pErr);
|
||||
U_CFUNC void UDatamemory_assign (UDataMemory *dest, UDataMemory *source);
|
||||
U_CFUNC void UDataMemory_init (UDataMemory *This);
|
||||
U_CFUNC UBool UDataMemory_isLoaded(const UDataMemory *This);
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
#include "unicode/udata.h"
|
||||
#include "putilimp.h"
|
||||
|
||||
U_CFUNC UBool uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status);
|
||||
U_CAPI UBool U_EXPORT2 uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status);
|
||||
U_CFUNC void uprv_unmapFile(UDataMemory *pData);
|
||||
|
||||
/* MAP_NONE: no memory mapping, no file access at all */
|
||||
|
|
BIN
icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu
Normal file
BIN
icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu
Normal file
Binary file not shown.
BIN
icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu
Normal file
BIN
icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu
Normal file
Binary file not shown.
|
@ -44,3 +44,6 @@ bazelisk run //tools/unicode/c/genprops $ICU_SRC/icu4c
|
|||
# We run it twice for different versions of the CLDR root sort order.
|
||||
bazelisk run //tools/unicode/c/genuca -- --hanOrder implicit $ICU_SRC/icu4c
|
||||
bazelisk run //tools/unicode/c/genuca -- --hanOrder radical-stroke $ICU_SRC/icu4c
|
||||
# Also generate the ICU4X versions
|
||||
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
|
||||
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c
|
||||
|
|
|
@ -221,7 +221,8 @@ public:
|
|||
/**
|
||||
* Points to contraction data.
|
||||
* Bits 31..13: Index into prefix/contraction data.
|
||||
* Bits 12..11: Unused, 0.
|
||||
* Bit 12: Unused, 0.
|
||||
* Bit 11: CONTRACT_HAS_STARTER flag. (Used by ICU4X only.)
|
||||
* Bit 10: CONTRACT_TRAILING_CCC flag.
|
||||
* Bit 9: CONTRACT_NEXT_CCC flag.
|
||||
* Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag.
|
||||
|
@ -298,6 +299,8 @@ public:
|
|||
static const uint32_t CONTRACT_NEXT_CCC = 0x200;
|
||||
/** Set if any contraction suffix ends with lccc!=0. */
|
||||
static const uint32_t CONTRACT_TRAILING_CCC = 0x400;
|
||||
/** Set if any contraction suffix contains a starter. (Used by ICU4X only.) */
|
||||
static const uint32_t CONTRACT_HAS_STARTER = 0x800;
|
||||
|
||||
/** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */
|
||||
static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100;
|
||||
|
|
|
@ -198,7 +198,7 @@ const int32_t CollationBuilder::HAS_BEFORE2;
|
|||
const int32_t CollationBuilder::HAS_BEFORE3;
|
||||
#endif
|
||||
|
||||
CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
|
||||
CollationBuilder::CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode)
|
||||
: nfd(*Normalizer2::getNFDInstance(errorCode)),
|
||||
fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
|
||||
nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
|
||||
|
@ -206,7 +206,8 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro
|
|||
baseData(b->data),
|
||||
rootElements(b->data->rootElements, b->data->rootElementsLength),
|
||||
variableTop(0),
|
||||
dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),
|
||||
dataBuilder(new CollationDataBuilder(icu4xMode, errorCode)), fastLatinEnabled(TRUE),
|
||||
icu4xMode(icu4xMode),
|
||||
errorReason(NULL),
|
||||
cesLength(0),
|
||||
rootPrimaryIndexes(errorCode), nodes(errorCode) {
|
||||
|
@ -225,6 +226,10 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro
|
|||
}
|
||||
}
|
||||
|
||||
CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
|
||||
: CollationBuilder(b, FALSE, errorCode)
|
||||
{}
|
||||
|
||||
CollationBuilder::~CollationBuilder() {
|
||||
delete dataBuilder;
|
||||
}
|
||||
|
@ -262,15 +267,19 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
|
|||
if(U_FAILURE(errorCode)) { return NULL; }
|
||||
if(dataBuilder->hasMappings()) {
|
||||
makeTailoredCEs(errorCode);
|
||||
closeOverComposites(errorCode);
|
||||
if (!icu4xMode) {
|
||||
closeOverComposites(errorCode);
|
||||
}
|
||||
finalizeCEs(errorCode);
|
||||
// Copy all of ASCII, and Latin-1 letters, into each tailoring.
|
||||
optimizeSet.add(0, 0x7f);
|
||||
optimizeSet.add(0xc0, 0xff);
|
||||
// Hangul is decomposed on the fly during collation,
|
||||
// and the tailoring data is always built with HANGUL_TAG specials.
|
||||
optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
|
||||
dataBuilder->optimize(optimizeSet, errorCode);
|
||||
if (!icu4xMode) {
|
||||
// Copy all of ASCII, and Latin-1 letters, into each tailoring.
|
||||
optimizeSet.add(0, 0x7f);
|
||||
optimizeSet.add(0xc0, 0xff);
|
||||
// Hangul is decomposed on the fly during collation,
|
||||
// and the tailoring data is always built with HANGUL_TAG specials.
|
||||
optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
|
||||
dataBuilder->optimize(optimizeSet, errorCode);
|
||||
}
|
||||
tailoring->ensureOwnedData(errorCode);
|
||||
if(U_FAILURE(errorCode)) { return NULL; }
|
||||
if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
|
||||
|
@ -743,14 +752,18 @@ CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
|
|||
}
|
||||
}
|
||||
uint32_t ce32 = Collation::UNASSIGNED_CE32;
|
||||
if((prefix != nfdPrefix || str != nfdString) &&
|
||||
if(!icu4xMode && (prefix != nfdPrefix || str != nfdString) &&
|
||||
!ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
|
||||
// Map from the original input to the CEs.
|
||||
// We do this in case the canonical closure is incomplete,
|
||||
// so that it is possible to explicitly provide the missing mappings.
|
||||
ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
|
||||
}
|
||||
addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
|
||||
if (!icu4xMode) {
|
||||
addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
|
||||
} else {
|
||||
addIfDifferent(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
parserErrorReason = "writing collation elements";
|
||||
return;
|
||||
|
@ -1608,7 +1621,7 @@ CEFinalizer::~CEFinalizer() {}
|
|||
void
|
||||
CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode);
|
||||
LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(icu4xMode, errorCode), errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ class Normalizer2Impl;
|
|||
|
||||
class U_I18N_API CollationBuilder : public CollationRuleParser::Sink {
|
||||
public:
|
||||
CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode);
|
||||
CollationBuilder(const CollationTailoring *base, UErrorCode &errorCode);
|
||||
virtual ~CollationBuilder();
|
||||
|
||||
|
@ -302,6 +303,7 @@ private:
|
|||
|
||||
CollationDataBuilder *dataBuilder;
|
||||
UBool fastLatinEnabled;
|
||||
UBool icu4xMode;
|
||||
UnicodeSet optimizeSet;
|
||||
const char *errorReason;
|
||||
|
||||
|
|
|
@ -296,16 +296,19 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &
|
|||
|
||||
// ------------------------------------------------------------------------- ***
|
||||
|
||||
CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode)
|
||||
CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
|
||||
: nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
|
||||
base(NULL), baseSettings(NULL),
|
||||
trie(NULL),
|
||||
ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
|
||||
modified(FALSE),
|
||||
icu4xMode(icu4xMode),
|
||||
fastLatinEnabled(FALSE), fastLatinBuilder(NULL),
|
||||
collIter(NULL) {
|
||||
// Reserve the first CE32 for U+0000.
|
||||
ce32s.addElement(0, errorCode);
|
||||
if (!icu4xMode) {
|
||||
ce32s.addElement(0, errorCode);
|
||||
}
|
||||
conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
|
||||
}
|
||||
|
||||
|
@ -329,28 +332,32 @@ CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &error
|
|||
base = b;
|
||||
|
||||
// For a tailoring, the default is to fall back to the base.
|
||||
trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode);
|
||||
// For ICU4X, use the same value for fallback as for the default
|
||||
// to avoid having to have different blocks for the two.
|
||||
trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);
|
||||
|
||||
// Set the Latin-1 letters block so that it is allocated first in the data array,
|
||||
// to try to improve locality of reference when sorting Latin-1 text.
|
||||
// Do not use utrie2_setRange32() since that will not actually allocate blocks
|
||||
// that are filled with the default value.
|
||||
// ASCII (0..7F) is already preallocated anyway.
|
||||
for(UChar32 c = 0xc0; c <= 0xff; ++c) {
|
||||
utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
|
||||
if (!icu4xMode) {
|
||||
// Set the Latin-1 letters block so that it is allocated first in the data array,
|
||||
// to try to improve locality of reference when sorting Latin-1 text.
|
||||
// Do not use utrie2_setRange32() since that will not actually allocate blocks
|
||||
// that are filled with the default value.
|
||||
// ASCII (0..7F) is already preallocated anyway.
|
||||
for(UChar32 c = 0xc0; c <= 0xff; ++c) {
|
||||
utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
|
||||
}
|
||||
|
||||
// Hangul syllables are not tailorable (except via tailoring Jamos).
|
||||
// Always set the Hangul tag to help performance.
|
||||
// Do this here, rather than in buildMappings(),
|
||||
// so that we see the HANGUL_TAG in various assertions.
|
||||
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
|
||||
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
|
||||
|
||||
// Copy the set contents but don't copy/clone the set as a whole because
|
||||
// that would copy the isFrozen state too.
|
||||
unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
|
||||
}
|
||||
|
||||
// Hangul syllables are not tailorable (except via tailoring Jamos).
|
||||
// Always set the Hangul tag to help performance.
|
||||
// Do this here, rather than in buildMappings(),
|
||||
// so that we see the HANGUL_TAG in various assertions.
|
||||
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
|
||||
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
|
||||
|
||||
// Copy the set contents but don't copy/clone the set as a whole because
|
||||
// that would copy the isFrozen state too.
|
||||
unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
|
||||
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
}
|
||||
|
||||
|
@ -567,6 +574,98 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
|
|||
int32_t cLength = U16_LENGTH(c);
|
||||
uint32_t oldCE32 = utrie2_get32(trie, c);
|
||||
UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
|
||||
|
||||
if (icu4xMode) {
|
||||
if (base && c >= 0x1100 && c < 0x1200) {
|
||||
// Omit jamo tailorings.
|
||||
// TODO(https://github.com/unicode-org/icu4x/issues/1941).
|
||||
}
|
||||
const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
|
||||
UnicodeString sInNfd;
|
||||
nfdNormalizer->normalize(s, sInNfd, errorCode);
|
||||
if (s != sInNfd) {
|
||||
// s is not in NFD, so it cannot match in ICU4X, since ICU4X only
|
||||
// does NFD lookups.
|
||||
// Now check that we're only rejecting known cases.
|
||||
if (s.length() == 2) {
|
||||
char16_t second = s.charAt(1);
|
||||
if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
|
||||
// Second is a special decomposing Tibetan vowel sign.
|
||||
// These also get added in the decomposed form, so ignoring
|
||||
// this instance is OK.
|
||||
return;
|
||||
}
|
||||
if (c == 0xFDD1 && second == 0xAC00) {
|
||||
// This strange contraction exists in the root and
|
||||
// doesn't have a decomposed counterpart there.
|
||||
// This won't match in ICU4X anyway and is very strange:
|
||||
// Unassigned Arabic presentation form contracting with
|
||||
// the very first Hangul syllable. Let's ignore this
|
||||
// explicitly.
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Unknown case worth investigating if ever found.
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!prefix.isEmpty()) {
|
||||
UnicodeString prefixInNfd;
|
||||
nfdNormalizer->normalize(prefix, prefixInNfd, errorCode);
|
||||
if (prefix != prefixInNfd) {
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t count = prefix.countChar32();
|
||||
if (count > 2) {
|
||||
// Prefix too long for ICU4X.
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
UChar32 utf32[4];
|
||||
int32_t len = prefix.toUTF32(utf32, 4, errorCode);
|
||||
if (len != count) {
|
||||
errorCode = U_INVALID_STATE_ERROR;
|
||||
return;
|
||||
}
|
||||
UChar32 c = utf32[0];
|
||||
if (u_getCombiningClass(c)) {
|
||||
// Prefix must start with as starter for ICU4X.
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
// XXX: Korean searchjl has jamo in prefix, so commenting out this
|
||||
// check for now. ICU4X currently ignores non-root jamo tables anyway.
|
||||
// searchjl was added in
|
||||
// https://unicode-org.atlassian.net/browse/CLDR-3560
|
||||
// Contractions were changed to prefixes in
|
||||
// https://unicode-org.atlassian.net/browse/CLDR-6546
|
||||
//
|
||||
// if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) {
|
||||
// errorCode = U_UNSUPPORTED_ERROR;
|
||||
// return;
|
||||
// }
|
||||
if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) {
|
||||
// Second character in prefix, if present, must be a kana voicing mark for ICU4X.
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (s.length() > cLength) {
|
||||
// Check that there's no modern Hangul in contractions.
|
||||
for (int32_t i = 0; i < s.length(); ++i) {
|
||||
UChar c = s.charAt(i);
|
||||
if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
|
||||
errorCode = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(oldCE32 == Collation::FALLBACK_CE32) {
|
||||
// First tailoring for c.
|
||||
// If c has contextual base mappings or if we add a contextual mapping,
|
||||
|
@ -688,8 +787,11 @@ CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
|
|||
return encodeOneCEAsCE32(0);
|
||||
} else if(cesLength == 1) {
|
||||
return encodeOneCE(ces[0], errorCode);
|
||||
} else if(cesLength == 2) {
|
||||
} else if(cesLength == 2 && !icu4xMode) {
|
||||
// Try to encode two CEs as one CE32.
|
||||
// Turn this off for ICU4X, because without the canonical closure
|
||||
// these are so rare that it doesn't make sense to spend a branch
|
||||
// on checking this tag when using the data.
|
||||
int64_t ce0 = ces[0];
|
||||
int64_t ce1 = ces[1];
|
||||
uint32_t p0 = (uint32_t)(ce0 >> 32);
|
||||
|
@ -1297,9 +1399,11 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
|
|||
setDigitTags(errorCode);
|
||||
setLeadSurrogates(errorCode);
|
||||
|
||||
// For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
|
||||
ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
|
||||
utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
|
||||
if (!icu4xMode) {
|
||||
// For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
|
||||
ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
|
||||
utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
|
||||
}
|
||||
|
||||
utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
|
@ -1454,6 +1558,20 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode)
|
|||
// The last suffix character has lccc!=0, allowing for discontiguous contractions.
|
||||
flags |= Collation::CONTRACT_TRAILING_CCC;
|
||||
}
|
||||
if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) {
|
||||
for (int32_t i = 0; i < suffix.length();) {
|
||||
UChar32 c = suffix.char32At(i);
|
||||
if (!u_getCombiningClass(c)) {
|
||||
flags |= Collation::CONTRACT_HAS_STARTER;
|
||||
break;
|
||||
}
|
||||
if (c > 0xFFFF) {
|
||||
i += 2;
|
||||
} else {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode);
|
||||
if(cond == lastCond) { break; }
|
||||
cond = getConditionalCE32(cond->next);
|
||||
|
|
|
@ -60,7 +60,7 @@ public:
|
|||
virtual int64_t modifyCE(int64_t ce) const = 0;
|
||||
};
|
||||
|
||||
CollationDataBuilder(UErrorCode &errorCode);
|
||||
CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode);
|
||||
|
||||
virtual ~CollationDataBuilder();
|
||||
|
||||
|
@ -255,6 +255,7 @@ private:
|
|||
protected:
|
||||
UnicodeSet unsafeBackwardSet;
|
||||
UBool modified;
|
||||
UBool icu4xMode;
|
||||
|
||||
UBool fastLatinEnabled;
|
||||
CollationFastLatinBuilder *fastLatinBuilder;
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "ucln_in.h"
|
||||
#include "udatamem.h"
|
||||
#include "umutex.h"
|
||||
#include "umapfile.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -47,17 +48,46 @@ static UBool U_CALLCONV uprv_collation_root_cleanup() {
|
|||
|
||||
U_CDECL_END
|
||||
|
||||
UDataMemory*
|
||||
CollationRoot::loadFromFile(const char* ucadataPath, UErrorCode &errorCode) {
|
||||
UDataMemory dataMemory;
|
||||
UDataMemory *rDataMem = NULL;
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
if (uprv_mapFile(&dataMemory, ucadataPath, &errorCode)) {
|
||||
if (dataMemory.pHeader->dataHeader.magic1 == 0xda &&
|
||||
dataMemory.pHeader->dataHeader.magic2 == 0x27 &&
|
||||
CollationDataReader::isAcceptable(NULL, "icu", "ucadata", &dataMemory.pHeader->info)) {
|
||||
rDataMem = UDataMemory_createNewInstance(&errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
rDataMem->pHeader = dataMemory.pHeader;
|
||||
rDataMem->mapAddr = dataMemory.mapAddr;
|
||||
rDataMem->map = dataMemory.map;
|
||||
return rDataMem;
|
||||
}
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void U_CALLCONV
|
||||
CollationRoot::load(UErrorCode &errorCode) {
|
||||
CollationRoot::load(const char* ucadataPath, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
LocalPointer<CollationTailoring> t(new CollationTailoring(NULL));
|
||||
if(t.isNull() || t->isBogus()) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
t->memory = udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll",
|
||||
"icu", "ucadata",
|
||||
CollationDataReader::isAcceptable, t->version, &errorCode);
|
||||
t->memory = ucadataPath ? CollationRoot::loadFromFile(ucadataPath, errorCode) :
|
||||
udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll",
|
||||
"icu", "ucadata",
|
||||
CollationDataReader::isAcceptable,
|
||||
t->version, &errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
const uint8_t *inBytes = static_cast<const uint8_t *>(udata_getMemory(t->memory));
|
||||
CollationDataReader::read(NULL, inBytes, udata_getLength(t->memory), *t, errorCode);
|
||||
|
@ -73,14 +103,14 @@ CollationRoot::load(UErrorCode &errorCode) {
|
|||
|
||||
const CollationCacheEntry *
|
||||
CollationRoot::getRootCacheEntry(UErrorCode &errorCode) {
|
||||
umtx_initOnce(initOnce, CollationRoot::load, errorCode);
|
||||
umtx_initOnce(initOnce, CollationRoot::load, static_cast<const char*>(NULL), errorCode);
|
||||
if(U_FAILURE(errorCode)) { return NULL; }
|
||||
return rootSingleton;
|
||||
}
|
||||
|
||||
const CollationTailoring *
|
||||
CollationRoot::getRoot(UErrorCode &errorCode) {
|
||||
umtx_initOnce(initOnce, CollationRoot::load, errorCode);
|
||||
umtx_initOnce(initOnce, CollationRoot::load, static_cast<const char*>(NULL), errorCode);
|
||||
if(U_FAILURE(errorCode)) { return NULL; }
|
||||
return rootSingleton->tailoring;
|
||||
}
|
||||
|
@ -99,6 +129,12 @@ CollationRoot::getSettings(UErrorCode &errorCode) {
|
|||
return root->settings;
|
||||
}
|
||||
|
||||
void
|
||||
CollationRoot::forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode) {
|
||||
umtx_initOnce(initOnce, CollationRoot::load, ucadataPath, errorCode);
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_COLLATION
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#define __COLLATIONROOT_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/udata.h"
|
||||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
|
@ -34,9 +35,11 @@ public:
|
|||
static const CollationTailoring *getRoot(UErrorCode &errorCode);
|
||||
static const CollationData *getData(UErrorCode &errorCode);
|
||||
static const CollationSettings *getSettings(UErrorCode &errorCode);
|
||||
static void U_EXPORT2 forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
static void U_CALLCONV load(UErrorCode &errorCode);
|
||||
static void U_CALLCONV load(const char* ucadataPath, UErrorCode &errorCode);
|
||||
static UDataMemory* loadFromFile(const char* ucadataPath, UErrorCode &errorCode);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include "filterrb.h"
|
||||
#include "reslist.h"
|
||||
#include "ucmndata.h" /* TODO: for reading the pool bundle */
|
||||
#include "collationroot.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
@ -84,7 +85,9 @@ enum
|
|||
WRITE_POOL_BUNDLE,
|
||||
USE_POOL_BUNDLE,
|
||||
INCLUDE_UNIHAN_COLL,
|
||||
FILTERDIR
|
||||
FILTERDIR,
|
||||
ICU4X_MODE,
|
||||
UCADATA
|
||||
};
|
||||
|
||||
UOption options[]={
|
||||
|
@ -111,6 +114,8 @@ UOption options[]={
|
|||
UOPTION_DEF("usePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 20 */
|
||||
UOPTION_DEF("includeUnihanColl", '\x01', UOPT_NO_ARG),/* 21 */ /* temporary, don't display in usage info */
|
||||
UOPTION_DEF("filterDir", '\x01', UOPT_OPTIONAL_ARG), /* 22 */
|
||||
UOPTION_DEF("icu4xMode", 'X', UOPT_NO_ARG),/* 23 */
|
||||
UOPTION_DEF("ucadata", '\x01', UOPT_REQUIRES_ARG),/* 24 */
|
||||
};
|
||||
|
||||
static UBool write_java = FALSE;
|
||||
|
@ -152,6 +157,10 @@ main(int argc,
|
|||
fprintf(stderr, "%s: cannot combine --writePoolBundle and --usePoolBundle\n", argv[0]);
|
||||
illegalArg = TRUE;
|
||||
}
|
||||
if (options[ICU4X_MODE].doesOccur && !options[UCADATA].doesOccur) {
|
||||
fprintf(stderr, "%s: --icu4xMode requires --ucadata\n", argv[0]);
|
||||
illegalArg = TRUE;
|
||||
}
|
||||
if(options[FORMAT_VERSION].doesOccur) {
|
||||
const char *s = options[FORMAT_VERSION].value;
|
||||
if(uprv_strlen(s) != 1 || (s[0] < '1' && '3' < s[0])) {
|
||||
|
@ -302,6 +311,15 @@ main(int argc,
|
|||
}
|
||||
}
|
||||
|
||||
if (options[UCADATA].doesOccur) {
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
CollationRoot::forceLoadFromFile(options[UCADATA].value, status);
|
||||
#else
|
||||
fprintf(stderr, "--ucadata was used with UCONFIG_NO_COLLATION\n");
|
||||
return status;
|
||||
#endif
|
||||
}
|
||||
|
||||
initParser();
|
||||
|
||||
/*added by Jing*/
|
||||
|
@ -656,7 +674,7 @@ processFile(const char *filename, const char *cp,
|
|||
}
|
||||
/* Parse the data into an SRBRoot */
|
||||
data.adoptInstead(parse(ucbuf.getAlias(), inputDir, outputDir, filename,
|
||||
!omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, &status));
|
||||
!omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, options[ICU4X_MODE].doesOccur, &status));
|
||||
|
||||
if (data.isNull() || U_FAILURE(status)) {
|
||||
fprintf(stderr, "couldn't parse the file %s. Error:%s\n", filename, u_errorName(status));
|
||||
|
|
|
@ -21,6 +21,8 @@
|
|||
*/
|
||||
|
||||
// Safer use of UnicodeString.
|
||||
#include <cstdint>
|
||||
#include <unicode/umachine.h>
|
||||
#ifndef UNISTR_FROM_CHAR_EXPLICIT
|
||||
# define UNISTR_FROM_CHAR_EXPLICIT explicit
|
||||
#endif
|
||||
|
@ -42,6 +44,7 @@
|
|||
#include "reslist.h"
|
||||
#include "rbt_pars.h"
|
||||
#include "genrb.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
@ -59,6 +62,7 @@
|
|||
#include "collationruleparser.h"
|
||||
#include "collationtailoring.h"
|
||||
#include <stdio.h>
|
||||
#include "writesrc.h"
|
||||
|
||||
/* Number of tokens to read ahead of the current stream position */
|
||||
#define MAX_LOOKAHEAD 3
|
||||
|
@ -76,6 +80,9 @@
|
|||
#define OPENSQBRACKET 0x005B
|
||||
#define CLOSESQBRACKET 0x005D
|
||||
|
||||
#define ICU4X_DIACRITIC_BASE 0x0300
|
||||
#define ICU4X_DIACRITIC_LIMIT 0x034F
|
||||
|
||||
using icu::CharString;
|
||||
using icu::LocalMemory;
|
||||
using icu::LocalPointer;
|
||||
|
@ -119,6 +126,7 @@ typedef struct {
|
|||
const char *filename;
|
||||
UBool makeBinaryCollation;
|
||||
UBool omitCollationRules;
|
||||
UBool icu4xMode;
|
||||
} ParseState;
|
||||
|
||||
typedef struct SResource *
|
||||
|
@ -764,7 +772,7 @@ GenrbImporter::getRules(
|
|||
|
||||
/* Parse the data into an SRBRoot */
|
||||
LocalPointer<SRBRoot> data(
|
||||
parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, &errorCode));
|
||||
parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, FALSE, &errorCode));
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
@ -807,6 +815,333 @@ escape(const UChar *s, char *buffer) {
|
|||
|
||||
} // namespace
|
||||
|
||||
static FILE*
|
||||
openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) {
|
||||
CharString baseName;
|
||||
baseName.append(name, *status);
|
||||
baseName.append("_", *status);
|
||||
baseName.append(collationType, *status);
|
||||
baseName.append("_", *status);
|
||||
baseName.append(structType, *status);
|
||||
|
||||
CharString outFileName;
|
||||
if (outputdir && *outputdir) {
|
||||
outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status);
|
||||
}
|
||||
outFileName.append(baseName, *status);
|
||||
outFileName.append(".toml", *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
FILE* f = fopen(outFileName.data(), "w");
|
||||
if (!f) {
|
||||
*status = U_FILE_ACCESS_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X");
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
static void
|
||||
writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) {
|
||||
FILE* f = openTOML(outputdir, name, collationType, "meta", status);
|
||||
if (!f) {
|
||||
return;
|
||||
}
|
||||
// printf("writeCollationMetadataTOML %s %s\n", name, collationType);
|
||||
fprintf(f, "bits = 0x%X\n", metadataBits);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
static UChar32
|
||||
writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
|
||||
UChar32 limit = ICU4X_DIACRITIC_LIMIT;
|
||||
FILE* f = openTOML(outputdir, name, collationType, "dia", status);
|
||||
if (!f) {
|
||||
return limit;
|
||||
}
|
||||
// printf("writeCollationDiacriticsTOML %s %s\n", name, collationType);
|
||||
uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE];
|
||||
for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
|
||||
uint16_t secondary = 0;
|
||||
uint32_t ce32 = data->getCE32(c);
|
||||
if (ce32 == icu::Collation::FALLBACK_CE32) {
|
||||
ce32 = data->base->getCE32(c);
|
||||
}
|
||||
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
|
||||
// These never occur in NFD data
|
||||
} else if (!icu::Collation::isSimpleOrLongCE32(ce32)) {
|
||||
if (uprv_strcmp(name, "root") == 0) {
|
||||
printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c);
|
||||
fclose(f);
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return limit;
|
||||
}
|
||||
limit = c;
|
||||
break;
|
||||
} else {
|
||||
uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32));
|
||||
if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) {
|
||||
// Not a CE where only the secondary weight differs from the expected
|
||||
// pattern.
|
||||
limit = c;
|
||||
break;
|
||||
}
|
||||
secondary = uint16_t(ce >> 16);
|
||||
}
|
||||
secondaries[c - ICU4X_DIACRITIC_BASE] = secondary;
|
||||
|
||||
}
|
||||
usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n");
|
||||
fclose(f);
|
||||
return limit;
|
||||
}
|
||||
|
||||
static void
|
||||
writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) {
|
||||
FILE* f = openTOML(outputdir, name, collationType, "reord", status);
|
||||
if (!f) {
|
||||
return;
|
||||
}
|
||||
// printf("writeCollationReorderingTOML %s %s\n", name, collationType);
|
||||
fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder);
|
||||
usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n");
|
||||
usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n");
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
|
||||
FILE* f = openTOML(outputdir, name, collationType, "jamo", status);
|
||||
if (!f) {
|
||||
printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType);
|
||||
return;
|
||||
}
|
||||
uint32_t jamo[0x1200-0x1100];
|
||||
for (UChar32 c = 0x1100; c < 0x1200; ++c) {
|
||||
uint32_t ce32 = data->getCE32(c);
|
||||
if (ce32 == icu::Collation::FALLBACK_CE32) {
|
||||
ce32 = data->base->getCE32(c);
|
||||
}
|
||||
// Can't reject complex CE32s, because search collations have expansions.
|
||||
// These expansions refer to the tailoring, which foils the reuse of the
|
||||
// these jamo tables.
|
||||
// XXX Figure out what to do. Perhaps instead of having Latin mini expansions,
|
||||
// there should be Hangul mini expansions.
|
||||
// XXX in any case, validate that modern jamo are self-contained.
|
||||
jamo[c - 0x1100] = ce32;
|
||||
|
||||
}
|
||||
usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n");
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
static UBool
|
||||
convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) {
|
||||
if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) {
|
||||
// Range entirely in conjoining jamo block.
|
||||
return TRUE;
|
||||
}
|
||||
icu::IcuToolErrorCode status("genrb: convertTrie");
|
||||
umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status);
|
||||
return !U_FAILURE(*status);
|
||||
}
|
||||
|
||||
static void
|
||||
writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) {
|
||||
FILE* f = openTOML(outputdir, name, collationType, "data", status);
|
||||
if (!f) {
|
||||
return;
|
||||
}
|
||||
// printf("writeCollationDataTOML %s %s\n", name, collationType);
|
||||
|
||||
icu::UnicodeSet tailoringSet;
|
||||
|
||||
if (data->base) {
|
||||
tailoringSet.addAll(*(data->unsafeBackwardSet));
|
||||
tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
|
||||
} else {
|
||||
tailoringSet.addAll(*(data->unsafeBackwardSet));
|
||||
}
|
||||
|
||||
// Use the same value for out-of-range and default in the hope of not having to allocate
|
||||
// different blocks, since ICU4X never does out-of-range queries.
|
||||
uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
|
||||
icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
|
||||
|
||||
utrie2_enum(data->trie, NULL, &convertTrie, builder.getAlias());
|
||||
|
||||
// If the diacritic table was cut short, copy CE32s between the lowered
|
||||
// limit and the max limit from the root to the tailoring. As of June 2022,
|
||||
// no collation in CLDR needs this.
|
||||
for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) {
|
||||
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
|
||||
// These never occur in NFD data.
|
||||
continue;
|
||||
}
|
||||
uint32_t ce32 = data->getCE32(c);
|
||||
if (ce32 == icu::Collation::FALLBACK_CE32) {
|
||||
ce32 = data->base->getCE32(c);
|
||||
umutablecptrie_set(builder.getAlias(), c, ce32, status);
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the range covered by the diacritic table isn't duplicated
|
||||
// in the trie.
|
||||
for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) {
|
||||
if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
|
||||
umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
|
||||
}
|
||||
}
|
||||
|
||||
icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
|
||||
builder.getAlias(),
|
||||
UCPTRIE_TYPE_SMALL,
|
||||
UCPTRIE_VALUE_BITS_32,
|
||||
status));
|
||||
usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n");
|
||||
usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n");
|
||||
usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n");
|
||||
fprintf(f, "[trie]\n");
|
||||
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
|
||||
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
static void
|
||||
writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
|
||||
FILE* f = openTOML(outputdir, name, collationType, "prim", status);
|
||||
if (!f) {
|
||||
return;
|
||||
}
|
||||
// printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType);
|
||||
|
||||
uint16_t lastPrimaries[4];
|
||||
for (int32_t i = 0; i < 4; ++i) {
|
||||
// getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one
|
||||
// back to get a value that fits in 16 bits.
|
||||
lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
|
||||
}
|
||||
|
||||
uint32_t numericPrimary = data->numericPrimary;
|
||||
if (numericPrimary & 0xFFFFFF) {
|
||||
printf("Lower 24 bits set in numeric primary");
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n");
|
||||
fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
static void
|
||||
writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
|
||||
UBool tailored = FALSE;
|
||||
UBool tailoredDiacritics = FALSE;
|
||||
UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
|
||||
UBool reordering = FALSE;
|
||||
UBool isRoot = uprv_strcmp(name, "root") == 0;
|
||||
UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT;
|
||||
if (!data->base && isRoot) {
|
||||
diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
writeCollationJamoTOML(outputdir, name, collationType, data, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
} else if (data->base && !lithuanianDotAbove) {
|
||||
for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
|
||||
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
|
||||
// These never occur in NFD data.
|
||||
continue;
|
||||
}
|
||||
uint32_t ce32 = data->getCE32(c);
|
||||
if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
|
||||
tailoredDiacritics = TRUE;
|
||||
diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (settings->hasReordering()) {
|
||||
reordering = TRUE;
|
||||
// Note: There are duplicate reorderings. Expecting the ICU4X provider
|
||||
// to take care of deduplication.
|
||||
writeCollationReorderingTOML(outputdir, name, collationType, settings, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Write collation data if either base is non-null or the name is root.
|
||||
// Languages that only reorder scripts are otherwise root-like and have
|
||||
// null base.
|
||||
if (data->base || isRoot) {
|
||||
tailored = !isRoot;
|
||||
writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t maxVariable = (uint32_t)settings->getMaxVariable();
|
||||
if (maxVariable >= 4) {
|
||||
printf("Max variable out of range");
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t metadataBits = maxVariable;
|
||||
if (tailored) {
|
||||
metadataBits |= (1 << 3);
|
||||
}
|
||||
if (tailoredDiacritics) {
|
||||
metadataBits |= (1 << 4);
|
||||
}
|
||||
if (reordering) {
|
||||
metadataBits |= (1 << 5);
|
||||
}
|
||||
if (lithuanianDotAbove) {
|
||||
metadataBits |= (1 << 6);
|
||||
}
|
||||
if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) {
|
||||
metadataBits |= (1 << 7);
|
||||
}
|
||||
if (settings->getAlternateHandling() == UCOL_SHIFTED) {
|
||||
metadataBits |= (1 << 8);
|
||||
}
|
||||
switch (settings->getCaseFirst()) {
|
||||
case UCOL_OFF:
|
||||
break;
|
||||
case UCOL_UPPER_FIRST:
|
||||
metadataBits |= (1 << 9);
|
||||
metadataBits |= (1 << 10);
|
||||
break;
|
||||
case UCOL_LOWER_FIRST:
|
||||
metadataBits |= (1 << 9);
|
||||
break;
|
||||
default:
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status);
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_COLLATION
|
||||
|
||||
static TableResource *
|
||||
|
@ -952,9 +1287,9 @@ addCollation(ParseState* state, TableResource *result, const char *collationTyp
|
|||
res_close(result);
|
||||
return NULL; // TODO: use LocalUResourceBundlePointer for result
|
||||
}
|
||||
icu::CollationBuilder builder(base, intStatus);
|
||||
if(uprv_strncmp(collationType, "search", 6) == 0) {
|
||||
builder.disableFastLatin(); // build fast-Latin table unless search collator
|
||||
icu::CollationBuilder builder(base, state->icu4xMode, intStatus);
|
||||
if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) {
|
||||
builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X
|
||||
}
|
||||
LocalPointer<icu::CollationTailoring> t(
|
||||
builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
|
||||
|
@ -977,6 +1312,19 @@ addCollation(ParseState* state, TableResource *result, const char *collationTyp
|
|||
return NULL;
|
||||
}
|
||||
}
|
||||
if (state->icu4xMode) {
|
||||
char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1));
|
||||
if (nameWithoutSuffix == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
res_close(result);
|
||||
return NULL;
|
||||
}
|
||||
uprv_strcpy(nameWithoutSuffix, state->filename);
|
||||
*uprv_strrchr(nameWithoutSuffix, '.') = 0;
|
||||
|
||||
writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status);
|
||||
uprv_free(nameWithoutSuffix);
|
||||
}
|
||||
icu::LocalMemory<uint8_t> buffer;
|
||||
int32_t capacity = 100000;
|
||||
uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
|
||||
|
@ -1966,7 +2314,7 @@ parseResource(ParseState* state, char *tag, const struct UString *comment, UErro
|
|||
/* parse the top-level resource */
|
||||
struct SRBRoot *
|
||||
parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
|
||||
UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status)
|
||||
UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status)
|
||||
{
|
||||
struct UString *tokenValue;
|
||||
struct UString comment;
|
||||
|
@ -1992,6 +2340,7 @@ parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *fi
|
|||
state.filename = filename;
|
||||
state.makeBinaryCollation = makeBinaryCollation;
|
||||
state.omitCollationRules = omitCollationRules;
|
||||
state.icu4xMode = icu4xMode;
|
||||
|
||||
ustr_init(&comment);
|
||||
expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status);
|
||||
|
|
|
@ -31,7 +31,7 @@ void initParser();
|
|||
/* Parse a ResourceBundle text file */
|
||||
struct SRBRoot* parse(UCHARBUF *buf, const char* inputDir, const char* outputDir,
|
||||
const char *filename,
|
||||
UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status);
|
||||
UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status);
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
|
|
|
@ -1,7 +1,15 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <unicode/localpointer.h>
|
||||
#include <unicode/umachine.h>
|
||||
#include <unicode/unistr.h>
|
||||
#include <unicode/urename.h>
|
||||
#include <unicode/uset.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "toolutil.h"
|
||||
|
@ -15,7 +23,10 @@
|
|||
#include "unicode/uscript.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
#include "unicode/ucharstriebuilder.h"
|
||||
#include "ucase.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "writesrc.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -299,6 +310,470 @@ FILE* prepareOutputFile(const char* basename) {
|
|||
return f;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
struct PendingDescriptor {
|
||||
UChar32 scalar;
|
||||
uint32_t descriptor;
|
||||
UBool supplementary;
|
||||
};
|
||||
|
||||
void writeCanonicalCompositions(USet* backwardCombiningStarters) {
|
||||
IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
|
||||
const char* basename = "compositions";
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
|
||||
LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
|
||||
|
||||
const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
|
||||
UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
|
||||
|
||||
const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
|
||||
for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
|
||||
if (c >= 0xD800 && c < 0xE000) {
|
||||
// Surrogate
|
||||
continue;
|
||||
}
|
||||
UnicodeString decomposition;
|
||||
if (!nfc->getRawDecomposition(c, decomposition)) {
|
||||
continue;
|
||||
}
|
||||
int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
|
||||
if (len != 2) {
|
||||
continue;
|
||||
}
|
||||
UChar32 starter = utf32[0];
|
||||
UChar32 second = utf32[1];
|
||||
UChar32 composite = nfc->composePair(starter, second);
|
||||
if (composite < 0) {
|
||||
continue;
|
||||
}
|
||||
if (c != composite) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
if (!u_getCombiningClass(second)) {
|
||||
uset_add(backwardCombiningStarters, second);
|
||||
}
|
||||
if (composite >= 0xAC00 && composite <= 0xD7A3) {
|
||||
// Hangul syllable
|
||||
continue;
|
||||
}
|
||||
|
||||
UnicodeString backward;
|
||||
backward.append(second);
|
||||
backward.append(starter);
|
||||
backwardBuilder->add(backward, int32_t(composite), status);
|
||||
}
|
||||
UnicodeString canonicalCompositionTrie;
|
||||
backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
|
||||
|
||||
usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n");
|
||||
fclose(f);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
||||
void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n");
|
||||
usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n");
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions) {
|
||||
IcuToolErrorCode status("icuexportdata: writeDecompositionData");
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
|
||||
// Zero is a magic number that means the character decomposes to itself.
|
||||
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
|
||||
|
||||
// Iterate backwards to insert lower code points in the trie first in case it matters
|
||||
// for trie block allocation.
|
||||
for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
|
||||
const PendingDescriptor& pending = pendingTrieInsertions[i];
|
||||
uint32_t additional = 0;
|
||||
if (!(pending.descriptor & 0xFFFF0000)) {
|
||||
uint32_t offset = pending.descriptor & 0xFFF;
|
||||
if (!pending.supplementary) {
|
||||
if (offset >= baseSize16) {
|
||||
// This is a offset to supplementary 16-bit data. We have
|
||||
// 16-bit base data and 32-bit base data before. However,
|
||||
// the 16-bit base data length is already part of offset.
|
||||
additional = baseSize32;
|
||||
}
|
||||
} else {
|
||||
if (offset >= baseSize32) {
|
||||
// This is an offset to supplementary 32-bit data. We have 16-bit
|
||||
// base data, 32-bit base data, and 16-bit supplementary data before.
|
||||
// However, the 32-bit base data length is already part
|
||||
// of offset.
|
||||
additional = baseSize16 + supplementSize16;
|
||||
} else {
|
||||
// This is an offset to 32-bit base data. We have 16-bit
|
||||
// base data before.
|
||||
additional = baseSize16;
|
||||
}
|
||||
}
|
||||
if (offset + additional > 0xFFF) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
}
|
||||
umutablecptrie_set(builder.getAlias(), pending.scalar, pending.descriptor + additional, status);
|
||||
}
|
||||
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
|
||||
builder.getAlias(),
|
||||
trieType,
|
||||
UCPTRIE_VALUE_BITS_32,
|
||||
status));
|
||||
handleError(status, basename);
|
||||
|
||||
if (!reference) {
|
||||
usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
|
||||
} else {
|
||||
if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
|
||||
// NFD expectations don't hold. The set must not contain the half-width
|
||||
// kana voicing marks and must contain iota subscript.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
||||
USet* halfWidthVoicing = uset_openEmpty();
|
||||
uset_add(halfWidthVoicing, 0xFF9E);
|
||||
uset_add(halfWidthVoicing, 0xFF9F);
|
||||
|
||||
USet* iotaSubscript = uset_openEmpty();
|
||||
uset_add(iotaSubscript, 0x0345);
|
||||
|
||||
uint8_t flags = 0;
|
||||
|
||||
USet* halfWidthCheck = uset_cloneAsThawed(uset);
|
||||
uset_removeAll(halfWidthCheck, reference);
|
||||
if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
|
||||
flags |= 1;
|
||||
} else if (!uset_isEmpty(halfWidthCheck)) {
|
||||
// The result was neither empty nor contained exactly
|
||||
// the two half-width voicing marks. The ICU4X
|
||||
// normalizer doesn't know how to deal with this case.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
uset_close(halfWidthCheck);
|
||||
|
||||
USet* iotaCheck = uset_cloneAsThawed(reference);
|
||||
uset_removeAll(iotaCheck, uset);
|
||||
if (uset_equals(iotaCheck, iotaSubscript)) {
|
||||
flags |= (1 << 1);
|
||||
} else if (!uset_isEmpty(iotaCheck)) {
|
||||
// The result was neither empty nor contained exactly
|
||||
// the iota subscript. The ICU4X normalizer doesn't
|
||||
// know how to deal with this case.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
uset_close(halfWidthCheck);
|
||||
|
||||
uset_close(iotaSubscript);
|
||||
uset_close(halfWidthVoicing);
|
||||
|
||||
fprintf(f, "flags = 0x%X\n", flags);
|
||||
}
|
||||
fprintf(f, "[trie]\n");
|
||||
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
|
||||
fclose(f);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
||||
void writePotentialCompositionPassThrough(const char* basename, const Normalizer2* norm, const USet* decompositionStartsWithNonStarter, const USet* decompositionStartsWithBackwardCombiningStarter, USet* potentialPassthroughAndNotBackwardCombining) {
|
||||
IcuToolErrorCode status("icuexportdata: writePotentialCompositionPassThrough");
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
|
||||
const Normalizer2* nfc = nullptr;
|
||||
if (!norm) {
|
||||
// UTS 46 case
|
||||
norm = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
|
||||
nfc = Normalizer2::getNFCInstance(status);
|
||||
}
|
||||
for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
|
||||
if (c >= 0xD800 && c < 0xE000) {
|
||||
// Surrogate
|
||||
continue;
|
||||
}
|
||||
if (uset_contains(decompositionStartsWithNonStarter, c) || uset_contains(decompositionStartsWithBackwardCombiningStarter, c)) {
|
||||
continue;
|
||||
}
|
||||
UnicodeString src;
|
||||
UnicodeString dst;
|
||||
src.append(c);
|
||||
norm->normalize(src, dst, status);
|
||||
if (nfc && (dst.isEmpty() || (dst == u"\uFFFD" && c != 0xFFFD))) {
|
||||
// UTS 46 ignored and disallowed fall back to NFC for data
|
||||
// overlap.
|
||||
dst.truncate(0);
|
||||
nfc->normalize(src, dst, status);
|
||||
}
|
||||
if (src == dst) {
|
||||
uset_add(potentialPassthroughAndNotBackwardCombining, c);
|
||||
}
|
||||
}
|
||||
|
||||
// The surrogate range forms a useless discontinuity. The code
|
||||
// that reads from the set never looks up by surrage, so let's
|
||||
// put the surrogate range in the set as a micro-optimization.
|
||||
uset_addRange(potentialPassthroughAndNotBackwardCombining, 0xD800, 0xDFFF);
|
||||
|
||||
usrc_writeUnicodeSet(f, potentialPassthroughAndNotBackwardCombining, UPRV_TARGET_SYNTAX_TOML);
|
||||
fclose(f);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
||||
// Computes data for canonical decompositions
|
||||
void computeDecompositions(const char* basename, const USet* backwardCombiningStarters, std::vector<uint16_t>& storage16, std::vector<uint32_t>& storage32, USet* decompositionStartsWithNonStarter, USet* decompositionStartsWithBackwardCombiningStarter, std::vector<PendingDescriptor>& pendingTrieInsertions) {
|
||||
IcuToolErrorCode status("icuexportdata: computeDecompositions");
|
||||
const Normalizer2* mainNormalizer;
|
||||
const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
|
||||
if (uprv_strcmp(basename, "nfkd") == 0) {
|
||||
mainNormalizer = Normalizer2::getNFKDInstance(status);
|
||||
} else if (uprv_strcmp(basename, "uts46d") == 0) {
|
||||
mainNormalizer = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
|
||||
} else {
|
||||
mainNormalizer = nfdNormalizer;
|
||||
}
|
||||
|
||||
// Max length as of Unicode 14 is 4 for NFD. For NFKD the max
|
||||
// is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
|
||||
const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
|
||||
const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
|
||||
const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
|
||||
UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
|
||||
|
||||
// Iterate over all scalar values excluding Hangul syllables.
|
||||
//
|
||||
// We go backwards in order to better find overlapping decompositions.
|
||||
//
|
||||
// As of Unicode 14:
|
||||
// Iterate forward without overlap search:
|
||||
// nfd: 16 size: 896, 32 size: 173
|
||||
// nfkd: 16 size: 3854, 32 size: 179
|
||||
//
|
||||
// Iterate forward with overlap search:
|
||||
// nfd: 16 size: 888, 32 size: 173
|
||||
// nfkd: 16 size: 3266, 32 size: 179
|
||||
//
|
||||
// Iterate backward with overlap search:
|
||||
// nfd: 16 size: 776, 32 size: 173
|
||||
// nfkd: 16 size: 2941, 32 size: 179
|
||||
//
|
||||
// UChar32 is signed!
|
||||
for (UChar32 c = 0x10FFFF; c >= 0; --c) {
|
||||
if (c >= 0xAC00 && c <= 0xD7A3) {
|
||||
// Hangul syllable
|
||||
continue;
|
||||
}
|
||||
if (c >= 0xD800 && c < 0xE000) {
|
||||
// Surrogate
|
||||
continue;
|
||||
}
|
||||
UnicodeString src;
|
||||
UnicodeString dst;
|
||||
src.append(c);
|
||||
if (mainNormalizer != nfdNormalizer) {
|
||||
UnicodeString inter;
|
||||
mainNormalizer->normalize(src, inter, status);
|
||||
nfdNormalizer->normalize(inter, dst, status);
|
||||
} else {
|
||||
nfdNormalizer->normalize(src, dst, status);
|
||||
}
|
||||
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
|
||||
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
|
||||
// Characters that normalize to nothing or to U+FFFD (without the
|
||||
// input being U+FFFD) in ICU4C's UTS 46 normalization normalize
|
||||
// as in NFD in ICU4X's UTF 46 normalization in the interest
|
||||
// of data size and ICU4X's normalizer being unable to handle
|
||||
// normalizing to nothing.
|
||||
// When UTS 46 is implemented on top of ICU4X, a preprocessing
|
||||
// step is supposed to remove these characters before the
|
||||
// normalization step.
|
||||
if (uprv_strcmp(basename, "uts46d") != 0) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
nfdNormalizer->normalize(src, dst, status);
|
||||
len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
|
||||
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
}
|
||||
if (len > DECOMPOSITION_BUFFER_SIZE) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
bool startsWithNonStarter = u_getCombiningClass(utf32[0]);
|
||||
if (startsWithNonStarter) {
|
||||
uset_add(decompositionStartsWithNonStarter, c);
|
||||
} else if (uset_contains(backwardCombiningStarters, c)) {
|
||||
uset_add(decompositionStartsWithBackwardCombiningStarter, c);
|
||||
}
|
||||
if (mainNormalizer != nfdNormalizer) {
|
||||
UnicodeString nfd;
|
||||
nfdNormalizer->normalize(src, nfd, status);
|
||||
if (dst == nfd) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (src == dst) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (startsWithNonStarter && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) {
|
||||
// A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
if (len == 1 && utf32[0] <= 0xFFFF) {
|
||||
if (utf32[0] == 1) {
|
||||
// 1 is reserved as a marker for the expansion of U+FDFA.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE});
|
||||
} else if (len == 2 && utf32[0] <= 0xFFFF && utf32[1] <= 0xFFFF && !u_getCombiningClass(utf32[0]) && u_getCombiningClass(utf32[1])) {
|
||||
pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), FALSE});
|
||||
} else {
|
||||
UBool supplementary = FALSE;
|
||||
UBool nonInitialStarter = FALSE;
|
||||
for (int32_t i = 0; i < len; ++i) {
|
||||
if (utf32[i] > 0xFFFF) {
|
||||
supplementary = TRUE;
|
||||
}
|
||||
if (utf32[i] == 0) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
if (i != 0 && !u_getCombiningClass(utf32[i])) {
|
||||
nonInitialStarter = TRUE;
|
||||
}
|
||||
}
|
||||
if (!supplementary) {
|
||||
if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
|
||||
if (len == 18 && c == 0xFDFA) {
|
||||
// Special marker for the one character whose decomposition
|
||||
// is too long.
|
||||
pendingTrieInsertions.push_back({c, 1 << 16, supplementary});
|
||||
continue;
|
||||
} else {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
}
|
||||
} else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
// Complex decomposition
|
||||
// Format for 16-bit value:
|
||||
// 15..13: length minus two for 16-bit case and length minus one for
|
||||
// the 32-bit case. Length 8 needs to fit in three bits in
|
||||
// the 16-bit case, and this way the value is future-proofed
|
||||
// up to 9 in the 16-bit case. Zero is unused and length one
|
||||
// in the 16-bit case goes directly into the trie.
|
||||
// 12: 1 if all trailing characters are guaranteed non-starters,
|
||||
// 0 if no guarantees about non-starterness.
|
||||
// Note: The bit choice is this way around to allow for
|
||||
// dynamically falling back to not having this but instead
|
||||
// having one more bit for length by merely choosing
|
||||
// different masks.
|
||||
// 11..0: Start offset in storage. The offset is to the logical
|
||||
// sequence of scalars16, scalars32, supplementary_scalars16,
|
||||
// supplementary_scalars32.
|
||||
uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
|
||||
if (!supplementary) {
|
||||
descriptor |= (uint32_t(len) - 2) << 13;
|
||||
} else {
|
||||
descriptor |= (uint32_t(len) - 1) << 13;
|
||||
}
|
||||
if (descriptor & 0xFFF) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
size_t index = 0;
|
||||
bool writeToStorage = FALSE;
|
||||
// Sadly, C++ lacks break and continue by label, so using goto in the
|
||||
// inner loops to break or continue the outer loop.
|
||||
if (!supplementary) {
|
||||
outer16: for (;;) {
|
||||
if (index == storage16.size()) {
|
||||
writeToStorage = TRUE;
|
||||
break;
|
||||
}
|
||||
if (storage16[index] == utf32[0]) {
|
||||
for (int32_t i = 1; i < len; ++i) {
|
||||
if (storage16[index + i] != uint32_t(utf32[i])) {
|
||||
++index;
|
||||
// continue outer
|
||||
goto outer16;
|
||||
}
|
||||
}
|
||||
// break outer
|
||||
goto after;
|
||||
}
|
||||
++index;
|
||||
}
|
||||
} else {
|
||||
outer32: for (;;) {
|
||||
if (index == storage32.size()) {
|
||||
writeToStorage = TRUE;
|
||||
break;
|
||||
}
|
||||
if (storage32[index] == uint32_t(utf32[0])) {
|
||||
for (int32_t i = 1; i < len; ++i) {
|
||||
if (storage32[index + i] != uint32_t(utf32[i])) {
|
||||
++index;
|
||||
// continue outer
|
||||
goto outer32;
|
||||
}
|
||||
}
|
||||
// break outer
|
||||
goto after;
|
||||
}
|
||||
++index;
|
||||
}
|
||||
}
|
||||
after:
|
||||
if (index > 0xFFF) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
descriptor |= uint32_t(index);
|
||||
if (!descriptor || descriptor > 0xFFFF) {
|
||||
// > 0xFFFF should never happen if the code above is correct.
|
||||
// == 0 should not happen due to the nature of the data.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
if (writeToStorage) {
|
||||
if (!supplementary) {
|
||||
for (int32_t i = 0; i < len; ++i) {
|
||||
storage16.push_back(uint16_t(utf32[i]));
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < len; ++i) {
|
||||
storage32.push_back(uint32_t(utf32[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, descriptor, supplementary});
|
||||
}
|
||||
}
|
||||
if (storage16.size() + storage32.size() > 0xFFF) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
enum {
|
||||
OPT_HELP_H,
|
||||
OPT_HELP_QUESTION_MARK,
|
||||
|
@ -341,7 +816,7 @@ void printHelp(FILE* stdfile, const char* program) {
|
|||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-m or --mode mode: currently only 'uprops' and 'ucase', but more may be added\n"
|
||||
"\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
|
||||
"\t --trie-type set the trie type (small or fast, default small)\n"
|
||||
"\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t --all write out all properties known to icuexportdata\n"
|
||||
|
@ -387,6 +862,46 @@ int exportUprops(int argc, char* argv[]) {
|
|||
}
|
||||
}
|
||||
|
||||
if (propNames.empty()
|
||||
|| options[OPT_HELP_H].doesOccur
|
||||
|| options[OPT_HELP_QUESTION_MARK].doesOccur
|
||||
|| !options[OPT_MODE].doesOccur) {
|
||||
FILE *stdfile=argc<0 ? stderr : stdout;
|
||||
fprintf(stdfile,
|
||||
"usage: %s -m uprops [-options] [--all | properties...]\n"
|
||||
"\tdump Unicode property data to .toml files\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-m or --mode mode: currently only 'uprops', but more may be added\n"
|
||||
"\t --trie-type set the trie type (small or fast, default small)\n"
|
||||
"\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t --all write out all properties known to icuexportdata\n"
|
||||
"\t --index write an _index.toml summarizing all data exported\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-v or --verbose Turn on verbose output\n"
|
||||
"\t-q or --quiet do not display warnings and progress\n",
|
||||
argv[0]);
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
const char* mode = options[OPT_MODE].value;
|
||||
if (uprv_strcmp(mode, "uprops") != 0) {
|
||||
fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
||||
if (options[OPT_TRIE_TYPE].doesOccur) {
|
||||
if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
|
||||
trieType = UCPTRIE_TYPE_FAST;
|
||||
} else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
|
||||
trieType = UCPTRIE_TYPE_SMALL;
|
||||
} else {
|
||||
fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
for (const char* propName : propNames) {
|
||||
UProperty propEnum = u_getPropertyEnum(propName);
|
||||
if (propEnum == UCHAR_INVALID_CODE) {
|
||||
|
@ -505,6 +1020,81 @@ int exportCase(int argc, char* argv[]) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
int exportNorm() {
|
||||
IcuToolErrorCode status("icuexportdata: exportNorm");
|
||||
USet* backwardCombiningStarters = uset_openEmpty();
|
||||
writeCanonicalCompositions(backwardCombiningStarters);
|
||||
|
||||
std::vector<uint16_t> storage16;
|
||||
std::vector<uint32_t> storage32;
|
||||
|
||||
USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
|
||||
USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
|
||||
std::vector<PendingDescriptor> nfdPendingTrieInsertions;
|
||||
computeDecompositions("nfd", backwardCombiningStarters, storage16, storage32, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfdPendingTrieInsertions);
|
||||
|
||||
uint32_t baseSize16 = storage16.size();
|
||||
uint32_t baseSize32 = storage32.size();
|
||||
|
||||
USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
|
||||
USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
|
||||
std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
|
||||
computeDecompositions("nfkd", backwardCombiningStarters, storage16, storage32, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkdPendingTrieInsertions);
|
||||
|
||||
USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
|
||||
USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
|
||||
std::vector<PendingDescriptor> uts46PendingTrieInsertions;
|
||||
computeDecompositions("uts46d", backwardCombiningStarters, storage16, storage32, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PendingTrieInsertions);
|
||||
|
||||
uint32_t supplementSize16 = storage16.size() - baseSize16;
|
||||
uint32_t supplementSize32 = storage32.size() - baseSize32;
|
||||
|
||||
writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions);
|
||||
writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions);
|
||||
writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions);
|
||||
|
||||
writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
|
||||
writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
|
||||
|
||||
USet* nfcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
|
||||
const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
|
||||
writePotentialCompositionPassThrough("nfc", nfc, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfcPotentialPassthroughAndNotBackwardCombining);
|
||||
|
||||
USet* nfkcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
|
||||
const Normalizer2* nfkc = Normalizer2::getNFKCInstance(status);
|
||||
writePotentialCompositionPassThrough("nfkc", nfkc, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkcPotentialPassthroughAndNotBackwardCombining);
|
||||
|
||||
USet* uts46PotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
|
||||
writePotentialCompositionPassThrough("uts46", nullptr, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PotentialPassthroughAndNotBackwardCombining);
|
||||
|
||||
// Check that NFKC set has no characters that NFC doesn't also have.
|
||||
uset_removeAll(nfkcPotentialPassthroughAndNotBackwardCombining, nfcPotentialPassthroughAndNotBackwardCombining);
|
||||
if (!uset_isEmpty(nfkcPotentialPassthroughAndNotBackwardCombining)) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, "exportNorm");
|
||||
}
|
||||
|
||||
uset_close(nfcPotentialPassthroughAndNotBackwardCombining);
|
||||
uset_close(nfkcPotentialPassthroughAndNotBackwardCombining);
|
||||
uset_close(uts46PotentialPassthroughAndNotBackwardCombining);
|
||||
|
||||
uset_close(nfdDecompositionStartsWithNonStarter);
|
||||
uset_close(nfkdDecompositionStartsWithNonStarter);
|
||||
uset_close(uts46DecompositionStartsWithNonStarter);
|
||||
|
||||
uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
|
||||
uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
|
||||
uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
|
||||
|
||||
uset_close(backwardCombiningStarters);
|
||||
handleError(status, "exportNorm");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
|
||||
|
@ -553,12 +1143,20 @@ int main(int argc, char* argv[]) {
|
|||
}
|
||||
|
||||
const char* mode = options[OPT_MODE].value;
|
||||
if (uprv_strcmp(mode, "norm") == 0) {
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
return exportNorm();
|
||||
#else
|
||||
fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
#endif
|
||||
}
|
||||
if (uprv_strcmp(mode, "uprops") == 0) {
|
||||
return exportUprops(argc, argv);
|
||||
} else if (uprv_strcmp(mode, "ucase") == 0) {
|
||||
return exportCase(argc, argv);
|
||||
}
|
||||
|
||||
fprintf(stderr, "Invalid option for --mode (must be uprops or ucase)\n");
|
||||
fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
|
||||
return U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
#include <time.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
|
@ -143,12 +144,14 @@ usrc_writeArray(FILE *f,
|
|||
const uint8_t *p8;
|
||||
const uint16_t *p16;
|
||||
const uint32_t *p32;
|
||||
uint32_t value;
|
||||
const int64_t *p64; // Signed due to TOML!
|
||||
int64_t value; // Signed due to TOML!
|
||||
int32_t i, col;
|
||||
|
||||
p8=NULL;
|
||||
p16=NULL;
|
||||
p32=NULL;
|
||||
p64=NULL;
|
||||
switch(width) {
|
||||
case 8:
|
||||
p8=(const uint8_t *)p;
|
||||
|
@ -159,6 +162,9 @@ usrc_writeArray(FILE *f,
|
|||
case 32:
|
||||
p32=(const uint32_t *)p;
|
||||
break;
|
||||
case 64:
|
||||
p64=(const int64_t *)p;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "usrc_writeArray(width=%ld) unrecognized width\n", (long)width);
|
||||
return;
|
||||
|
@ -186,11 +192,14 @@ usrc_writeArray(FILE *f,
|
|||
case 32:
|
||||
value=p32[i];
|
||||
break;
|
||||
case 64:
|
||||
value=p64[i];
|
||||
break;
|
||||
default:
|
||||
value=0; /* unreachable */
|
||||
break;
|
||||
}
|
||||
fprintf(f, value<=9 ? "%lu" : "0x%lx", (unsigned long)value);
|
||||
fprintf(f, value<=9 ? "%" PRId64 : "0x%" PRIx64, value);
|
||||
}
|
||||
if(postfix!=NULL) {
|
||||
fputs(postfix, f);
|
||||
|
|
|
@ -69,7 +69,7 @@ usrc_writeFileNameGeneratedBy(
|
|||
const char *generator);
|
||||
|
||||
/**
|
||||
* Writes the contents of an array of 8/16/32-bit words.
|
||||
* Writes the contents of an array of 8/16/32/64-bit words.
|
||||
* The prefix and postfix are optional (can be NULL) and are written first/last.
|
||||
* The prefix may contain a %ld or similar field for the array length.
|
||||
* The {} and declaration etc. need to be included in prefix/postfix or
|
||||
|
|
|
@ -83,14 +83,15 @@ binarySearch(const UVector64 &list, int64_t ce) {
|
|||
|
||||
} // namespace
|
||||
|
||||
CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
|
||||
: CollationDataBuilder(errorCode),
|
||||
CollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
|
||||
: CollationDataBuilder(icu4xMode, errorCode),
|
||||
numericPrimary(0x12000000),
|
||||
firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
|
||||
rootElements(errorCode),
|
||||
scriptStartsLength(1) {
|
||||
uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
|
||||
uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
|
||||
this->icu4xMode = icu4xMode;
|
||||
}
|
||||
|
||||
CollationBaseDataBuilder::~CollationBaseDataBuilder() {
|
||||
|
@ -119,7 +120,9 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) {
|
|||
trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
|
||||
|
||||
// Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
|
||||
for(UChar32 c = 0; c < 0x180; ++c) {
|
||||
// In the ICU4X case, only preallocate ASCII, because we don't store CE32s for
|
||||
// precomposed characters.
|
||||
for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) {
|
||||
utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
|
||||
}
|
||||
|
||||
|
@ -128,8 +131,10 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) {
|
|||
// Some code assumes that the root first primary CE is the "space first primary"
|
||||
// from FractionalUCA.txt.
|
||||
|
||||
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
|
||||
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
|
||||
if (!icu4xMode) {
|
||||
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
|
||||
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
|
||||
}
|
||||
|
||||
// Add a mapping for the first-unassigned boundary,
|
||||
// which is the AlphabeticIndex overflow boundary.
|
||||
|
|
|
@ -37,7 +37,7 @@ U_NAMESPACE_BEGIN
|
|||
*/
|
||||
class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder {
|
||||
public:
|
||||
CollationBaseDataBuilder(UErrorCode &errorCode);
|
||||
CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode);
|
||||
|
||||
virtual ~CollationBaseDataBuilder();
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/localpointer.h"
|
||||
|
@ -69,7 +70,7 @@ enum HanOrderValue {
|
|||
HAN_RADICAL_STROKE
|
||||
};
|
||||
|
||||
static UBool beVerbose=FALSE, withCopyright=TRUE;
|
||||
static UBool beVerbose=FALSE, withCopyright=TRUE, icu4xMode=FALSE;
|
||||
|
||||
static HanOrderValue hanOrder = HAN_NO_ORDER;
|
||||
|
||||
|
@ -832,6 +833,11 @@ parseFractionalUCA(const char *filename,
|
|||
int32_t lineNumber = 0;
|
||||
char buffer[30000];
|
||||
|
||||
const Normalizer2* norm = nullptr;
|
||||
if (icu4xMode) {
|
||||
norm = Normalizer2::getNFDInstance(*status);
|
||||
}
|
||||
|
||||
UChar32 maxCodePoint = 0;
|
||||
while(!feof(data)) {
|
||||
if(U_FAILURE(*status)) {
|
||||
|
@ -889,6 +895,24 @@ parseFractionalUCA(const char *filename,
|
|||
// CollationBaseDataBuilder::init() maps them to special CEs.
|
||||
// Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
|
||||
if(0xfffd <= c && c <= 0xffff) { continue; }
|
||||
if (icu4xMode) {
|
||||
if (c >= 0xAC00 && c <= 0xD7A3) {
|
||||
// Hangul syllable
|
||||
continue;
|
||||
}
|
||||
if (c >= 0xD800 && c < 0xE000) {
|
||||
// Surrogate
|
||||
continue;
|
||||
}
|
||||
UnicodeString src;
|
||||
UnicodeString dst;
|
||||
src.append(c);
|
||||
norm->normalize(src, dst, *status);
|
||||
if (src != dst) {
|
||||
// c decomposed, skip it
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(s.length() >= 2 && c == 0xFDD1) {
|
||||
UChar32 c2 = s.char32At(1);
|
||||
int32_t script = getCharScript(c2);
|
||||
|
@ -923,7 +947,6 @@ parseFractionalUCA(const char *filename,
|
|||
(int)lineNumber, filename, line);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
|
||||
builder.add(prefix, s, ces, cesLength, *status);
|
||||
}
|
||||
}
|
||||
|
@ -1126,8 +1149,9 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder,
|
|||
|
||||
CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
|
||||
const char *dataName =
|
||||
hanOrder == HAN_IMPLICIT ? "ucadata-implicithan" :
|
||||
"ucadata-unihan";
|
||||
hanOrder == HAN_IMPLICIT ?
|
||||
(icu4xMode ? "ucadata-implicithan-icu4x" : "ucadata-implicithan") :
|
||||
(icu4xMode ? "ucadata-unihan-icu4x" : "ucadata-unihan");
|
||||
UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo,
|
||||
withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
|
@ -1275,7 +1299,7 @@ parseAndWriteCollationRootData(
|
|||
const char *sourceCodePath,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
CollationBaseDataBuilder builder(errorCode);
|
||||
CollationBaseDataBuilder builder(icu4xMode, errorCode);
|
||||
builder.init(errorCode);
|
||||
parseFractionalUCA(fracUCAPath, builder, &errorCode);
|
||||
buildAndWriteBaseData(builder, binaryDataPath, errorCode);
|
||||
|
@ -1289,7 +1313,8 @@ enum {
|
|||
HELP_QUESTION_MARK,
|
||||
VERBOSE,
|
||||
COPYRIGHT,
|
||||
HAN_ORDER
|
||||
HAN_ORDER,
|
||||
ICU4X
|
||||
};
|
||||
|
||||
static UOption options[]={
|
||||
|
@ -1297,7 +1322,8 @@ static UOption options[]={
|
|||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
UOPTION_COPYRIGHT,
|
||||
UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG)
|
||||
UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("icu4x", 'X', UOPT_NO_ARG)
|
||||
};
|
||||
|
||||
extern "C" int
|
||||
|
@ -1348,6 +1374,7 @@ main(int argc, char* argv[]) {
|
|||
|
||||
beVerbose=options[VERBOSE].doesOccur;
|
||||
withCopyright=options[COPYRIGHT].doesOccur;
|
||||
icu4xMode=options[ICU4X].doesOccur;
|
||||
|
||||
IcuToolErrorCode errorCode("genuca");
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue