diff --git a/.ci-builds/.azure-pipelines.yml b/.ci-builds/.azure-pipelines.yml index 811b531365a..70d711504df 100644 --- a/.ci-builds/.azure-pipelines.yml +++ b/.ci-builds/.azure-pipelines.yml @@ -445,7 +445,7 @@ jobs: timeoutInMinutes: 30 pool: vmImage: 'windows-2019' - demands: + demands: - msbuild - visualstudio - Cmd @@ -606,6 +606,8 @@ jobs: cd icu4c/source LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all + LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all + LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all displayName: 'Build Unicode property data export file (Full)' # In the sample file, include: # - Basic binary properties: AHex WSpace @@ -619,6 +621,18 @@ jobs: LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/fast --trie-type fast AHex gc nt Basic_Emoji sc WSpace blank LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/small --trie-type small AHex gc nt Basic_Emoji sc WSpace blank displayName: 'Build Unicode property data export file (Sample)' + - script: | + mkdir -p icu4c/source/icuexportdata_uprops_full/collation_unihan + mkdir -p icu4c/source/icuexportdata_uprops_full/collation_implicithan + cd icu4c/source + cd data/coll + FILES=`echo *.txt` + cd - + LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_unihan --ucadata data/in/coll/ucadata-unihan-icu4x.icu $FILES + LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_implicithan --ucadata data/in/coll/ucadata-implicithan-icu4x.icu $FILES + rm icuexportdata_uprops_full/collation_unihan/*.res + rm icuexportdata_uprops_full/collation_implicithan/*.res + displayName: 'Build collation data export file' - task: PublishBuildArtifacts@1 displayName: 'Publish Artifact: icuexportdata_uprops_full' inputs: diff --git a/icu4c/source/common/udatamem.h b/icu4c/source/common/udatamem.h index a05dd697568..3db2af43aad 100644 --- a/icu4c/source/common/udatamem.h +++ b/icu4c/source/common/udatamem.h @@ -44,7 +44,7 @@ struct UDataMemory { int32_t length; /* Length of the data in bytes; -1 if unknown. */ }; -U_CFUNC UDataMemory *UDataMemory_createNewInstance(UErrorCode *pErr); +U_CAPI UDataMemory* U_EXPORT2 UDataMemory_createNewInstance(UErrorCode *pErr); U_CFUNC void UDatamemory_assign (UDataMemory *dest, UDataMemory *source); U_CFUNC void UDataMemory_init (UDataMemory *This); U_CFUNC UBool UDataMemory_isLoaded(const UDataMemory *This); diff --git a/icu4c/source/common/umapfile.h b/icu4c/source/common/umapfile.h index adc265203dc..042e71374c1 100644 --- a/icu4c/source/common/umapfile.h +++ b/icu4c/source/common/umapfile.h @@ -29,7 +29,7 @@ #include "unicode/udata.h" #include "putilimp.h" -U_CFUNC UBool uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status); +U_CAPI UBool U_EXPORT2 uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status); U_CFUNC void uprv_unmapFile(UDataMemory *pData); /* MAP_NONE: no memory mapping, no file access at all */ diff --git a/icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu b/icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu new file mode 100644 index 00000000000..5c1a111424d Binary files /dev/null and b/icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu differ diff --git a/icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu b/icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu new file mode 100644 index 00000000000..29e5b00e14c Binary files /dev/null and b/icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu differ diff --git a/icu4c/source/data/unidata/generate.sh b/icu4c/source/data/unidata/generate.sh index 9782cd14d2e..f13492b2dd6 100755 --- a/icu4c/source/data/unidata/generate.sh +++ b/icu4c/source/data/unidata/generate.sh @@ -44,3 +44,6 @@ bazelisk run //tools/unicode/c/genprops $ICU_SRC/icu4c # We run it twice for different versions of the CLDR root sort order. bazelisk run //tools/unicode/c/genuca -- --hanOrder implicit $ICU_SRC/icu4c bazelisk run //tools/unicode/c/genuca -- --hanOrder radical-stroke $ICU_SRC/icu4c +# Also generate the ICU4X versions +bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c +bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c diff --git a/icu4c/source/i18n/collation.h b/icu4c/source/i18n/collation.h index 6a449a3eb69..034cb76a3c2 100644 --- a/icu4c/source/i18n/collation.h +++ b/icu4c/source/i18n/collation.h @@ -221,7 +221,8 @@ public: /** * Points to contraction data. * Bits 31..13: Index into prefix/contraction data. - * Bits 12..11: Unused, 0. + * Bit 12: Unused, 0. + * Bit 11: CONTRACT_HAS_STARTER flag. (Used by ICU4X only.) * Bit 10: CONTRACT_TRAILING_CCC flag. * Bit 9: CONTRACT_NEXT_CCC flag. * Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag. @@ -298,6 +299,8 @@ public: static const uint32_t CONTRACT_NEXT_CCC = 0x200; /** Set if any contraction suffix ends with lccc!=0. */ static const uint32_t CONTRACT_TRAILING_CCC = 0x400; + /** Set if any contraction suffix contains a starter. (Used by ICU4X only.) */ + static const uint32_t CONTRACT_HAS_STARTER = 0x800; /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */ static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100; diff --git a/icu4c/source/i18n/collationbuilder.cpp b/icu4c/source/i18n/collationbuilder.cpp index 5d4611b851d..97de83c6ccf 100644 --- a/icu4c/source/i18n/collationbuilder.cpp +++ b/icu4c/source/i18n/collationbuilder.cpp @@ -198,7 +198,7 @@ const int32_t CollationBuilder::HAS_BEFORE2; const int32_t CollationBuilder::HAS_BEFORE3; #endif -CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) +CollationBuilder::CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode) : nfd(*Normalizer2::getNFDInstance(errorCode)), fcd(*Normalizer2Factory::getFCDInstance(errorCode)), nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), @@ -206,7 +206,8 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro baseData(b->data), rootElements(b->data->rootElements, b->data->rootElementsLength), variableTop(0), - dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE), + dataBuilder(new CollationDataBuilder(icu4xMode, errorCode)), fastLatinEnabled(TRUE), + icu4xMode(icu4xMode), errorReason(NULL), cesLength(0), rootPrimaryIndexes(errorCode), nodes(errorCode) { @@ -225,6 +226,10 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro } } +CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) + : CollationBuilder(b, FALSE, errorCode) +{} + CollationBuilder::~CollationBuilder() { delete dataBuilder; } @@ -262,15 +267,19 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString, if(U_FAILURE(errorCode)) { return NULL; } if(dataBuilder->hasMappings()) { makeTailoredCEs(errorCode); - closeOverComposites(errorCode); + if (!icu4xMode) { + closeOverComposites(errorCode); + } finalizeCEs(errorCode); - // Copy all of ASCII, and Latin-1 letters, into each tailoring. - optimizeSet.add(0, 0x7f); - optimizeSet.add(0xc0, 0xff); - // Hangul is decomposed on the fly during collation, - // and the tailoring data is always built with HANGUL_TAG specials. - optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); - dataBuilder->optimize(optimizeSet, errorCode); + if (!icu4xMode) { + // Copy all of ASCII, and Latin-1 letters, into each tailoring. + optimizeSet.add(0, 0x7f); + optimizeSet.add(0xc0, 0xff); + // Hangul is decomposed on the fly during collation, + // and the tailoring data is always built with HANGUL_TAG specials. + optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); + dataBuilder->optimize(optimizeSet, errorCode); + } tailoring->ensureOwnedData(errorCode); if(U_FAILURE(errorCode)) { return NULL; } if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } @@ -743,14 +752,18 @@ CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, } } uint32_t ce32 = Collation::UNASSIGNED_CE32; - if((prefix != nfdPrefix || str != nfdString) && + if(!icu4xMode && (prefix != nfdPrefix || str != nfdString) && !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { // Map from the original input to the CEs. // We do this in case the canonical closure is incomplete, // so that it is possible to explicitly provide the missing mappings. ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); } - addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); + if (!icu4xMode) { + addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); + } else { + addIfDifferent(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); + } if(U_FAILURE(errorCode)) { parserErrorReason = "writing collation elements"; return; @@ -1608,7 +1621,7 @@ CEFinalizer::~CEFinalizer() {} void CollationBuilder::finalizeCEs(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } - LocalPointer newBuilder(new CollationDataBuilder(errorCode), errorCode); + LocalPointer newBuilder(new CollationDataBuilder(icu4xMode, errorCode), errorCode); if(U_FAILURE(errorCode)) { return; } diff --git a/icu4c/source/i18n/collationbuilder.h b/icu4c/source/i18n/collationbuilder.h index 59d3c5d24b0..22e24ddb813 100644 --- a/icu4c/source/i18n/collationbuilder.h +++ b/icu4c/source/i18n/collationbuilder.h @@ -39,6 +39,7 @@ class Normalizer2Impl; class U_I18N_API CollationBuilder : public CollationRuleParser::Sink { public: + CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode); CollationBuilder(const CollationTailoring *base, UErrorCode &errorCode); virtual ~CollationBuilder(); @@ -302,6 +303,7 @@ private: CollationDataBuilder *dataBuilder; UBool fastLatinEnabled; + UBool icu4xMode; UnicodeSet optimizeSet; const char *errorReason; diff --git a/icu4c/source/i18n/collationdatabuilder.cpp b/icu4c/source/i18n/collationdatabuilder.cpp index b10de993c27..e2eb554f7c2 100644 --- a/icu4c/source/i18n/collationdatabuilder.cpp +++ b/icu4c/source/i18n/collationdatabuilder.cpp @@ -283,16 +283,19 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode & // ------------------------------------------------------------------------- *** -CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode) +CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode) : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), base(NULL), baseSettings(NULL), trie(NULL), ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode), modified(FALSE), + icu4xMode(icu4xMode), fastLatinEnabled(FALSE), fastLatinBuilder(NULL), collIter(NULL) { // Reserve the first CE32 for U+0000. - ce32s.addElement(0, errorCode); + if (!icu4xMode) { + ce32s.addElement(0, errorCode); + } conditionalCE32s.setDeleter(uprv_deleteConditionalCE32); } @@ -316,28 +319,32 @@ CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &error base = b; // For a tailoring, the default is to fall back to the base. - trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode); + // For ICU4X, use the same value for fallback as for the default + // to avoid having to have different blocks for the two. + trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode); - // Set the Latin-1 letters block so that it is allocated first in the data array, - // to try to improve locality of reference when sorting Latin-1 text. - // Do not use utrie2_setRange32() since that will not actually allocate blocks - // that are filled with the default value. - // ASCII (0..7F) is already preallocated anyway. - for(UChar32 c = 0xc0; c <= 0xff; ++c) { - utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); + if (!icu4xMode) { + // Set the Latin-1 letters block so that it is allocated first in the data array, + // to try to improve locality of reference when sorting Latin-1 text. + // Do not use utrie2_setRange32() since that will not actually allocate blocks + // that are filled with the default value. + // ASCII (0..7F) is already preallocated anyway. + for(UChar32 c = 0xc0; c <= 0xff; ++c) { + utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); + } + + // Hangul syllables are not tailorable (except via tailoring Jamos). + // Always set the Hangul tag to help performance. + // Do this here, rather than in buildMappings(), + // so that we see the HANGUL_TAG in various assertions. + uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); + utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); + + // Copy the set contents but don't copy/clone the set as a whole because + // that would copy the isFrozen state too. + unsafeBackwardSet.addAll(*b->unsafeBackwardSet); } - // Hangul syllables are not tailorable (except via tailoring Jamos). - // Always set the Hangul tag to help performance. - // Do this here, rather than in buildMappings(), - // so that we see the HANGUL_TAG in various assertions. - uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); - utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); - - // Copy the set contents but don't copy/clone the set as a whole because - // that would copy the isFrozen state too. - unsafeBackwardSet.addAll(*b->unsafeBackwardSet); - if(U_FAILURE(errorCode)) { return; } } @@ -554,6 +561,98 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString & int32_t cLength = U16_LENGTH(c); uint32_t oldCE32 = utrie2_get32(trie, c); UBool hasContext = !prefix.isEmpty() || s.length() > cLength; + + if (icu4xMode) { + if (base && c >= 0x1100 && c < 0x1200) { + // Omit jamo tailorings. + // TODO(https://github.com/unicode-org/icu4x/issues/1941). + } + const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode); + UnicodeString sInNfd; + nfdNormalizer->normalize(s, sInNfd, errorCode); + if (s != sInNfd) { + // s is not in NFD, so it cannot match in ICU4X, since ICU4X only + // does NFD lookups. + // Now check that we're only rejecting known cases. + if (s.length() == 2) { + char16_t second = s.charAt(1); + if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) { + // Second is a special decomposing Tibetan vowel sign. + // These also get added in the decomposed form, so ignoring + // this instance is OK. + return; + } + if (c == 0xFDD1 && second == 0xAC00) { + // This strange contraction exists in the root and + // doesn't have a decomposed counterpart there. + // This won't match in ICU4X anyway and is very strange: + // Unassigned Arabic presentation form contracting with + // the very first Hangul syllable. Let's ignore this + // explicitly. + return; + } + } + // Unknown case worth investigating if ever found. + errorCode = U_UNSUPPORTED_ERROR; + return; + } + + if (!prefix.isEmpty()) { + UnicodeString prefixInNfd; + nfdNormalizer->normalize(prefix, prefixInNfd, errorCode); + if (prefix != prefixInNfd) { + errorCode = U_UNSUPPORTED_ERROR; + return; + } + + int32_t count = prefix.countChar32(); + if (count > 2) { + // Prefix too long for ICU4X. + errorCode = U_UNSUPPORTED_ERROR; + return; + } + UChar32 utf32[4]; + int32_t len = prefix.toUTF32(utf32, 4, errorCode); + if (len != count) { + errorCode = U_INVALID_STATE_ERROR; + return; + } + UChar32 c = utf32[0]; + if (u_getCombiningClass(c)) { + // Prefix must start with as starter for ICU4X. + errorCode = U_UNSUPPORTED_ERROR; + return; + } + // XXX: Korean searchjl has jamo in prefix, so commenting out this + // check for now. ICU4X currently ignores non-root jamo tables anyway. + // searchjl was added in + // https://unicode-org.atlassian.net/browse/CLDR-3560 + // Contractions were changed to prefixes in + // https://unicode-org.atlassian.net/browse/CLDR-6546 + // + // if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) { + // errorCode = U_UNSUPPORTED_ERROR; + // return; + // } + if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) { + // Second character in prefix, if present, must be a kana voicing mark for ICU4X. + errorCode = U_UNSUPPORTED_ERROR; + return; + } + } + + if (s.length() > cLength) { + // Check that there's no modern Hangul in contractions. + for (int32_t i = 0; i < s.length(); ++i) { + UChar c = s.charAt(i); + if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) { + errorCode = U_UNSUPPORTED_ERROR; + return; + } + } + } + } + if(oldCE32 == Collation::FALLBACK_CE32) { // First tailoring for c. // If c has contextual base mappings or if we add a contextual mapping, @@ -675,8 +774,11 @@ CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, return encodeOneCEAsCE32(0); } else if(cesLength == 1) { return encodeOneCE(ces[0], errorCode); - } else if(cesLength == 2) { + } else if(cesLength == 2 && !icu4xMode) { // Try to encode two CEs as one CE32. + // Turn this off for ICU4X, because without the canonical closure + // these are so rare that it doesn't make sense to spend a branch + // on checking this tag when using the data. int64_t ce0 = ces[0]; int64_t ce1 = ces[1]; uint32_t p0 = (uint32_t)(ce0 >> 32); @@ -1284,9 +1386,11 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) setDigitTags(errorCode); setLeadSurrogates(errorCode); - // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. - ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0); - utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); + if (!icu4xMode) { + // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. + ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0); + utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); + } utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode); if(U_FAILURE(errorCode)) { return; } @@ -1428,6 +1532,20 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) // The last suffix character has lccc!=0, allowing for discontiguous contractions. flags |= Collation::CONTRACT_TRAILING_CCC; } + if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) { + for (int32_t i = 0; i < suffix.length();) { + UChar32 c = suffix.char32At(i); + if (!u_getCombiningClass(c)) { + flags |= Collation::CONTRACT_HAS_STARTER; + break; + } + if (c > 0xFFFF) { + i += 2; + } else { + ++i; + } + } + } contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode); if(cond == lastCond) { break; } cond = getConditionalCE32(cond->next); diff --git a/icu4c/source/i18n/collationdatabuilder.h b/icu4c/source/i18n/collationdatabuilder.h index 6ae77772fd5..c272118a577 100644 --- a/icu4c/source/i18n/collationdatabuilder.h +++ b/icu4c/source/i18n/collationdatabuilder.h @@ -60,7 +60,7 @@ public: virtual int64_t modifyCE(int64_t ce) const = 0; }; - CollationDataBuilder(UErrorCode &errorCode); + CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode); virtual ~CollationDataBuilder(); @@ -246,6 +246,7 @@ protected: UnicodeString contexts; UnicodeSet unsafeBackwardSet; UBool modified; + UBool icu4xMode; UBool fastLatinEnabled; CollationFastLatinBuilder *fastLatinBuilder; diff --git a/icu4c/source/i18n/collationroot.cpp b/icu4c/source/i18n/collationroot.cpp index 71753bd6f4b..62bee63c3f4 100644 --- a/icu4c/source/i18n/collationroot.cpp +++ b/icu4c/source/i18n/collationroot.cpp @@ -27,6 +27,7 @@ #include "ucln_in.h" #include "udatamem.h" #include "umutex.h" +#include "umapfile.h" U_NAMESPACE_BEGIN @@ -47,17 +48,46 @@ static UBool U_CALLCONV uprv_collation_root_cleanup() { U_CDECL_END +UDataMemory* +CollationRoot::loadFromFile(const char* ucadataPath, UErrorCode &errorCode) { + UDataMemory dataMemory; + UDataMemory *rDataMem = NULL; + if (U_FAILURE(errorCode)) { + return NULL; + } + if (uprv_mapFile(&dataMemory, ucadataPath, &errorCode)) { + if (dataMemory.pHeader->dataHeader.magic1 == 0xda && + dataMemory.pHeader->dataHeader.magic2 == 0x27 && + CollationDataReader::isAcceptable(NULL, "icu", "ucadata", &dataMemory.pHeader->info)) { + rDataMem = UDataMemory_createNewInstance(&errorCode); + if (U_FAILURE(errorCode)) { + return NULL; + } + rDataMem->pHeader = dataMemory.pHeader; + rDataMem->mapAddr = dataMemory.mapAddr; + rDataMem->map = dataMemory.map; + return rDataMem; + } + errorCode = U_INVALID_FORMAT_ERROR; + return NULL; + } + errorCode = U_MISSING_RESOURCE_ERROR; + return NULL; +} + void U_CALLCONV -CollationRoot::load(UErrorCode &errorCode) { +CollationRoot::load(const char* ucadataPath, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } LocalPointer t(new CollationTailoring(NULL)); if(t.isNull() || t->isBogus()) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } - t->memory = udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll", - "icu", "ucadata", - CollationDataReader::isAcceptable, t->version, &errorCode); + t->memory = ucadataPath ? CollationRoot::loadFromFile(ucadataPath, errorCode) : + udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll", + "icu", "ucadata", + CollationDataReader::isAcceptable, + t->version, &errorCode); if(U_FAILURE(errorCode)) { return; } const uint8_t *inBytes = static_cast(udata_getMemory(t->memory)); CollationDataReader::read(NULL, inBytes, udata_getLength(t->memory), *t, errorCode); @@ -73,14 +103,14 @@ CollationRoot::load(UErrorCode &errorCode) { const CollationCacheEntry * CollationRoot::getRootCacheEntry(UErrorCode &errorCode) { - umtx_initOnce(initOnce, CollationRoot::load, errorCode); + umtx_initOnce(initOnce, CollationRoot::load, static_cast(NULL), errorCode); if(U_FAILURE(errorCode)) { return NULL; } return rootSingleton; } const CollationTailoring * CollationRoot::getRoot(UErrorCode &errorCode) { - umtx_initOnce(initOnce, CollationRoot::load, errorCode); + umtx_initOnce(initOnce, CollationRoot::load, static_cast(NULL), errorCode); if(U_FAILURE(errorCode)) { return NULL; } return rootSingleton->tailoring; } @@ -99,6 +129,12 @@ CollationRoot::getSettings(UErrorCode &errorCode) { return root->settings; } +void +CollationRoot::forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode) { + umtx_initOnce(initOnce, CollationRoot::load, ucadataPath, errorCode); +} + + U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION diff --git a/icu4c/source/i18n/collationroot.h b/icu4c/source/i18n/collationroot.h index 8cd3046cdf1..b203f612b35 100644 --- a/icu4c/source/i18n/collationroot.h +++ b/icu4c/source/i18n/collationroot.h @@ -15,6 +15,7 @@ #define __COLLATIONROOT_H__ #include "unicode/utypes.h" +#include "unicode/udata.h" #if !UCONFIG_NO_COLLATION @@ -34,9 +35,11 @@ public: static const CollationTailoring *getRoot(UErrorCode &errorCode); static const CollationData *getData(UErrorCode &errorCode); static const CollationSettings *getSettings(UErrorCode &errorCode); + static void U_EXPORT2 forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode); private: - static void U_CALLCONV load(UErrorCode &errorCode); + static void U_CALLCONV load(const char* ucadataPath, UErrorCode &errorCode); + static UDataMemory* loadFromFile(const char* ucadataPath, UErrorCode &errorCode); }; U_NAMESPACE_END diff --git a/icu4c/source/tools/genrb/genrb.cpp b/icu4c/source/tools/genrb/genrb.cpp index a739b91143c..319484213ee 100644 --- a/icu4c/source/tools/genrb/genrb.cpp +++ b/icu4c/source/tools/genrb/genrb.cpp @@ -33,6 +33,7 @@ #include "filterrb.h" #include "reslist.h" #include "ucmndata.h" /* TODO: for reading the pool bundle */ +#include "collationroot.h" U_NAMESPACE_USE @@ -84,7 +85,9 @@ enum WRITE_POOL_BUNDLE, USE_POOL_BUNDLE, INCLUDE_UNIHAN_COLL, - FILTERDIR + FILTERDIR, + ICU4X_MODE, + UCADATA }; UOption options[]={ @@ -111,6 +114,8 @@ UOption options[]={ UOPTION_DEF("usePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 20 */ UOPTION_DEF("includeUnihanColl", '\x01', UOPT_NO_ARG),/* 21 */ /* temporary, don't display in usage info */ UOPTION_DEF("filterDir", '\x01', UOPT_OPTIONAL_ARG), /* 22 */ + UOPTION_DEF("icu4xMode", 'X', UOPT_NO_ARG),/* 23 */ + UOPTION_DEF("ucadata", '\x01', UOPT_REQUIRES_ARG),/* 24 */ }; static UBool write_java = FALSE; @@ -152,6 +157,10 @@ main(int argc, fprintf(stderr, "%s: cannot combine --writePoolBundle and --usePoolBundle\n", argv[0]); illegalArg = TRUE; } + if (options[ICU4X_MODE].doesOccur && !options[UCADATA].doesOccur) { + fprintf(stderr, "%s: --icu4xMode requires --ucadata\n", argv[0]); + illegalArg = TRUE; + } if(options[FORMAT_VERSION].doesOccur) { const char *s = options[FORMAT_VERSION].value; if(uprv_strlen(s) != 1 || (s[0] < '1' && '3' < s[0])) { @@ -302,6 +311,15 @@ main(int argc, } } + if (options[UCADATA].doesOccur) { +#if !UCONFIG_NO_COLLATION + CollationRoot::forceLoadFromFile(options[UCADATA].value, status); +#else + fprintf(stderr, "--ucadata was used with UCONFIG_NO_COLLATION\n"); + return status; +#endif + } + initParser(); /*added by Jing*/ @@ -656,7 +674,7 @@ processFile(const char *filename, const char *cp, } /* Parse the data into an SRBRoot */ data.adoptInstead(parse(ucbuf.getAlias(), inputDir, outputDir, filename, - !omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, &status)); + !omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, options[ICU4X_MODE].doesOccur, &status)); if (data.isNull() || U_FAILURE(status)) { fprintf(stderr, "couldn't parse the file %s. Error:%s\n", filename, u_errorName(status)); diff --git a/icu4c/source/tools/genrb/parse.cpp b/icu4c/source/tools/genrb/parse.cpp index 7d5ffe1fc78..a66f8ef914a 100644 --- a/icu4c/source/tools/genrb/parse.cpp +++ b/icu4c/source/tools/genrb/parse.cpp @@ -21,6 +21,8 @@ */ // Safer use of UnicodeString. +#include +#include #ifndef UNISTR_FROM_CHAR_EXPLICIT # define UNISTR_FROM_CHAR_EXPLICIT explicit #endif @@ -42,6 +44,7 @@ #include "reslist.h" #include "rbt_pars.h" #include "genrb.h" +#include "unicode/normalizer2.h" #include "unicode/stringpiece.h" #include "unicode/unistr.h" #include "unicode/ustring.h" @@ -59,6 +62,7 @@ #include "collationruleparser.h" #include "collationtailoring.h" #include +#include "writesrc.h" /* Number of tokens to read ahead of the current stream position */ #define MAX_LOOKAHEAD 3 @@ -76,6 +80,9 @@ #define OPENSQBRACKET 0x005B #define CLOSESQBRACKET 0x005D +#define ICU4X_DIACRITIC_BASE 0x0300 +#define ICU4X_DIACRITIC_LIMIT 0x034F + using icu::CharString; using icu::LocalMemory; using icu::LocalPointer; @@ -119,6 +126,7 @@ typedef struct { const char *filename; UBool makeBinaryCollation; UBool omitCollationRules; + UBool icu4xMode; } ParseState; typedef struct SResource * @@ -764,7 +772,7 @@ GenrbImporter::getRules( /* Parse the data into an SRBRoot */ LocalPointer data( - parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, &errorCode)); + parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, FALSE, &errorCode)); if (U_FAILURE(errorCode)) { return; } @@ -807,6 +815,333 @@ escape(const UChar *s, char *buffer) { } // namespace +static FILE* +openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) { + CharString baseName; + baseName.append(name, *status); + baseName.append("_", *status); + baseName.append(collationType, *status); + baseName.append("_", *status); + baseName.append(structType, *status); + + CharString outFileName; + if (outputdir && *outputdir) { + outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status); + } + outFileName.append(baseName, *status); + outFileName.append(".toml", *status); + if (U_FAILURE(*status)) { + return NULL; + } + + FILE* f = fopen(outFileName.data(), "w"); + if (!f) { + *status = U_FILE_ACCESS_ERROR; + return NULL; + } + usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X"); + + return f; +} + +static void +writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "meta", status); + if (!f) { + return; + } + // printf("writeCollationMetadataTOML %s %s\n", name, collationType); + fprintf(f, "bits = 0x%X\n", metadataBits); + fclose(f); +} + +static UChar32 +writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + UChar32 limit = ICU4X_DIACRITIC_LIMIT; + FILE* f = openTOML(outputdir, name, collationType, "dia", status); + if (!f) { + return limit; + } + // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType); + uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE]; + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { + uint16_t secondary = 0; + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + } + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data + } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) { + if (uprv_strcmp(name, "root") == 0) { + printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c); + fclose(f); + *status = U_INTERNAL_PROGRAM_ERROR; + return limit; + } + limit = c; + break; + } else { + uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32)); + if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) { + // Not a CE where only the secondary weight differs from the expected + // pattern. + limit = c; + break; + } + secondary = uint16_t(ce >> 16); + } + secondaries[c - ICU4X_DIACRITIC_BASE] = secondary; + + } + usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n"); + fclose(f); + return limit; +} + +static void +writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "reord", status); + if (!f) { + return; + } + // printf("writeCollationReorderingTOML %s %s\n", name, collationType); + fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder); + usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n"); + usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n"); + fclose(f); +} + + +static void +writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "jamo", status); + if (!f) { + printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType); + return; + } + uint32_t jamo[0x1200-0x1100]; + for (UChar32 c = 0x1100; c < 0x1200; ++c) { + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + } + // Can't reject complex CE32s, because search collations have expansions. + // These expansions refer to the tailoring, which foils the reuse of the + // these jamo tables. + // XXX Figure out what to do. Perhaps instead of having Latin mini expansions, + // there should be Hangul mini expansions. + // XXX in any case, validate that modern jamo are self-contained. + jamo[c - 0x1100] = ce32; + + } + usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n"); + fclose(f); +} + +static UBool +convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) { + if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) { + // Range entirely in conjoining jamo block. + return TRUE; + } + icu::IcuToolErrorCode status("genrb: convertTrie"); + umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status); + return !U_FAILURE(*status); +} + +static void +writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "data", status); + if (!f) { + return; + } + // printf("writeCollationDataTOML %s %s\n", name, collationType); + + icu::UnicodeSet tailoringSet; + + if (data->base) { + tailoringSet.addAll(*(data->unsafeBackwardSet)); + tailoringSet.removeAll(*(data->base->unsafeBackwardSet)); + } else { + tailoringSet.addAll(*(data->unsafeBackwardSet)); + } + + // Use the same value for out-of-range and default in the hope of not having to allocate + // different blocks, since ICU4X never does out-of-range queries. + uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32; + icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status)); + + utrie2_enum(data->trie, NULL, &convertTrie, builder.getAlias()); + + // If the diacritic table was cut short, copy CE32s between the lowered + // limit and the max limit from the root to the tailoring. As of June 2022, + // no collation in CLDR needs this. + for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data. + continue; + } + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + umutablecptrie_set(builder.getAlias(), c, ce32, status); + } + } + + // Ensure that the range covered by the diacritic table isn't duplicated + // in the trie. + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) { + if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) { + umutablecptrie_set(builder.getAlias(), c, trieDefault, status); + } + } + + icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + UCPTRIE_TYPE_SMALL, + UCPTRIE_VALUE_BITS_32, + status)); + usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n"); + usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n"); + usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n"); + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + + fclose(f); +} + +static void +writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "prim", status); + if (!f) { + return; + } + // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType); + + uint16_t lastPrimaries[4]; + for (int32_t i = 0; i < 4; ++i) { + // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one + // back to get a value that fits in 16 bits. + lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16); + } + + uint32_t numericPrimary = data->numericPrimary; + if (numericPrimary & 0xFFFFFF) { + printf("Lower 24 bits set in numeric primary"); + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n"); + fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24); + fclose(f); +} + +static void +writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) { + UBool tailored = FALSE; + UBool tailoredDiacritics = FALSE; + UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0); + UBool reordering = FALSE; + UBool isRoot = uprv_strcmp(name, "root") == 0; + UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT; + if (!data->base && isRoot) { + diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + writeCollationJamoTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + } else if (data->base && !lithuanianDotAbove) { + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data. + continue; + } + uint32_t ce32 = data->getCE32(c); + if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) { + tailoredDiacritics = TRUE; + diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + break; + } + } + } + + if (settings->hasReordering()) { + reordering = TRUE; + // Note: There are duplicate reorderings. Expecting the ICU4X provider + // to take care of deduplication. + writeCollationReorderingTOML(outputdir, name, collationType, settings, status); + if (U_FAILURE(*status)) { + return; + } + } + + // Write collation data if either base is non-null or the name is root. + // Languages that only reorder scripts are otherwise root-like and have + // null base. + if (data->base || isRoot) { + tailored = !isRoot; + writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status); + if (U_FAILURE(*status)) { + return; + } + } + + uint32_t maxVariable = (uint32_t)settings->getMaxVariable(); + if (maxVariable >= 4) { + printf("Max variable out of range"); + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + uint32_t metadataBits = maxVariable; + if (tailored) { + metadataBits |= (1 << 3); + } + if (tailoredDiacritics) { + metadataBits |= (1 << 4); + } + if (reordering) { + metadataBits |= (1 << 5); + } + if (lithuanianDotAbove) { + metadataBits |= (1 << 6); + } + if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) { + metadataBits |= (1 << 7); + } + if (settings->getAlternateHandling() == UCOL_SHIFTED) { + metadataBits |= (1 << 8); + } + switch (settings->getCaseFirst()) { + case UCOL_OFF: + break; + case UCOL_UPPER_FIRST: + metadataBits |= (1 << 9); + metadataBits |= (1 << 10); + break; + case UCOL_LOWER_FIRST: + metadataBits |= (1 << 9); + break; + default: + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status); +} + #endif // !UCONFIG_NO_COLLATION static TableResource * @@ -952,9 +1287,9 @@ addCollation(ParseState* state, TableResource *result, const char *collationTyp res_close(result); return NULL; // TODO: use LocalUResourceBundlePointer for result } - icu::CollationBuilder builder(base, intStatus); - if(uprv_strncmp(collationType, "search", 6) == 0) { - builder.disableFastLatin(); // build fast-Latin table unless search collator + icu::CollationBuilder builder(base, state->icu4xMode, intStatus); + if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) { + builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X } LocalPointer t( builder.parseAndBuild(rules, version, &importer, &parseError, intStatus)); @@ -977,6 +1312,19 @@ addCollation(ParseState* state, TableResource *result, const char *collationTyp return NULL; } } + if (state->icu4xMode) { + char *nameWithoutSuffix = static_cast(uprv_malloc(uprv_strlen(state->filename) + 1)); + if (nameWithoutSuffix == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + res_close(result); + return NULL; + } + uprv_strcpy(nameWithoutSuffix, state->filename); + *uprv_strrchr(nameWithoutSuffix, '.') = 0; + + writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status); + uprv_free(nameWithoutSuffix); + } icu::LocalMemory buffer; int32_t capacity = 100000; uint8_t *dest = buffer.allocateInsteadAndCopy(capacity); @@ -1966,7 +2314,7 @@ parseResource(ParseState* state, char *tag, const struct UString *comment, UErro /* parse the top-level resource */ struct SRBRoot * parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename, - UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status) + UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status) { struct UString *tokenValue; struct UString comment; @@ -1992,6 +2340,7 @@ parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *fi state.filename = filename; state.makeBinaryCollation = makeBinaryCollation; state.omitCollationRules = omitCollationRules; + state.icu4xMode = icu4xMode; ustr_init(&comment); expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status); diff --git a/icu4c/source/tools/genrb/parse.h b/icu4c/source/tools/genrb/parse.h index bcd8e798f9a..fa90ede9d20 100644 --- a/icu4c/source/tools/genrb/parse.h +++ b/icu4c/source/tools/genrb/parse.h @@ -31,7 +31,7 @@ void initParser(); /* Parse a ResourceBundle text file */ struct SRBRoot* parse(UCHARBUF *buf, const char* inputDir, const char* outputDir, const char *filename, - UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status); + UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status); U_CDECL_END diff --git a/icu4c/source/tools/icuexportdata/icuexportdata.cpp b/icu4c/source/tools/icuexportdata/icuexportdata.cpp index 7431ac74ab8..20ec0324738 100644 --- a/icu4c/source/tools/icuexportdata/icuexportdata.cpp +++ b/icu4c/source/tools/icuexportdata/icuexportdata.cpp @@ -1,7 +1,15 @@ // © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html +#include +#include +#include #include +#include +#include +#include +#include +#include #include #include #include "toolutil.h" @@ -15,7 +23,10 @@ #include "unicode/uscript.h" #include "unicode/putil.h" #include "unicode/umutablecptrie.h" +#include "unicode/ucharstriebuilder.h" #include "ucase.h" +#include "unicode/normalizer2.h" +#include "normalizer2impl.h" #include "writesrc.h" U_NAMESPACE_USE @@ -299,6 +310,470 @@ FILE* prepareOutputFile(const char* basename) { return f; } +#if !UCONFIG_NO_NORMALIZATION + +struct PendingDescriptor { + UChar32 scalar; + uint32_t descriptor; + UBool supplementary; +}; + +void writeCanonicalCompositions(USet* backwardCombiningStarters) { + IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions"); + const char* basename = "compositions"; + FILE* f = prepareOutputFile(basename); + + LocalPointer backwardBuilder(new UCharsTrieBuilder(status), status); + + const int32_t DECOMPOSITION_BUFFER_SIZE = 20; + UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; + + const Normalizer2* nfc = Normalizer2::getNFCInstance(status); + for (UChar32 c = 0; c <= 0x10FFFF; ++c) { + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + UnicodeString decomposition; + if (!nfc->getRawDecomposition(c, decomposition)) { + continue; + } + int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (len != 2) { + continue; + } + UChar32 starter = utf32[0]; + UChar32 second = utf32[1]; + UChar32 composite = nfc->composePair(starter, second); + if (composite < 0) { + continue; + } + if (c != composite) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (!u_getCombiningClass(second)) { + uset_add(backwardCombiningStarters, second); + } + if (composite >= 0xAC00 && composite <= 0xD7A3) { + // Hangul syllable + continue; + } + + UnicodeString backward; + backward.append(second); + backward.append(starter); + backwardBuilder->add(backward, int32_t(composite), status); + } + UnicodeString canonicalCompositionTrie; + backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status); + + usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n"); + fclose(f); + handleError(status, basename); +} + +void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) { + FILE* f = prepareOutputFile(basename); + usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n"); + usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n"); + fclose(f); +} + +void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector& pendingTrieInsertions) { + IcuToolErrorCode status("icuexportdata: writeDecompositionData"); + FILE* f = prepareOutputFile(basename); + + // Zero is a magic number that means the character decomposes to itself. + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + + // Iterate backwards to insert lower code points in the trie first in case it matters + // for trie block allocation. + for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) { + const PendingDescriptor& pending = pendingTrieInsertions[i]; + uint32_t additional = 0; + if (!(pending.descriptor & 0xFFFF0000)) { + uint32_t offset = pending.descriptor & 0xFFF; + if (!pending.supplementary) { + if (offset >= baseSize16) { + // This is a offset to supplementary 16-bit data. We have + // 16-bit base data and 32-bit base data before. However, + // the 16-bit base data length is already part of offset. + additional = baseSize32; + } + } else { + if (offset >= baseSize32) { + // This is an offset to supplementary 32-bit data. We have 16-bit + // base data, 32-bit base data, and 16-bit supplementary data before. + // However, the 32-bit base data length is already part + // of offset. + additional = baseSize16 + supplementSize16; + } else { + // This is an offset to 32-bit base data. We have 16-bit + // base data before. + additional = baseSize16; + } + } + if (offset + additional > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + umutablecptrie_set(builder.getAlias(), pending.scalar, pending.descriptor + additional, status); + } + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + UCPTRIE_VALUE_BITS_32, + status)); + handleError(status, basename); + + if (!reference) { + usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML); + } else { + if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) { + // NFD expectations don't hold. The set must not contain the half-width + // kana voicing marks and must contain iota subscript. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + + USet* halfWidthVoicing = uset_openEmpty(); + uset_add(halfWidthVoicing, 0xFF9E); + uset_add(halfWidthVoicing, 0xFF9F); + + USet* iotaSubscript = uset_openEmpty(); + uset_add(iotaSubscript, 0x0345); + + uint8_t flags = 0; + + USet* halfWidthCheck = uset_cloneAsThawed(uset); + uset_removeAll(halfWidthCheck, reference); + if (uset_equals(halfWidthCheck, halfWidthVoicing)) { + flags |= 1; + } else if (!uset_isEmpty(halfWidthCheck)) { + // The result was neither empty nor contained exactly + // the two half-width voicing marks. The ICU4X + // normalizer doesn't know how to deal with this case. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uset_close(halfWidthCheck); + + USet* iotaCheck = uset_cloneAsThawed(reference); + uset_removeAll(iotaCheck, uset); + if (uset_equals(iotaCheck, iotaSubscript)) { + flags |= (1 << 1); + } else if (!uset_isEmpty(iotaCheck)) { + // The result was neither empty nor contained exactly + // the iota subscript. The ICU4X normalizer doesn't + // know how to deal with this case. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uset_close(halfWidthCheck); + + uset_close(iotaSubscript); + uset_close(halfWidthVoicing); + + fprintf(f, "flags = 0x%X\n", flags); + } + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + fclose(f); + handleError(status, basename); +} + +void writePotentialCompositionPassThrough(const char* basename, const Normalizer2* norm, const USet* decompositionStartsWithNonStarter, const USet* decompositionStartsWithBackwardCombiningStarter, USet* potentialPassthroughAndNotBackwardCombining) { + IcuToolErrorCode status("icuexportdata: writePotentialCompositionPassThrough"); + FILE* f = prepareOutputFile(basename); + + const Normalizer2* nfc = nullptr; + if (!norm) { + // UTS 46 case + norm = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status); + nfc = Normalizer2::getNFCInstance(status); + } + for (UChar32 c = 0; c <= 0x10FFFF; ++c) { + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + if (uset_contains(decompositionStartsWithNonStarter, c) || uset_contains(decompositionStartsWithBackwardCombiningStarter, c)) { + continue; + } + UnicodeString src; + UnicodeString dst; + src.append(c); + norm->normalize(src, dst, status); + if (nfc && (dst.isEmpty() || (dst == u"\uFFFD" && c != 0xFFFD))) { + // UTS 46 ignored and disallowed fall back to NFC for data + // overlap. + dst.truncate(0); + nfc->normalize(src, dst, status); + } + if (src == dst) { + uset_add(potentialPassthroughAndNotBackwardCombining, c); + } + } + + // The surrogate range forms a useless discontinuity. The code + // that reads from the set never looks up by surrage, so let's + // put the surrogate range in the set as a micro-optimization. + uset_addRange(potentialPassthroughAndNotBackwardCombining, 0xD800, 0xDFFF); + + usrc_writeUnicodeSet(f, potentialPassthroughAndNotBackwardCombining, UPRV_TARGET_SYNTAX_TOML); + fclose(f); + handleError(status, basename); +} + +// Computes data for canonical decompositions +void computeDecompositions(const char* basename, const USet* backwardCombiningStarters, std::vector& storage16, std::vector& storage32, USet* decompositionStartsWithNonStarter, USet* decompositionStartsWithBackwardCombiningStarter, std::vector& pendingTrieInsertions) { + IcuToolErrorCode status("icuexportdata: computeDecompositions"); + const Normalizer2* mainNormalizer; + const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status); + if (uprv_strcmp(basename, "nfkd") == 0) { + mainNormalizer = Normalizer2::getNFKDInstance(status); + } else if (uprv_strcmp(basename, "uts46d") == 0) { + mainNormalizer = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status); + } else { + mainNormalizer = nfdNormalizer; + } + + // Max length as of Unicode 14 is 4 for NFD. For NFKD the max + // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB). + const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9; + const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8; + const int32_t DECOMPOSITION_BUFFER_SIZE = 20; + UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; + + // Iterate over all scalar values excluding Hangul syllables. + // + // We go backwards in order to better find overlapping decompositions. + // + // As of Unicode 14: + // Iterate forward without overlap search: + // nfd: 16 size: 896, 32 size: 173 + // nfkd: 16 size: 3854, 32 size: 179 + // + // Iterate forward with overlap search: + // nfd: 16 size: 888, 32 size: 173 + // nfkd: 16 size: 3266, 32 size: 179 + // + // Iterate backward with overlap search: + // nfd: 16 size: 776, 32 size: 173 + // nfkd: 16 size: 2941, 32 size: 179 + // + // UChar32 is signed! + for (UChar32 c = 0x10FFFF; c >= 0; --c) { + if (c >= 0xAC00 && c <= 0xD7A3) { + // Hangul syllable + continue; + } + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + UnicodeString src; + UnicodeString dst; + src.append(c); + if (mainNormalizer != nfdNormalizer) { + UnicodeString inter; + mainNormalizer->normalize(src, inter, status); + nfdNormalizer->normalize(inter, dst, status); + } else { + nfdNormalizer->normalize(src, dst, status); + } + int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + // Characters that normalize to nothing or to U+FFFD (without the + // input being U+FFFD) in ICU4C's UTS 46 normalization normalize + // as in NFD in ICU4X's UTF 46 normalization in the interest + // of data size and ICU4X's normalizer being unable to handle + // normalizing to nothing. + // When UTS 46 is implemented on top of ICU4X, a preprocessing + // step is supposed to remove these characters before the + // normalization step. + if (uprv_strcmp(basename, "uts46d") != 0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + nfdNormalizer->normalize(src, dst, status); + len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + if (len > DECOMPOSITION_BUFFER_SIZE) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + bool startsWithNonStarter = u_getCombiningClass(utf32[0]); + if (startsWithNonStarter) { + uset_add(decompositionStartsWithNonStarter, c); + } else if (uset_contains(backwardCombiningStarters, c)) { + uset_add(decompositionStartsWithBackwardCombiningStarter, c); + } + if (mainNormalizer != nfdNormalizer) { + UnicodeString nfd; + nfdNormalizer->normalize(src, nfd, status); + if (dst == nfd) { + continue; + } + } else { + if (src == dst) { + continue; + } + } + if (startsWithNonStarter && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) { + // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (len == 1 && utf32[0] <= 0xFFFF) { + if (utf32[0] == 1) { + // 1 is reserved as a marker for the expansion of U+FDFA. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE}); + } else if (len == 2 && utf32[0] <= 0xFFFF && utf32[1] <= 0xFFFF && !u_getCombiningClass(utf32[0]) && u_getCombiningClass(utf32[1])) { + pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), FALSE}); + } else { + UBool supplementary = FALSE; + UBool nonInitialStarter = FALSE; + for (int32_t i = 0; i < len; ++i) { + if (utf32[i] > 0xFFFF) { + supplementary = TRUE; + } + if (utf32[i] == 0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (i != 0 && !u_getCombiningClass(utf32[i])) { + nonInitialStarter = TRUE; + } + } + if (!supplementary) { + if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) { + if (len == 18 && c == 0xFDFA) { + // Special marker for the one character whose decomposition + // is too long. + pendingTrieInsertions.push_back({c, 1 << 16, supplementary}); + continue; + } else { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + // Complex decomposition + // Format for 16-bit value: + // 15..13: length minus two for 16-bit case and length minus one for + // the 32-bit case. Length 8 needs to fit in three bits in + // the 16-bit case, and this way the value is future-proofed + // up to 9 in the 16-bit case. Zero is unused and length one + // in the 16-bit case goes directly into the trie. + // 12: 1 if all trailing characters are guaranteed non-starters, + // 0 if no guarantees about non-starterness. + // Note: The bit choice is this way around to allow for + // dynamically falling back to not having this but instead + // having one more bit for length by merely choosing + // different masks. + // 11..0: Start offset in storage. The offset is to the logical + // sequence of scalars16, scalars32, supplementary_scalars16, + // supplementary_scalars32. + uint32_t descriptor = uint32_t(!nonInitialStarter) << 12; + if (!supplementary) { + descriptor |= (uint32_t(len) - 2) << 13; + } else { + descriptor |= (uint32_t(len) - 1) << 13; + } + if (descriptor & 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + size_t index = 0; + bool writeToStorage = FALSE; + // Sadly, C++ lacks break and continue by label, so using goto in the + // inner loops to break or continue the outer loop. + if (!supplementary) { + outer16: for (;;) { + if (index == storage16.size()) { + writeToStorage = TRUE; + break; + } + if (storage16[index] == utf32[0]) { + for (int32_t i = 1; i < len; ++i) { + if (storage16[index + i] != uint32_t(utf32[i])) { + ++index; + // continue outer + goto outer16; + } + } + // break outer + goto after; + } + ++index; + } + } else { + outer32: for (;;) { + if (index == storage32.size()) { + writeToStorage = TRUE; + break; + } + if (storage32[index] == uint32_t(utf32[0])) { + for (int32_t i = 1; i < len; ++i) { + if (storage32[index + i] != uint32_t(utf32[i])) { + ++index; + // continue outer + goto outer32; + } + } + // break outer + goto after; + } + ++index; + } + } + after: + if (index > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + descriptor |= uint32_t(index); + if (!descriptor || descriptor > 0xFFFF) { + // > 0xFFFF should never happen if the code above is correct. + // == 0 should not happen due to the nature of the data. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (writeToStorage) { + if (!supplementary) { + for (int32_t i = 0; i < len; ++i) { + storage16.push_back(uint16_t(utf32[i])); + } + } else { + for (int32_t i = 0; i < len; ++i) { + storage32.push_back(uint32_t(utf32[i])); + } + } + } + pendingTrieInsertions.push_back({c, descriptor, supplementary}); + } + } + if (storage16.size() + storage32.size() > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + } + handleError(status, basename); +} + +#endif // !UCONFIG_NO_NORMALIZATION + enum { OPT_HELP_H, OPT_HELP_QUESTION_MARK, @@ -341,7 +816,7 @@ void printHelp(FILE* stdfile, const char* program) { "options:\n" "\t-h or -? or --help this usage text\n" "\t-V or --version show a version message\n" - "\t-m or --mode mode: currently only 'uprops' and 'ucase', but more may be added\n" + "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n" "\t --trie-type set the trie type (small or fast, default small)\n" "\t-d or --destdir destination directory, followed by the path\n" "\t --all write out all properties known to icuexportdata\n" @@ -387,6 +862,46 @@ int exportUprops(int argc, char* argv[]) { } } + if (propNames.empty() + || options[OPT_HELP_H].doesOccur + || options[OPT_HELP_QUESTION_MARK].doesOccur + || !options[OPT_MODE].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + fprintf(stdfile, + "usage: %s -m uprops [-options] [--all | properties...]\n" + "\tdump Unicode property data to .toml files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-m or --mode mode: currently only 'uprops', but more may be added\n" + "\t --trie-type set the trie type (small or fast, default small)\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t --all write out all properties known to icuexportdata\n" + "\t --index write an _index.toml summarizing all data exported\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n", + argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + const char* mode = options[OPT_MODE].value; + if (uprv_strcmp(mode, "uprops") != 0) { + fprintf(stderr, "Invalid option for --mode (must be uprops)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + + if (options[OPT_TRIE_TYPE].doesOccur) { + if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { + trieType = UCPTRIE_TYPE_FAST; + } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { + trieType = UCPTRIE_TYPE_SMALL; + } else { + fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + for (const char* propName : propNames) { UProperty propEnum = u_getPropertyEnum(propName); if (propEnum == UCHAR_INVALID_CODE) { @@ -505,6 +1020,81 @@ int exportCase(int argc, char* argv[]) { return 0; } +#if !UCONFIG_NO_NORMALIZATION + +int exportNorm() { + IcuToolErrorCode status("icuexportdata: exportNorm"); + USet* backwardCombiningStarters = uset_openEmpty(); + writeCanonicalCompositions(backwardCombiningStarters); + + std::vector storage16; + std::vector storage32; + + USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty(); + USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector nfdPendingTrieInsertions; + computeDecompositions("nfd", backwardCombiningStarters, storage16, storage32, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfdPendingTrieInsertions); + + uint32_t baseSize16 = storage16.size(); + uint32_t baseSize32 = storage32.size(); + + USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty(); + USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector nfkdPendingTrieInsertions; + computeDecompositions("nfkd", backwardCombiningStarters, storage16, storage32, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkdPendingTrieInsertions); + + USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty(); + USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector uts46PendingTrieInsertions; + computeDecompositions("uts46d", backwardCombiningStarters, storage16, storage32, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PendingTrieInsertions); + + uint32_t supplementSize16 = storage16.size() - baseSize16; + uint32_t supplementSize32 = storage32.size() - baseSize32; + + writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions); + writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions); + writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions); + + writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32); + writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32); + + USet* nfcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty(); + const Normalizer2* nfc = Normalizer2::getNFCInstance(status); + writePotentialCompositionPassThrough("nfc", nfc, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfcPotentialPassthroughAndNotBackwardCombining); + + USet* nfkcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty(); + const Normalizer2* nfkc = Normalizer2::getNFKCInstance(status); + writePotentialCompositionPassThrough("nfkc", nfkc, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkcPotentialPassthroughAndNotBackwardCombining); + + USet* uts46PotentialPassthroughAndNotBackwardCombining = uset_openEmpty(); + writePotentialCompositionPassThrough("uts46", nullptr, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PotentialPassthroughAndNotBackwardCombining); + + // Check that NFKC set has no characters that NFC doesn't also have. + uset_removeAll(nfkcPotentialPassthroughAndNotBackwardCombining, nfcPotentialPassthroughAndNotBackwardCombining); + if (!uset_isEmpty(nfkcPotentialPassthroughAndNotBackwardCombining)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + + uset_close(nfcPotentialPassthroughAndNotBackwardCombining); + uset_close(nfkcPotentialPassthroughAndNotBackwardCombining); + uset_close(uts46PotentialPassthroughAndNotBackwardCombining); + + uset_close(nfdDecompositionStartsWithNonStarter); + uset_close(nfkdDecompositionStartsWithNonStarter); + uset_close(uts46DecompositionStartsWithNonStarter); + + uset_close(nfdDecompositionStartsWithBackwardCombiningStarter); + uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter); + uset_close(uts46DecompositionStartsWithBackwardCombiningStarter); + + uset_close(backwardCombiningStarters); + handleError(status, "exportNorm"); + return 0; +} + +#endif // !UCONFIG_NO_NORMALIZATION + int main(int argc, char* argv[]) { U_MAIN_INIT_ARGS(argc, argv); @@ -553,12 +1143,20 @@ int main(int argc, char* argv[]) { } const char* mode = options[OPT_MODE].value; + if (uprv_strcmp(mode, "norm") == 0) { +#if !UCONFIG_NO_NORMALIZATION + return exportNorm(); +#else + fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n"); + return U_ILLEGAL_ARGUMENT_ERROR; +#endif + } if (uprv_strcmp(mode, "uprops") == 0) { return exportUprops(argc, argv); } else if (uprv_strcmp(mode, "ucase") == 0) { return exportCase(argc, argv); } - fprintf(stderr, "Invalid option for --mode (must be uprops or ucase)\n"); + fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n"); return U_ILLEGAL_ARGUMENT_ERROR; } diff --git a/icu4c/source/tools/toolutil/writesrc.cpp b/icu4c/source/tools/toolutil/writesrc.cpp index 143254a7f34..4e8989a02c2 100644 --- a/icu4c/source/tools/toolutil/writesrc.cpp +++ b/icu4c/source/tools/toolutil/writesrc.cpp @@ -19,6 +19,7 @@ */ #include +#include #include #include "unicode/utypes.h" #include "unicode/putil.h" @@ -143,12 +144,14 @@ usrc_writeArray(FILE *f, const uint8_t *p8; const uint16_t *p16; const uint32_t *p32; - uint32_t value; + const int64_t *p64; // Signed due to TOML! + int64_t value; // Signed due to TOML! int32_t i, col; p8=NULL; p16=NULL; p32=NULL; + p64=NULL; switch(width) { case 8: p8=(const uint8_t *)p; @@ -159,6 +162,9 @@ usrc_writeArray(FILE *f, case 32: p32=(const uint32_t *)p; break; + case 64: + p64=(const int64_t *)p; + break; default: fprintf(stderr, "usrc_writeArray(width=%ld) unrecognized width\n", (long)width); return; @@ -186,11 +192,14 @@ usrc_writeArray(FILE *f, case 32: value=p32[i]; break; + case 64: + value=p64[i]; + break; default: value=0; /* unreachable */ break; } - fprintf(f, value<=9 ? "%lu" : "0x%lx", (unsigned long)value); + fprintf(f, value<=9 ? "%" PRId64 : "0x%" PRIx64, value); } if(postfix!=NULL) { fputs(postfix, f); diff --git a/icu4c/source/tools/toolutil/writesrc.h b/icu4c/source/tools/toolutil/writesrc.h index 784a9b9c7a7..9c0be5a1007 100644 --- a/icu4c/source/tools/toolutil/writesrc.h +++ b/icu4c/source/tools/toolutil/writesrc.h @@ -69,7 +69,7 @@ usrc_writeFileNameGeneratedBy( const char *generator); /** - * Writes the contents of an array of 8/16/32-bit words. + * Writes the contents of an array of 8/16/32/64-bit words. * The prefix and postfix are optional (can be NULL) and are written first/last. * The prefix may contain a %ld or similar field for the array length. * The {} and declaration etc. need to be included in prefix/postfix or diff --git a/tools/unicode/c/genuca/collationbasedatabuilder.cpp b/tools/unicode/c/genuca/collationbasedatabuilder.cpp index c92ebd9aadc..b20d0310e89 100644 --- a/tools/unicode/c/genuca/collationbasedatabuilder.cpp +++ b/tools/unicode/c/genuca/collationbasedatabuilder.cpp @@ -83,14 +83,15 @@ binarySearch(const UVector64 &list, int64_t ce) { } // namespace -CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode) - : CollationDataBuilder(errorCode), +CollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode) + : CollationDataBuilder(icu4xMode, errorCode), numericPrimary(0x12000000), firstHanPrimary(0), lastHanPrimary(0), hanStep(2), rootElements(errorCode), scriptStartsLength(1) { uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex)); uprv_memset(scriptStarts, 0, sizeof(scriptStarts)); + this->icu4xMode = icu4xMode; } CollationBaseDataBuilder::~CollationBaseDataBuilder() { @@ -119,7 +120,9 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) { trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode); // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches. - for(UChar32 c = 0; c < 0x180; ++c) { + // In the ICU4X case, only preallocate ASCII, because we don't store CE32s for + // precomposed characters. + for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) { utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode); } @@ -128,8 +131,10 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) { // Some code assumes that the root first primary CE is the "space first primary" // from FractionalUCA.txt. - uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); - utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode); + if (!icu4xMode) { + uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); + utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode); + } // Add a mapping for the first-unassigned boundary, // which is the AlphabeticIndex overflow boundary. diff --git a/tools/unicode/c/genuca/collationbasedatabuilder.h b/tools/unicode/c/genuca/collationbasedatabuilder.h index c2c9564bae9..6d57b9c43f0 100644 --- a/tools/unicode/c/genuca/collationbasedatabuilder.h +++ b/tools/unicode/c/genuca/collationbasedatabuilder.h @@ -37,7 +37,7 @@ U_NAMESPACE_BEGIN */ class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder { public: - CollationBaseDataBuilder(UErrorCode &errorCode); + CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode); virtual ~CollationBaseDataBuilder(); diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp index baa27c3f2ed..41e253f3cd1 100644 --- a/tools/unicode/c/genuca/genuca.cpp +++ b/tools/unicode/c/genuca/genuca.cpp @@ -24,6 +24,7 @@ #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1 #include +#include #include "unicode/utypes.h" #include "unicode/errorcode.h" #include "unicode/localpointer.h" @@ -69,7 +70,7 @@ enum HanOrderValue { HAN_RADICAL_STROKE }; -static UBool beVerbose=FALSE, withCopyright=TRUE; +static UBool beVerbose=FALSE, withCopyright=TRUE, icu4xMode=FALSE; static HanOrderValue hanOrder = HAN_NO_ORDER; @@ -832,6 +833,11 @@ parseFractionalUCA(const char *filename, int32_t lineNumber = 0; char buffer[30000]; + const Normalizer2* norm = nullptr; + if (icu4xMode) { + norm = Normalizer2::getNFDInstance(*status); + } + UChar32 maxCodePoint = 0; while(!feof(data)) { if(U_FAILURE(*status)) { @@ -889,6 +895,24 @@ parseFractionalUCA(const char *filename, // CollationBaseDataBuilder::init() maps them to special CEs. // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt. if(0xfffd <= c && c <= 0xffff) { continue; } + if (icu4xMode) { + if (c >= 0xAC00 && c <= 0xD7A3) { + // Hangul syllable + continue; + } + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + UnicodeString src; + UnicodeString dst; + src.append(c); + norm->normalize(src, dst, *status); + if (src != dst) { + // c decomposed, skip it + continue; + } + } if(s.length() >= 2 && c == 0xFDD1) { UChar32 c2 = s.char32At(1); int32_t script = getCharScript(c2); @@ -923,7 +947,6 @@ parseFractionalUCA(const char *filename, (int)lineNumber, filename, line); exit(U_INVALID_FORMAT_ERROR); } - builder.add(prefix, s, ces, cesLength, *status); } } @@ -1126,8 +1149,9 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder, CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion); const char *dataName = - hanOrder == HAN_IMPLICIT ? "ucadata-implicithan" : - "ucadata-unihan"; + hanOrder == HAN_IMPLICIT ? + (icu4xMode ? "ucadata-implicithan-icu4x" : "ucadata-implicithan") : + (icu4xMode ? "ucadata-unihan-icu4x" : "ucadata-unihan"); UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo, withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); if(U_FAILURE(errorCode)) { @@ -1275,7 +1299,7 @@ parseAndWriteCollationRootData( const char *sourceCodePath, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } - CollationBaseDataBuilder builder(errorCode); + CollationBaseDataBuilder builder(icu4xMode, errorCode); builder.init(errorCode); parseFractionalUCA(fracUCAPath, builder, &errorCode); buildAndWriteBaseData(builder, binaryDataPath, errorCode); @@ -1289,7 +1313,8 @@ enum { HELP_QUESTION_MARK, VERBOSE, COPYRIGHT, - HAN_ORDER + HAN_ORDER, + ICU4X }; static UOption options[]={ @@ -1297,7 +1322,8 @@ static UOption options[]={ UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, UOPTION_COPYRIGHT, - UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG) + UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG), + UOPTION_DEF("icu4x", 'X', UOPT_NO_ARG) }; extern "C" int @@ -1348,6 +1374,7 @@ main(int argc, char* argv[]) { beVerbose=options[VERBOSE].doesOccur; withCopyright=options[COPYRIGHT].doesOccur; + icu4xMode=options[ICU4X].doesOccur; IcuToolErrorCode errorCode("genuca");