ICU-22028 Export collation and normalization data for ICU4X

2025-04-11 08:01:32 +00:00 · 2021-11-03 12:28:07 +02:00 · 2021-11-03 12:28:07 +02:00 · 3cefbd55c7
commit 3cefbd55c7
parent d7c424b00f
22 changed files with 1275 additions and 76 deletions
--- a/.ci-builds/.azure-pipelines.yml
+++ b/.ci-builds/.azure-pipelines.yml
@ -418,7 +418,7 @@ jobs:
  timeoutInMinutes: 30
  pool:
    vmImage: 'windows-2019'
-    demands: 
+    demands:
      - msbuild
      - visualstudio
      - Cmd
@ -633,6 +633,8 @@ jobs:
        cd icu4c/source
        LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all
        LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all
+        LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/fast --trie-type fast --all
+        LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode norm --copyright --verbose --destdir icuexportdata_uprops_full/small --trie-type small --all
      displayName: 'Build Unicode property data export file (Full)'
    # In the sample file, include:
    # - Basic binary properties: AHex WSpace
@ -646,6 +648,18 @@ jobs:
        LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/fast --trie-type fast AHex gc nt Basic_Emoji sc WSpace blank
        LD_LIBRARY_PATH=lib ./bin/icuexportdata --mode uprops --index --copyright --verbose --destdir icuexportdata_uprops_sample/small --trie-type small AHex gc nt Basic_Emoji sc WSpace blank
      displayName: 'Build Unicode property data export file (Sample)'
+    - script: |
+        mkdir -p icu4c/source/icuexportdata_uprops_full/collation_unihan
+        mkdir -p icu4c/source/icuexportdata_uprops_full/collation_implicithan
+        cd icu4c/source
+        cd data/coll
+        FILES=`echo *.txt`
+        cd -
+        LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_unihan --ucadata data/in/coll/ucadata-unihan-icu4x.icu $FILES
+        LD_LIBRARY_PATH=lib ./bin/genrb -X -s data/coll/ -d icuexportdata_uprops_full/collation_implicithan --ucadata data/in/coll/ucadata-implicithan-icu4x.icu $FILES
+        rm icuexportdata_uprops_full/collation_unihan/*.res
+        rm icuexportdata_uprops_full/collation_implicithan/*.res
+      displayName: 'Build collation data export file'
    - task: PublishBuildArtifacts@1
      displayName: 'Publish Artifact: icuexportdata_uprops_full'
      inputs:
--- a/icu4c/source/common/udatamem.h
+++ b/icu4c/source/common/udatamem.h
@ -44,7 +44,7 @@ struct UDataMemory {
    int32_t           length;      /* Length of the data in bytes; -1 if unknown.     */
 };

-U_CFUNC UDataMemory *UDataMemory_createNewInstance(UErrorCode *pErr);
+U_CAPI  UDataMemory* U_EXPORT2 UDataMemory_createNewInstance(UErrorCode *pErr);
 U_CFUNC void         UDatamemory_assign  (UDataMemory *dest, UDataMemory *source);
 U_CFUNC void         UDataMemory_init    (UDataMemory *This);
 U_CFUNC UBool        UDataMemory_isLoaded(const UDataMemory *This);
--- a/icu4c/source/common/umapfile.h
+++ b/icu4c/source/common/umapfile.h
@ -29,7 +29,7 @@
 #include "unicode/udata.h"
 #include "putilimp.h"

-U_CFUNC UBool uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status);
+U_CAPI  UBool U_EXPORT2 uprv_mapFile(UDataMemory *pdm, const char *path, UErrorCode *status);
 U_CFUNC void  uprv_unmapFile(UDataMemory *pData);

 /* MAP_NONE: no memory mapping, no file access at all */
--- a/icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu
+++ b/icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu
--- a/icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu
+++ b/icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu
--- a/icu4c/source/data/unidata/generate.sh
+++ b/icu4c/source/data/unidata/generate.sh
@ -44,3 +44,6 @@ bazelisk run //tools/unicode/c/genprops $ICU_SRC/icu4c
 # We run it twice for different versions of the CLDR root sort order.
 bazelisk run //tools/unicode/c/genuca -- --hanOrder implicit $ICU_SRC/icu4c
 bazelisk run //tools/unicode/c/genuca -- --hanOrder radical-stroke $ICU_SRC/icu4c
+# Also generate the ICU4X versions
+bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
+bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c
--- a/icu4c/source/i18n/collation.h
+++ b/icu4c/source/i18n/collation.h
@ -221,7 +221,8 @@ public:
        /**
         * Points to contraction data.
         * Bits 31..13: Index into prefix/contraction data.
-         * Bits 12..11: Unused, 0.
+         * Bit      12: Unused, 0.
+         * Bit      11: CONTRACT_HAS_STARTER flag. (Used by ICU4X only.)
         * Bit      10: CONTRACT_TRAILING_CCC flag.
         * Bit       9: CONTRACT_NEXT_CCC flag.
         * Bit       8: CONTRACT_SINGLE_CP_NO_MATCH flag.
@ -298,6 +299,8 @@ public:
    static const uint32_t CONTRACT_NEXT_CCC = 0x200;
    /** Set if any contraction suffix ends with lccc!=0. */
    static const uint32_t CONTRACT_TRAILING_CCC = 0x400;
+    /** Set if any contraction suffix contains a starter. (Used by ICU4X only.) */
+    static const uint32_t CONTRACT_HAS_STARTER = 0x800;

    /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */
    static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100;
--- a/icu4c/source/i18n/collationbuilder.cpp
+++ b/icu4c/source/i18n/collationbuilder.cpp
@ -198,7 +198,7 @@ const int32_t CollationBuilder::HAS_BEFORE2;
 const int32_t CollationBuilder::HAS_BEFORE3;
 #endif

-CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
+CollationBuilder::CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode)
        : nfd(*Normalizer2::getNFDInstance(errorCode)),
          fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
          nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
@ -206,7 +206,8 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro
          baseData(b->data),
          rootElements(b->data->rootElements, b->data->rootElementsLength),
          variableTop(0),
-          dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),
+          dataBuilder(new CollationDataBuilder(icu4xMode, errorCode)), fastLatinEnabled(TRUE),
+          icu4xMode(icu4xMode),
          errorReason(NULL),
          cesLength(0),
          rootPrimaryIndexes(errorCode), nodes(errorCode) {
@ -225,6 +226,10 @@ CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro
    }
 }

+CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
+  : CollationBuilder(b, FALSE, errorCode)
+{}
+
 CollationBuilder::~CollationBuilder() {
    delete dataBuilder;
 }
@ -262,15 +267,19 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
    if(U_FAILURE(errorCode)) { return NULL; }
    if(dataBuilder->hasMappings()) {
        makeTailoredCEs(errorCode);
-        closeOverComposites(errorCode);
+        if (!icu4xMode) {
+            closeOverComposites(errorCode);
+        }
        finalizeCEs(errorCode);
-        // Copy all of ASCII, and Latin-1 letters, into each tailoring.
-        optimizeSet.add(0, 0x7f);
-        optimizeSet.add(0xc0, 0xff);
-        // Hangul is decomposed on the fly during collation,
-        // and the tailoring data is always built with HANGUL_TAG specials.
-        optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
-        dataBuilder->optimize(optimizeSet, errorCode);
+        if (!icu4xMode) {
+            // Copy all of ASCII, and Latin-1 letters, into each tailoring.
+            optimizeSet.add(0, 0x7f);
+            optimizeSet.add(0xc0, 0xff);
+            // Hangul is decomposed on the fly during collation,
+            // and the tailoring data is always built with HANGUL_TAG specials.
+            optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
+            dataBuilder->optimize(optimizeSet, errorCode);
+        }
        tailoring->ensureOwnedData(errorCode);
        if(U_FAILURE(errorCode)) { return NULL; }
        if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
@ -743,14 +752,18 @@ CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
        }
    }
    uint32_t ce32 = Collation::UNASSIGNED_CE32;
-    if((prefix != nfdPrefix || str != nfdString) &&
+    if(!icu4xMode && (prefix != nfdPrefix || str != nfdString) &&
            !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
        // Map from the original input to the CEs.
        // We do this in case the canonical closure is incomplete,
        // so that it is possible to explicitly provide the missing mappings.
        ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
    }
-    addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
+    if (!icu4xMode) {
+        addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
+    } else {
+        addIfDifferent(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
+    }
    if(U_FAILURE(errorCode)) {
        parserErrorReason = "writing collation elements";
        return;
@ -1608,7 +1621,7 @@ CEFinalizer::~CEFinalizer() {}
 void
 CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
-    LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode);
+    LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(icu4xMode, errorCode), errorCode);
    if(U_FAILURE(errorCode)) {
        return;
    }
--- a/icu4c/source/i18n/collationbuilder.h
+++ b/icu4c/source/i18n/collationbuilder.h
@ -39,6 +39,7 @@ class Normalizer2Impl;

 class U_I18N_API CollationBuilder : public CollationRuleParser::Sink {
 public:
+    CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode);
    CollationBuilder(const CollationTailoring *base, UErrorCode &errorCode);
    virtual ~CollationBuilder();

@ -302,6 +303,7 @@ private:

    CollationDataBuilder *dataBuilder;
    UBool fastLatinEnabled;
+    UBool icu4xMode;
    UnicodeSet optimizeSet;
    const char *errorReason;

--- a/icu4c/source/i18n/collationdatabuilder.cpp
+++ b/icu4c/source/i18n/collationdatabuilder.cpp
@ -296,16 +296,19 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &

 // ------------------------------------------------------------------------- ***

-CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode)
+CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
        : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
          base(NULL), baseSettings(NULL),
          trie(NULL),
          ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
          modified(FALSE),
+          icu4xMode(icu4xMode),
          fastLatinEnabled(FALSE), fastLatinBuilder(NULL),
          collIter(NULL) {
    // Reserve the first CE32 for U+0000.
-    ce32s.addElement(0, errorCode);
+    if (!icu4xMode) {
+        ce32s.addElement(0, errorCode);
+    }
    conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
 }

@ -329,28 +332,32 @@ CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &error
    base = b;

    // For a tailoring, the default is to fall back to the base.
-    trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode);
+    // For ICU4X, use the same value for fallback as for the default
+    // to avoid having to have different blocks for the two.
+    trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);

-    // Set the Latin-1 letters block so that it is allocated first in the data array,
-    // to try to improve locality of reference when sorting Latin-1 text.
-    // Do not use utrie2_setRange32() since that will not actually allocate blocks
-    // that are filled with the default value.
-    // ASCII (0..7F) is already preallocated anyway.
-    for(UChar32 c = 0xc0; c <= 0xff; ++c) {
-        utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
+    if (!icu4xMode) {
+        // Set the Latin-1 letters block so that it is allocated first in the data array,
+        // to try to improve locality of reference when sorting Latin-1 text.
+        // Do not use utrie2_setRange32() since that will not actually allocate blocks
+        // that are filled with the default value.
+        // ASCII (0..7F) is already preallocated anyway.
+        for(UChar32 c = 0xc0; c <= 0xff; ++c) {
+            utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
+        }
+
+        // Hangul syllables are not tailorable (except via tailoring Jamos).
+        // Always set the Hangul tag to help performance.
+        // Do this here, rather than in buildMappings(),
+        // so that we see the HANGUL_TAG in various assertions.
+        uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
+        utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
+
+        // Copy the set contents but don't copy/clone the set as a whole because
+        // that would copy the isFrozen state too.
+        unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
    }

-    // Hangul syllables are not tailorable (except via tailoring Jamos).
-    // Always set the Hangul tag to help performance.
-    // Do this here, rather than in buildMappings(),
-    // so that we see the HANGUL_TAG in various assertions.
-    uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
-    utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
-
-    // Copy the set contents but don't copy/clone the set as a whole because
-    // that would copy the isFrozen state too.
-    unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
-
    if(U_FAILURE(errorCode)) { return; }
 }

@ -567,6 +574,98 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
    int32_t cLength = U16_LENGTH(c);
    uint32_t oldCE32 = utrie2_get32(trie, c);
    UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
+
+    if (icu4xMode) {
+        if (base && c >= 0x1100 && c < 0x1200) {
+            // Omit jamo tailorings.
+            // TODO(https://github.com/unicode-org/icu4x/issues/1941).
+        }
+        const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
+        UnicodeString sInNfd;
+        nfdNormalizer->normalize(s, sInNfd, errorCode);
+        if (s != sInNfd) {
+            // s is not in NFD, so it cannot match in ICU4X, since ICU4X only
+            // does NFD lookups.
+            // Now check that we're only rejecting known cases.
+            if (s.length() == 2) {
+                char16_t second = s.charAt(1);
+                if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
+                    // Second is a special decomposing Tibetan vowel sign.
+                    // These also get added in the decomposed form, so ignoring
+                    // this instance is OK.
+                    return;
+                }
+                if (c == 0xFDD1 && second == 0xAC00) {
+                    // This strange contraction exists in the root and
+                    // doesn't have a decomposed counterpart there.
+                    // This won't match in ICU4X anyway and is very strange:
+                    // Unassigned Arabic presentation form contracting with
+                    // the very first Hangul syllable. Let's ignore this
+                    // explicitly.
+                    return;
+                }
+            }
+            // Unknown case worth investigating if ever found.
+            errorCode = U_UNSUPPORTED_ERROR;
+            return;
+        }
+
+        if (!prefix.isEmpty()) {
+            UnicodeString prefixInNfd;
+            nfdNormalizer->normalize(prefix, prefixInNfd, errorCode);
+            if (prefix != prefixInNfd) {
+                errorCode = U_UNSUPPORTED_ERROR;
+                return;
+            }
+
+            int32_t count = prefix.countChar32();
+            if (count > 2) {
+                // Prefix too long for ICU4X.
+                errorCode = U_UNSUPPORTED_ERROR;
+                return;
+            }
+            UChar32 utf32[4];
+            int32_t len = prefix.toUTF32(utf32, 4, errorCode);
+            if (len != count) {
+                errorCode = U_INVALID_STATE_ERROR;
+                return;
+            }
+            UChar32 c = utf32[0];
+            if (u_getCombiningClass(c)) {
+                // Prefix must start with as starter for ICU4X.
+                errorCode = U_UNSUPPORTED_ERROR;
+                return;
+            }
+            // XXX: Korean searchjl has jamo in prefix, so commenting out this
+            // check for now. ICU4X currently ignores non-root jamo tables anyway.
+            // searchjl was added in
+            // https://unicode-org.atlassian.net/browse/CLDR-3560
+            // Contractions were changed to prefixes in
+            // https://unicode-org.atlassian.net/browse/CLDR-6546
+            //
+            // if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) {
+            //     errorCode = U_UNSUPPORTED_ERROR;
+            //     return;
+            // }
+            if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) {
+                // Second character in prefix, if present, must be a kana voicing mark for ICU4X.
+                errorCode = U_UNSUPPORTED_ERROR;
+                return;
+            }
+        }
+
+        if (s.length() > cLength) {
+            // Check that there's no modern Hangul in contractions.
+            for (int32_t i = 0; i < s.length(); ++i) {
+                UChar c = s.charAt(i);
+                if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
+                    errorCode = U_UNSUPPORTED_ERROR;
+                    return;
+                }
+            }
+        }
+    }
+
    if(oldCE32 == Collation::FALLBACK_CE32) {
        // First tailoring for c.
        // If c has contextual base mappings or if we add a contextual mapping,
@ -688,8 +787,11 @@ CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
        return encodeOneCEAsCE32(0);
    } else if(cesLength == 1) {
        return encodeOneCE(ces[0], errorCode);
-    } else if(cesLength == 2) {
+    } else if(cesLength == 2 && !icu4xMode) {
        // Try to encode two CEs as one CE32.
+        // Turn this off for ICU4X, because without the canonical closure
+        // these are so rare that it doesn't make sense to spend a branch
+        // on checking this tag when using the data.
        int64_t ce0 = ces[0];
        int64_t ce1 = ces[1];
        uint32_t p0 = (uint32_t)(ce0 >> 32);
@ -1297,9 +1399,11 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
    setDigitTags(errorCode);
    setLeadSurrogates(errorCode);

-    // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
-    ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
-    utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
+    if (!icu4xMode) {
+        // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
+        ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
+        utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
+    }

    utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
    if(U_FAILURE(errorCode)) { return; }
@ -1454,6 +1558,20 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode)
                    // The last suffix character has lccc!=0, allowing for discontiguous contractions.
                    flags |= Collation::CONTRACT_TRAILING_CCC;
                }
+                if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) {
+                    for (int32_t i = 0; i < suffix.length();) {
+                        UChar32 c = suffix.char32At(i);
+                            if (!u_getCombiningClass(c)) {
+                                flags |= Collation::CONTRACT_HAS_STARTER;
+                                break;
+                            }
+                        if (c > 0xFFFF) {
+                            i += 2;
+                        } else {
+                            ++i;
+                        }
+                    }
+                }
                contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode);
                if(cond == lastCond) { break; }
                cond = getConditionalCE32(cond->next);
--- a/icu4c/source/i18n/collationdatabuilder.h
+++ b/icu4c/source/i18n/collationdatabuilder.h
@ -60,7 +60,7 @@ public:
        virtual int64_t modifyCE(int64_t ce) const = 0;
    };

-    CollationDataBuilder(UErrorCode &errorCode);
+    CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode);

    virtual ~CollationDataBuilder();

@ -255,6 +255,7 @@ private:
 protected:
    UnicodeSet unsafeBackwardSet;
    UBool modified;
+    UBool icu4xMode;

    UBool fastLatinEnabled;
    CollationFastLatinBuilder *fastLatinBuilder;
--- a/icu4c/source/i18n/collationroot.cpp
+++ b/icu4c/source/i18n/collationroot.cpp
@ -27,6 +27,7 @@
 #include "ucln_in.h"
 #include "udatamem.h"
 #include "umutex.h"
+#include "umapfile.h"

 U_NAMESPACE_BEGIN

@ -47,17 +48,46 @@ static UBool U_CALLCONV uprv_collation_root_cleanup() {

 U_CDECL_END

+UDataMemory*
+CollationRoot::loadFromFile(const char* ucadataPath, UErrorCode &errorCode) {
+    UDataMemory dataMemory;
+    UDataMemory  *rDataMem = NULL;
+    if (U_FAILURE(errorCode)) {
+        return NULL;
+    }
+    if (uprv_mapFile(&dataMemory, ucadataPath, &errorCode)) {
+        if (dataMemory.pHeader->dataHeader.magic1 == 0xda &&
+            dataMemory.pHeader->dataHeader.magic2 == 0x27 &&
+            CollationDataReader::isAcceptable(NULL, "icu", "ucadata", &dataMemory.pHeader->info)) {
+            rDataMem = UDataMemory_createNewInstance(&errorCode);
+            if (U_FAILURE(errorCode)) {
+                return NULL;
+            }
+            rDataMem->pHeader = dataMemory.pHeader;
+            rDataMem->mapAddr = dataMemory.mapAddr;
+            rDataMem->map = dataMemory.map;
+            return rDataMem;
+        }
+        errorCode = U_INVALID_FORMAT_ERROR;
+        return NULL;
+    }
+    errorCode = U_MISSING_RESOURCE_ERROR;
+    return NULL;
+}
+
 void U_CALLCONV
-CollationRoot::load(UErrorCode &errorCode) {
+CollationRoot::load(const char* ucadataPath, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    LocalPointer<CollationTailoring> t(new CollationTailoring(NULL));
    if(t.isNull() || t->isBogus()) {
        errorCode = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
-    t->memory = udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll",
-                                 "icu", "ucadata",
-                                 CollationDataReader::isAcceptable, t->version, &errorCode);
+    t->memory = ucadataPath ? CollationRoot::loadFromFile(ucadataPath, errorCode) :
+                              udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll",
+                                               "icu", "ucadata",
+                                               CollationDataReader::isAcceptable,
+                                               t->version, &errorCode);
    if(U_FAILURE(errorCode)) { return; }
    const uint8_t *inBytes = static_cast<const uint8_t *>(udata_getMemory(t->memory));
    CollationDataReader::read(NULL, inBytes, udata_getLength(t->memory), *t, errorCode);
@ -73,14 +103,14 @@ CollationRoot::load(UErrorCode &errorCode) {

 const CollationCacheEntry *
 CollationRoot::getRootCacheEntry(UErrorCode &errorCode) {
-    umtx_initOnce(initOnce, CollationRoot::load, errorCode);
+    umtx_initOnce(initOnce, CollationRoot::load, static_cast<const char*>(NULL), errorCode);
    if(U_FAILURE(errorCode)) { return NULL; }
    return rootSingleton;
 }

 const CollationTailoring *
 CollationRoot::getRoot(UErrorCode &errorCode) {
-    umtx_initOnce(initOnce, CollationRoot::load, errorCode);
+    umtx_initOnce(initOnce, CollationRoot::load, static_cast<const char*>(NULL), errorCode);
    if(U_FAILURE(errorCode)) { return NULL; }
    return rootSingleton->tailoring;
 }
@ -99,6 +129,12 @@ CollationRoot::getSettings(UErrorCode &errorCode) {
    return root->settings;
 }

+void
+CollationRoot::forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode) {
+    umtx_initOnce(initOnce, CollationRoot::load, ucadataPath, errorCode);
+}
+
+
 U_NAMESPACE_END

 #endif  // !UCONFIG_NO_COLLATION
--- a/icu4c/source/i18n/collationroot.h
+++ b/icu4c/source/i18n/collationroot.h
@ -15,6 +15,7 @@
 #define __COLLATIONROOT_H__

 #include "unicode/utypes.h"
+#include "unicode/udata.h"

 #if !UCONFIG_NO_COLLATION

@ -34,9 +35,11 @@ public:
    static const CollationTailoring *getRoot(UErrorCode &errorCode);
    static const CollationData *getData(UErrorCode &errorCode);
    static const CollationSettings *getSettings(UErrorCode &errorCode);
+    static void U_EXPORT2 forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode);

 private:
-    static void U_CALLCONV load(UErrorCode &errorCode);
+    static void U_CALLCONV load(const char* ucadataPath, UErrorCode &errorCode);
+    static UDataMemory* loadFromFile(const char* ucadataPath, UErrorCode &errorCode);
 };

 U_NAMESPACE_END
--- a/icu4c/source/tools/genrb/genrb.cpp
+++ b/icu4c/source/tools/genrb/genrb.cpp
@ -33,6 +33,7 @@
 #include "filterrb.h"
 #include "reslist.h"
 #include "ucmndata.h"  /* TODO: for reading the pool bundle */
+#include "collationroot.h"

 U_NAMESPACE_USE

@ -84,7 +85,9 @@ enum
    WRITE_POOL_BUNDLE,
    USE_POOL_BUNDLE,
    INCLUDE_UNIHAN_COLL,
-    FILTERDIR
+    FILTERDIR,
+    ICU4X_MODE,
+    UCADATA
 };

 UOption options[]={
@ -111,6 +114,8 @@ UOption options[]={
                      UOPTION_DEF("usePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 20 */
                      UOPTION_DEF("includeUnihanColl", '\x01', UOPT_NO_ARG),/* 21 */ /* temporary, don't display in usage info */
                      UOPTION_DEF("filterDir", '\x01', UOPT_OPTIONAL_ARG), /* 22 */
+                      UOPTION_DEF("icu4xMode", 'X', UOPT_NO_ARG),/* 23 */
+                      UOPTION_DEF("ucadata", '\x01', UOPT_REQUIRES_ARG),/* 24 */
                  };

 static     UBool       write_java = FALSE;
@ -152,6 +157,10 @@ main(int argc,
        fprintf(stderr, "%s: cannot combine --writePoolBundle and --usePoolBundle\n", argv[0]);
        illegalArg = TRUE;
    }
+    if (options[ICU4X_MODE].doesOccur && !options[UCADATA].doesOccur) {
+        fprintf(stderr, "%s: --icu4xMode requires --ucadata\n", argv[0]);
+        illegalArg = TRUE;
+    }
    if(options[FORMAT_VERSION].doesOccur) {
        const char *s = options[FORMAT_VERSION].value;
        if(uprv_strlen(s) != 1 || (s[0] < '1' && '3' < s[0])) {
@ -302,6 +311,15 @@ main(int argc,
        }
    }

+    if (options[UCADATA].doesOccur) {
+#if !UCONFIG_NO_COLLATION
+        CollationRoot::forceLoadFromFile(options[UCADATA].value, status);
+#else
+        fprintf(stderr, "--ucadata was used with UCONFIG_NO_COLLATION\n");
+        return status;
+#endif
+    }
+
    initParser();

    /*added by Jing*/
@ -656,7 +674,7 @@ processFile(const char *filename, const char *cp,
    }
    /* Parse the data into an SRBRoot */
    data.adoptInstead(parse(ucbuf.getAlias(), inputDir, outputDir, filename,
-            !omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, &status));
+            !omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, options[ICU4X_MODE].doesOccur, &status));

    if (data.isNull() || U_FAILURE(status)) {
        fprintf(stderr, "couldn't parse the file %s. Error:%s\n", filename, u_errorName(status));
--- a/icu4c/source/tools/genrb/parse.cpp
+++ b/icu4c/source/tools/genrb/parse.cpp
@ -21,6 +21,8 @@
 */

 // Safer use of UnicodeString.
+#include <cstdint>
+#include <unicode/umachine.h>
 #ifndef UNISTR_FROM_CHAR_EXPLICIT
 #   define UNISTR_FROM_CHAR_EXPLICIT explicit
 #endif
@ -42,6 +44,7 @@
 #include "reslist.h"
 #include "rbt_pars.h"
 #include "genrb.h"
+#include "unicode/normalizer2.h"
 #include "unicode/stringpiece.h"
 #include "unicode/unistr.h"
 #include "unicode/ustring.h"
@ -59,6 +62,7 @@
 #include "collationruleparser.h"
 #include "collationtailoring.h"
 #include <stdio.h>
+#include "writesrc.h"

 /* Number of tokens to read ahead of the current stream position */
 #define MAX_LOOKAHEAD   3
@ -76,6 +80,9 @@
 #define OPENSQBRACKET    0x005B
 #define CLOSESQBRACKET   0x005D

+#define ICU4X_DIACRITIC_BASE  0x0300
+#define ICU4X_DIACRITIC_LIMIT 0x034F
+
 using icu::CharString;
 using icu::LocalMemory;
 using icu::LocalPointer;
@ -119,6 +126,7 @@ typedef struct {
    const char     *filename;
    UBool           makeBinaryCollation;
    UBool           omitCollationRules;
+    UBool           icu4xMode;
 } ParseState;

 typedef struct SResource *
@ -764,7 +772,7 @@ GenrbImporter::getRules(

    /* Parse the data into an SRBRoot */
    LocalPointer<SRBRoot> data(
-            parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, &errorCode));
+            parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), FALSE, FALSE, FALSE, &errorCode));
    if (U_FAILURE(errorCode)) {
        return;
    }
@ -807,6 +815,333 @@ escape(const UChar *s, char *buffer) {

 }  // namespace

+static FILE*
+openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) {
+    CharString baseName;
+    baseName.append(name, *status);
+    baseName.append("_", *status);
+    baseName.append(collationType, *status);
+    baseName.append("_", *status);
+    baseName.append(structType, *status);
+
+    CharString outFileName;
+    if (outputdir && *outputdir) {
+        outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status);
+    }
+    outFileName.append(baseName, *status);
+    outFileName.append(".toml", *status);
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+
+    FILE* f = fopen(outFileName.data(), "w");
+    if (!f) {
+        *status = U_FILE_ACCESS_ERROR;
+        return NULL;
+    }
+    usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X");
+
+    return f;
+}
+
+static void
+writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) {
+    FILE* f = openTOML(outputdir, name, collationType, "meta", status);
+    if (!f) {
+        return;
+    }
+    // printf("writeCollationMetadataTOML %s %s\n", name, collationType);
+    fprintf(f, "bits = 0x%X\n", metadataBits);
+    fclose(f);
+}
+
+static UChar32
+writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
+    UChar32 limit = ICU4X_DIACRITIC_LIMIT;
+    FILE* f = openTOML(outputdir, name, collationType, "dia", status);
+    if (!f) {
+        return limit;
+    }
+    // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType);
+    uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE];
+    for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
+        uint16_t secondary = 0;
+        uint32_t ce32 = data->getCE32(c);
+        if (ce32 == icu::Collation::FALLBACK_CE32) {
+            ce32 = data->base->getCE32(c);
+        }
+        if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
+            // These never occur in NFD data
+        } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) {
+            if (uprv_strcmp(name, "root") == 0) {
+                printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c);
+                fclose(f);
+                *status = U_INTERNAL_PROGRAM_ERROR;
+                return limit;
+            }
+            limit = c;
+            break;
+        } else {
+            uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32));
+            if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) {
+                // Not a CE where only the secondary weight differs from the expected
+                // pattern.
+                limit = c;
+                break;
+            }
+            secondary = uint16_t(ce >> 16);
+        }
+        secondaries[c - ICU4X_DIACRITIC_BASE] = secondary;
+
+    }
+    usrc_writeArray(f, "secondaries = [\n  ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, "  ", "\n]\n");
+    fclose(f);
+    return limit;
+}
+
+static void
+writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) {
+    FILE* f = openTOML(outputdir, name, collationType, "reord", status);
+    if (!f) {
+        return;
+    }
+    // printf("writeCollationReorderingTOML %s %s\n", name, collationType);
+    fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder);
+    usrc_writeArray(f, "reorder_table = [\n  ", settings->reorderTable, 8, 256, "  ", "\n]\n");
+    usrc_writeArray(f, "reorder_ranges = [\n  ", settings->reorderRanges, 32, settings->reorderRangesLength, "  ", "\n]\n");
+    fclose(f);
+}
+
+
+static void
+writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
+    FILE* f = openTOML(outputdir, name, collationType, "jamo", status);
+    if (!f) {
+        printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType);
+        return;
+    }
+    uint32_t jamo[0x1200-0x1100];
+    for (UChar32 c = 0x1100; c < 0x1200; ++c) {
+        uint32_t ce32 = data->getCE32(c);
+        if (ce32 == icu::Collation::FALLBACK_CE32) {
+            ce32 = data->base->getCE32(c);
+        }
+        // Can't reject complex CE32s, because search collations have expansions.
+        // These expansions refer to the tailoring, which foils the reuse of the
+        // these jamo tables.
+        // XXX Figure out what to do. Perhaps instead of having Latin mini expansions,
+        // there should be Hangul mini expansions.
+        // XXX in any case, validate that modern jamo are self-contained.
+        jamo[c - 0x1100] = ce32;
+
+    }
+    usrc_writeArray(f, "ce32s = [\n  ", jamo, 32, 0x1200-0x1100, "  ", "\n]\n");
+    fclose(f);
+}
+
+static UBool
+convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) {
+    if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) {
+        // Range entirely in conjoining jamo block.
+        return TRUE;
+    }
+    icu::IcuToolErrorCode status("genrb: convertTrie");
+    umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status);
+    return !U_FAILURE(*status);
+}
+
+static void
+writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) {
+    FILE* f = openTOML(outputdir, name, collationType, "data", status);
+    if (!f) {
+        return;
+    }
+    // printf("writeCollationDataTOML %s %s\n", name, collationType);
+
+    icu::UnicodeSet tailoringSet;
+
+    if (data->base) {
+        tailoringSet.addAll(*(data->unsafeBackwardSet));
+        tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
+    } else {
+        tailoringSet.addAll(*(data->unsafeBackwardSet));
+    }
+
+    // Use the same value for out-of-range and default in the hope of not having to allocate
+    // different blocks, since ICU4X never does out-of-range queries.
+    uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
+    icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
+
+    utrie2_enum(data->trie, NULL, &convertTrie, builder.getAlias());
+
+    // If the diacritic table was cut short, copy CE32s between the lowered
+    // limit and the max limit from the root to the tailoring. As of June 2022,
+    // no collation in CLDR needs this.
+    for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) {
+        if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
+            // These never occur in NFD data.
+            continue;
+        }
+        uint32_t ce32 = data->getCE32(c);
+        if (ce32 == icu::Collation::FALLBACK_CE32) {
+            ce32 = data->base->getCE32(c);
+            umutablecptrie_set(builder.getAlias(), c, ce32, status);
+        }
+    }
+
+    // Ensure that the range covered by the diacritic table isn't duplicated
+    // in the trie.
+    for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) {
+        if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
+            umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
+        }
+    }
+
+    icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
+    builder.getAlias(),
+    UCPTRIE_TYPE_SMALL,
+    UCPTRIE_VALUE_BITS_32,
+    status));
+    usrc_writeArray(f, "contexts = [\n  ", data->contexts, 16, data->contextsLength, "  ", "\n]\n");
+    usrc_writeArray(f, "ce32s = [\n  ", data->ce32s, 32, data->ce32sLength, "  ", "\n]\n");
+    usrc_writeArray(f, "ces = [\n  ", data->ces, 64, data->cesLength, "  ", "\n]\n");
+    fprintf(f, "[trie]\n");
+    usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
+
+    fclose(f);
+}
+
+static void
+writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
+    FILE* f = openTOML(outputdir, name, collationType, "prim", status);
+    if (!f) {
+        return;
+    }
+    // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType);
+
+    uint16_t lastPrimaries[4];
+    for (int32_t i = 0; i < 4; ++i) {
+        // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one
+        // back to get a value that fits in 16 bits.
+        lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
+    }
+
+    uint32_t numericPrimary = data->numericPrimary;
+    if (numericPrimary & 0xFFFFFF) {
+        printf("Lower 24 bits set in numeric primary");
+        *status = U_INTERNAL_PROGRAM_ERROR;
+        return;
+    }
+
+    usrc_writeArray(f, "last_primaries = [\n  ", lastPrimaries, 16, 4, "  ", "\n]\n");
+    fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24);
+    fclose(f);
+}
+
+static void
+writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
+    UBool tailored = FALSE;
+    UBool tailoredDiacritics = FALSE;
+    UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
+    UBool reordering = FALSE;
+    UBool isRoot = uprv_strcmp(name, "root") == 0;
+    UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT;
+    if (!data->base && isRoot) {
+        diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
+        if (U_FAILURE(*status)) {
+            return;
+        }
+        writeCollationJamoTOML(outputdir, name, collationType, data, status);
+        if (U_FAILURE(*status)) {
+            return;
+        }
+        writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status);
+        if (U_FAILURE(*status)) {
+            return;
+        }
+    } else if (data->base && !lithuanianDotAbove) {
+        for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
+            if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
+                // These never occur in NFD data.
+                continue;
+            }
+            uint32_t ce32 = data->getCE32(c);
+            if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
+                tailoredDiacritics = TRUE;
+                diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
+                if (U_FAILURE(*status)) {
+                    return;
+                }
+                break;
+            }
+        }
+    }
+
+    if (settings->hasReordering()) {
+        reordering = TRUE;
+        // Note: There are duplicate reorderings. Expecting the ICU4X provider
+        // to take care of deduplication.
+        writeCollationReorderingTOML(outputdir, name, collationType, settings, status);
+        if (U_FAILURE(*status)) {
+            return;
+        }
+    }
+
+    // Write collation data if either base is non-null or the name is root.
+    // Languages that only reorder scripts are otherwise root-like and have
+    // null base.
+    if (data->base || isRoot) {
+        tailored = !isRoot;
+        writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status);
+        if (U_FAILURE(*status)) {
+            return;
+        }
+    }
+
+    uint32_t maxVariable = (uint32_t)settings->getMaxVariable();
+    if (maxVariable >= 4) {
+        printf("Max variable out of range");
+        *status = U_INTERNAL_PROGRAM_ERROR;
+        return;
+    }
+
+    uint32_t metadataBits = maxVariable;
+    if (tailored) {
+        metadataBits |= (1 << 3);
+    }
+    if (tailoredDiacritics) {
+        metadataBits |= (1 << 4);
+    }
+    if (reordering) {
+        metadataBits |= (1 << 5);
+    }
+    if (lithuanianDotAbove) {
+        metadataBits |= (1 << 6);
+    }
+    if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) {
+        metadataBits |= (1 << 7);
+    }
+    if (settings->getAlternateHandling() == UCOL_SHIFTED) {
+        metadataBits |= (1 << 8);
+    }
+    switch (settings->getCaseFirst()) {
+        case UCOL_OFF:
+            break;
+        case UCOL_UPPER_FIRST:
+            metadataBits |= (1 << 9);
+            metadataBits |= (1 << 10);
+            break;
+        case UCOL_LOWER_FIRST:
+            metadataBits |= (1 << 9);
+            break;
+        default:
+            *status = U_INTERNAL_PROGRAM_ERROR;
+            return;
+    }
+
+    writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status);
+}
+
 #endif  // !UCONFIG_NO_COLLATION

 static TableResource *
@ -952,9 +1287,9 @@ addCollation(ParseState* state, TableResource  *result, const char *collationTyp
        res_close(result);
        return NULL;  // TODO: use LocalUResourceBundlePointer for result
    }
-    icu::CollationBuilder builder(base, intStatus);
-    if(uprv_strncmp(collationType, "search", 6) == 0) {
-        builder.disableFastLatin();  // build fast-Latin table unless search collator
+    icu::CollationBuilder builder(base, state->icu4xMode, intStatus);
+    if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) {
+        builder.disableFastLatin();  // build fast-Latin table unless search collator or ICU4X
    }
    LocalPointer<icu::CollationTailoring> t(
            builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
@ -977,6 +1312,19 @@ addCollation(ParseState* state, TableResource  *result, const char *collationTyp
            return NULL;
        }
    }
+    if (state->icu4xMode) {
+        char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1));
+        if (nameWithoutSuffix == NULL) {
+            *status = U_MEMORY_ALLOCATION_ERROR;
+            res_close(result);
+            return NULL;
+        }
+        uprv_strcpy(nameWithoutSuffix, state->filename);
+        *uprv_strrchr(nameWithoutSuffix, '.') = 0;
+
+        writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status);
+        uprv_free(nameWithoutSuffix);
+    }
    icu::LocalMemory<uint8_t> buffer;
    int32_t capacity = 100000;
    uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
@ -1966,7 +2314,7 @@ parseResource(ParseState* state, char *tag, const struct UString *comment, UErro
 /* parse the top-level resource */
 struct SRBRoot *
 parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
-      UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status)
+      UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status)
 {
    struct UString    *tokenValue;
    struct UString    comment;
@ -1992,6 +2340,7 @@ parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *fi
    state.filename = filename;
    state.makeBinaryCollation = makeBinaryCollation;
    state.omitCollationRules = omitCollationRules;
+    state.icu4xMode = icu4xMode;

    ustr_init(&comment);
    expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status);
--- a/icu4c/source/tools/genrb/parse.h
+++ b/icu4c/source/tools/genrb/parse.h
@ -31,7 +31,7 @@ void initParser();
 /* Parse a ResourceBundle text file */
 struct SRBRoot* parse(UCHARBUF *buf, const char* inputDir, const char* outputDir,
                      const char *filename,
-                      UBool makeBinaryCollation, UBool omitCollationRules, UErrorCode *status);
+                      UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status);

 U_CDECL_END

--- a/icu4c/source/tools/icuexportdata/icuexportdata.cpp
+++ b/icu4c/source/tools/icuexportdata/icuexportdata.cpp
@ -1,7 +1,15 @@
 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html

+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
 #include <iostream>
+#include <unicode/localpointer.h>
+#include <unicode/umachine.h>
+#include <unicode/unistr.h>
+#include <unicode/urename.h>
+#include <unicode/uset.h>
 #include <vector>
 #include <algorithm>
 #include "toolutil.h"
@ -15,7 +23,10 @@
 #include "unicode/uscript.h"
 #include "unicode/putil.h"
 #include "unicode/umutablecptrie.h"
+#include "unicode/ucharstriebuilder.h"
 #include "ucase.h"
+#include "unicode/normalizer2.h"
+#include "normalizer2impl.h"
 #include "writesrc.h"

 U_NAMESPACE_USE
@ -299,6 +310,470 @@ FILE* prepareOutputFile(const char* basename) {
    return f;
 }

+#if !UCONFIG_NO_NORMALIZATION
+
+struct PendingDescriptor {
+    UChar32 scalar;
+    uint32_t descriptor;
+    UBool supplementary;
+};
+
+void writeCanonicalCompositions(USet* backwardCombiningStarters) {
+    IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
+    const char* basename = "compositions";
+    FILE* f = prepareOutputFile(basename);
+
+    LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
+
+    const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
+    UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
+
+    const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
+    for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
+        if (c >= 0xD800 && c < 0xE000) {
+            // Surrogate
+            continue;
+        }
+        UnicodeString decomposition;
+        if (!nfc->getRawDecomposition(c, decomposition)) {
+            continue;
+        }
+        int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
+        if (len != 2) {
+            continue;
+        }
+        UChar32 starter = utf32[0];
+        UChar32 second = utf32[1];
+        UChar32 composite = nfc->composePair(starter, second);
+        if (composite < 0) {
+            continue;
+        }
+        if (c != composite) {
+            status.set(U_INTERNAL_PROGRAM_ERROR);
+            handleError(status, basename);
+        }
+        if (!u_getCombiningClass(second)) {
+            uset_add(backwardCombiningStarters, second);
+        }
+        if (composite >= 0xAC00 && composite <= 0xD7A3) {
+            // Hangul syllable
+            continue;
+        }
+
+        UnicodeString backward;
+        backward.append(second);
+        backward.append(starter);
+        backwardBuilder->add(backward, int32_t(composite), status);
+    }
+    UnicodeString canonicalCompositionTrie;
+    backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
+
+    usrc_writeArray(f, "compositions = [\n  ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), "  ", "\n]\n");
+    fclose(f);
+    handleError(status, basename);
+}
+
+void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
+    FILE* f = prepareOutputFile(basename);
+    usrc_writeArray(f, "scalars16 = [\n  ", ptr16, 16, len16, "  ", "\n]\n");
+    usrc_writeArray(f, "scalars32 = [\n  ", ptr32, 32, len32, "  ", "\n]\n");
+    fclose(f);
+}
+
+void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions) {
+    IcuToolErrorCode status("icuexportdata: writeDecompositionData");
+    FILE* f = prepareOutputFile(basename);
+
+    // Zero is a magic number that means the character decomposes to itself.
+    LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
+
+    // Iterate backwards to insert lower code points in the trie first in case it matters
+    // for trie block allocation.
+    for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
+        const PendingDescriptor& pending = pendingTrieInsertions[i];
+        uint32_t additional = 0;
+        if (!(pending.descriptor & 0xFFFF0000)) {
+            uint32_t offset = pending.descriptor & 0xFFF;
+            if (!pending.supplementary) {
+                if (offset >= baseSize16) {
+                    // This is a offset to supplementary 16-bit data. We have
+                    // 16-bit base data and 32-bit base data before. However,
+                    // the 16-bit base data length is already part of offset.
+                    additional = baseSize32;
+                }
+            } else {
+                if (offset >= baseSize32) {
+                    // This is an offset to supplementary 32-bit data. We have 16-bit
+                    // base data, 32-bit base data, and 16-bit supplementary data before.
+                    // However, the 32-bit base data length is already part
+                    // of offset.
+                    additional = baseSize16 + supplementSize16;
+                } else {
+                    // This is an offset to 32-bit base data. We have 16-bit
+                    // base data before.
+                    additional = baseSize16;
+                }
+            }
+            if (offset + additional > 0xFFF) {
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+        }
+        umutablecptrie_set(builder.getAlias(), pending.scalar, pending.descriptor + additional, status);
+    }
+    LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
+        builder.getAlias(),
+        trieType,
+        UCPTRIE_VALUE_BITS_32,
+        status));
+    handleError(status, basename);
+
+    if (!reference) {
+        usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
+    } else {
+        if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
+            // NFD expectations don't hold. The set must not contain the half-width
+            // kana voicing marks and must contain iota subscript.
+            status.set(U_INTERNAL_PROGRAM_ERROR);
+            handleError(status, basename);
+        }
+
+        USet* halfWidthVoicing = uset_openEmpty();
+        uset_add(halfWidthVoicing, 0xFF9E);
+        uset_add(halfWidthVoicing, 0xFF9F);
+
+        USet* iotaSubscript = uset_openEmpty();
+        uset_add(iotaSubscript, 0x0345);
+
+        uint8_t flags = 0;
+
+        USet* halfWidthCheck = uset_cloneAsThawed(uset);
+        uset_removeAll(halfWidthCheck, reference);
+        if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
+            flags |= 1;
+        } else if (!uset_isEmpty(halfWidthCheck)) {
+            // The result was neither empty nor contained exactly
+            // the two half-width voicing marks. The ICU4X
+            // normalizer doesn't know how to deal with this case.
+            status.set(U_INTERNAL_PROGRAM_ERROR);
+            handleError(status, basename);
+        }
+        uset_close(halfWidthCheck);
+
+        USet* iotaCheck = uset_cloneAsThawed(reference);
+        uset_removeAll(iotaCheck, uset);
+        if (uset_equals(iotaCheck, iotaSubscript)) {
+            flags |= (1 << 1);
+        } else if (!uset_isEmpty(iotaCheck)) {
+            // The result was neither empty nor contained exactly
+            // the iota subscript. The ICU4X normalizer doesn't
+            // know how to deal with this case.
+            status.set(U_INTERNAL_PROGRAM_ERROR);
+            handleError(status, basename);
+        }
+        uset_close(halfWidthCheck);
+
+        uset_close(iotaSubscript);
+        uset_close(halfWidthVoicing);
+
+        fprintf(f, "flags = 0x%X\n", flags);
+    }
+    fprintf(f, "[trie]\n");
+    usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
+    fclose(f);
+    handleError(status, basename);
+}
+
+void writePotentialCompositionPassThrough(const char* basename, const Normalizer2* norm, const USet* decompositionStartsWithNonStarter, const USet* decompositionStartsWithBackwardCombiningStarter, USet* potentialPassthroughAndNotBackwardCombining) {
+    IcuToolErrorCode status("icuexportdata: writePotentialCompositionPassThrough");
+    FILE* f = prepareOutputFile(basename);
+
+    const Normalizer2* nfc = nullptr;
+    if (!norm) {
+        // UTS 46 case
+        norm = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
+        nfc = Normalizer2::getNFCInstance(status);
+    }
+    for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
+        if (c >= 0xD800 && c < 0xE000) {
+            // Surrogate
+            continue;
+        }
+        if (uset_contains(decompositionStartsWithNonStarter, c) || uset_contains(decompositionStartsWithBackwardCombiningStarter, c)) {
+            continue;
+        }
+        UnicodeString src;
+        UnicodeString dst;
+        src.append(c);
+        norm->normalize(src, dst, status);
+        if (nfc && (dst.isEmpty() || (dst == u"\uFFFD" && c != 0xFFFD))) {
+            // UTS 46 ignored and disallowed fall back to NFC for data
+            // overlap.
+            dst.truncate(0);
+            nfc->normalize(src, dst, status);
+        }
+        if (src == dst) {
+            uset_add(potentialPassthroughAndNotBackwardCombining, c);
+        }
+    }
+
+    // The surrogate range forms a useless discontinuity. The code
+    // that reads from the set never looks up by surrage, so let's
+    // put the surrogate range in the set as a micro-optimization.
+    uset_addRange(potentialPassthroughAndNotBackwardCombining, 0xD800, 0xDFFF);
+
+    usrc_writeUnicodeSet(f, potentialPassthroughAndNotBackwardCombining, UPRV_TARGET_SYNTAX_TOML);
+    fclose(f);
+    handleError(status, basename);
+}
+
+// Computes data for canonical decompositions
+void computeDecompositions(const char* basename, const USet* backwardCombiningStarters, std::vector<uint16_t>& storage16, std::vector<uint32_t>& storage32, USet* decompositionStartsWithNonStarter, USet* decompositionStartsWithBackwardCombiningStarter, std::vector<PendingDescriptor>& pendingTrieInsertions) {
+    IcuToolErrorCode status("icuexportdata: computeDecompositions");
+    const Normalizer2* mainNormalizer;
+    const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
+    if (uprv_strcmp(basename, "nfkd") == 0) {
+        mainNormalizer = Normalizer2::getNFKDInstance(status);
+    } else if (uprv_strcmp(basename, "uts46d") == 0) {
+        mainNormalizer = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
+    } else {
+        mainNormalizer = nfdNormalizer;
+    }
+
+    // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
+    // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
+    const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
+    const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
+    const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
+    UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
+
+    // Iterate over all scalar values excluding Hangul syllables.
+    //
+    // We go backwards in order to better find overlapping decompositions.
+    //
+    // As of Unicode 14:
+    // Iterate forward without overlap search:
+    // nfd: 16 size: 896, 32 size: 173
+    // nfkd: 16 size: 3854, 32 size: 179
+    //
+    // Iterate forward with overlap search:
+    // nfd: 16 size: 888, 32 size: 173
+    // nfkd: 16 size: 3266, 32 size: 179
+    //
+    // Iterate backward with overlap search:
+    // nfd: 16 size: 776, 32 size: 173
+    // nfkd: 16 size: 2941, 32 size: 179
+    //
+    // UChar32 is signed!
+    for (UChar32 c = 0x10FFFF; c >= 0; --c) {
+        if (c >= 0xAC00 && c <= 0xD7A3) {
+            // Hangul syllable
+            continue;
+        }
+        if (c >= 0xD800 && c < 0xE000) {
+            // Surrogate
+            continue;
+        }
+        UnicodeString src;
+        UnicodeString dst;
+        src.append(c);
+        if (mainNormalizer != nfdNormalizer) {
+            UnicodeString inter;
+            mainNormalizer->normalize(src, inter, status);
+            nfdNormalizer->normalize(inter, dst, status);
+        } else {
+            nfdNormalizer->normalize(src, dst, status);
+        }
+        int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
+        if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
+            // Characters that normalize to nothing or to U+FFFD (without the
+            // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
+            // as in NFD in ICU4X's UTF 46 normalization in the interest
+            // of data size and ICU4X's normalizer being unable to handle
+            // normalizing to nothing.
+            // When UTS 46 is implemented on top of ICU4X, a preprocessing
+            // step is supposed to remove these characters before the
+            // normalization step.
+            if (uprv_strcmp(basename, "uts46d") != 0) {
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+            nfdNormalizer->normalize(src, dst, status);
+            len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
+            if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+        }
+        if (len > DECOMPOSITION_BUFFER_SIZE) {
+            status.set(U_INTERNAL_PROGRAM_ERROR);
+            handleError(status, basename);
+        }
+        bool startsWithNonStarter = u_getCombiningClass(utf32[0]);
+        if (startsWithNonStarter) {
+            uset_add(decompositionStartsWithNonStarter, c);
+        } else if (uset_contains(backwardCombiningStarters, c)) {
+            uset_add(decompositionStartsWithBackwardCombiningStarter, c);
+        }
+        if (mainNormalizer != nfdNormalizer) {
+            UnicodeString nfd;
+            nfdNormalizer->normalize(src, nfd, status);
+            if (dst == nfd) {
+                continue;
+            }
+        } else {
+            if (src == dst) {
+                continue;
+            }
+        }
+        if (startsWithNonStarter && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) {
+            // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
+            status.set(U_INTERNAL_PROGRAM_ERROR);
+            handleError(status, basename);
+        }
+        if (len == 1 && utf32[0] <= 0xFFFF) {
+            if (utf32[0] == 1) {
+                // 1 is reserved as a marker for the expansion of U+FDFA.
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+            pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE});
+        } else if (len == 2 && utf32[0] <= 0xFFFF && utf32[1] <= 0xFFFF && !u_getCombiningClass(utf32[0]) && u_getCombiningClass(utf32[1])) {
+            pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), FALSE});
+        } else {
+            UBool supplementary = FALSE;
+            UBool nonInitialStarter = FALSE;
+            for (int32_t i = 0; i < len; ++i) {
+                if (utf32[i] > 0xFFFF) {
+                    supplementary = TRUE;
+                }
+                if (utf32[i] == 0) {
+                    status.set(U_INTERNAL_PROGRAM_ERROR);
+                    handleError(status, basename);
+                }
+                if (i != 0 && !u_getCombiningClass(utf32[i])) {
+                    nonInitialStarter = TRUE;
+                }
+            }
+            if (!supplementary) {
+                if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
+                    if (len == 18 && c == 0xFDFA) {
+                        // Special marker for the one character whose decomposition
+                        // is too long.
+                        pendingTrieInsertions.push_back({c, 1 << 16, supplementary});
+                        continue;
+                    } else {
+                        status.set(U_INTERNAL_PROGRAM_ERROR);
+                        handleError(status, basename);
+                    }
+                }
+            } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+            // Complex decomposition
+            // Format for 16-bit value:
+            // 15..13: length minus two for 16-bit case and length minus one for
+            //         the 32-bit case. Length 8 needs to fit in three bits in
+            //         the 16-bit case, and this way the value is future-proofed
+            //         up to 9 in the 16-bit case. Zero is unused and length one
+            //         in the 16-bit case goes directly into the trie.
+            //     12: 1 if all trailing characters are guaranteed non-starters,
+            //         0 if no guarantees about non-starterness.
+            //         Note: The bit choice is this way around to allow for
+            //         dynamically falling back to not having this but instead
+            //         having one more bit for length by merely choosing
+            //         different masks.
+            //  11..0: Start offset in storage. The offset is to the logical
+            //         sequence of scalars16, scalars32, supplementary_scalars16,
+            //         supplementary_scalars32.
+            uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
+            if (!supplementary) {
+                descriptor |= (uint32_t(len) - 2) << 13;
+            } else {
+                descriptor |= (uint32_t(len) - 1) << 13;
+            }
+            if (descriptor & 0xFFF) {
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+            size_t index = 0;
+            bool writeToStorage = FALSE;
+            // Sadly, C++ lacks break and continue by label, so using goto in the
+            // inner loops to break or continue the outer loop.
+            if (!supplementary) {
+                outer16: for (;;) {
+                    if (index == storage16.size()) {
+                        writeToStorage = TRUE;
+                        break;
+                    }
+                    if (storage16[index] == utf32[0]) {
+                        for (int32_t i = 1; i < len; ++i) {
+                            if (storage16[index + i] != uint32_t(utf32[i])) {
+                                ++index;
+                                // continue outer
+                                goto outer16;
+                            }
+                        }
+                        // break outer
+                        goto after;
+                    }
+                    ++index;
+                }
+            } else {
+                outer32: for (;;) {
+                    if (index == storage32.size()) {
+                        writeToStorage = TRUE;
+                        break;
+                    }
+                    if (storage32[index] == uint32_t(utf32[0])) {
+                        for (int32_t i = 1; i < len; ++i) {
+                            if (storage32[index + i] != uint32_t(utf32[i])) {
+                                ++index;
+                                // continue outer
+                                goto outer32;
+                            }
+                        }
+                        // break outer
+                        goto after;
+                    }
+                    ++index;
+                }
+            }
+            after:
+            if (index > 0xFFF) {
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+            descriptor |= uint32_t(index);
+            if (!descriptor || descriptor > 0xFFFF) {
+                // > 0xFFFF should never happen if the code above is correct.
+                // == 0 should not happen due to the nature of the data.
+                status.set(U_INTERNAL_PROGRAM_ERROR);
+                handleError(status, basename);
+            }
+            if (writeToStorage) {
+                if (!supplementary) {
+                    for (int32_t i = 0; i < len; ++i) {
+                        storage16.push_back(uint16_t(utf32[i]));
+                    }
+                } else {
+                    for (int32_t i = 0; i < len; ++i) {
+                        storage32.push_back(uint32_t(utf32[i]));
+                    }
+                }
+            }
+            pendingTrieInsertions.push_back({c, descriptor, supplementary});
+        }
+    }
+    if (storage16.size() + storage32.size() > 0xFFF) {
+        status.set(U_INTERNAL_PROGRAM_ERROR);
+    }
+    handleError(status, basename);
+}
+
+#endif // !UCONFIG_NO_NORMALIZATION
+
 enum {
    OPT_HELP_H,
    OPT_HELP_QUESTION_MARK,
@ -341,7 +816,7 @@ void printHelp(FILE* stdfile, const char* program) {
          "options:\n"
          "\t-h or -? or --help  this usage text\n"
          "\t-V or --version     show a version message\n"
-          "\t-m or --mode        mode: currently only 'uprops' and 'ucase', but more may be added\n"
+          "\t-m or --mode        mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
          "\t      --trie-type   set the trie type (small or fast, default small)\n"
          "\t-d or --destdir     destination directory, followed by the path\n"
          "\t      --all         write out all properties known to icuexportdata\n"
@ -387,6 +862,46 @@ int exportUprops(int argc, char* argv[]) {
        }
    }

+    if (propNames.empty()
+            || options[OPT_HELP_H].doesOccur
+            || options[OPT_HELP_QUESTION_MARK].doesOccur
+            || !options[OPT_MODE].doesOccur) {
+        FILE *stdfile=argc<0 ? stderr : stdout;
+        fprintf(stdfile,
+            "usage: %s -m uprops [-options] [--all | properties...]\n"
+            "\tdump Unicode property data to .toml files\n"
+            "options:\n"
+            "\t-h or -? or --help  this usage text\n"
+            "\t-V or --version     show a version message\n"
+            "\t-m or --mode        mode: currently only 'uprops', but more may be added\n"
+            "\t      --trie-type   set the trie type (small or fast, default small)\n"
+            "\t-d or --destdir     destination directory, followed by the path\n"
+            "\t      --all         write out all properties known to icuexportdata\n"
+            "\t      --index       write an _index.toml summarizing all data exported\n"
+            "\t-c or --copyright   include a copyright notice\n"
+            "\t-v or --verbose     Turn on verbose output\n"
+            "\t-q or --quiet       do not display warnings and progress\n",
+            argv[0]);
+        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
+    }
+
+    const char* mode = options[OPT_MODE].value;
+    if (uprv_strcmp(mode, "uprops") != 0) {
+        fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
+        return U_ILLEGAL_ARGUMENT_ERROR;
+    }
+
+    if (options[OPT_TRIE_TYPE].doesOccur) {
+        if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
+            trieType = UCPTRIE_TYPE_FAST;
+        } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
+            trieType = UCPTRIE_TYPE_SMALL;
+        } else {
+            fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
+            return U_ILLEGAL_ARGUMENT_ERROR;
+        }
+    }
+
    for (const char* propName : propNames) {
        UProperty propEnum = u_getPropertyEnum(propName);
        if (propEnum == UCHAR_INVALID_CODE) {
@ -505,6 +1020,81 @@ int exportCase(int argc, char* argv[]) {
    return 0;
 }

+#if !UCONFIG_NO_NORMALIZATION
+
+int exportNorm() {
+    IcuToolErrorCode status("icuexportdata: exportNorm");
+    USet* backwardCombiningStarters = uset_openEmpty();
+    writeCanonicalCompositions(backwardCombiningStarters);
+
+    std::vector<uint16_t> storage16;
+    std::vector<uint32_t> storage32;
+
+    USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
+    USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
+    std::vector<PendingDescriptor> nfdPendingTrieInsertions;
+    computeDecompositions("nfd", backwardCombiningStarters, storage16, storage32, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfdPendingTrieInsertions);
+
+    uint32_t baseSize16 = storage16.size();
+    uint32_t baseSize32 = storage32.size();
+
+    USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
+    USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
+    std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
+    computeDecompositions("nfkd", backwardCombiningStarters, storage16, storage32, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkdPendingTrieInsertions);
+
+    USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
+    USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
+    std::vector<PendingDescriptor> uts46PendingTrieInsertions;
+    computeDecompositions("uts46d", backwardCombiningStarters, storage16, storage32, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PendingTrieInsertions);
+
+    uint32_t supplementSize16 = storage16.size() - baseSize16;
+    uint32_t supplementSize32 = storage32.size() - baseSize32;
+
+    writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions);
+    writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions);
+    writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions);
+
+    writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
+    writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
+
+    USet* nfcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
+    const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
+    writePotentialCompositionPassThrough("nfc", nfc, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfcPotentialPassthroughAndNotBackwardCombining);
+
+    USet* nfkcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
+    const Normalizer2* nfkc = Normalizer2::getNFKCInstance(status);
+    writePotentialCompositionPassThrough("nfkc", nfkc, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkcPotentialPassthroughAndNotBackwardCombining);
+
+    USet* uts46PotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
+    writePotentialCompositionPassThrough("uts46", nullptr, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PotentialPassthroughAndNotBackwardCombining);
+
+    // Check that NFKC set has no characters that NFC doesn't also have.
+    uset_removeAll(nfkcPotentialPassthroughAndNotBackwardCombining, nfcPotentialPassthroughAndNotBackwardCombining);
+    if (!uset_isEmpty(nfkcPotentialPassthroughAndNotBackwardCombining)) {
+        status.set(U_INTERNAL_PROGRAM_ERROR);
+        handleError(status, "exportNorm");
+    }
+
+    uset_close(nfcPotentialPassthroughAndNotBackwardCombining);
+    uset_close(nfkcPotentialPassthroughAndNotBackwardCombining);
+    uset_close(uts46PotentialPassthroughAndNotBackwardCombining);
+
+    uset_close(nfdDecompositionStartsWithNonStarter);
+    uset_close(nfkdDecompositionStartsWithNonStarter);
+    uset_close(uts46DecompositionStartsWithNonStarter);
+
+    uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
+    uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
+    uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
+
+    uset_close(backwardCombiningStarters);
+    handleError(status, "exportNorm");
+    return 0;
+}
+
+#endif // !UCONFIG_NO_NORMALIZATION
+
 int main(int argc, char* argv[]) {
    U_MAIN_INIT_ARGS(argc, argv);

@ -553,12 +1143,20 @@ int main(int argc, char* argv[]) {
    }

    const char* mode = options[OPT_MODE].value;
+    if (uprv_strcmp(mode, "norm") == 0) {
+#if !UCONFIG_NO_NORMALIZATION
+        return exportNorm();
+#else
+    fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
+    return U_ILLEGAL_ARGUMENT_ERROR;
+#endif
+    }
    if (uprv_strcmp(mode, "uprops") == 0) {
        return exportUprops(argc, argv);
    } else if (uprv_strcmp(mode, "ucase") == 0) {
        return exportCase(argc, argv);
    }

-    fprintf(stderr, "Invalid option for --mode (must be uprops or ucase)\n");
+    fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
    return U_ILLEGAL_ARGUMENT_ERROR;
 }
--- a/icu4c/source/tools/toolutil/writesrc.cpp
+++ b/icu4c/source/tools/toolutil/writesrc.cpp
@ -19,6 +19,7 @@
 */

 #include <stdio.h>
+#include <inttypes.h>
 #include <time.h>
 #include "unicode/utypes.h"
 #include "unicode/putil.h"
@ -143,12 +144,14 @@ usrc_writeArray(FILE *f,
    const uint8_t *p8;
    const uint16_t *p16;
    const uint32_t *p32;
-    uint32_t value;
+    const int64_t *p64; // Signed due to TOML!
+    int64_t value; // Signed due to TOML!
    int32_t i, col;

    p8=NULL;
    p16=NULL;
    p32=NULL;
+    p64=NULL;
    switch(width) {
    case 8:
        p8=(const uint8_t *)p;
@ -159,6 +162,9 @@ usrc_writeArray(FILE *f,
    case 32:
        p32=(const uint32_t *)p;
        break;
+    case 64:
+        p64=(const int64_t *)p;
+        break;
    default:
        fprintf(stderr, "usrc_writeArray(width=%ld) unrecognized width\n", (long)width);
        return;
@ -186,11 +192,14 @@ usrc_writeArray(FILE *f,
        case 32:
            value=p32[i];
            break;
+        case 64:
+            value=p64[i];
+            break;
        default:
            value=0; /* unreachable */
            break;
        }
-        fprintf(f, value<=9 ? "%lu" : "0x%lx", (unsigned long)value);
+        fprintf(f, value<=9 ? "%" PRId64 : "0x%" PRIx64, value);
    }
    if(postfix!=NULL) {
        fputs(postfix, f);
--- a/icu4c/source/tools/toolutil/writesrc.h
+++ b/icu4c/source/tools/toolutil/writesrc.h
@ -69,7 +69,7 @@ usrc_writeFileNameGeneratedBy(
        const char *generator);

 /**
- * Writes the contents of an array of 8/16/32-bit words.
+ * Writes the contents of an array of 8/16/32/64-bit words.
 * The prefix and postfix are optional (can be NULL) and are written first/last.
 * The prefix may contain a %ld or similar field for the array length.
 * The {} and declaration etc. need to be included in prefix/postfix or
--- a/tools/unicode/c/genuca/collationbasedatabuilder.cpp
+++ b/tools/unicode/c/genuca/collationbasedatabuilder.cpp
@ -83,14 +83,15 @@ binarySearch(const UVector64 &list, int64_t ce) {

 }  // namespace

-CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
-        : CollationDataBuilder(errorCode),
+CollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
+        : CollationDataBuilder(icu4xMode, errorCode),
          numericPrimary(0x12000000),
          firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
          rootElements(errorCode),
          scriptStartsLength(1) {
    uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
    uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
+    this->icu4xMode = icu4xMode;
 }

 CollationBaseDataBuilder::~CollationBaseDataBuilder() {
@ -119,7 +120,9 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) {
    trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);

    // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
-    for(UChar32 c = 0; c < 0x180; ++c) {
+    // In the ICU4X case, only preallocate ASCII, because we don't store CE32s for
+    // precomposed characters.
+    for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) {
        utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
    }

@ -128,8 +131,10 @@ CollationBaseDataBuilder::init(UErrorCode &errorCode) {
    // Some code assumes that the root first primary CE is the "space first primary"
    // from FractionalUCA.txt.

-    uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
-    utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
+    if (!icu4xMode) {
+        uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
+        utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
+    }

    // Add a mapping for the first-unassigned boundary,
    // which is the AlphabeticIndex overflow boundary.
--- a/tools/unicode/c/genuca/collationbasedatabuilder.h
+++ b/tools/unicode/c/genuca/collationbasedatabuilder.h
@ -37,7 +37,7 @@ U_NAMESPACE_BEGIN
 */
 class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder {
 public:
-    CollationBaseDataBuilder(UErrorCode &errorCode);
+    CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode);

    virtual ~CollationBaseDataBuilder();

--- a/tools/unicode/c/genuca/genuca.cpp
+++ b/tools/unicode/c/genuca/genuca.cpp
@ -24,6 +24,7 @@
 #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1

 #include <stdio.h>
+#include <stdint.h>
 #include "unicode/utypes.h"
 #include "unicode/errorcode.h"
 #include "unicode/localpointer.h"
@ -69,7 +70,7 @@ enum HanOrderValue {
    HAN_RADICAL_STROKE
 };

-static UBool beVerbose=FALSE, withCopyright=TRUE;
+static UBool beVerbose=FALSE, withCopyright=TRUE, icu4xMode=FALSE;

 static HanOrderValue hanOrder = HAN_NO_ORDER;

@ -832,6 +833,11 @@ parseFractionalUCA(const char *filename,
    int32_t lineNumber = 0;
    char buffer[30000];

+    const Normalizer2* norm = nullptr;
+    if (icu4xMode) {
+        norm = Normalizer2::getNFDInstance(*status);
+    }
+
    UChar32 maxCodePoint = 0;
    while(!feof(data)) {
        if(U_FAILURE(*status)) {
@ -889,6 +895,24 @@ parseFractionalUCA(const char *filename,
                // CollationBaseDataBuilder::init() maps them to special CEs.
                // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
                if(0xfffd <= c && c <= 0xffff) { continue; }
+                if (icu4xMode) {
+                    if (c >= 0xAC00 && c <= 0xD7A3) {
+                        // Hangul syllable
+                        continue;
+                    }
+                    if (c >= 0xD800 && c < 0xE000) {
+                        // Surrogate
+                        continue;
+                    }
+                    UnicodeString src;
+                    UnicodeString dst;
+                    src.append(c);
+                    norm->normalize(src, dst, *status);
+                    if (src != dst) {
+                        // c decomposed, skip it
+                        continue;
+                    }
+                }
                if(s.length() >= 2 && c == 0xFDD1) {
                    UChar32 c2 = s.char32At(1);
                    int32_t script = getCharScript(c2);
@ -923,7 +947,6 @@ parseFractionalUCA(const char *filename,
                            (int)lineNumber, filename, line);
                    exit(U_INVALID_FORMAT_ERROR);
                }
-
                builder.add(prefix, s, ces, cesLength, *status);
            }
        }
@ -1126,8 +1149,9 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder,

    CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
    const char *dataName =
-        hanOrder == HAN_IMPLICIT ? "ucadata-implicithan" :
-        "ucadata-unihan";
+        hanOrder == HAN_IMPLICIT ?
+            (icu4xMode ? "ucadata-implicithan-icu4x" : "ucadata-implicithan") :
+            (icu4xMode ? "ucadata-unihan-icu4x" : "ucadata-unihan");
    UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo,
                                       withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
    if(U_FAILURE(errorCode)) {
@ -1275,7 +1299,7 @@ parseAndWriteCollationRootData(
        const char *sourceCodePath,
        UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
-    CollationBaseDataBuilder builder(errorCode);
+    CollationBaseDataBuilder builder(icu4xMode, errorCode);
    builder.init(errorCode);
    parseFractionalUCA(fracUCAPath, builder, &errorCode);
    buildAndWriteBaseData(builder, binaryDataPath, errorCode);
@ -1289,7 +1313,8 @@ enum {
    HELP_QUESTION_MARK,
    VERBOSE,
    COPYRIGHT,
-    HAN_ORDER
+    HAN_ORDER,
+    ICU4X
 };

 static UOption options[]={
@ -1297,7 +1322,8 @@ static UOption options[]={
    UOPTION_HELP_QUESTION_MARK,
    UOPTION_VERBOSE,
    UOPTION_COPYRIGHT,
-    UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG)
+    UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG),
+    UOPTION_DEF("icu4x", 'X', UOPT_NO_ARG)
 };

 extern "C" int
@ -1348,6 +1374,7 @@ main(int argc, char* argv[]) {

    beVerbose=options[VERBOSE].doesOccur;
    withCopyright=options[COPYRIGHT].doesOccur;
+    icu4xMode=options[ICU4X].doesOccur;

    IcuToolErrorCode errorCode("genuca");