From f4234577ad723ec27c251a5b88733728aad212e8 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 2 Jun 2023 19:18:35 +0000 Subject: [PATCH] ICU-22404 initial changes.txt for Unicode 15.1 See #2490 - copy 15.0 change log to the top, unchanged - adjust changes.txt for 15.1, incl. diffs from CLDR 43 root collation update --- icu4c/source/data/unidata/changes.txt | 349 ++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) diff --git a/icu4c/source/data/unidata/changes.txt b/icu4c/source/data/unidata/changes.txt index 9345f26bf53..27cc337c158 100644 --- a/icu4c/source/data/unidata/changes.txt +++ b/icu4c/source/data/unidata/changes.txt @@ -38,6 +38,355 @@ and see the change logs below. ---------------------------------------------------------------------------- *** +Unicode 15.1 update for ICU 74 + +https://www.unicode.org/versions/Unicode15.1.0/ +https://www.unicode.org/versions/beta-15.1.0.html +https://www.unicode.org/Public/draft/ +https://www.unicode.org/reports/uax-proposed-updates.html +https://www.unicode.org/reports/tr44/tr44-31.html + +https://unicode-org.atlassian.net/browse/ICU-22404 Unicode 15.1 +https://unicode-org.atlassian.net/browse/CLDR-16669 BRS Unicode 15.1 + +https://github.com/unicode-org/unicodetools/issues/492 adjust cldr/*BreakTest generation for Unicode 15.1 + +* Command-line environment setup + +export UNICODE_DATA=~/unidata/uni15.1/snapshot +export CLDR_SRC=~/cldr/uni/src +export ICU_ROOT=~/icu/uni +export ICU_SRC=$ICU_ROOT/src +export ICUDT=icudt73b +export ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in +export ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata +export LD_LIBRARY_PATH=$ICU_ROOT/dbg/icu4c/lib + +TODO + +*** Unicode version numbers +- makedata.mak +- uchar.h +- com.ibm.icu.util.VersionInfo +- com.ibm.icu.dev.test.lang.UCharacterTest.VERSION_ + +*** Configure: Build Unicode data for ICU4J +- Run ICU4C "configure" _after_ updating the Unicode version number in uchar.h + so that the makefiles see the new version number. + cd $ICU_ROOT/dbg/icu4c + ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data ../../../doconfig-clang-dbg.sh + +*** data files & enums & parser code + +* download files +- same as for the early Unicode Tools setup and data refresh: + https://github.com/unicode-org/unicodetools/blob/main/docs/index.md + https://github.com/unicode-org/unicodetools/blob/main/docs/inputdata.md +- mkdir -p $UNICODE_DATA +- download Unicode files into $UNICODE_DATA + + new since Unicode 15.1: + for the pre-release (alpha, beta) data files, + download all of https://www.unicode.org/Public/draft/ + (you can omit or discard the UCD/charts/ and UCD/ucdxml/ files/folders) + + for final-release data files, the source of truth are the files in + https://www.unicode.org/Public/(version) [=UCD], + https://www.unicode.org/Public/UCA/(version), + https://www.unicode.org/Public/idna/(version), + etc. + + use an FTP client; anonymous FTP from www.unicode.org at /Public/draft etc. + + subfolders: emoji, idna, security, ucd, uca + + whichever way you download the files: + ~ inside ucd: extract Unihan.zip to "here" (.../ucd/Unihan/*.txt), delete Unihan.zip + ~ split Unihan into single-property files + ~/unitools/mine/src$ py/splitunihan.py $UNICODE_DATA/ucd/Unihan + + alternate way of fetching files, if available: + copy the files from a Unicode Tools workspace that is up to date with + https://github.com/unicode-org/unicodetools + and which might at this point be *ahead* of "Public" + ~ before the Unicode release copy files from "dev" subfolders, for example + https://github.com/unicode-org/unicodetools/tree/main/unicodetools/data/ucd/dev + + get GraphemeBreakTest-cldr.txt from $CLDR_SRC/common/properties/segments/GraphemeBreakTest.txt + or from the UCD/cldr/ output folder of the Unicode Tools: + From Unicode 12/CLDR 35/ICU 64 to Unicode 15.0/CLDR 43/ICU 73, + CLDR used modified grapheme break rules. + cp $CLDR_SRC/common/properties/segments/GraphemeBreakTest.txt icu4c/source/test/testdata + or + cp ~/unitools/mine/Generated/UCD/15.0.0/cldr/GraphemeBreakTest-cldr.txt icu4c/source/test/testdata/GraphemeBreakTest.txt + + TODO: figure out whether we need a CLDR version of LineBreakTest.txt: + unicodetools issue #492 + +* Note: Since Unicode 15.1, data files are no longer published with version suffixes + even during the alpha or beta. + Thus we no longer need steps & tools to remove those suffixes. + (remove this note next time) + +* process and/or copy files +- $ICU_SRC/tools/unicode$ py/preparseucd.py $UNICODE_DATA $ICU_SRC + + This writes files (especially ppucd.txt) to the ICU4C unidata and testdata subfolders. + + For debugging, and tweaking how ppucd.txt is written, + the tool has an --only_ppucd option: + py/preparseucd.py $UNICODE_DATA --only_ppucd path/to/ppucd/outputfile + +- cp -v $UNICODE_DATA/security/confusables.txt $ICU4C_UNIDATA + +* new constants for new property values +- preparseucd.py error: + ValueError: missing uchar.h enum constants for some property values: [('blk', {'Nag_Mundari', 'CJK_Ext_H', 'Kawi', 'Kaktovik_Numerals', 'Devanagari_Ext_A', 'Arabic_Ext_C', 'Cyrillic_Ext_D'}), ('sc', {'Nagm', 'Kawi'})] + = PropertyValueAliases.txt new property values (diff old & new .txt files) + ~/unidata$ diff -u uni14/20210922/ucd/PropertyValueAliases.txt uni15/beta/ucd/PropertyValueAliases.txt | egrep '^[-+][a-zA-Z]' + +age; 15.0 ; V15_0 + +blk; Arabic_Ext_C ; Arabic_Extended_C + +blk; CJK_Ext_H ; CJK_Unified_Ideographs_Extension_H + +blk; Cyrillic_Ext_D ; Cyrillic_Extended_D + +blk; Devanagari_Ext_A ; Devanagari_Extended_A + +blk; Kaktovik_Numerals ; Kaktovik_Numerals + +blk; Kawi ; Kawi + +blk; Nag_Mundari ; Nag_Mundari + +sc ; Kawi ; Kawi + +sc ; Nagm ; Nag_Mundari + -> add new blocks to uchar.h before UBLOCK_COUNT + use long property names for enum constants, + for the trailing comment get the block start code point: diff old & new Blocks.txt + ~/unidata$ diff -u uni14/20210922/ucd/Blocks.txt uni15/beta/ucd/Blocks.txt | egrep '^[-+][0-9A-Z]' + +10EC0..10EFF; Arabic Extended-C + +11B00..11B5F; Devanagari Extended-A + +11F00..11F5F; Kawi + -13430..1343F; Egyptian Hieroglyph Format Controls + +13430..1345F; Egyptian Hieroglyph Format Controls + +1D2C0..1D2DF; Kaktovik Numerals + +1E030..1E08F; Cyrillic Extended-D + +1E4D0..1E4FF; Nag Mundari + +31350..323AF; CJK Unified Ideographs Extension H + (ignore blocks whose end code point changed) + -> add new blocks to UCharacter.UnicodeBlock IDs + Eclipse find UBLOCK_([^ ]+) = ([0-9]+), (/.+) + replace public static final int \1_ID = \2; \3 + -> add new blocks to UCharacter.UnicodeBlock objects + Eclipse find UBLOCK_([^ ]+) = [0-9]+, (/.+) + replace public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 + -> add new scripts to uscript.h & com.ibm.icu.lang.UScript + Eclipse find USCRIPT_([^ ]+) *= ([0-9]+),(/.+) + replace public static final int \1 = \2; \3 + -> for new scripts: fix expectedLong names in cintltst/cucdapi.c/TestUScriptCodeAPI() + and in com.ibm.icu.dev.test.lang.TestUScript.java + +* update Script metadata: SCRIPT_PROPS[] in uscript_props.cpp & UScript.ScriptMetadata + (not strictly necessary for NOT_ENCODED scripts) + $ICU_SRC/tools/unicode$ py/parsescriptmetadata.py $ICU_SRC/icu4c/source/common/unicode/uscript.h $CLDR_SRC/common/properties/scriptMetadata.txt + +* build ICU + to make sure that there are no syntax errors + + $ICU_ROOT/dbg/icu4c$ echo;echo; date; make -j7 tests &> out.txt ; tail -n 30 out.txt ; date + +* update spoof checker UnicodeSet initializers: + inclusionPat & recommendedPat in i18n/uspoof.cpp + INCLUSION & RECOMMENDED in SpoofChecker.java +- make sure that the Unicode Tools tree contains the latest security data files +- go to Unicode Tools org.unicode.text.tools.RecommendedSetGenerator +- run the tool (no special environment variables needed) +- copy & paste from the Console output into the .cpp & .java files + +* Bazel build process + +See https://unicode-org.github.io/icu/processes/unicode-update#bazel-build-process +for an overview and for setup instructions. + +Consider running `bazelisk --version` outside of the $ICU_SRC folder +to find out the latest `bazel` version, and +copying that version number into the $ICU_SRC/.bazeliskrc config file. +(Revert if you find incompatibilities, or, better, update our build & config files.) + +* generate data files + +- remember to define the environment variables + (see the start of the section for this Unicode version) +- cd $ICU_SRC +- optional but not necessary: + bazelisk clean + or even + bazelisk clean --expunge +- build/bootstrap/generate new files: + icu4c/source/data/unidata/generate.sh + +* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to + sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) +- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters + ~/unitools/mine/src$ grep disallowed_STD3_valid unicodetools/data/idna/dev/IdnaMappingTable.txt +- Unicode 6.0..15.0: U+2260, U+226E, U+226F +- TODO: Since Unicode 15.1, the UTS #46 data derivation no longer looks at the decompositions (NFD). + These characters are now just valid, no longer disallowed_STD3_valid. + +* run & fix ICU4C tests +- Note: Some of the collation data and test data will be updated below, + so at this time we might get some collation test failures. + Ignore these for now. +- fix Unicode Tools class Segmenter to generate correct *BreakTest.txt files +- update CLDR GraphemeBreakTest.txt + cd ~/unitools/mine/Generated + cp UCD/15.1.0/cldr/GraphemeBreakTest-cldr.txt $CLDR_SRC/common/properties/segments/GraphemeBreakTest.txt + cp UCD/15.1.0/cldr/GraphemeBreakTest-cldr.html $CLDR_SRC/common/properties/segments/GraphemeBreakTest.html + cp $CLDR_SRC/common/properties/segments/GraphemeBreakTest.txt $ICU_SRC/icu4c/source/test/testdata +- Robin or Andy helps with RBBI & spoof check test failures + +* collation: CLDR collation root, UCA DUCET + +- UCA DUCET goes into Mark's Unicode tools, + and a tool-tailored version goes into CLDR, see + https://github.com/unicode-org/unicodetools/blob/main/docs/uca/index.md + +- update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt + cp -v $CLDR_SRC/common/uca/FractionalUCA_SHORT.txt $ICU4C_UNIDATA/FractionalUCA.txt +- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt + cp -v $ICU4C_UNIDATA/UCARules.txt /tmp/UCARules-old.txt + (note removing the underscore before "Rules") + cp -v $CLDR_SRC/common/uca/UCA_Rules_SHORT.txt $ICU4C_UNIDATA/UCARules.txt +- restore TODO diffs in UCARules.txt + meld /tmp/UCARules-old.txt $ICU4C_UNIDATA/UCARules.txt +- update (ICU4C)/source/test/testdata/CollationTest_*.txt + and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt + from the CLDR root files (..._CLDR_..._SHORT.txt) + cp -v $CLDR_SRC/common/uca/CollationTest_CLDR_NON_IGNORABLE_SHORT.txt $ICU_SRC/icu4c/source/test/testdata/CollationTest_NON_IGNORABLE_SHORT.txt + cp -v $CLDR_SRC/common/uca/CollationTest_CLDR_SHIFTED_SHORT.txt $ICU_SRC/icu4c/source/test/testdata/CollationTest_SHIFTED_SHORT.txt + cp -v $ICU_SRC/icu4c/source/test/testdata/CollationTest_*.txt $ICU_SRC/icu4j/main/tests/collate/src/com/ibm/icu/dev/data +- if CLDR common/uca/unihan-index.txt changes, then update + CLDR common/collation/root.xml + and regenerate (or update in parallel) $ICU_SRC/icu4c/source/data/coll/root.txt + +- generate data files, as above (generate.sh), now to pick up new collation data +- update CollationFCD.java: + copy & paste the initializers of lcccIndex[] etc. from + ICU4C/source/i18n/collationfcd.cpp to + ICU4J/main/classes/collate/src/com/ibm/icu/impl/coll/CollationFCD.java +- rebuild ICU4C (make clean, make check, as usual) + +* Unihan collators + https://github.com/unicode-org/unicodetools/blob/main/docs/unihan.md +- run Unicode Tools GenerateUnihanCollators & GenerateUnihanCollatorFiles, + check CLDR diffs, copy to CLDR, test CLDR, ... as documented there +- generate ICU zh collation data + instructions inspired by + https://github.com/unicode-org/icu/blob/main/tools/cldr/cldr-to-icu/README.txt and + https://github.com/unicode-org/icu/blob/main/icu4c/source/data/cldr-icu-readme.txt + + setup: + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 + (didn't work without setting JAVA_HOME, + nor with the Google default of /usr/local/buildtools/java/jdk + [Google security limitations in the XML parser]) + export TOOLS_ROOT=~/icu/uni/src/tools + export CLDR_DIR=~/cldr/uni/src + export CLDR_DATA_DIR=~/cldr/uni/src + (pointing to the "raw" data, not cldr-staging/.../production should be ok for the relevant files) + cd "$TOOLS_ROOT/cldr/lib" + ./install-cldr-jars.sh "$CLDR_DIR" + + generate the files we need + cd "$TOOLS_ROOT/cldr/cldr-to-icu" + ant -f build-icu-data.xml -DoutDir=/tmp/icu -DoutputTypes=coll,transforms -DlocaleIdFilter='zh.*' + + diff + cd $ICU_SRC + meld icu4c/source/data/coll/zh.txt /tmp/icu/coll/zh.txt + meld icu4c/source/data/translit/Hani_Latn.txt /tmp/icu/translit/Hani_Latn.txt + + copy into the source tree + cd $ICU_SRC + cp /tmp/icu/coll/zh.txt icu4c/source/data/coll/zh.txt + cp /tmp/icu/translit/Hani_Latn.txt icu4c/source/data/translit/Hani_Latn.txt +- rebuild ICU4C + +* run & fix ICU4C tests, now with new CLDR collation root data +- run all tests with the collation test data *_SHORT.txt or the full files + (the full ones have comments, useful for debugging) +- note on intltest: if collate/UCAConformanceTest fails, then + utility/MultithreadTest/TestCollators will fail as well; + fix the conformance test before looking into the multi-thread test + +* update Java data files +- refresh just the UCD/UCA-related/derived files, just to be safe +- see (ICU4C)/source/data/icu4j-readme.txt +- mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT +- $ICU_ROOT/dbg/icu4c$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + NOTE: If you get the error "No rule to make target 'out/build/icudt70l/uprops.icu'", + you need to reconfigure with unicore data; see the "configure" line above. + output: + ... + make[1]: Entering directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data' + mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt73b + mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt73b + LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt73l.dat ./out/icu4j/icudt73b.dat -s ./out/build/icudt73l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt73b + mv ./out/icu4j/"com/ibm/icu/impl/data/icudt73b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt73b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt73b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt73b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt73b" + jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt73b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data + jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt73b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data + make[1]: Leaving directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data' +- copy the big-endian Unicode data files to another location, + separate from the other data files, + and then refresh ICU4J + cd $ICU_ROOT/dbg/icu4c/data/out/icu4j + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/coll + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/brkitr + cp -v com/ibm/icu/impl/data/$ICUDT/confusables.cfu /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT + cp -v com/ibm/icu/impl/data/$ICUDT/*.icu /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT + rm /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/cnvalias.icu + cp -v com/ibm/icu/impl/data/$ICUDT/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT + cp -v com/ibm/icu/impl/data/$ICUDT/coll/* /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/coll + cp -v com/ibm/icu/impl/data/$ICUDT/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/brkitr + jar uvf $ICU_SRC/icu4j/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/$ICUDT +- new since ICU 73: also copy the binary data files directly into the ICU4J tree + cp -v com/ibm/icu/impl/data/$ICUDT/coll/* $ICU_SRC/icu4j/maven-build/maven-icu4j-datafiles/src/main/resources/com/ibm/icu/impl/data/$ICUDT/coll + +* When refreshing all of ICU4J data from ICU4C +- $ICU_ROOT/dbg/icu4c$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install +- cp /tmp/icu4j/main/shared/data/icudata.jar $ICU_SRC/icu4j/main/shared/data +or +- $ICU_ROOT/dbg/icu4c$ make ICU4J_ROOT=$ICU_SRC/icu4j icu4j-data-install + +* refresh Java test .txt files +- copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode + cd $ICU_SRC/icu4c/source/data/unidata + cp -v confusables.txt confusablesWholeScript.txt NormalizationCorrections.txt NormalizationTest.txt SpecialCasing.txt UnicodeData.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode + cd ../../test/testdata + cp -v BidiCharacterTest.txt BidiTest.txt IdnaTestV2.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode + cp -v $UNICODE_DATA/ucd/CompositionExclusions.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode + +* run & fix ICU4J tests + +*** API additions +- send notice to icu-design about new born-@stable API (enum constants etc.) + +*** CLDR numbering systems +- look for new sets of decimal digits (gc=ND & nv=4) and add to CLDR + for example: + ~/icu/mine/src$ egrep ';gc=Nd.+;nv=4' icu4c/source/data/unidata/ppucd.txt > /tmp/icu/nv4-15.txt + ~/icu/uni/src$ egrep ';gc=Nd.+;nv=4' icu4c/source/data/unidata/ppucd.txt > /tmp/icu/nv4-15.1.txt + ~/icu/uni/src$ diff -u /tmp/icu/nv4-15.txt /tmp/icu/nv4-15.1.txt + --> + (empty this time) + or: + ~/unitools/mine/src$ diff -u unicodetools/data/ucd/15.0.0/extracted/DerivedGeneralCategory.txt unicodetools/data/ucd/dev/extracted/DerivedGeneralCategory.txt | grep '; Nd' | egrep '^\+' + --> + (empty this time) + Unicode 15.1: + (none this time) + +*** merge the Unicode update branch back onto the main branch +- do not merge the icudata.jar and testdata.jar, + instead rebuild them from merged & tested ICU4C +- if there is a merge conflict in icudata.jar, here is one way to deal with it: + + remove icudata.jar from the commit so that rebasing is trivial + + ~/icu/uni/src$ git restore --source=main icu4j/main/shared/data/icudata.jar + + ~/icu/uni/src$ git commit -a --amend + + switch to main, pull updates, switch back to the dev branch + + ~/icu/uni/src$ git rebase main + + rebuild icudata.jar + + ~/icu/uni/src$ git commit -a --amend + + ~/icu/uni/src$ git push -f +- make sure that changes to Unicode tools are checked in: + https://github.com/unicode-org/unicodetools + +---------------------------------------------------------------------------- *** + CLDR 43 root collation update for ICU 73 Partial update only for the root collation.