From 4959b9b3a33f2ac0410edf9f3127bd7042c5b1a3 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 8 Feb 2018 21:17:18 +0000 Subject: [PATCH] ICU-13569 rbbi table compression, work in progress. X-SVN-Rev: 40873 --- icu4c/source/common/rbbirb.cpp | 10 ++++++++-- icu4c/source/common/rbbitblb.cpp | 2 +- icu4c/source/test/intltest/rbbitst.cpp | 19 +++++++++++++++++-- icu4c/source/test/testdata/rbbitst.txt | 22 ++++++++++------------ 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 817a955a969..3becda31278 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -357,11 +357,17 @@ void RBBIRuleBuilder::optimizeTables() { int32_t leftClass; int32_t rightClass; - leftClass = 1; - rightClass = 2; + leftClass = 3; + rightClass = 4; + printf("Optimizing tables ...\n"); while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) { + printf("Merging duplicate columns (%d, %d)\n", leftClass, rightClass); + fSetBuilder->mergeCategories(leftClass, rightClass); fForwardTables->removeColumn(rightClass); + fReverseTables->removeColumn(rightClass); + fSafeFwdTables->removeColumn(rightClass); + fSafeRevTables->removeColumn(rightClass); } diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index a8bc0486199..b71921a9dee 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -1090,7 +1090,7 @@ bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &dup uint16_t table_base; uint16_t table_dupl; for (; baseCategory < numCols-1; ++baseCategory) { - for (; duplCategory < numCols; ++duplCategory) { + for (duplCategory=baseCategory+1; duplCategory < numCols; ++duplCategory) { for (int32_t state=0; stateelementAt(state); table_base = (uint16_t)sd->fDtran->elementAti(baseCategory); diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 2565ef4f61c..c6e0f457fbe 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -4461,9 +4461,24 @@ void RBBITest::TestBug12677() { void RBBITest::TestTableRedundancies() { UErrorCode status = U_ZERO_ERROR; + + UnicodeString rules {u"$s0=[;,*]; \n" + "$s1=[a-z]; \n" + "$s2=[i-n]; \n" + "$s3=[x-z]; \n" + "!!forward; \n" + "($s0 | '?')*; \n" + "($s1 | $s2 | $s3)*; \n" }; + RuleBasedBreakIterator *lbi = + (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); + rules = lbi->getRules(); + UParseError pe {}; RuleBasedBreakIterator *bi = - (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); - // bi->dumpTables(); + // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); + new RuleBasedBreakIterator(rules, pe, status); + assertSuccess(WHERE, status); + if (U_FAILURE(status)) return; + bi->dumpTables(); RBBIDataWrapper *dw = bi->fData; const RBBIStateTable *fwtbl = dw->fForwardTable; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 1450a98d7be..0d4c1633b62 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -39,18 +39,16 @@ # Temp debugging tests - -<0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\ -コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\ -よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\ -何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\ -んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\ -すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\ -が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\ -の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\ -。<0> - -#<0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\ + +$s0=[;,*]; +$s1=[a-z]; +$s2=[i-n]; +$s3=[x-z]; +!!forward; +($s0 | '?')* +($s1 | $s2 | $s3)*; + +•hello• • ## FILTERED BREAK TESTS