diff --git a/.github/lstm_for_th_my.json b/.github/lstm_for_th_my.json new file mode 100644 index 00000000000..75d2f4ede53 --- /dev/null +++ b/.github/lstm_for_th_my.json @@ -0,0 +1,21 @@ +// © 2021 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml +// +// Remove Burmese and Thai dictionaries and replaced with lstm models. +{ + "featureFilters": { + "brkitr_dictionaries": { + "excludelist": [ + "burmesedict", + "thaidict" + ] + }, + "brkitr_lstm": { + "includelist": [ + "Thai_graphclust_model4_heavy", + "Burmese_graphclust_model5_heavy" + ] + } + } +} diff --git a/.github/workflows/icu_ci.yml b/.github/workflows/icu_ci.yml index a84f0372e8b..17e0daa292a 100644 --- a/.github/workflows/icu_ci.yml +++ b/.github/workflows/icu_ci.yml @@ -216,6 +216,17 @@ jobs: make clean; make -j2 check + # Test LSTM + lstm-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - run: | + cd icu4c/source; + ICU_DATA_FILTER_FILE=../../.github/lstm_for_th_my.json ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex; + make clean; + make -j2 check + # Build and run testmap testmap: runs-on: ubuntu-latest diff --git a/icu4c/source/common/lstmbe.cpp b/icu4c/source/common/lstmbe.cpp index b051936fbe8..be7eee624e7 100644 --- a/icu4c/source/common/lstmbe.cpp +++ b/icu4c/source/common/lstmbe.cpp @@ -614,10 +614,11 @@ static const int32_t MIN_WORD = 2; static const int32_t MIN_WORD_SPAN = MIN_WORD * 2; int32_t -LSTMBreakEngine::divideUpDictionaryRange( UText *text, - int32_t startPos, - int32_t endPos, - UVector32 &foundBreaks ) const { +LSTMBreakEngine::divideUpDictionaryRange(UText *text, + int32_t startPos, + int32_t endPos, + UVector32 &foundBreaks) const { + int32_t beginFoundBreakSize = foundBreaks.size(); utext_setNativeIndex(text, startPos); utext_moveIndex32(text, MIN_WORD_SPAN); if (utext_getNativeIndex(text) >= endPos) { @@ -704,7 +705,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text, } } } - return foundBreaks.size(); + return foundBreaks.size() - beginFoundBreakSize; } Vectorizer* createVectorizer(const LSTMData* data, UErrorCode &status) { @@ -809,6 +810,10 @@ U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data) delete data; } +U_CAPI const UChar* U_EXPORT2 LSTMDataName(const LSTMData* data) +{ + return data->fName; +} U_NAMESPACE_END diff --git a/icu4c/source/common/lstmbe.h b/icu4c/source/common/lstmbe.h index 8a1acda53d8..38c75d6db98 100644 --- a/icu4c/source/common/lstmbe.h +++ b/icu4c/source/common/lstmbe.h @@ -76,6 +76,7 @@ U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript( UScriptCode script, UErrorCode& status); U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data); +U_CAPI const UChar* U_EXPORT2 LSTMDataName(const LSTMData* data); U_NAMESPACE_END diff --git a/icu4c/source/test/cintltst/udatatst.c b/icu4c/source/test/cintltst/udatatst.c index 5c57d5aa437..a52078d0477 100644 --- a/icu4c/source/test/cintltst/udatatst.c +++ b/icu4c/source/test/cintltst/udatatst.c @@ -1353,7 +1353,7 @@ static const struct { #if !UCONFIG_NO_BREAK_ITERATION {"char", "brk", ubrk_swap}, - {"thaidict", "dict",udict_swap}, + {"laodict", "dict",udict_swap}, #endif #if 0 diff --git a/icu4c/source/test/intltest/intltest.cpp b/icu4c/source/test/intltest/intltest.cpp index aafe7f8ed96..02521972fc6 100644 --- a/icu4c/source/test/intltest/intltest.cpp +++ b/icu4c/source/test/intltest/intltest.cpp @@ -39,6 +39,7 @@ #include "cmemory.h" #include "cstring.h" #include "itmajor.h" +#include "lstmbe.h" #include "mutex.h" #include "putilimp.h" // for uprv_getRawUTCtime() #include "uassert.h" @@ -2428,6 +2429,27 @@ cleanUpAndReturn: return retPtr; } +#if !UCONFIG_NO_BREAK_ITERATION +UBool LSTMDataIsBuilt() { + // If we can find the LSTM data, the RBBI will use the LSTM engine. + // So we skip the test which depending on the dictionary data. + UErrorCode status = U_ZERO_ERROR; + DeleteLSTMData(CreateLSTMDataForScript(USCRIPT_THAI, status)); + UBool thaiDataIsBuilt = U_SUCCESS(status); + status = U_ZERO_ERROR; + DeleteLSTMData(CreateLSTMDataForScript(USCRIPT_MYANMAR, status)); + UBool burmeseDataIsBuilt = U_SUCCESS(status); + return thaiDataIsBuilt | burmeseDataIsBuilt; +} + +UBool IntlTest::skipLSTMTest() { + return ! LSTMDataIsBuilt(); +} +UBool IntlTest::skipDictionaryTest() { + return LSTMDataIsBuilt(); +} +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ + /* * Hey, Emacs, please set the following: * diff --git a/icu4c/source/test/intltest/intltest.h b/icu4c/source/test/intltest/intltest.h index af06cd3cd9a..4c485fe8d1a 100644 --- a/icu4c/source/test/intltest/intltest.h +++ b/icu4c/source/test/intltest/intltest.h @@ -192,6 +192,11 @@ public: */ UBool logKnownIssue( const char *ticket, const char *fmt, ...); +#if !UCONFIG_NO_BREAK_ITERATION + UBool skipDictionaryTest(); + UBool skipLSTMTest(); +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ + virtual void info( const UnicodeString &message ); virtual void infoln( const UnicodeString &message ); diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index b0a5773646a..f2c60f047a9 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -14,6 +14,7 @@ #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION +#include #include #include #include @@ -42,6 +43,7 @@ #include "cmemory.h" #include "cstr.h" #include "intltest.h" +#include "lstmbe.h" #include "rbbitst.h" #include "rbbidata.h" #include "utypeinfo.h" // for 'typeid' to work @@ -135,6 +137,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestTable_8_16_Bits); TESTCASE_AUTO(TestBug13590); TESTCASE_AUTO(TestUnpairedSurrogate); + TESTCASE_AUTO(TestLSTMThai); + TESTCASE_AUTO(TestLSTMBurmese); #if U_ENABLE_TRACING TESTCASE_AUTO(TestTraceCreateCharacter); @@ -715,8 +719,12 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { } } - void RBBITest::TestExtended() { + // The expectations in this test heavily depends on the Thai dictionary. + // Therefore, we skip this test under the LSTM configuration. + if (skipDictionaryTest()) { + return; + } // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This // data driven test closely entangles filtered and regular data. #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION @@ -1126,7 +1134,6 @@ end_test: #endif } - //------------------------------------------------------------------------------- // // TestDictRules create a break iterator from source rules that includes a @@ -5243,4 +5250,158 @@ void RBBITest::TestUnpairedSurrogate() { assertEquals(WHERE, rules, rtRules); } +// Read file generated by +// https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py +// as test cases and compare the Output. +// Format of the file +// Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')] +// Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')] +// Input:\t[source text] +// Output:\t[expected output separated by | ] +// Input: ... +// Output: ... + +void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) { + // The expectation in this test depends on LSTM, skip the test if the + // configuration is not build with LSTM data. + if (skipLSTMTest()) { + return; + } + UErrorCode status = U_ZERO_ERROR; + LocalPointer iterator(BreakIterator::createWordInstance(Locale(), status)); + if (U_FAILURE(status)) { + errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status)); + return; + } + // Open and read the test data file. + const char *testDataDirectory = IntlTest::getSourceTestData(status); + CharString testFileName(testDataDirectory, -1, status); + testFileName.append(filename, -1, status); + + int len; + UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status); + if (U_FAILURE(status)) { + errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename); + return; + } + + // Put the test data into a UnicodeString + UnicodeString testString(FALSE, testFile, len); + + int32_t start = 0; + + UnicodeString line; + int32_t end; + std::string actual_sep_str; + int32_t caseNum = 0; + // Iterate through all the lines in the test file. + do { + int32_t cr = testString.indexOf(u'\r', start); + int32_t lf = testString.indexOf(u'\n', start); + end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf; + line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start); + if (line.length() > 0) { + // Separate each line to key and value by TAB. + int32_t tab = line.indexOf(u'\t'); + UnicodeString key = line.tempSubString(0, tab); + const UnicodeString value = line.tempSubString(tab+1); + + if (key == "Model:") { + // Verify the expectation in the test file match the LSTM model + // we are using now. + const LSTMData* data = CreateLSTMDataForScript(script, status); + if (U_FAILURE(status)) { + dataerrln("%s:%d Error %s Cannot create LSTM data for script %s", + __FILE__, __LINE__, u_errorName(status), uscript_getName(script)); + return; + } + UnicodeString name(LSTMDataName(data)); + DeleteLSTMData(data); + if (value != name) { + std::string utf8Name, utf8Value; + dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s", + __FILE__, __LINE__, u_errorName(status), uscript_getName(script), + name.toUTF8String(utf8Name).c_str(), + value.toUTF8String(utf8Value).c_str()); + return; + } + } else if (key == "Input:") { + UnicodeString input("prefix "); + input += value + " suffix"; + std::stringstream ss; + + // Construct the UText which is expected by the the engine as + // input from the UnicodeString. + UText ut = UTEXT_INITIALIZER; + utext_openConstUnicodeString(&ut, &input, &status); + if (U_FAILURE(status)) { + dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status))); + return; + } + + iterator->setText(&ut, status); + if (U_FAILURE(status)) { + errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status)); + return; + } + + int32_t bp; + for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) { + ss << bp; + if (bp != input.length()) { + ss << ", "; + } + } + + utext_close(&ut); + // Turn the break points into a string for easy comparions + // output. + actual_sep_str = "{" + ss.str() + "}"; + } else if (key == "Output:" && !actual_sep_str.empty()) { + UnicodeString input("prefix| |"); + input += value + "| |suffix"; + std::string d; + int32_t sep; + int32_t start = 0; + int32_t curr = 0; + std::stringstream ss; + // Incude 0 as the break point. + ss << "0, "; + while ((sep = input.indexOf(u'|', start)) >= 0) { + int32_t len = sep - start; + if (len > 0) { + if (curr > 0) { + ss << ", "; + } + curr += len; + ss << curr; + } + start = sep + 1; + } + // Include end of the string as break point. + ss << ", " << curr + input.length() - start; + // Turn the break points into a string for easy comparions + // output. + std::string expected = "{" + ss.str() + "}"; + std::string utf8; + + assertEquals((input + " Test Case#" + caseNum).toUTF8String(utf8).c_str(), + expected.c_str(), actual_sep_str.c_str()); + actual_sep_str.clear(); + } + } + start = std::max(cr, lf) + 1; + } while (end >= 0); + + delete [] testFile; +} + +void RBBITest::TestLSTMThai() { + runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI); +} + +void RBBITest::TestLSTMBurmese() { + runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR); +} + #endif // #if !UCONFIG_NO_BREAK_ITERATION diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 821e2a41755..7fa14e6a49f 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -22,6 +22,7 @@ #include "intltest.h" #include "unicode/brkiter.h" #include "unicode/rbbi.h" +#include "unicode/uscript.h" class Enumeration; class BITestData; @@ -92,6 +93,8 @@ public: void Test16BitsTrieWith16BitStateTable(); void TestTable_8_16_Bits(); void TestBug13590(); + void TestLSTMThai(); + void TestLSTMBurmese(); #if U_ENABLE_TRACING void TestTraceCreateCharacter(); @@ -117,6 +120,9 @@ private: // Run one of the Unicode Consortium boundary test data files. void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi); + // Run tests from one of the LSTM test files. + void runLSTMTestFromFile(const char* filename, UScriptCode script); + // Run a single test case from one of the Unicode Consortium test files. void checkUnicodeTestCase(const char *testFileName, int lineNumber, const UnicodeString &testString, diff --git a/icu4c/source/test/intltest/transtst.cpp b/icu4c/source/test/intltest/transtst.cpp index 8e7bcb09174..d446b20a271 100644 --- a/icu4c/source/test/intltest/transtst.cpp +++ b/icu4c/source/test/intltest/transtst.cpp @@ -4678,6 +4678,11 @@ void TransliteratorTest::TestHalfwidthFullwidth(void) { */ void TransliteratorTest::TestThai(void) { #if !UCONFIG_NO_BREAK_ITERATION + // The expectations in this test heavily depends on the Thai dictionary. + // Therefore, we skip this test under the LSTM configuration. + if (skipDictionaryTest()) { + return; + } UParseError parseError; UErrorCode status = U_ZERO_ERROR; Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);