ICU-21569 Add GA to test LSTM configuration

1. Add GA to test BreakIterator under LSTM configuration (remove Thai and Burmese dictionary and include Thai and Burmese LSTM) 2. Add LSTMDataName for the purpose of testing. 3. Add file base test code to test BreakIterator match results from test file generated by pythong code in https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py 4. Fix a LSTMBreakEngine::divideUpDictionaryRange bug when the return value should only contains the number of words found when the passed in foundBreaks already contains some data. 5. Change the cintltest TestSwapData from testing thaidict to laodict so it will not break while we filter out thaidict under the LSTM configuration.
2025-04-05 05:25:34 +00:00 · 2021-04-29 00:07:45 -07:00 · 2021-04-29 00:07:45 -07:00 · 9a2177c575
commit 9a2177c575
parent 253c54ab81
10 changed files with 245 additions and 8 deletions
--- a/.github/lstm_for_th_my.json
+++ b/.github/lstm_for_th_my.json
@ -0,0 +1,21 @@
+// © 2021 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+//
+// Remove Burmese and Thai dictionaries and replaced with lstm models.
+{
+  "featureFilters": {
+    "brkitr_dictionaries": {
+      "excludelist": [
+        "burmesedict",
+        "thaidict"
+      ]
+    },
+    "brkitr_lstm": {
+      "includelist": [
+	"Thai_graphclust_model4_heavy",
+	"Burmese_graphclust_model5_heavy"
+      ]
+    }
+  }
+}
--- a/.github/workflows/icu_ci.yml
+++ b/.github/workflows/icu_ci.yml
@ -216,6 +216,17 @@ jobs:
          make clean;
          make -j2 check

+  # Test LSTM
+  lstm-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - run: |
+          cd icu4c/source;
+          ICU_DATA_FILTER_FILE=../../.github/lstm_for_th_my.json ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
+          make clean;
+          make -j2 check
+
  # Build and run testmap
  testmap:
    runs-on: ubuntu-latest
--- a/icu4c/source/common/lstmbe.cpp
+++ b/icu4c/source/common/lstmbe.cpp
@ -614,10 +614,11 @@ static const int32_t MIN_WORD = 2;
 static const int32_t MIN_WORD_SPAN = MIN_WORD * 2;

 int32_t
-LSTMBreakEngine::divideUpDictionaryRange( UText *text,
-                                                int32_t startPos,
-                                                int32_t endPos,
-                                                UVector32 &foundBreaks ) const {
+LSTMBreakEngine::divideUpDictionaryRange(UText *text,
+                                         int32_t startPos,
+                                         int32_t endPos,
+                                         UVector32 &foundBreaks) const {
+    int32_t beginFoundBreakSize = foundBreaks.size();
    utext_setNativeIndex(text, startPos);
    utext_moveIndex32(text, MIN_WORD_SPAN);
    if (utext_getNativeIndex(text) >= endPos) {
@ -704,7 +705,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text,
            }
        }
    }
-    return foundBreaks.size();
+    return foundBreaks.size() - beginFoundBreakSize;
 }

 Vectorizer* createVectorizer(const LSTMData* data, UErrorCode &status) {
@ -809,6 +810,10 @@ U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data)
    delete data;
 }

+U_CAPI const UChar* U_EXPORT2 LSTMDataName(const LSTMData* data)
+{
+    return data->fName;
+}

 U_NAMESPACE_END

--- a/icu4c/source/common/lstmbe.h
+++ b/icu4c/source/common/lstmbe.h
@ -76,6 +76,7 @@ U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
    UScriptCode script, UErrorCode& status);

 U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
+U_CAPI const UChar* U_EXPORT2 LSTMDataName(const LSTMData* data);

 U_NAMESPACE_END

--- a/icu4c/source/test/cintltst/udatatst.c
+++ b/icu4c/source/test/cintltst/udatatst.c
@ -1353,7 +1353,7 @@ static const struct {

 #if !UCONFIG_NO_BREAK_ITERATION
    {"char",                     "brk", ubrk_swap},
-    {"thaidict",                 "dict",udict_swap},
+    {"laodict",                  "dict",udict_swap},
 #endif

 #if 0
--- a/icu4c/source/test/intltest/intltest.cpp
+++ b/icu4c/source/test/intltest/intltest.cpp
@ -39,6 +39,7 @@
 #include "cmemory.h"
 #include "cstring.h"
 #include "itmajor.h"
+#include "lstmbe.h"
 #include "mutex.h"
 #include "putilimp.h" // for uprv_getRawUTCtime()
 #include "uassert.h"
@ -2428,6 +2429,27 @@ cleanUpAndReturn:
    return retPtr;
 }

+#if !UCONFIG_NO_BREAK_ITERATION
+UBool LSTMDataIsBuilt() {
+  // If we can find the LSTM data, the RBBI will use the LSTM engine.
+  // So we skip the test which depending on the dictionary data.
+  UErrorCode status = U_ZERO_ERROR;
+  DeleteLSTMData(CreateLSTMDataForScript(USCRIPT_THAI, status));
+  UBool thaiDataIsBuilt = U_SUCCESS(status);
+  status = U_ZERO_ERROR;
+  DeleteLSTMData(CreateLSTMDataForScript(USCRIPT_MYANMAR, status));
+  UBool burmeseDataIsBuilt = U_SUCCESS(status);
+  return thaiDataIsBuilt | burmeseDataIsBuilt;
+}
+
+UBool IntlTest::skipLSTMTest() {
+   return ! LSTMDataIsBuilt();
+}
+UBool IntlTest::skipDictionaryTest() {
+   return LSTMDataIsBuilt();
+}
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
 /*
 * Hey, Emacs, please set the following:
 *
--- a/icu4c/source/test/intltest/intltest.h
+++ b/icu4c/source/test/intltest/intltest.h
@ -192,6 +192,11 @@ public:
     */
    UBool logKnownIssue( const char *ticket, const char *fmt, ...);

+#if !UCONFIG_NO_BREAK_ITERATION
+    UBool skipDictionaryTest();
+    UBool skipLSTMTest();
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
    virtual void info( const UnicodeString &message );

    virtual void infoln( const UnicodeString &message );
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -14,6 +14,7 @@
 #include "unicode/utypes.h"
 #if !UCONFIG_NO_BREAK_ITERATION

+#include <algorithm>
 #include <sstream>
 #include <stdio.h>
 #include <stdlib.h>
@ -42,6 +43,7 @@
 #include "cmemory.h"
 #include "cstr.h"
 #include "intltest.h"
+#include "lstmbe.h"
 #include "rbbitst.h"
 #include "rbbidata.h"
 #include "utypeinfo.h"  // for 'typeid' to work
@ -135,6 +137,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
    TESTCASE_AUTO(TestTable_8_16_Bits);
    TESTCASE_AUTO(TestBug13590);
    TESTCASE_AUTO(TestUnpairedSurrogate);
+    TESTCASE_AUTO(TestLSTMThai);
+    TESTCASE_AUTO(TestLSTMBurmese);

 #if U_ENABLE_TRACING
    TESTCASE_AUTO(TestTraceCreateCharacter);
@ -715,8 +719,12 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
    }
 }

-
 void RBBITest::TestExtended() {
+     // The expectations in this test heavily depends on the Thai dictionary.
+     // Therefore, we skip this test under the LSTM configuration.
+     if (skipDictionaryTest()) {
+         return;
+     }
  // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
  // data driven test closely entangles filtered and regular data.
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
@ -1126,7 +1134,6 @@ end_test:
 #endif
 }

-
 //-------------------------------------------------------------------------------
 //
 //  TestDictRules   create a break iterator from source rules that includes a
@ -5243,4 +5250,158 @@ void RBBITest::TestUnpairedSurrogate() {
    assertEquals(WHERE, rules, rtRules);
 }

+// Read file generated by
+// https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
+// as test cases and compare the Output.
+// Format of the file
+//   Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
+//   Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
+//   Input:\t[source text]
+//   Output:\t[expected output separated by | ]
+//   Input: ...
+//   Output: ...
+
+void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
+    // The expectation in this test depends on LSTM, skip the test if the
+    // configuration is not build with LSTM data.
+    if (skipLSTMTest()) {
+        return;
+    }
+    UErrorCode   status = U_ZERO_ERROR;
+    LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
+    if (U_FAILURE(status)) {
+        errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
+        return;
+    }
+    //  Open and read the test data file.
+    const char *testDataDirectory = IntlTest::getSourceTestData(status);
+    CharString testFileName(testDataDirectory, -1, status);
+    testFileName.append(filename, -1, status);
+
+    int len;
+    UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
+    if (U_FAILURE(status)) {
+        errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
+        return;
+    }
+
+    //  Put the test data into a UnicodeString
+    UnicodeString testString(FALSE, testFile, len);
+
+    int32_t start = 0;
+
+    UnicodeString line;
+    int32_t end;
+    std::string actual_sep_str;
+    int32_t caseNum = 0;
+    // Iterate through all the lines in the test file.
+    do {
+        int32_t cr = testString.indexOf(u'\r', start);
+        int32_t lf = testString.indexOf(u'\n', start);
+        end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
+        line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
+        if (line.length() > 0) {
+            // Separate each line to key and value by TAB.
+            int32_t tab = line.indexOf(u'\t');
+            UnicodeString key = line.tempSubString(0, tab);
+            const UnicodeString value = line.tempSubString(tab+1);
+
+            if (key == "Model:") {
+                // Verify the expectation in the test file match the LSTM model
+                // we are using now.
+                const LSTMData* data = CreateLSTMDataForScript(script, status);
+                if (U_FAILURE(status)) {
+                    dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
+                              __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
+                    return;
+                }
+                UnicodeString name(LSTMDataName(data));
+                DeleteLSTMData(data);
+                if (value != name) {
+                    std::string utf8Name, utf8Value;
+                    dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
+                              __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
+                              name.toUTF8String<std::string>(utf8Name).c_str(),
+                              value.toUTF8String<std::string>(utf8Value).c_str());
+                    return;
+                }
+            } else if (key == "Input:") {
+                UnicodeString input("prefix ");
+                input += value + " suffix";
+                std::stringstream ss;
+
+                // Construct the UText which is expected by the the engine as
+                // input from the UnicodeString.
+                UText ut = UTEXT_INITIALIZER;
+                utext_openConstUnicodeString(&ut, &input, &status);
+                if (U_FAILURE(status)) {
+                    dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
+                    return;
+                }
+
+                iterator->setText(&ut, status);
+                if (U_FAILURE(status)) {
+                    errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
+                    return;
+                }
+
+                int32_t bp;
+                for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
+                    ss << bp;
+                    if (bp != input.length()) {
+                        ss << ", ";
+                    }
+                }
+
+                utext_close(&ut);
+                // Turn the break points into a string for easy comparions
+                // output.
+                actual_sep_str = "{" + ss.str() + "}";
+            } else if (key == "Output:" && !actual_sep_str.empty()) {
+                UnicodeString input("prefix| |");
+                input += value + "| |suffix";
+                std::string d;
+                int32_t sep;
+                int32_t start = 0;
+                int32_t curr = 0;
+                std::stringstream ss;
+                // Incude 0 as the break point.
+                ss << "0, ";
+                while ((sep = input.indexOf(u'|', start)) >= 0) {
+                    int32_t len = sep - start;
+                    if (len > 0) {
+                        if (curr > 0) {
+                            ss << ", ";
+                        }
+                        curr += len;
+                        ss << curr;
+                    }
+                    start = sep + 1;
+                }
+                // Include end of the string as break point.
+                ss << ", " << curr + input.length() - start;
+                // Turn the break points into a string for easy comparions
+                // output.
+                std::string expected = "{" + ss.str() + "}";
+                std::string utf8;
+
+                assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
+                             expected.c_str(), actual_sep_str.c_str());
+                actual_sep_str.clear();
+            }
+        }
+        start = std::max(cr, lf) + 1;
+    } while (end >= 0);
+
+    delete [] testFile;
+}
+
+void RBBITest::TestLSTMThai() {
+    runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
+}
+
+void RBBITest::TestLSTMBurmese() {
+    runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
+}
+
 #endif // #if !UCONFIG_NO_BREAK_ITERATION
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -22,6 +22,7 @@
 #include "intltest.h"
 #include "unicode/brkiter.h"
 #include "unicode/rbbi.h"
+#include "unicode/uscript.h"

 class  Enumeration;
 class  BITestData;
@ -92,6 +93,8 @@ public:
    void Test16BitsTrieWith16BitStateTable();
    void TestTable_8_16_Bits();
    void TestBug13590();
+    void TestLSTMThai();
+    void TestLSTMBurmese();

 #if U_ENABLE_TRACING
    void TestTraceCreateCharacter();
@ -117,6 +120,9 @@ private:
    // Run one of the Unicode Consortium boundary test data files.
    void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi);

+    // Run tests from one of the LSTM test files.
+    void runLSTMTestFromFile(const char* filename, UScriptCode script);
+
    // Run a single test case from one of the Unicode Consortium test files.
    void checkUnicodeTestCase(const char *testFileName, int lineNumber,
                         const UnicodeString &testString,
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -4678,6 +4678,11 @@ void TransliteratorTest::TestHalfwidthFullwidth(void) {
     */
 void TransliteratorTest::TestThai(void) {
 #if !UCONFIG_NO_BREAK_ITERATION
+    // The expectations in this test heavily depends on the Thai dictionary.
+    // Therefore, we skip this test under the LSTM configuration.
+    if (skipDictionaryTest()) {
+        return;
+    }
    UParseError parseError;
    UErrorCode status = U_ZERO_ERROR;
    Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);