ICU-21569 Add GA to test LSTM configuration

1. Add GA to test BreakIterator under LSTM configuration (remove Thai
and Burmese dictionary and include Thai and Burmese LSTM)
2. Add LSTMDataName for the purpose of testing.
3. Add file base test code to test BreakIterator match results from test
file generated by pythong code in
https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
4. Fix a LSTMBreakEngine::divideUpDictionaryRange bug when the return value
should only contains the number of words found when the passed in foundBreaks
already contains some data.
5. Change the cintltest TestSwapData from testing thaidict to laodict so
it will not break while we filter out thaidict under the LSTM
configuration.
This commit is contained in:
Frank Tang 2021-04-29 00:07:45 -07:00 committed by Frank Yung-Fong Tang
parent 253c54ab81
commit 9a2177c575
10 changed files with 245 additions and 8 deletions

21
.github/lstm_for_th_my.json vendored Normal file
View file

@ -0,0 +1,21 @@
// © 2021 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
//
// Remove Burmese and Thai dictionaries and replaced with lstm models.
{
"featureFilters": {
"brkitr_dictionaries": {
"excludelist": [
"burmesedict",
"thaidict"
]
},
"brkitr_lstm": {
"includelist": [
"Thai_graphclust_model4_heavy",
"Burmese_graphclust_model5_heavy"
]
}
}
}

View file

@ -216,6 +216,17 @@ jobs:
make clean;
make -j2 check
# Test LSTM
lstm-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- run: |
cd icu4c/source;
ICU_DATA_FILTER_FILE=../../.github/lstm_for_th_my.json ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
make clean;
make -j2 check
# Build and run testmap
testmap:
runs-on: ubuntu-latest

View file

@ -614,10 +614,11 @@ static const int32_t MIN_WORD = 2;
static const int32_t MIN_WORD_SPAN = MIN_WORD * 2;
int32_t
LSTMBreakEngine::divideUpDictionaryRange( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks ) const {
LSTMBreakEngine::divideUpDictionaryRange(UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks) const {
int32_t beginFoundBreakSize = foundBreaks.size();
utext_setNativeIndex(text, startPos);
utext_moveIndex32(text, MIN_WORD_SPAN);
if (utext_getNativeIndex(text) >= endPos) {
@ -704,7 +705,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text,
}
}
}
return foundBreaks.size();
return foundBreaks.size() - beginFoundBreakSize;
}
Vectorizer* createVectorizer(const LSTMData* data, UErrorCode &status) {
@ -809,6 +810,10 @@ U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data)
delete data;
}
U_CAPI const UChar* U_EXPORT2 LSTMDataName(const LSTMData* data)
{
return data->fName;
}
U_NAMESPACE_END

View file

@ -76,6 +76,7 @@ U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
UScriptCode script, UErrorCode& status);
U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
U_CAPI const UChar* U_EXPORT2 LSTMDataName(const LSTMData* data);
U_NAMESPACE_END

View file

@ -1353,7 +1353,7 @@ static const struct {
#if !UCONFIG_NO_BREAK_ITERATION
{"char", "brk", ubrk_swap},
{"thaidict", "dict",udict_swap},
{"laodict", "dict",udict_swap},
#endif
#if 0

View file

@ -39,6 +39,7 @@
#include "cmemory.h"
#include "cstring.h"
#include "itmajor.h"
#include "lstmbe.h"
#include "mutex.h"
#include "putilimp.h" // for uprv_getRawUTCtime()
#include "uassert.h"
@ -2428,6 +2429,27 @@ cleanUpAndReturn:
return retPtr;
}
#if !UCONFIG_NO_BREAK_ITERATION
UBool LSTMDataIsBuilt() {
// If we can find the LSTM data, the RBBI will use the LSTM engine.
// So we skip the test which depending on the dictionary data.
UErrorCode status = U_ZERO_ERROR;
DeleteLSTMData(CreateLSTMDataForScript(USCRIPT_THAI, status));
UBool thaiDataIsBuilt = U_SUCCESS(status);
status = U_ZERO_ERROR;
DeleteLSTMData(CreateLSTMDataForScript(USCRIPT_MYANMAR, status));
UBool burmeseDataIsBuilt = U_SUCCESS(status);
return thaiDataIsBuilt | burmeseDataIsBuilt;
}
UBool IntlTest::skipLSTMTest() {
return ! LSTMDataIsBuilt();
}
UBool IntlTest::skipDictionaryTest() {
return LSTMDataIsBuilt();
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
/*
* Hey, Emacs, please set the following:
*

View file

@ -192,6 +192,11 @@ public:
*/
UBool logKnownIssue( const char *ticket, const char *fmt, ...);
#if !UCONFIG_NO_BREAK_ITERATION
UBool skipDictionaryTest();
UBool skipLSTMTest();
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
virtual void info( const UnicodeString &message );
virtual void infoln( const UnicodeString &message );

View file

@ -14,6 +14,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include <algorithm>
#include <sstream>
#include <stdio.h>
#include <stdlib.h>
@ -42,6 +43,7 @@
#include "cmemory.h"
#include "cstr.h"
#include "intltest.h"
#include "lstmbe.h"
#include "rbbitst.h"
#include "rbbidata.h"
#include "utypeinfo.h" // for 'typeid' to work
@ -135,6 +137,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestTable_8_16_Bits);
TESTCASE_AUTO(TestBug13590);
TESTCASE_AUTO(TestUnpairedSurrogate);
TESTCASE_AUTO(TestLSTMThai);
TESTCASE_AUTO(TestLSTMBurmese);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@ -715,8 +719,12 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
}
}
void RBBITest::TestExtended() {
// The expectations in this test heavily depends on the Thai dictionary.
// Therefore, we skip this test under the LSTM configuration.
if (skipDictionaryTest()) {
return;
}
// Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
// data driven test closely entangles filtered and regular data.
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
@ -1126,7 +1134,6 @@ end_test:
#endif
}
//-------------------------------------------------------------------------------
//
// TestDictRules create a break iterator from source rules that includes a
@ -5243,4 +5250,158 @@ void RBBITest::TestUnpairedSurrogate() {
assertEquals(WHERE, rules, rtRules);
}
// Read file generated by
// https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
// as test cases and compare the Output.
// Format of the file
// Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
// Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
// Input:\t[source text]
// Output:\t[expected output separated by | ]
// Input: ...
// Output: ...
void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
// The expectation in this test depends on LSTM, skip the test if the
// configuration is not build with LSTM data.
if (skipLSTMTest()) {
return;
}
UErrorCode status = U_ZERO_ERROR;
LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
if (U_FAILURE(status)) {
errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
return;
}
// Open and read the test data file.
const char *testDataDirectory = IntlTest::getSourceTestData(status);
CharString testFileName(testDataDirectory, -1, status);
testFileName.append(filename, -1, status);
int len;
UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
if (U_FAILURE(status)) {
errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
return;
}
// Put the test data into a UnicodeString
UnicodeString testString(FALSE, testFile, len);
int32_t start = 0;
UnicodeString line;
int32_t end;
std::string actual_sep_str;
int32_t caseNum = 0;
// Iterate through all the lines in the test file.
do {
int32_t cr = testString.indexOf(u'\r', start);
int32_t lf = testString.indexOf(u'\n', start);
end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
if (line.length() > 0) {
// Separate each line to key and value by TAB.
int32_t tab = line.indexOf(u'\t');
UnicodeString key = line.tempSubString(0, tab);
const UnicodeString value = line.tempSubString(tab+1);
if (key == "Model:") {
// Verify the expectation in the test file match the LSTM model
// we are using now.
const LSTMData* data = CreateLSTMDataForScript(script, status);
if (U_FAILURE(status)) {
dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
__FILE__, __LINE__, u_errorName(status), uscript_getName(script));
return;
}
UnicodeString name(LSTMDataName(data));
DeleteLSTMData(data);
if (value != name) {
std::string utf8Name, utf8Value;
dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
__FILE__, __LINE__, u_errorName(status), uscript_getName(script),
name.toUTF8String<std::string>(utf8Name).c_str(),
value.toUTF8String<std::string>(utf8Value).c_str());
return;
}
} else if (key == "Input:") {
UnicodeString input("prefix ");
input += value + " suffix";
std::stringstream ss;
// Construct the UText which is expected by the the engine as
// input from the UnicodeString.
UText ut = UTEXT_INITIALIZER;
utext_openConstUnicodeString(&ut, &input, &status);
if (U_FAILURE(status)) {
dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
return;
}
iterator->setText(&ut, status);
if (U_FAILURE(status)) {
errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
return;
}
int32_t bp;
for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
ss << bp;
if (bp != input.length()) {
ss << ", ";
}
}
utext_close(&ut);
// Turn the break points into a string for easy comparions
// output.
actual_sep_str = "{" + ss.str() + "}";
} else if (key == "Output:" && !actual_sep_str.empty()) {
UnicodeString input("prefix| |");
input += value + "| |suffix";
std::string d;
int32_t sep;
int32_t start = 0;
int32_t curr = 0;
std::stringstream ss;
// Incude 0 as the break point.
ss << "0, ";
while ((sep = input.indexOf(u'|', start)) >= 0) {
int32_t len = sep - start;
if (len > 0) {
if (curr > 0) {
ss << ", ";
}
curr += len;
ss << curr;
}
start = sep + 1;
}
// Include end of the string as break point.
ss << ", " << curr + input.length() - start;
// Turn the break points into a string for easy comparions
// output.
std::string expected = "{" + ss.str() + "}";
std::string utf8;
assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
expected.c_str(), actual_sep_str.c_str());
actual_sep_str.clear();
}
}
start = std::max(cr, lf) + 1;
} while (end >= 0);
delete [] testFile;
}
void RBBITest::TestLSTMThai() {
runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
}
void RBBITest::TestLSTMBurmese() {
runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
}
#endif // #if !UCONFIG_NO_BREAK_ITERATION

View file

@ -22,6 +22,7 @@
#include "intltest.h"
#include "unicode/brkiter.h"
#include "unicode/rbbi.h"
#include "unicode/uscript.h"
class Enumeration;
class BITestData;
@ -92,6 +93,8 @@ public:
void Test16BitsTrieWith16BitStateTable();
void TestTable_8_16_Bits();
void TestBug13590();
void TestLSTMThai();
void TestLSTMBurmese();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();
@ -117,6 +120,9 @@ private:
// Run one of the Unicode Consortium boundary test data files.
void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi);
// Run tests from one of the LSTM test files.
void runLSTMTestFromFile(const char* filename, UScriptCode script);
// Run a single test case from one of the Unicode Consortium test files.
void checkUnicodeTestCase(const char *testFileName, int lineNumber,
const UnicodeString &testString,

View file

@ -4678,6 +4678,11 @@ void TransliteratorTest::TestHalfwidthFullwidth(void) {
*/
void TransliteratorTest::TestThai(void) {
#if !UCONFIG_NO_BREAK_ITERATION
// The expectations in this test heavily depends on the Thai dictionary.
// Therefore, we skip this test under the LSTM configuration.
if (skipDictionaryTest()) {
return;
}
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);