mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
parent
7577899ff3
commit
19c52a4fe3
21 changed files with 28022 additions and 5 deletions
|
@ -25,6 +25,7 @@
|
|||
#include "brkeng.h"
|
||||
#include "cmemory.h"
|
||||
#include "dictbe.h"
|
||||
#include "lstmbe.h"
|
||||
#include "charstr.h"
|
||||
#include "dictionarydata.h"
|
||||
#include "mutex.h"
|
||||
|
@ -163,9 +164,26 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
UScriptCode code = uscript_getScript(c, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
const LanguageBreakEngine *engine = nullptr;
|
||||
// Try to use LSTM first
|
||||
const LSTMData *data = CreateLSTMDataForScript(code, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
if (data != nullptr) {
|
||||
engine = CreateLSTMBreakEngine(code, data, status);
|
||||
if (U_SUCCESS(status) && engine != nullptr) {
|
||||
return engine;
|
||||
}
|
||||
if (engine != nullptr) {
|
||||
delete engine;
|
||||
engine = nullptr;
|
||||
} else {
|
||||
DeleteLSTMData(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
status = U_ZERO_ERROR; // fallback to dictionary based
|
||||
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
|
||||
if (m != NULL) {
|
||||
const LanguageBreakEngine *engine = NULL;
|
||||
switch(code) {
|
||||
case USCRIPT_THAI:
|
||||
engine = new ThaiBreakEngine(m, status);
|
||||
|
|
|
@ -87,6 +87,7 @@
|
|||
<ClCompile Include="brkeng.cpp" />
|
||||
<ClCompile Include="brkiter.cpp" />
|
||||
<ClCompile Include="dictbe.cpp" />
|
||||
<ClCompile Include="lstmbe.cpp" />
|
||||
<ClCompile Include="pluralmap.cpp" />
|
||||
<ClCompile Include="rbbi.cpp" />
|
||||
<ClCompile Include="rbbidata.cpp" />
|
||||
|
@ -279,6 +280,7 @@
|
|||
<ClInclude Include="ubidiimp.h" />
|
||||
<ClInclude Include="brkeng.h" />
|
||||
<ClInclude Include="dictbe.h" />
|
||||
<ClInclude Include="lstmbe.h" />
|
||||
<ClInclude Include="rbbidata.h" />
|
||||
<ClInclude Include="rbbinode.h" />
|
||||
<ClInclude Include="rbbirb.h" />
|
||||
|
|
|
@ -73,6 +73,9 @@
|
|||
<ClCompile Include="dictbe.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lstmbe.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbi.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
|
@ -651,6 +654,9 @@
|
|||
<ClInclude Include="dictbe.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="lstmbe.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="rbbidata.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -221,6 +221,7 @@
|
|||
<ClCompile Include="brkeng.cpp" />
|
||||
<ClCompile Include="brkiter.cpp" />
|
||||
<ClCompile Include="dictbe.cpp" />
|
||||
<ClCompile Include="lstmbe.cpp" />
|
||||
<ClCompile Include="pluralmap.cpp" />
|
||||
<ClCompile Include="rbbi.cpp" />
|
||||
<ClCompile Include="rbbidata.cpp" />
|
||||
|
@ -414,6 +415,7 @@
|
|||
<ClInclude Include="ubidiimp.h" />
|
||||
<ClInclude Include="brkeng.h" />
|
||||
<ClInclude Include="dictbe.h" />
|
||||
<ClInclude Include="lstmbe.h" />
|
||||
<ClInclude Include="rbbidata.h" />
|
||||
<ClInclude Include="rbbinode.h" />
|
||||
<ClInclude Include="rbbirb.h" />
|
||||
|
|
815
icu4c/source/common/lstmbe.cpp
Normal file
815
icu4c/source/common/lstmbe.cpp
Normal file
|
@ -0,0 +1,815 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include <utility>
|
||||
#include <ctgmath>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "lstmbe.h"
|
||||
#include "putilimp.h"
|
||||
#include "uassert.h"
|
||||
#include "ubrkimpl.h"
|
||||
#include "uresimp.h"
|
||||
#include "uvectr32.h"
|
||||
#include "uvector.h"
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Uncomment the follwoing #define to debug.
|
||||
// #define LSTM_DEBUG 1
|
||||
// #define LSTM_VECTORIZER_DEBUG 1
|
||||
|
||||
/**
|
||||
* Interface for reading 1D array.
|
||||
*/
|
||||
class ReadArray1D {
|
||||
public:
|
||||
virtual ~ReadArray1D();
|
||||
virtual int32_t d1() const = 0;
|
||||
virtual float get(int32_t i) const = 0;
|
||||
|
||||
#ifdef LSTM_DEBUG
|
||||
void print() const {
|
||||
printf("\n[");
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
printf("%0.8e ", get(i));
|
||||
if (i % 4 == 3) printf("\n");
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
ReadArray1D::~ReadArray1D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Interface for reading 2D array.
|
||||
*/
|
||||
class ReadArray2D {
|
||||
public:
|
||||
virtual ~ReadArray2D();
|
||||
virtual int32_t d1() const = 0;
|
||||
virtual int32_t d2() const = 0;
|
||||
virtual float get(int32_t i, int32_t j) const = 0;
|
||||
};
|
||||
|
||||
ReadArray2D::~ReadArray2D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* A class to index a float array as a 1D Array without owning the pointer or
|
||||
* copy the data.
|
||||
*/
|
||||
class ConstArray1D : public ReadArray1D {
|
||||
public:
|
||||
ConstArray1D() : data_(nullptr), d1_(0) {}
|
||||
|
||||
ConstArray1D(const float* data, int32_t d1) : data_(data), d1_(d1) {}
|
||||
|
||||
virtual ~ConstArray1D();
|
||||
|
||||
// Init the object, the object does not own the data nor copy.
|
||||
// It is designed to directly use data from memory mapped resources.
|
||||
void init(const int32_t* data, int32_t d1) {
|
||||
U_ASSERT(IEEE_754 == 1);
|
||||
data_ = reinterpret_cast<const float*>(data);
|
||||
d1_ = d1;
|
||||
}
|
||||
|
||||
// ReadArray1D methods.
|
||||
virtual int32_t d1() const { return d1_; }
|
||||
virtual float get(int32_t i) const {
|
||||
U_ASSERT(i < d1_);
|
||||
return data_[i];
|
||||
}
|
||||
|
||||
private:
|
||||
const float* data_;
|
||||
int32_t d1_;
|
||||
};
|
||||
|
||||
ConstArray1D::~ConstArray1D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* A class to index a float array as a 2D Array without owning the pointer or
|
||||
* copy the data.
|
||||
*/
|
||||
class ConstArray2D : public ReadArray2D {
|
||||
public:
|
||||
ConstArray2D() : data_(nullptr), d1_(0), d2_(0) {}
|
||||
|
||||
ConstArray2D(const float* data, int32_t d1, int32_t d2)
|
||||
: data_(data), d1_(d1), d2_(d2) {}
|
||||
|
||||
virtual ~ConstArray2D();
|
||||
|
||||
// Init the object, the object does not own the data nor copy.
|
||||
// It is designed to directly use data from memory mapped resources.
|
||||
void init(const int32_t* data, int32_t d1, int32_t d2) {
|
||||
U_ASSERT(IEEE_754 == 1);
|
||||
data_ = reinterpret_cast<const float*>(data);
|
||||
d1_ = d1;
|
||||
d2_ = d2;
|
||||
}
|
||||
|
||||
// ReadArray2D methods.
|
||||
inline int32_t d1() const { return d1_; }
|
||||
inline int32_t d2() const { return d2_; }
|
||||
float get(int32_t i, int32_t j) const {
|
||||
U_ASSERT(i < d1_);
|
||||
U_ASSERT(j < d2_);
|
||||
return data_[i * d2_ + j];
|
||||
}
|
||||
|
||||
// Expose the ith row as a ConstArray1D
|
||||
inline ConstArray1D row(int32_t i) const {
|
||||
U_ASSERT(i < d1_);
|
||||
return ConstArray1D(data_ + i * d2_, d2_);
|
||||
}
|
||||
|
||||
private:
|
||||
const float* data_;
|
||||
int32_t d1_;
|
||||
int32_t d2_;
|
||||
};
|
||||
|
||||
ConstArray2D::~ConstArray2D()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* A class to allocate data as a writable 1D array.
|
||||
* This is the main class implement matrix operation.
|
||||
*/
|
||||
class Array1D : public ReadArray1D {
|
||||
public:
|
||||
Array1D() : memory_(nullptr), data_(nullptr), d1_(0) {}
|
||||
Array1D(int32_t d1)
|
||||
: memory_(uprv_malloc(d1 * sizeof(float))),
|
||||
data_((float*)memory_), d1_(d1) {
|
||||
clear();
|
||||
}
|
||||
|
||||
virtual ~Array1D();
|
||||
|
||||
// A special constructor which does not own the memory but writeable
|
||||
// as a slice of an array.
|
||||
Array1D(float* data, int32_t d1)
|
||||
: memory_(nullptr), data_(data), d1_(d1) {}
|
||||
|
||||
// ReadArray1D methods.
|
||||
virtual int32_t d1() const { return d1_; }
|
||||
virtual float get(int32_t i) const {
|
||||
U_ASSERT(i < d1_);
|
||||
return data_[i];
|
||||
}
|
||||
|
||||
// Return the index which point to the max data in the array.
|
||||
inline int32_t maxIndex() const {
|
||||
int32_t index = 0;
|
||||
float max = data_[0];
|
||||
for (int32_t i = 1; i < d1_; i++) {
|
||||
if (data_[i] > max) {
|
||||
max = data_[i];
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
// Slice part of the array to a new one.
|
||||
inline Array1D slice(int32_t from, int32_t size) const {
|
||||
U_ASSERT(from >= 0);
|
||||
U_ASSERT(from < d1_);
|
||||
U_ASSERT(from + size <= d1_);
|
||||
return Array1D(data_ + from, size);
|
||||
}
|
||||
|
||||
// Dot product of a 1D array and a 2D array into this one.
|
||||
inline Array1D& dotProduct(const ReadArray1D& a, const ReadArray2D& b) {
|
||||
U_ASSERT(a.d1() == b.d1());
|
||||
U_ASSERT(b.d2() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] = 0;
|
||||
for (int32_t j = 0; j < a.d1(); j++) {
|
||||
data_[i] += a.get(j) * b.get(j, i);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Hadamard Product the values of another array of the same size into this one.
|
||||
inline Array1D& hadamardProduct(const ReadArray1D& a) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] *= a.get(i);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Add the values of another array of the same size into this one.
|
||||
inline Array1D& add(const ReadArray1D& a) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] += a.get(i);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Assign the values of another array of the same size into this one.
|
||||
inline Array1D& assign(const ReadArray1D& a) {
|
||||
U_ASSERT(a.d1() == d1());
|
||||
for (int32_t i = 0; i < d1(); i++) {
|
||||
data_[i] = a.get(i);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Apply tanh to all the elements in the array.
|
||||
inline Array1D& tanh() {
|
||||
for (int32_t i = 0; i < d1_; i++) {
|
||||
data_[i] = std::tanh(data_[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Apply sigmoid to all the elements in the array.
|
||||
inline Array1D& sigmoid() {
|
||||
for (int32_t i = 0; i < d1_; i++) {
|
||||
data_[i] = 1.0f/(1.0f + expf(-data_[i]));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Array1D& clear() {
|
||||
uprv_memset(data_, 0, d1_ * sizeof(float));
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void* memory_;
|
||||
float* data_;
|
||||
int32_t d1_;
|
||||
};
|
||||
|
||||
Array1D::~Array1D()
|
||||
{
|
||||
uprv_free(memory_);
|
||||
}
|
||||
|
||||
class Array2D : public ReadArray2D {
|
||||
public:
|
||||
Array2D() : memory_(nullptr), data_(nullptr), d1_(0), d2_(0) {}
|
||||
Array2D(int32_t d1, int32_t d2)
|
||||
: memory_(uprv_malloc(d1 * d2 * sizeof(float))),
|
||||
data_((float*)memory_), d1_(d1), d2_(d2) {
|
||||
clear();
|
||||
}
|
||||
virtual ~Array2D();
|
||||
|
||||
// ReadArray2D methods.
|
||||
virtual int32_t d1() const { return d1_; }
|
||||
virtual int32_t d2() const { return d2_; }
|
||||
virtual float get(int32_t i, int32_t j) const {
|
||||
U_ASSERT(i < d1_);
|
||||
U_ASSERT(j < d2_);
|
||||
return data_[i * d2_ + j];
|
||||
}
|
||||
|
||||
inline Array1D row(int32_t i) const {
|
||||
U_ASSERT(i < d1_);
|
||||
return Array1D(data_ + i * d2_, d2_);
|
||||
}
|
||||
|
||||
inline Array2D& clear() {
|
||||
uprv_memset(data_, 0, d1_ * d2_ * sizeof(float));
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void* memory_;
|
||||
float* data_;
|
||||
int32_t d1_;
|
||||
int32_t d2_;
|
||||
};
|
||||
|
||||
Array2D::~Array2D()
|
||||
{
|
||||
uprv_free(memory_);
|
||||
}
|
||||
|
||||
typedef enum {
|
||||
BEGIN,
|
||||
INSIDE,
|
||||
END,
|
||||
SINGLE
|
||||
} LSTMClass;
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN,
|
||||
CODE_POINTS,
|
||||
GRAPHEME_CLUSTER,
|
||||
} EmbeddingType;
|
||||
|
||||
struct LSTMData : public UMemory {
|
||||
LSTMData(UResourceBundle* rb, UErrorCode &status);
|
||||
~LSTMData();
|
||||
UHashtable* fDict;
|
||||
EmbeddingType fType;
|
||||
const UChar* fName;
|
||||
ConstArray2D fEmbedding;
|
||||
ConstArray2D fForwardW;
|
||||
ConstArray2D fForwardU;
|
||||
ConstArray1D fForwardB;
|
||||
ConstArray2D fBackwardW;
|
||||
ConstArray2D fBackwardU;
|
||||
ConstArray1D fBackwardB;
|
||||
ConstArray2D fOutputW;
|
||||
ConstArray1D fOutputB;
|
||||
|
||||
private:
|
||||
UResourceBundle* fDataRes;
|
||||
UResourceBundle* fDictRes;
|
||||
};
|
||||
|
||||
LSTMData::LSTMData(UResourceBundle* rb, UErrorCode &status)
|
||||
: fDict(nullptr), fType(UNKNOWN), fName(nullptr),
|
||||
fDataRes(nullptr), fDictRes(nullptr)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (IEEE_754 != 1) {
|
||||
status = U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
LocalUResourceBundlePointer embeddings_res(
|
||||
ures_getByKey(rb, "embeddings", nullptr, &status));
|
||||
int32_t embedding_size = ures_getInt(embeddings_res.getAlias(), &status);
|
||||
LocalUResourceBundlePointer hunits_res(
|
||||
ures_getByKey(rb, "hunits", nullptr, &status));
|
||||
int32_t hunits = ures_getInt(hunits_res.getAlias(), &status);
|
||||
const UChar* type = ures_getStringByKey(rb, "type", nullptr, &status);
|
||||
if (u_strCompare(type, -1, u"codepoints", -1, false) == 0) {
|
||||
fType = CODE_POINTS;
|
||||
} else if (u_strCompare(type, -1, u"graphclust", -1, false) == 0) {
|
||||
fType = GRAPHEME_CLUSTER;
|
||||
}
|
||||
fName = ures_getStringByKey(rb, "model", nullptr, &status);
|
||||
fDataRes = ures_getByKey(rb, "data", nullptr, &status);
|
||||
int32_t data_len = 0;
|
||||
const int32_t* data = ures_getIntVector(fDataRes, &data_len, &status);
|
||||
LocalUResourceBundlePointer fDictRes(
|
||||
ures_getByKey(rb, "dict", nullptr, &status));
|
||||
int32_t num_index = ures_getSize(fDictRes.getAlias());
|
||||
fDict = uhash_open(uhash_hashUChars, uhash_compareUChars, nullptr, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ures_resetIterator(fDictRes.getAlias());
|
||||
int32_t idx = 0;
|
||||
// put dict into hash
|
||||
while(ures_hasNext(fDictRes.getAlias())) {
|
||||
const char *tempKey = nullptr;
|
||||
const UChar* str = ures_getNextString(fDictRes.getAlias(), nullptr, &tempKey, &status);
|
||||
uhash_putiAllowZero(fDict, (void*)str, idx++, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
#ifdef LSTM_VECTORIZER_DEBUG
|
||||
printf("Assign [");
|
||||
while (*str != 0x0000) {
|
||||
printf("U+%04x ", *str);
|
||||
str++;
|
||||
}
|
||||
printf("] map to %d\n", idx-1);
|
||||
#endif
|
||||
}
|
||||
int32_t mat1_size = (num_index + 1) * embedding_size;
|
||||
int32_t mat2_size = embedding_size * 4 * hunits;
|
||||
int32_t mat3_size = hunits * 4 * hunits;
|
||||
int32_t mat4_size = 4 * hunits;
|
||||
int32_t mat5_size = mat2_size;
|
||||
int32_t mat6_size = mat3_size;
|
||||
int32_t mat7_size = mat4_size;
|
||||
int32_t mat8_size = 2 * hunits * 4;
|
||||
#if U_DEBUG
|
||||
int32_t mat9_size = 4;
|
||||
U_ASSERT(data_len == mat1_size + mat2_size + mat3_size + mat4_size + mat5_size +
|
||||
mat6_size + mat7_size + mat8_size + mat9_size);
|
||||
#endif
|
||||
|
||||
fEmbedding.init(data, (num_index + 1), embedding_size);
|
||||
data += mat1_size;
|
||||
fForwardW.init(data, embedding_size, 4 * hunits);
|
||||
data += mat2_size;
|
||||
fForwardU.init(data, hunits, 4 * hunits);
|
||||
data += mat3_size;
|
||||
fForwardB.init(data, 4 * hunits);
|
||||
data += mat4_size;
|
||||
fBackwardW.init(data, embedding_size, 4 * hunits);
|
||||
data += mat5_size;
|
||||
fBackwardU.init(data, hunits, 4 * hunits);
|
||||
data += mat6_size;
|
||||
fBackwardB.init(data, 4 * hunits);
|
||||
data += mat7_size;
|
||||
fOutputW.init(data, 2 * hunits, 4);
|
||||
data += mat8_size;
|
||||
fOutputB.init(data, 4);
|
||||
}
|
||||
|
||||
LSTMData::~LSTMData() {
|
||||
uhash_close(fDict);
|
||||
ures_close(fDictRes);
|
||||
ures_close(fDataRes);
|
||||
}
|
||||
|
||||
class Vectorizer : public UMemory {
|
||||
public:
|
||||
Vectorizer(UHashtable* dict) : fDict(dict) {}
|
||||
virtual ~Vectorizer();
|
||||
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices,
|
||||
UErrorCode &status) const = 0;
|
||||
protected:
|
||||
int32_t stringToIndex(const UChar* str) const {
|
||||
UBool found = false;
|
||||
int32_t ret = uhash_getiAndFound(fDict, (const void*)str, &found);
|
||||
if (!found) {
|
||||
ret = fDict->count;
|
||||
}
|
||||
#ifdef LSTM_VECTORIZER_DEBUG
|
||||
printf("[");
|
||||
while (*str != 0x0000) {
|
||||
printf("U+%04x ", *str);
|
||||
str++;
|
||||
}
|
||||
printf("] map to %d\n", ret);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
private:
|
||||
UHashtable* fDict;
|
||||
};
|
||||
|
||||
Vectorizer::~Vectorizer()
|
||||
{
|
||||
}
|
||||
|
||||
class CodePointsVectorizer : public Vectorizer {
|
||||
public:
|
||||
CodePointsVectorizer(UHashtable* dict) : Vectorizer(dict) {}
|
||||
virtual ~CodePointsVectorizer();
|
||||
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices,
|
||||
UErrorCode &status) const;
|
||||
};
|
||||
|
||||
CodePointsVectorizer::~CodePointsVectorizer()
|
||||
{
|
||||
}
|
||||
|
||||
void CodePointsVectorizer::vectorize(
|
||||
UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices, UErrorCode &status) const
|
||||
{
|
||||
if (offsets.ensureCapacity(endPos - startPos, status) &&
|
||||
indices.ensureCapacity(endPos - startPos, status)) {
|
||||
utext_setNativeIndex(text, startPos);
|
||||
int32_t current;
|
||||
UChar str[2] = {0, 0};
|
||||
while (U_SUCCESS(status) &&
|
||||
(current = (int32_t)utext_getNativeIndex(text)) < endPos) {
|
||||
// Since the LSTMBreakEngine is currently only accept chars in BMP,
|
||||
// we can ignore the possibility of hitting supplementary code
|
||||
// point.
|
||||
str[0] = (UChar) utext_next32(text);
|
||||
U_ASSERT(!U_IS_SURROGATE(str[0]));
|
||||
offsets.addElement(current, status);
|
||||
indices.addElement(stringToIndex(str), status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class GraphemeClusterVectorizer : public Vectorizer {
|
||||
public:
|
||||
GraphemeClusterVectorizer(UHashtable* dict)
|
||||
: Vectorizer(dict)
|
||||
{
|
||||
}
|
||||
virtual ~GraphemeClusterVectorizer();
|
||||
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices,
|
||||
UErrorCode &status) const;
|
||||
};
|
||||
|
||||
GraphemeClusterVectorizer::~GraphemeClusterVectorizer()
|
||||
{
|
||||
}
|
||||
|
||||
constexpr int32_t MAX_GRAPHEME_CLSTER_LENTH = 10;
|
||||
|
||||
void GraphemeClusterVectorizer::vectorize(
|
||||
UText *text, int32_t startPos, int32_t endPos,
|
||||
UVector32 &offsets, UVector32 &indices, UErrorCode &status) const
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (!offsets.ensureCapacity(endPos - startPos, status) ||
|
||||
!indices.ensureCapacity(endPos - startPos, status)) {
|
||||
return;
|
||||
}
|
||||
LocalPointer<BreakIterator> graphemeIter(BreakIterator::createCharacterInstance(Locale(), status));
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
graphemeIter->setText(text, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (startPos != 0) {
|
||||
graphemeIter->preceding(startPos);
|
||||
}
|
||||
int32_t last = startPos;
|
||||
int32_t current = startPos;
|
||||
UChar str[MAX_GRAPHEME_CLSTER_LENTH];
|
||||
while ((current = graphemeIter->next()) != BreakIterator::DONE) {
|
||||
if (current >= endPos) {
|
||||
break;
|
||||
}
|
||||
if (current > startPos) {
|
||||
utext_extract(text, last, current, str, MAX_GRAPHEME_CLSTER_LENTH, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
}
|
||||
offsets.addElement(last, status);
|
||||
indices.addElement(stringToIndex(str), status);
|
||||
}
|
||||
last = current;
|
||||
}
|
||||
if (U_FAILURE(status) || last >= endPos) {
|
||||
return;
|
||||
}
|
||||
utext_extract(text, last, endPos, str, MAX_GRAPHEME_CLSTER_LENTH, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
offsets.addElement(last, status);
|
||||
indices.addElement(stringToIndex(str), status);
|
||||
}
|
||||
}
|
||||
|
||||
// Computing LSTM as stated in
|
||||
// https://en.wikipedia.org/wiki/Long_short-term_memory#LSTM_with_a_forget_gate
|
||||
void compute(
|
||||
const ReadArray2D& W, const ReadArray2D& U, const ReadArray1D& b,
|
||||
const ReadArray1D& x, Array1D& h, Array1D& c)
|
||||
{
|
||||
// ifco = x * W + h * U + b
|
||||
Array1D ifco(b.d1());
|
||||
ifco.dotProduct(x, W)
|
||||
.add(Array1D(b.d1()).dotProduct(h, U))
|
||||
.add(b);
|
||||
|
||||
int32_t hunits = b.d1() / 4;
|
||||
ifco.slice(0*hunits, hunits).sigmoid(); // i: sigmod
|
||||
ifco.slice(1*hunits, hunits).sigmoid(); // f: sigmoid
|
||||
ifco.slice(2*hunits, hunits).tanh(); // c_: tanh
|
||||
ifco.slice(3*hunits, hunits).sigmoid(); // o: sigmod
|
||||
|
||||
c.hadamardProduct(ifco.slice(hunits, hunits))
|
||||
.add(Array1D(c.d1())
|
||||
.assign(ifco.slice(0, hunits))
|
||||
.hadamardProduct(ifco.slice(2*hunits, hunits)));
|
||||
|
||||
h.assign(c)
|
||||
.tanh()
|
||||
.hadamardProduct(ifco.slice(3*hunits, hunits));
|
||||
}
|
||||
|
||||
// Minimum word size
|
||||
static const int32_t MIN_WORD = 2;
|
||||
|
||||
// Minimum number of characters for two words
|
||||
static const int32_t MIN_WORD_SPAN = MIN_WORD * 2;
|
||||
|
||||
int32_t
|
||||
LSTMBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks ) const {
|
||||
utext_setNativeIndex(text, startPos);
|
||||
utext_moveIndex32(text, MIN_WORD_SPAN);
|
||||
if (utext_getNativeIndex(text) >= endPos) {
|
||||
return 0; // Not enough characters for two words
|
||||
}
|
||||
utext_setNativeIndex(text, startPos);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
UVector32 offsets(status);
|
||||
UVector32 indices(status);
|
||||
fVectorizer->vectorize(text, startPos, endPos, offsets, indices, status);
|
||||
int32_t* offsetsBuf = offsets.getBuffer();
|
||||
int32_t* indicesBuf = indices.getBuffer();
|
||||
|
||||
int32_t input_seq_len = indices.size();
|
||||
int32_t hunits = fData->fForwardU.d1();
|
||||
|
||||
// To save the needed memory usage, the following is different from the
|
||||
// Python or ICU4X implementation. We first perform the Backward LSTM
|
||||
// and then merge the iteration of the forward LSTM and the output layer
|
||||
// together because we only neetdto remember the h[t-1] for Forward LSTM.
|
||||
Array1D c(hunits);
|
||||
|
||||
// TODO: limit size of hBackward. If input_seq_len is too big, we could
|
||||
// run out of memory.
|
||||
// Backward LSTM
|
||||
Array2D hBackward(input_seq_len, hunits);
|
||||
for (int32_t i = input_seq_len - 1; i >= 0; i--) {
|
||||
Array1D hRow = hBackward.row(i);
|
||||
if (i != input_seq_len - 1) {
|
||||
hRow.assign(hBackward.row(i+1));
|
||||
}
|
||||
#ifdef LSTM_DEBUG
|
||||
printf("hRow %d\n", i);
|
||||
hRow.print();
|
||||
printf("indicesBuf[%d] = %d\n", i, indicesBuf[i]);
|
||||
printf("fData->fEmbedding.row(indicesBuf[%d]):\n", i);
|
||||
fData->fEmbedding.row(indicesBuf[i]).print();
|
||||
#endif // LSTM_DEBUG
|
||||
compute(fData->fBackwardW, fData->fBackwardU, fData->fBackwardB,
|
||||
fData->fEmbedding.row(indicesBuf[i]),
|
||||
hRow, c);
|
||||
}
|
||||
|
||||
Array1D logp(4);
|
||||
|
||||
// Allocate fbRow and slice the internal array in two.
|
||||
Array1D fbRow(2 * hunits);
|
||||
Array1D forwardRow = fbRow.slice(0, hunits); // point to first half of data in fbRow.
|
||||
Array1D backwardRow = fbRow.slice(hunits, hunits); // point to second half of data n fbRow.
|
||||
|
||||
// The following iteration merge the forward LSTM and the output layer
|
||||
// together.
|
||||
c.clear(); // reuse c since it is the same size.
|
||||
for (int32_t i = 0; i < input_seq_len; i++) {
|
||||
#ifdef LSTM_DEBUG
|
||||
printf("forwardRow %d\n", i);
|
||||
forwardRow.print();
|
||||
#endif // LSTM_DEBUG
|
||||
// Forward LSTM
|
||||
// Calculate the result into forwardRow, which point to the data in the first half
|
||||
// of fbRow.
|
||||
compute(fData->fForwardW, fData->fForwardU, fData->fForwardB,
|
||||
fData->fEmbedding.row(indicesBuf[i]),
|
||||
forwardRow, c);
|
||||
|
||||
// assign the data from hBackward.row(i) to second half of fbRowa.
|
||||
backwardRow.assign(hBackward.row(i));
|
||||
|
||||
logp.dotProduct(fbRow, fData->fOutputW).add(fData->fOutputB);
|
||||
#ifdef LSTM_DEBUG
|
||||
printf("backwardRow %d\n", i);
|
||||
backwardRow.print();
|
||||
printf("logp %d\n", i);
|
||||
logp.print();
|
||||
#endif // LSTM_DEBUG
|
||||
|
||||
// current = argmax(logp)
|
||||
LSTMClass current = (LSTMClass)logp.maxIndex();
|
||||
// BIES logic.
|
||||
if (current == BEGIN || current == SINGLE) {
|
||||
if (i != 0) {
|
||||
foundBreaks.addElement(offsetsBuf[i], status);
|
||||
}
|
||||
}
|
||||
}
|
||||
return foundBreaks.size();
|
||||
}
|
||||
|
||||
Vectorizer* createVectorizer(const LSTMData* data, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
switch (data->fType) {
|
||||
case CODE_POINTS:
|
||||
return new CodePointsVectorizer(data->fDict);
|
||||
break;
|
||||
case GRAPHEME_CLUSTER:
|
||||
return new GraphemeClusterVectorizer(data->fDict);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
UPRV_UNREACHABLE;
|
||||
}
|
||||
|
||||
LSTMBreakEngine::LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status)
|
||||
: DictionaryBreakEngine(), fData(data), fVectorizer(createVectorizer(fData, status))
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
fData = nullptr; // If failure, we should not delete fData in destructor because the caller will do so.
|
||||
return;
|
||||
}
|
||||
setCharacters(set);
|
||||
}
|
||||
|
||||
LSTMBreakEngine::~LSTMBreakEngine() {
|
||||
delete fData;
|
||||
delete fVectorizer;
|
||||
}
|
||||
|
||||
const UChar* LSTMBreakEngine::name() const {
|
||||
return fData->fName;
|
||||
}
|
||||
|
||||
UnicodeString defaultLSTM(UScriptCode script, UErrorCode& status) {
|
||||
// open root from brkitr tree.
|
||||
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
||||
b = ures_getByKeyWithFallback(b, "lstm", b, &status);
|
||||
UnicodeString result = ures_getUnicodeStringByKey(b, uscript_getShortName(script), &status);
|
||||
ures_close(b);
|
||||
return result;
|
||||
}
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(UScriptCode script, UErrorCode& status)
|
||||
{
|
||||
if (script != USCRIPT_KHMER && script != USCRIPT_LAO && script != USCRIPT_MYANMAR && script != USCRIPT_THAI) {
|
||||
return nullptr;
|
||||
}
|
||||
UnicodeString name = defaultLSTM(script, status);
|
||||
CharString namebuf;
|
||||
namebuf.appendInvariantChars(name, status).truncate(namebuf.lastIndexOf('.'));
|
||||
|
||||
LocalUResourceBundlePointer rb(
|
||||
ures_openDirect(U_ICUDATA_BRKITR, namebuf.data(), &status));
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return CreateLSTMData(rb.getAlias(), status);
|
||||
}
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(UResourceBundle* rb, UErrorCode& status)
|
||||
{
|
||||
return new LSTMData(rb, status);
|
||||
}
|
||||
|
||||
U_CAPI const LanguageBreakEngine* U_EXPORT2
|
||||
CreateLSTMBreakEngine(UScriptCode script, const LSTMData* data, UErrorCode& status)
|
||||
{
|
||||
UnicodeString unicodeSetString;
|
||||
switch(script) {
|
||||
case USCRIPT_THAI:
|
||||
unicodeSetString = UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]");
|
||||
break;
|
||||
case USCRIPT_MYANMAR:
|
||||
unicodeSetString = UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]");
|
||||
break;
|
||||
default:
|
||||
delete data;
|
||||
return nullptr;
|
||||
}
|
||||
UnicodeSet unicodeSet;
|
||||
unicodeSet.applyPattern(unicodeSetString, status);
|
||||
const LanguageBreakEngine* engine = new LSTMBreakEngine(data, unicodeSet, status);
|
||||
if (U_FAILURE(status) || engine == nullptr) {
|
||||
if (engine != nullptr) {
|
||||
delete engine;
|
||||
} else {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
return engine;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data)
|
||||
{
|
||||
delete data;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
84
icu4c/source/common/lstmbe.h
Normal file
84
icu4c/source/common/lstmbe.h
Normal file
|
@ -0,0 +1,84 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#ifndef LSTMBE_H
|
||||
#define LSTMBE_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "dictbe.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class Vectorizer;
|
||||
struct LSTMData;
|
||||
|
||||
/*******************************************************************
|
||||
* LSTMBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>LSTMBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* LSTM to determine language-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a LSTMBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class LSTMBreakEngine : public DictionaryBreakEngine {
|
||||
public:
|
||||
/**
|
||||
* <p>Constructor.</p>
|
||||
*/
|
||||
LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~LSTMBreakEngine();
|
||||
|
||||
virtual const UChar* name() const;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange(UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks ) const;
|
||||
private:
|
||||
const LSTMData* fData;
|
||||
const Vectorizer* fVectorizer;
|
||||
};
|
||||
|
||||
U_CAPI const LanguageBreakEngine* U_EXPORT2 CreateLSTMBreakEngine(
|
||||
UScriptCode script, const LSTMData* data, UErrorCode& status);
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(
|
||||
UResourceBundle* rb, UErrorCode& status);
|
||||
|
||||
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
|
||||
UScriptCode script, UErrorCode& status);
|
||||
|
||||
U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif /* LSTMBE_H */
|
|
@ -40,6 +40,7 @@ locmap.cpp
|
|||
locresdata.cpp
|
||||
locutil.cpp
|
||||
lsr.cpp
|
||||
lstmbe.cpp
|
||||
messagepattern.cpp
|
||||
normalizer2.cpp
|
||||
normalizer2impl.cpp
|
||||
|
|
|
@ -19,6 +19,7 @@ system_symbols:
|
|||
PIC system_misc system_debug malloc_functions ubsan
|
||||
c_strings c_string_formatting
|
||||
int_functions floating_point trigonometry
|
||||
exp_and_tanhf
|
||||
stdlib_qsort
|
||||
system_locale
|
||||
stdio_input stdio_output file_io readlink_function dir_io mmap_functions dlfcn
|
||||
|
@ -76,6 +77,9 @@ group: int_functions
|
|||
group: floating_point
|
||||
abs fabs floor ceil modf fmod log pow round sqrt trunc
|
||||
|
||||
group: exp_and_tanhf
|
||||
expf tanhf
|
||||
|
||||
group: trigonometry
|
||||
acos asin atan atan2 cos sin tan
|
||||
# Additional symbols in an optimized build.
|
||||
|
@ -207,7 +211,7 @@ group: breakiterator
|
|||
brkiter.o brkeng.o ubrk.o
|
||||
rbbi.o rbbinode.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o
|
||||
rbbidata.o rbbirb.o rbbi_cache.o
|
||||
dictionarydata.o dictbe.o
|
||||
dictionarydata.o dictbe.o lstmbe.o
|
||||
# BreakIterator::makeInstance() factory implementation makes for circular dependency
|
||||
# between BreakIterator base and FilteredBreakIteratorBuilder.
|
||||
filteredbrk.o
|
||||
|
@ -219,6 +223,7 @@ group: breakiterator
|
|||
ucharstriebuilder # for filteredbrk.o
|
||||
normlzr # for dictbe.o, should switch to Normalizer2
|
||||
uvector32 # for dictbe.o
|
||||
exp_and_tanhf # for lstmbe.o
|
||||
|
||||
group: unormcmp # unorm_compare()
|
||||
unormcmp.o
|
||||
|
|
|
@ -52,7 +52,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
|
|||
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
|
||||
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
|
||||
bytestrietest.o ucharstrietest.o \
|
||||
itrbbi.o rbbiapts.o rbbitst.o rbbimonkeytest.o ittrans.o transapi.o cpdtrtst.o \
|
||||
itrbbi.o lstmbetst.o rbbiapts.o rbbitst.o rbbimonkeytest.o ittrans.o transapi.o cpdtrtst.o \
|
||||
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
|
||||
jamotest.o srchtest.o reptest.o regextst.o \
|
||||
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
|
||||
|
|
|
@ -82,6 +82,7 @@
|
|||
<ClCompile Include="regiontst.cpp" />
|
||||
<ClCompile Include="ucharstrietest.cpp" />
|
||||
<ClCompile Include="itrbbi.cpp" />
|
||||
<ClCompile Include="lstmbetst.cpp" />
|
||||
<ClCompile Include="rbbiapts.cpp">
|
||||
<DisableLanguageExtensions>false</DisableLanguageExtensions>
|
||||
</ClCompile>
|
||||
|
|
|
@ -64,6 +64,9 @@
|
|||
<ClCompile Include="itrbbi.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lstmbetst.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbiapts.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#include "intltest.h"
|
||||
#include "itrbbi.h"
|
||||
#include "lstmbetst.h"
|
||||
#include "rbbiapts.h"
|
||||
#include "rbbitst.h"
|
||||
#include "rbbimonkeytest.h"
|
||||
|
@ -36,6 +37,7 @@ void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
#if !UCONFIG_NO_FORMATTING
|
||||
TESTCASE_AUTO_CLASS(RBBIMonkeyTest);
|
||||
#endif
|
||||
TESTCASE_AUTO_CLASS(LSTMBETest);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
|
246
icu4c/source/test/intltest/lstmbetst.cpp
Normal file
246
icu4c/source/test/intltest/lstmbetst.cpp
Normal file
|
@ -0,0 +1,246 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "lstmbetst.h"
|
||||
#include "lstmbe.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "charstr.h"
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
//---------------------------------------------
|
||||
|
||||
|
||||
void LSTMBETest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
|
||||
{
|
||||
fTestParams = params;
|
||||
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
|
||||
TESTCASE_AUTO(TestThaiGraphclust);
|
||||
TESTCASE_AUTO(TestThaiCodepoints);
|
||||
TESTCASE_AUTO(TestBurmeseGraphclust);
|
||||
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------------
|
||||
//
|
||||
// LSTMBETest constructor and destructor
|
||||
//
|
||||
//--------------------------------------------------------------------------------------
|
||||
|
||||
LSTMBETest::LSTMBETest() {
|
||||
fTestParams = NULL;
|
||||
}
|
||||
|
||||
|
||||
LSTMBETest::~LSTMBETest() {
|
||||
}
|
||||
|
||||
UScriptCode getScriptFromModelName(const std::string& modelName) {
|
||||
if (modelName.find("Thai") == 0) {
|
||||
return USCRIPT_THAI;
|
||||
} else if (modelName.find("Burmese") == 0) {
|
||||
return USCRIPT_MYANMAR;
|
||||
}
|
||||
// Add for other script codes.
|
||||
UPRV_UNREACHABLE;
|
||||
}
|
||||
|
||||
// Read file generated by
|
||||
// https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
|
||||
// as test cases and compare the Output.
|
||||
// Format of the file
|
||||
// Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
|
||||
// Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
|
||||
// Input:\t[source text]
|
||||
// Output:\t[expected output separated by | ]
|
||||
// Input: ...
|
||||
// Output: ...
|
||||
// The test will ensure the Input contains only the characters can be handled by
|
||||
// the model. Since by default the LSTM models are not included, all the tested
|
||||
// models need to be included under source/test/testdata.
|
||||
|
||||
void LSTMBETest::runTestFromFile(const char* filename) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<const LanguageBreakEngine> engine;
|
||||
// Open and read the test data file.
|
||||
const char *testDataDirectory = IntlTest::getSourceTestData(status);
|
||||
CharString testFileName(testDataDirectory, -1, status);
|
||||
testFileName.append(filename, -1, status);
|
||||
|
||||
int len;
|
||||
UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
|
||||
return;
|
||||
}
|
||||
|
||||
// Put the test data into a UnicodeString
|
||||
UnicodeString testString(FALSE, testFile, len);
|
||||
|
||||
int32_t start = 0;
|
||||
|
||||
UnicodeString line;
|
||||
int32_t end;
|
||||
std::string actual_sep_str;
|
||||
int32_t caseNum = 0;
|
||||
// Iterate through all the lines in the test file.
|
||||
do {
|
||||
int32_t cr = testString.indexOf(u'\r', start);
|
||||
int32_t lf = testString.indexOf(u'\n', start);
|
||||
end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
|
||||
line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
|
||||
if (line.length() > 0) {
|
||||
// Separate each line to key and value by TAB.
|
||||
int32_t tab = line.indexOf(u'\t');
|
||||
UnicodeString key = line.tempSubString(0, tab);
|
||||
const UnicodeString value = line.tempSubString(tab+1);
|
||||
|
||||
if (key == "Model:") {
|
||||
std::string modelName;
|
||||
value.toUTF8String<std::string>(modelName);
|
||||
engine.adoptInstead(createEngineFromTestData(modelName.c_str(), getScriptFromModelName(modelName), status));
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("Could not CreateLSTMBreakEngine for " + line + UnicodeString(u_errorName(status)));
|
||||
return;
|
||||
}
|
||||
} else if (key == "Input:") {
|
||||
// First, we ensure all the char in the Input lines are accepted
|
||||
// by the engine before we test them.
|
||||
caseNum++;
|
||||
bool canHandleAllChars = true;
|
||||
for (int32_t i = 0; i < value.length(); i++) {
|
||||
if (!engine->handles(value.charAt(i))) {
|
||||
errln(UnicodeString("Test Case#") + caseNum + " contains char '" +
|
||||
UnicodeString(value.charAt(i)) +
|
||||
"' cannot be handled by the engine in offset " + i + "\n" + line);
|
||||
canHandleAllChars = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (! canHandleAllChars) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If the engine can handle all the chars in the Input line, we
|
||||
// then find the break points by calling the engine.
|
||||
std::stringstream ss;
|
||||
|
||||
// Construct the UText which is expected by the the engine as
|
||||
// input from the UnicodeString.
|
||||
UText ut = UTEXT_INITIALIZER;
|
||||
utext_openConstUnicodeString(&ut, &value, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
|
||||
return;
|
||||
}
|
||||
|
||||
UVector32 actual(status);
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("%s:%d Error %s Could not allocate UVextor32", __FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
engine->findBreaks(&ut, 0, value.length(), actual);
|
||||
utext_close(&ut);
|
||||
for (int32_t i = 0; i < actual.size(); i++) {
|
||||
ss << actual.elementAti(i) << ", ";
|
||||
}
|
||||
ss << value.length();
|
||||
// Turn the break points into a string for easy comparions
|
||||
// output.
|
||||
actual_sep_str = "{" + ss.str() + "}";
|
||||
} else if (key == "Output:" && !actual_sep_str.empty()) {
|
||||
std::string d;
|
||||
int32_t sep;
|
||||
int32_t start = 0;
|
||||
int32_t curr = 0;
|
||||
std::stringstream ss;
|
||||
while ((sep = value.indexOf(u'|', start)) >= 0) {
|
||||
int32_t len = sep - start;
|
||||
if (len > 0) {
|
||||
if (curr > 0) {
|
||||
ss << ", ";
|
||||
}
|
||||
curr += len;
|
||||
ss << curr;
|
||||
}
|
||||
start = sep + 1;
|
||||
}
|
||||
// Turn the break points into a string for easy comparions
|
||||
// output.
|
||||
std::string expected = "{" + ss.str() + "}";
|
||||
std::string utf8;
|
||||
|
||||
assertEquals((value + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
|
||||
expected.c_str(), actual_sep_str.c_str());
|
||||
actual_sep_str.clear();
|
||||
}
|
||||
}
|
||||
start = std::max(cr, lf) + 1;
|
||||
} while (end >= 0);
|
||||
|
||||
delete [] testFile;
|
||||
}
|
||||
|
||||
void LSTMBETest::TestThaiGraphclust() {
|
||||
runTestFromFile("Thai_graphclust_model4_heavy_Test.txt");
|
||||
}
|
||||
|
||||
void LSTMBETest::TestThaiCodepoints() {
|
||||
runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt");
|
||||
}
|
||||
|
||||
void LSTMBETest::TestBurmeseGraphclust() {
|
||||
runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt");
|
||||
}
|
||||
|
||||
const LanguageBreakEngine* LSTMBETest::createEngineFromTestData(
|
||||
const char* model, UScriptCode script, UErrorCode& status) {
|
||||
const char* testdatapath=loadTestData(status);
|
||||
if(U_FAILURE(status))
|
||||
{
|
||||
dataerrln("Could not load testdata.dat " + UnicodeString(testdatapath) + ", " +
|
||||
UnicodeString(u_errorName(status)));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
LocalUResourceBundlePointer rb(
|
||||
ures_openDirect(testdatapath, model, &status));
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("Could not open " + UnicodeString(model) + " under " + UnicodeString(testdatapath) + ", " +
|
||||
UnicodeString(u_errorName(status)));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const LSTMData* data = CreateLSTMData(rb.getAlias(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("Could not CreateLSTMData " + UnicodeString(model) + " under " + UnicodeString(testdatapath) + ", " +
|
||||
UnicodeString(u_errorName(status)));
|
||||
return nullptr;
|
||||
}
|
||||
if (data == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
LocalPointer<const LanguageBreakEngine> engine(CreateLSTMBreakEngine(script, data, status));
|
||||
if (U_FAILURE(status) || engine.getAlias() == nullptr) {
|
||||
dataerrln("Could not CreateLSTMBreakEngine " + UnicodeString(testdatapath) + ", " +
|
||||
UnicodeString(u_errorName(status)));
|
||||
DeleteLSTMData(data);
|
||||
return nullptr;
|
||||
}
|
||||
return engine.orphan();
|
||||
}
|
||||
|
||||
#endif // #if !UCONFIG_NO_BREAK_ITERATION
|
49
icu4c/source/test/intltest/lstmbetst.h
Normal file
49
icu4c/source/test/intltest/lstmbetst.h
Normal file
|
@ -0,0 +1,49 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#ifndef LSTMBETEST_H
|
||||
#define LSTMBETEST_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "intltest.h"
|
||||
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
struct TestParams;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
class LanguageBreakEngine;
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
||||
/**
|
||||
* Test the LSTMBreakEngine class giving different rules
|
||||
*/
|
||||
class LSTMBETest: public IntlTest {
|
||||
public:
|
||||
|
||||
LSTMBETest();
|
||||
virtual ~LSTMBETest();
|
||||
|
||||
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
|
||||
void TestThaiGraphclust();
|
||||
void TestThaiCodepoints();
|
||||
void TestBurmeseGraphclust();
|
||||
void runTestFromFile(const char* filename);
|
||||
|
||||
private:
|
||||
const LanguageBreakEngine* createEngineFromTestData(const char* model, UScriptCode script, UErrorCode& status);
|
||||
|
||||
// Test parameters, from the test framework and test invocation.
|
||||
const char* fTestParams;
|
||||
};
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif
|
7
icu4c/source/test/testdata/BUILDRULES.py
vendored
7
icu4c/source/test/testdata/BUILDRULES.py
vendored
|
@ -44,6 +44,10 @@ def generate_rb(config, io, common_vars):
|
|||
"testaliases",
|
||||
"testempty",
|
||||
"testtypes",
|
||||
# LSTM models
|
||||
"Thai_graphclust_model4_heavy",
|
||||
"Thai_codepoints_exclusive_model5_heavy",
|
||||
"Burmese_graphclust_model5_heavy"
|
||||
# "metaZones",
|
||||
# "timezoneTypes",
|
||||
# "windowsZones",
|
||||
|
@ -59,7 +63,7 @@ def generate_rb(config, io, common_vars):
|
|||
input_files = [InFile("%s.txt" % bn) for bn in basenames],
|
||||
output_files = [OutFile("%s.res" % bn) for bn in basenames],
|
||||
tool = IcuTool("genrb"),
|
||||
args = "-q -s {IN_DIR} -d {OUT_DIR} {INPUT_FILE}",
|
||||
args = "-q -s {IN_DIR} -eUTF-8 -d {OUT_DIR} {INPUT_FILE}",
|
||||
format_with = {},
|
||||
repeat_with = {}
|
||||
),
|
||||
|
@ -169,7 +173,6 @@ def generate_conv(config, io, common_vars):
|
|||
)
|
||||
]
|
||||
|
||||
|
||||
def generate_copy(config, io, common_vars):
|
||||
return [
|
||||
CopyRequest(
|
||||
|
|
7061
icu4c/source/test/testdata/Burmese_graphclust_model5_heavy.txt
vendored
Normal file
7061
icu4c/source/test/testdata/Burmese_graphclust_model5_heavy.txt
vendored
Normal file
File diff suppressed because it is too large
Load diff
8
icu4c/source/test/testdata/Burmese_graphclust_model5_heavy_Test.txt
vendored
Normal file
8
icu4c/source/test/testdata/Burmese_graphclust_model5_heavy_Test.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
# Copyright (C) 2021 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
Model: Burmese_graphclust_model5_heavy
|
||||
Embedding: grapheme_clusters_tf
|
||||
Input: အပြည်ပြည်ဆိုင်ရာလူ့အခွင့်အရေးကြေညာစာတမ်း
|
||||
Output: |အပြည်|ပြည်|ဆိုင်ရာ|လူ့|အခွင့်အရေး|ကြေညာစာတမ်း|
|
||||
Input: မျိုးရိုးဂုဏ်သိက္ခာနှင့်တကွ
|
||||
Output: |မျိုး|ရိုး|ဂုဏ်|သိက္ခာ|နှင့်|တ|ကွ|
|
6009
icu4c/source/test/testdata/Thai_codepoints_exclusive_model5_heavy.txt
vendored
Normal file
6009
icu4c/source/test/testdata/Thai_codepoints_exclusive_model5_heavy.txt
vendored
Normal file
File diff suppressed because it is too large
Load diff
97
icu4c/source/test/testdata/Thai_codepoints_exclusive_model5_heavy_Test.txt
vendored
Normal file
97
icu4c/source/test/testdata/Thai_codepoints_exclusive_model5_heavy_Test.txt
vendored
Normal file
|
@ -0,0 +1,97 @@
|
|||
# Copyright (C) 2020 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
Note: model Thai_codepoints_exclusive_model5_heavy has been trained using an exclusive data set. However, if you like you can still test it by other types of data sets (not recommended).
|
||||
Model: Thai_codepoints_exclusive_model5_heavy
|
||||
Embedding: codepoints
|
||||
Input: ปฏิญญาสากลว่าด้วยสิทธิมนุษยชน
|
||||
Output: |ปฏิญญา|สากลว่า|ด้วย|สิทธิ|มนุษย|ชน|
|
||||
Input: คำปรารภ
|
||||
Output: |คำ|ปรารภ|
|
||||
Input: โดยที่การยอมรับนับถือเกียรติศักดิ์ประจำตัว
|
||||
Output: |โดย|ที่|การ|ยอม|รับ|นับ|ถือ|เกียรติศักดิ์|ประจำ|ตัว|
|
||||
Input: และสิทธิเท่าเทียมกันและโอนมิได้ของบรรดา
|
||||
Output: |และ|สิทธิ|เท่า|เทียม|กัน|และ|โอน|มิ|ได้|ของ|บรรดา|
|
||||
Input: สมาชิก
|
||||
Output: |สมา|ชิก|
|
||||
Input: ทั้ง
|
||||
Output: |ทั้ง|
|
||||
Input: หลายแห่งครอบครัว
|
||||
Output: |หลาย|แห่ง|ครอบครัว|
|
||||
Input: มนุษย์เป็นหลักมูลเหตุแห่งอิสรภาพ
|
||||
Output: |มนุษย์|เป็น|หลักมูล|เหตุ|แห่ง|อิสรภาพ|
|
||||
Input: ความยุติธรรม
|
||||
Output: |ความ|ยุติ|ธรรม|
|
||||
Input: และสันติภาพในโลก
|
||||
Output: |และ|สันติภาพ|ใน|โลก|
|
||||
Input: โดยที่การไม่นำพาและการเหยียดหยามต่อสิทธิมนุษยชน
|
||||
Output: |โดย|ที่|การ|ไม่|นำ|พา|และ|การ|เหยียด|หยาม|ต่อ|สิทธิ|มนุษยชน|
|
||||
Input: ยังผลให้มีการหระทำอันป่าเถื่อน
|
||||
Output: |ยัง|ผล|ให้|มี|การ|หระทำ|อัน|ป่า|เถื่อ|น|
|
||||
Input: ซี่งเป็นการละเมิดมโนธรรมของมนุษยชาติอย่างร้ายแรง
|
||||
Output: |ซี่ง|เป็น|การ|ละเมิดมโนธรรม|ของ|มนุษยชาติ|อย่าง|ร้าย|แรง|
|
||||
Input: และใต้
|
||||
Output: |และ|ใต้|
|
||||
Input: ได้
|
||||
Output: |ได้|
|
||||
Input: มีการประกาศว่า
|
||||
Output: |มี|การ|ประกาศ|ว่า|
|
||||
Input: ปณิธานสูงสุดของสามัญชนได้แก่ความต้องการให้มนุษย์มีชีวิตอยู่ในโลกด้วยอิสรภาพในการพูด
|
||||
Output: |ปณิธาน|สูงสุด|ของ|สามัญชน|ได้|แก่|ความ|ต้องการ|ให้|มนุษย์|มี|ชีวิต|อยู่|ใน|โลก|ด้วย|อิสรภาพ|ใน|การ|พูด|
|
||||
Input: และความเชื่อถือ
|
||||
Output: |และ|ความ|เชื่อถือ|
|
||||
Input: และอิสรภาพพ้นจากความหวาดกลัวและความต้องการ
|
||||
Output: |และ|อิสรภาพ|พ้น|จาก|ความ|หวาด|กลัว|และ|ความ|ต้องการ|
|
||||
Input: โดยที่เป็นการจำเป็นอย่างยิ่งที่สิทธิมนุษยชนควรได้รับความคุ้มครองโดยหลักบังคับของกฎหมาย
|
||||
Output: |โดย|ที่|เป็น|การ|จำเป็น|อย่าง|ยิ่ง|ที่|สิทธิ|มนุษยชน|ควร|ได้|รับ|ความ|คุ้มครอง|โดย|หลัก|บังคับ|ของ|กฎหมาย|
|
||||
Input: ถ้าไม่ประสงค์จะให้คนตกอยู่ในบังคับให้หันเข้าหาการขบถขัดขืนต่อทรราชและการกดขี่เป็นวิถีทางสุดท้าย
|
||||
Output: |ถ้า|ไม่|ประสงค์|จะ|ให้|คน|ตก|อยู่|ใน|บังคับ|ให้|หัน|เข้า|หา|การ|ขบถ|ขัด|ขืน|ต่อทรราช|และ|การ|กด|ขี่|เป็น|วิถี|ทาง|สุด|ท้าย|
|
||||
Input: โดยที่ประชากรแห่งสหประชาชาติได้ยืนยันไว้ในกฎบัตรถึงความเชื่อมั่นในสิทธิมนุษยชนอันเป็นหลักมูล
|
||||
Output: |โดย|ที่|ประชากร|แห่ง|สหประชา|ชาติ|ได้|ยืน|ยัน|ไว้|ใน|กฎบัตร|ถึง|ความ|เชื่อมั่น|ใน|สิทธิ|มนุษยชน|อัน|เป็น|หลัก|มู|ล|
|
||||
Input: ในเกียรติศักดิ์และคุณค่าของมนุษย์และในสิทธิเท่าเทียมกันของบรรดาชายและหญิง
|
||||
Output: |ใน|เกียรติศักดิ์|และ|คุณค่า|ของ|มนุษย์|และ|ใน|สิทธิ|เท่า|เทียม|กัน|ของ|บรรดา|ชาย|และ|หญิง|
|
||||
Input: และได้ตกลงใจที่จะส่งเสริมความก้าวหน้าทางสังคม
|
||||
Output: |และ|ได้|ตก|ลงใจ|ที่|จะ|ส่ง|เสริม|ความ|ก้าว|หน้า|ทาง|สังคม|
|
||||
Input: และมาตรฐานแห่งชีวิตที่ดีขึ้นด้วยในอิสรภาพ
|
||||
Output: |และ|มาตรฐาน|แห่ง|ชีวิต|ที่|ดี|ขึ้น|ด้วย|ใน|อิสรภาพ|
|
||||
Input: อันกว้างขวางยิ่งขึ้น
|
||||
Output: |อัน|กว้าง|ขวาง|ยิ่ง|ขึ้น|
|
||||
Input: โดยที่รัฐสมาชิกต่างปฎิญาณจะให้บรรลุถึงซึ่งการส่งเสริมการเคารพและการปฎิบัติตามทั่วสากลต่อสิทธิมนุษยชนและอิสรภาพหลักมูล
|
||||
Output: |โดย|ที่|รัฐสมา|ชิก|ต่าง|ปฎิญาณ|จะ|ให้|บรรลุ|ถึง|ซึ่ง|การ|ส่ง|เสริม|การ|เคารพ|และ|การ|ปฎิบัติ|ตาม|ทั่วสากล|ต่อ|สิทธิ|มนุษยชน|และ|อิสรภาพ|หลัก|มู|ล|
|
||||
Input: โดยร่วมมือกับสหประชาชาติ
|
||||
Output: |โดย|ร่วม|มือ|กับ|สหประชา|ชาติ|
|
||||
Input: โดยที่ความเข้าใจร่วมกันในสิทธิ
|
||||
Output: |โดย|ที่|ความ|เข้าใจ|ร่วม|กัน|ใน|สิทธิ|
|
||||
Input: และอิสรภาพเหล่านี้เป็นสิ่งสำคัญอย่างยิ่ง
|
||||
Output: |และ|อิสรภาพ|เหล่า|นี้|เป็น|สิ่ง|สำคัญ|อย่าง|ยิ่ง|
|
||||
Input: เพื่อให้ปฏิญาณนี้สำเร็จผลเต็มบริบูรณ์
|
||||
Output: |เพื่อ|ให้|ปฏิญาณ|นี้|สำเร็จ|ผล|เต็ม|บริบูรณ์|
|
||||
Input: ฉะนั้น
|
||||
Output: |ฉะนั้น|
|
||||
Input: บัดนี้สมัชชาจึงประกาศว่า
|
||||
Output: |บัด|นี้|สมัชชา|จึง|ประกาศ|ว่า|
|
||||
Input: ปฏิญญาสากลว่าด้วยสิทธิมนุษยชนนี้
|
||||
Output: |ปฏิญญา|สากลว่า|ด้วย|สิทธิ|มนุษยชน|นี้|
|
||||
Input: เป็นมาตรฐานร่วมกันแห่งความสำเร็จสำหรับบรรดาประชากรและประชาชาติทั้งหลาย
|
||||
Output: |เป็น|มาตรฐาน|ร่วม|กัน|แห่ง|ความ|สำเร็จ|สำหรับ|บรรดา|ประชากร|และ|ประชาชาติ|ทั้ง|หลาย|
|
||||
Input: เพื่อจุดหมายปลายทางที่ว่า
|
||||
Output: |เพื่อ|จุดหมาย|ปลาย|ทาง|ที่|ว่า|
|
||||
Input: เอกชนทุกคนและองค์การชองสังคมทุกองค์การ
|
||||
Output: |เอกชน|ทุก|คน|และ|องค์|การ|ชอง|สังคม|ทุก|องค์|การ|
|
||||
Input: โดยการรำลึกถึงปฏิญญานี้เป็นเนืองนิจ
|
||||
Output: |โดย|การ|รำลึก|ถึง|ปฏิญญา|นี้|เป็น|เนือง|นิจ|
|
||||
Input: จะบากบั่นพยายามด้วยการสอนและศึกษา
|
||||
Output: |จะ|บาก|บั่นพยายาม|ด้วย|การ|สอน|และ|ศึกษา|
|
||||
Input: ในอันที่จะส่งเสริมการเคารพสิทธิและอิสรภาพเหล่านี้
|
||||
Output: |ใน|อัน|ที่|จะ|ส่ง|เสริม|การ|เคารพ|สิทธิ|และ|อิสรภาพ|เหล่า|นี้|
|
||||
Input: และด้วยมาตรการอันก้าวหน้าทั้งในประเทศและระหว่างประเทศ
|
||||
Output: |และ|ด้วย|มาตรการ|อัน|ก้าว|หน้า|ทั้ง|ใน|ประเทศ|และ|ระหว่าง|ประเทศ|
|
||||
Input: ในอันที่จะให้มีการยอมรับนับถือ
|
||||
Output: |ใน|อัน|ที่|จะ|ให้|มี|การ|ยอม|รับ|นับ|ถือ|
|
||||
Input: และการปฏิบัติตามโดยสากลและอย่างเป็นผลจริงจัง
|
||||
Output: |และ|การ|ปฏิบัติ|ตาม|โดย|สากล|และ|อย่าง|เป็น|ผล|จริง|จัง|
|
||||
Input: ทั้งในบรรดาประชาชนของรัฐสมาชิกด้วยกันเอง
|
||||
Output: |ทั้ง|ใน|บรรดา|ประชาชน|ของ|รัฐสมา|ชิก|ด้วย|กัน|เอง|
|
||||
Input: และในบรรดาประชาชนของดินแดนที่อยู่ใตัอำนาจของรัฐนั้น
|
||||
Output: |และ|ใน|บรรดา|ประชาชน|ของ|ดิน|แดน|ที่|อยู่|ใตัอำนาจ|ของ|รัฐ|นั้น|
|
||||
Input: ๆ
|
||||
Output: |ๆ|
|
13509
icu4c/source/test/testdata/Thai_graphclust_model4_heavy.txt
vendored
Normal file
13509
icu4c/source/test/testdata/Thai_graphclust_model4_heavy.txt
vendored
Normal file
File diff suppressed because it is too large
Load diff
96
icu4c/source/test/testdata/Thai_graphclust_model4_heavy_Test.txt
vendored
Normal file
96
icu4c/source/test/testdata/Thai_graphclust_model4_heavy_Test.txt
vendored
Normal file
|
@ -0,0 +1,96 @@
|
|||
# Copyright (C) 2018 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
Model: Thai_graphclust_model4_heavy
|
||||
Embedding: grapheme_clusters_tf
|
||||
Input: ปฏิญญาสากลว่าด้วยสิทธิมนุษยชน
|
||||
Output: |ปฏิญญา|สากลว่า|ด้วย|สิทธิ|มนุ|ษย|ชน|
|
||||
Input: คำปรารภ
|
||||
Output: คำปราร|ภ|
|
||||
Input: โดยที่การยอมรับนับถือเกียรติศักดิ์ประจำตัว
|
||||
Output: |โดย|ที่|การ|ยอม|รับ|นับถือเกียรติ|ศักดิ์|ประจำ|ตัว|
|
||||
Input: และสิทธิเท่าเทียมกันและโอนมิได้ของบรรดา
|
||||
Output: |และ|สิทธิ|เท่า|เทีย|มกัน|และ|โอน|มิได้|ของ|บรรดา|
|
||||
Input: สมาชิก
|
||||
Output: |สมาชิก|
|
||||
Input: ทั้ง
|
||||
Output: ทั้ง|
|
||||
Input: หลายแห่งครอบครัว
|
||||
Output: |หลาย|แห่ง|ครอบ|ครัว|
|
||||
Input: มนุษย์เป็นหลักมูลเหตุแห่งอิสรภาพ
|
||||
Output: ม|นุ|ษย์|เป็น|หลักมูล|เหตุแห่งอิ|สรภาพ|
|
||||
Input: ความยุติธรรม
|
||||
Output: |ความ|ยุติธรรม|
|
||||
Input: และสันติภาพในโลก
|
||||
Output: |และ|สันติภาพ|ใน|โลก|
|
||||
Input: โดยที่การไม่นำพาและการเหยียดหยามต่อสิทธิมนุษยชน
|
||||
Output: |โดย|ที่|การ|ไม่|นำ|พา|และ|การ|เหยียด|หยาม|ต่อ|สิทธิ|มนุ|ษย|ชน|
|
||||
Input: ยังผลให้มีการหระทำอันป่าเถื่อน
|
||||
Output: |ยังผล|ให้|มี|การ|หระทำ|อัน|ป่า|เถื่อน|
|
||||
Input: ซี่งเป็นการละเมิดมโนธรรมของมนุษยชาติอย่างร้ายแรง
|
||||
Output: |ซี่ง|เป็น|การ|ละเมิดม|โนธรรม|ของ|มนุษย|ชาติ|อย่าง|ร้าย|แรง|
|
||||
Input: และใต้
|
||||
Output: |และ|ใต้|
|
||||
Input: ได้
|
||||
Output: |ได้|
|
||||
Input: มีการประกาศว่า
|
||||
Output: |มี|การ|ประกาศ|ว่า|
|
||||
Input: ปณิธานสูงสุดของสามัญชนได้แก่ความต้องการให้มนุษย์มีชีวิตอยู่ในโลกด้วยอิสรภาพในการพูด
|
||||
Output: |ปณิธา|นสูงสุด|ของ|สามัญชน|ได้|แก่|ความ|ต้อง|การ|ให้|ม|นุษย์|มี|ชีวิต|อยู่|ใน|โลก|ด้วยอิ|สรภาพ|ใน|การ|พูด|
|
||||
Input: และความเชื่อถือ
|
||||
Output: |และ|ความ|เชื่อถือ|
|
||||
Input: และอิสรภาพพ้นจากความหวาดกลัวและความต้องการ
|
||||
Output: และอิ|สรภาพพ้น|จาก|ความ|หวาดกลัว|และ|ความ|ต้องการ|
|
||||
Input: โดยที่เป็นการจำเป็นอย่างยิ่งที่สิทธิมนุษยชนควรได้รับความคุ้มครองโดยหลักบังคับของกฎหมาย
|
||||
Output: |โดย|ที่|เป็น|การ|จำเป็น|อย่าง|ยิ่งที่|สิทธิม|นุ|ษย|ชน|ควร|ได้|รับ|ความ|คุ้มครอง|โดย|หลักบัง|คับ|ของ|กฎหมา|ย|
|
||||
Input: ถ้าไม่ประสงค์จะให้คนตกอยู่ในบังคับให้หันเข้าหาการขบถขัดขืนต่อทรราชและการกดขี่เป็นวิถีทางสุดท้าย
|
||||
Output: |ถ้า|ไม่|ประสงค์|จะ|ให้|คน|ตก|อยู่|ใน|บังคับ|ให้|หั|นเข้า|หา|การ|ขบ|ถขัด|ขืน|ต่อ|ทรราช|และ|การ|กดขี่|เป็น|วิ|ถี|ทาง|สุดท้าย|
|
||||
Input: โดยที่ประชากรแห่งสหประชาชาติได้ยืนยันไว้ในกฎบัตรถึงความเชื่อมั่นในสิทธิมนุษยชนอันเป็นหลักมูล
|
||||
Output: |โดย|ที่|ประชากร|แห่ง|สหประชาชาติ|ได้|ยืนยัน|ไว้|ใน|กฎบัตร|ถึง|ความ|เชื่อมั่น|ใน|สิทธิ|มนุ|ษย|ชน|อัน|เป็น|หลักมูล|
|
||||
Input: ในเกียรติศักดิ์และคุณค่าของมนุษย์และในสิทธิเท่าเทียมกันของบรรดาชายและหญิง
|
||||
Output: |ใน|เกียรติ|ศักดิ์|และ|คุณค่า|ของ|มนุษย์|และ|ใน|สิทธิ|เท่า|เทีย|มกัน|ของ|บรรดา|ชาย|และ|หญิง|
|
||||
Input: และได้ตกลงใจที่จะส่งเสริมความก้าวหน้าทางสังคม
|
||||
Output: |และ|ได้|ตกลงใจ|ที่|จะ|ส่ง|เสริม|ความ|ก้าว|หน้าทาง|สังคม|
|
||||
Input: และมาตรฐานแห่งชีวิตที่ดีขึ้นด้วยในอิสรภาพ
|
||||
Output: |และ|มาตรฐาน|แห่งชีวิต|ที่|ดี|ขึ้น|ด้วย|ในอิ|สรภาพ|
|
||||
Input: อันกว้างขวางยิ่งขึ้น
|
||||
Output: |อัน|กว้าง|ขวาง|ยิ่ง|ขึ้น|
|
||||
Input: โดยที่รัฐสมาชิกต่างปฎิญาณจะให้บรรลุถึงซึ่งการส่งเสริมการเคารพและการปฎิบัติตามทั่วสากลต่อสิทธิมนุษยชนและอิสรภาพหลักมูล
|
||||
Output: |โดย|ที่|รัฐส|มา|ชิก|ต่าง|ปฎิญาณ|จะ|ให้|บรรลุ|ถึง|ซึ่ง|การ|ส่ง|เสริม|การ|เคา|รพ|และ|การ|ปฎิบัติ|ตา|มทั่วสาก|ล|ต่อ|สิทธิม|นุ|ษย|ชนและอิ|สรภาพ|หลักมูล|
|
||||
Input: โดยร่วมมือกับสหประชาชาติ
|
||||
Output: |โดย|ร่วมมือ|กับ|สหประชาชาติ|
|
||||
Input: โดยที่ความเข้าใจร่วมกันในสิทธิ
|
||||
Output: |โดย|ที่|ความ|เข้าใจ|ร่วม|กัน|ใน|สิทธิ|
|
||||
Input: และอิสรภาพเหล่านี้เป็นสิ่งสำคัญอย่างยิ่ง
|
||||
Output: และอิ|สรภาพ|เหล่า|นี้|เป็น|สิ่ง|สำคัญ|อย่าง|ยิ่ง|
|
||||
Input: เพื่อให้ปฏิญาณนี้สำเร็จผลเต็มบริบูรณ์
|
||||
Output: |เพื่อ|ให้|ปฏิญาณ|นี้|สำเร็จผล|เต็ม|บริบูรณ์|
|
||||
Input: ฉะนั้น
|
||||
Output: ฉะนั้น|
|
||||
Input: บัดนี้สมัชชาจึงประกาศว่า
|
||||
Output: |บัด|นี้|สมัชชา|จึง|ประกาศ|ว่า|
|
||||
Input: ปฏิญญาสากลว่าด้วยสิทธิมนุษยชนนี้
|
||||
Output: |ปฏิญญา|สากลว่า|ด้วย|สิทธิ|มนุ|ษย|ชน|นี้|
|
||||
Input: เป็นมาตรฐานร่วมกันแห่งความสำเร็จสำหรับบรรดาประชากรและประชาชาติทั้งหลาย
|
||||
Output: |เป็น|มาตรฐาน|ร่วม|กัน|แห่ง|ความ|สำเร็จ|สำหรับ|บรรดา|ประชากร|และ|ประชาชาติ|ทั้งหลา|ย|
|
||||
Input: เพื่อจุดหมายปลายทางที่ว่า
|
||||
Output: |เพื่อจุดหมาย|ปลาย|ทาง|ที่|ว่า|
|
||||
Input: เอกชนทุกคนและองค์การชองสังคมทุกองค์การ
|
||||
Output: |เอกชน|ทุก|คน|และ|องค์การ|ชอง|สังคม|ทุกองค์การ|
|
||||
Input: โดยการรำลึกถึงปฏิญญานี้เป็นเนืองนิจ
|
||||
Output: |โดย|การ|รำลึก|ถึง|ปฏิญญานี้|เป็น|เนือง|นิ|จ|
|
||||
Input: จะบากบั่นพยายามด้วยการสอนและศึกษา
|
||||
Output: |จะ|บาก|บั่น|พยายาม|ด้วย|การ|สอน|และ|ศึก|ษา|
|
||||
Input: ในอันที่จะส่งเสริมการเคารพสิทธิและอิสรภาพเหล่านี้
|
||||
Output: |ใน|อัน|ที่|จะ|ส่ง|เสริม|การ|เคารพ|สิทธิ|และอิ|สรภาพ|เหล่า|นี้|
|
||||
Input: และด้วยมาตรการอันก้าวหน้าทั้งในประเทศและระหว่างประเทศ
|
||||
Output: |และ|ด้วย|มาตรการ|อัน|ก้าว|หน้าทั้ง|ใน|ประเทศ|และ|ระหว่าง|ประเทศ|
|
||||
Input: ในอันที่จะให้มีการยอมรับนับถือ
|
||||
Output: |ใน|อัน|ที่|จะ|ให้|มี|การ|ยอม|รับ|นับถือ|
|
||||
Input: และการปฏิบัติตามโดยสากลและอย่างเป็นผลจริงจัง
|
||||
Output: |และ|การ|ปฏิบัติตาม|โดย|สากล|และ|อย่าง|เป็นผล|จริง|จัง|
|
||||
Input: ทั้งในบรรดาประชาชนของรัฐสมาชิกด้วยกันเอง
|
||||
Output: ทั้ง|ใน|บรรดา|ประชาชน|ของ|รัฐส|มาชิก|ด้วย|กัน|เอง|
|
||||
Input: และในบรรดาประชาชนของดินแดนที่อยู่ใตัอำนาจของรัฐนั้น
|
||||
Output: |และ|ใน|บรรดา|ประชาชน|ของ|ดินแดน|ที่|อยู่|ใตัอำนาจ|ของ|รัฐนั้น|
|
||||
Input: ๆ
|
||||
Output: |ๆ|
|
Loading…
Add table
Reference in a new issue