mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
Merge c140c6dde5
into 770c4b8042
This commit is contained in:
commit
676a809a5e
6 changed files with 132 additions and 3 deletions
|
@ -161,11 +161,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
|
|||
}
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
|
||||
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char* locale) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UScriptCode code = uscript_getScript(c, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
const LanguageBreakEngine *engine = nullptr;
|
||||
if (DictionaryBreakEngine::suppressScriptBreak(locale, code)) {
|
||||
return nullptr; // -u-dx was requested
|
||||
}
|
||||
// Try to use LSTM first
|
||||
const LSTMData *data = CreateLSTMDataForScript(code, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
|
@ -186,7 +189,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
|
|||
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
|
||||
if (m != nullptr) {
|
||||
switch(code) {
|
||||
case USCRIPT_THAI:
|
||||
case USCRIPT_THAI:
|
||||
engine = new ThaiBreakEngine(m, status);
|
||||
break;
|
||||
case USCRIPT_LAO:
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "uassert.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "dictionarydata.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -42,7 +43,11 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
|
|||
}
|
||||
|
||||
UBool
|
||||
DictionaryBreakEngine::handles(UChar32 c, const char*) const {
|
||||
DictionaryBreakEngine::handles(UChar32 c, const char* locale) const {
|
||||
if (DictionaryBreakEngine::suppressScriptBreak(locale, c)) {
|
||||
// suppressed by ID
|
||||
return false;
|
||||
}
|
||||
return fSet.contains(c);
|
||||
}
|
||||
|
||||
|
@ -85,6 +90,40 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
|
|||
fSet.compact();
|
||||
}
|
||||
|
||||
UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UScriptCode code) {
|
||||
// get the keyword value
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
|
||||
int32_t len = uloc_getKeywordValue(locale, "dx", buf, ULOC_KEYWORD_AND_VALUES_CAPACITY, &status);
|
||||
if (U_FAILURE(status)) return false;
|
||||
// loop over the keyword values
|
||||
for(int32_t i =0; i<len; i+= 5) {
|
||||
// turn hyphen into a null
|
||||
if(buf[i+4] != 0 && buf[i+4] == '-') {
|
||||
buf[i+4] = 0; // terminate (in buffer): 'hira-kata' -> 'hira\0kata'
|
||||
} // else: possibly malformed, let match fail
|
||||
|
||||
const char *scriptName = buf+i;
|
||||
if (!uprv_strncmp(scriptName, "zyyy", 4)) {
|
||||
return true; // matched 'all'
|
||||
} else if(!uprv_strnicmp(scriptName, uscript_getShortName(code), 4)) {
|
||||
return true; // matched the specific script
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UChar32 c) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UScriptCode code = uscript_getScript(c, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
} else {
|
||||
return suppressScriptBreak(locale, code);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
* PossibleWord
|
||||
|
|
|
@ -113,6 +113,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const = 0;
|
||||
|
||||
public:
|
||||
/** @returns true if the specified code is suppressed by the specified locale, -u-dx */
|
||||
static UBool suppressScriptBreak(const char *locale, UScriptCode code);
|
||||
/** @returns true if the specified char is suppressed by the specified locale, -u-dx */
|
||||
static UBool suppressScriptBreak(const char *locale, UChar32 c);
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
|
|
|
@ -25,10 +25,12 @@
|
|||
#include "unicode/ustring.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "cmemory.h"
|
||||
#include "dictbe.h"
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
#include "unicode/filteredbrk.h"
|
||||
#include <stdio.h> // for snprintf
|
||||
#endif
|
||||
#include <unicode/uscript.h>
|
||||
/**
|
||||
* API Test the RuleBasedBreakIterator class
|
||||
*/
|
||||
|
@ -1438,6 +1440,56 @@ void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** helper function for testing*/
|
||||
const char *RBBIAPITest::forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale) {
|
||||
if(!locale) return locale;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t parsedLength;
|
||||
uloc_forLanguageTag(locale, buf, ULOC_FULLNAME_CAPACITY, &parsedLength, &status);
|
||||
// verify no err
|
||||
assertFalse(u_errorName(status), U_FAILURE(status));
|
||||
return buf;
|
||||
}
|
||||
|
||||
void RBBIAPITest::TestSuppressDictionary() {
|
||||
char buf[ULOC_FULLNAME_CAPACITY];
|
||||
|
||||
// sanity checks of our internal function
|
||||
{
|
||||
const char *t = forLangTag(buf, "en");
|
||||
assertEquals(WHERE, "en", t);
|
||||
}
|
||||
{
|
||||
const char *t = forLangTag(buf, "en-u-dx-Thai");
|
||||
assertEquals(WHERE, "en@dx=thai", t);
|
||||
}
|
||||
{
|
||||
const char *t = forLangTag(buf, "sss-u-dx-Thai-Laoo");
|
||||
assertEquals(WHERE, "sss@dx=thai-laoo", t);
|
||||
}
|
||||
|
||||
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(nullptr, USCRIPT_COMMON));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_THAI));
|
||||
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_HANGUL));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_COMMON));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_THAI));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_HANGUL));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_THAI));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_LAO));
|
||||
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_HANGUL));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Laoo-Zyyy"), USCRIPT_THAI));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_THAI));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo-tz-gblon"), USCRIPT_THAI));
|
||||
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_COMMON));
|
||||
|
||||
// try where there's no -u-dx
|
||||
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh"), USCRIPT_MYANMAR));
|
||||
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-t-k0-plqdkbd"), USCRIPT_MYANMAR));
|
||||
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-u-tz-gblon-"), USCRIPT_MYANMAR));
|
||||
}
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
//---------------------------------------------
|
||||
|
@ -1469,6 +1521,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
TESTCASE_AUTO(TestFilteredBreakIteratorBuilder);
|
||||
#endif
|
||||
TESTCASE_AUTO(TestSuppressDictionary);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
|
|
@ -91,6 +91,8 @@ public:
|
|||
|
||||
void TestRefreshInputText();
|
||||
|
||||
void TestSuppressDictionary();
|
||||
|
||||
/**
|
||||
*Internal subroutines
|
||||
**/
|
||||
|
@ -100,6 +102,8 @@ public:
|
|||
/*Internal subroutine used for comparison of expected and acquired results */
|
||||
void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);
|
||||
|
||||
/** Helper: convert the language tag */
|
||||
const char *forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale);
|
||||
|
||||
};
|
||||
|
||||
|
|
25
icu4c/source/test/testdata/rbbitst.txt
vendored
25
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1516,6 +1516,31 @@ Bangkok)•</data>
|
|||
#
|
||||
####################################################################################
|
||||
|
||||
# -u-dx (exclude script)
|
||||
#<locale th
|
||||
|
||||
|
||||
#<line>
|
||||
|
||||
<locale sss@dx=thai>
|
||||
<line>
|
||||
# Should no longer break at the dictionary points - it's not Thai language
|
||||
# Short Test
|
||||
<data>•โอํน• อะไป •จู่วาม •โล่น•</data>
|
||||
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
|
||||
#<word>
|
||||
# Should no longer break at the dictionary points - it's not the Thai language
|
||||
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
|
||||
|
||||
#<locale sss@dx=zyyy>
|
||||
#<line>
|
||||
# Should no longer break at the dictionary points - it's not Thai language
|
||||
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
|
||||
#<word>
|
||||
# Should no longer break at the dictionary points - it's not the Thai language
|
||||
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
|
||||
|
||||
|
||||
# Japanese line break tailoring test
|
||||
|
||||
<locale ja>
|
||||
|
|
Loading…
Add table
Reference in a new issue