This commit is contained in:
Steven R. Loomis 2025-04-03 16:04:08 -07:00 committed by GitHub
commit 676a809a5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 132 additions and 3 deletions

View file

@ -161,11 +161,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
}
const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char* locale) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
const LanguageBreakEngine *engine = nullptr;
if (DictionaryBreakEngine::suppressScriptBreak(locale, code)) {
return nullptr; // -u-dx was requested
}
// Try to use LSTM first
const LSTMData *data = CreateLSTMDataForScript(code, status);
if (U_SUCCESS(status)) {
@ -186,7 +189,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
if (m != nullptr) {
switch(code) {
case USCRIPT_THAI:
case USCRIPT_THAI:
engine = new ThaiBreakEngine(m, status);
break;
case USCRIPT_LAO:

View file

@ -27,6 +27,7 @@
#include "uassert.h"
#include "unicode/normlzr.h"
#include "cmemory.h"
#include "cstring.h"
#include "dictionarydata.h"
U_NAMESPACE_BEGIN
@ -42,7 +43,11 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
}
UBool
DictionaryBreakEngine::handles(UChar32 c, const char*) const {
DictionaryBreakEngine::handles(UChar32 c, const char* locale) const {
if (DictionaryBreakEngine::suppressScriptBreak(locale, c)) {
// suppressed by ID
return false;
}
return fSet.contains(c);
}
@ -85,6 +90,40 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
fSet.compact();
}
UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UScriptCode code) {
// get the keyword value
UErrorCode status = U_ZERO_ERROR;
char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
int32_t len = uloc_getKeywordValue(locale, "dx", buf, ULOC_KEYWORD_AND_VALUES_CAPACITY, &status);
if (U_FAILURE(status)) return false;
// loop over the keyword values
for(int32_t i =0; i<len; i+= 5) {
// turn hyphen into a null
if(buf[i+4] != 0 && buf[i+4] == '-') {
buf[i+4] = 0; // terminate (in buffer): 'hira-kata' -> 'hira\0kata'
} // else: possibly malformed, let match fail
const char *scriptName = buf+i;
if (!uprv_strncmp(scriptName, "zyyy", 4)) {
return true; // matched 'all'
} else if(!uprv_strnicmp(scriptName, uscript_getShortName(code), 4)) {
return true; // matched the specific script
}
}
return false;
}
UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UChar32 c) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_FAILURE(status)) {
return false;
} else {
return suppressScriptBreak(locale, code);
}
}
/*
******************************************************************
* PossibleWord

View file

@ -113,6 +113,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
UBool isPhraseBreaking,
UErrorCode& status) const = 0;
public:
/** @returns true if the specified code is suppressed by the specified locale, -u-dx */
static UBool suppressScriptBreak(const char *locale, UScriptCode code);
/** @returns true if the specified char is suppressed by the specified locale, -u-dx */
static UBool suppressScriptBreak(const char *locale, UChar32 c);
};
/*******************************************************************

View file

@ -25,10 +25,12 @@
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "cmemory.h"
#include "dictbe.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#include <stdio.h> // for snprintf
#endif
#include <unicode/uscript.h>
/**
* API Test the RuleBasedBreakIterator class
*/
@ -1438,6 +1440,56 @@ void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
#endif
}
/** helper function for testing*/
const char *RBBIAPITest::forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale) {
if(!locale) return locale;
UErrorCode status = U_ZERO_ERROR;
int32_t parsedLength;
uloc_forLanguageTag(locale, buf, ULOC_FULLNAME_CAPACITY, &parsedLength, &status);
// verify no err
assertFalse(u_errorName(status), U_FAILURE(status));
return buf;
}
void RBBIAPITest::TestSuppressDictionary() {
char buf[ULOC_FULLNAME_CAPACITY];
// sanity checks of our internal function
{
const char *t = forLangTag(buf, "en");
assertEquals(WHERE, "en", t);
}
{
const char *t = forLangTag(buf, "en-u-dx-Thai");
assertEquals(WHERE, "en@dx=thai", t);
}
{
const char *t = forLangTag(buf, "sss-u-dx-Thai-Laoo");
assertEquals(WHERE, "sss@dx=thai-laoo", t);
}
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(nullptr, USCRIPT_COMMON));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_THAI));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_HANGUL));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_COMMON));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_HANGUL));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_LAO));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_HANGUL));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Laoo-Zyyy"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo-tz-gblon"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_COMMON));
// try where there's no -u-dx
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh"), USCRIPT_MYANMAR));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-t-k0-plqdkbd"), USCRIPT_MYANMAR));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-u-tz-gblon-"), USCRIPT_MYANMAR));
}
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
@ -1469,6 +1521,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
#if !UCONFIG_NO_BREAK_ITERATION
TESTCASE_AUTO(TestFilteredBreakIteratorBuilder);
#endif
TESTCASE_AUTO(TestSuppressDictionary);
TESTCASE_AUTO_END;
}

View file

@ -91,6 +91,8 @@ public:
void TestRefreshInputText();
void TestSuppressDictionary();
/**
*Internal subroutines
**/
@ -100,6 +102,8 @@ public:
/*Internal subroutine used for comparison of expected and acquired results */
void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);
/** Helper: convert the language tag */
const char *forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale);
};

View file

@ -1516,6 +1516,31 @@ Bangkok)•</data>
#
####################################################################################
# -u-dx (exclude script)
#<locale th
#<line>
<locale sss@dx=thai>
<line>
# Should no longer break at the dictionary points - it's not Thai language
# Short Test
<data>•โอํน• อะไป •จู่วาม •โล่น•</data>
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
#<word>
# Should no longer break at the dictionary points - it's not the Thai language
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
#<locale sss@dx=zyyy>
#<line>
# Should no longer break at the dictionary points - it's not Thai language
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
#<word>
# Should no longer break at the dictionary points - it's not the Thai language
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
# Japanese line break tailoring test
<locale ja>