Merge c140c6dde5 into 770c4b8042

2025-04-10 07:39:16 +00:00 · 2025-04-03 16:04:08 -07:00 · 2025-04-03 16:04:08 -07:00 · 676a809a5e
commit 676a809a5e
parent 770c4b8042 c140c6dde5
6 changed files with 132 additions and 3 deletions
--- a/icu4c/source/common/brkeng.cpp
+++ b/icu4c/source/common/brkeng.cpp
@ -161,11 +161,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
 }

 const LanguageBreakEngine *
-ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
+ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char* locale) {
    UErrorCode status = U_ZERO_ERROR;
    UScriptCode code = uscript_getScript(c, &status);
    if (U_SUCCESS(status)) {
        const LanguageBreakEngine *engine = nullptr;
+        if (DictionaryBreakEngine::suppressScriptBreak(locale, code)) {
+            return nullptr; // -u-dx was requested
+        }
        // Try to use LSTM first
        const LSTMData *data = CreateLSTMDataForScript(code, status);
        if (U_SUCCESS(status)) {
@ -186,7 +189,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
        DictionaryMatcher *m = loadDictionaryMatcherFor(code);
        if (m != nullptr) {
            switch(code) {
-            case USCRIPT_THAI:
+            case USCRIPT_THAI:                
                engine = new ThaiBreakEngine(m, status);
                break;
            case USCRIPT_LAO:
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@ -27,6 +27,7 @@
 #include "uassert.h"
 #include "unicode/normlzr.h"
 #include "cmemory.h"
+#include "cstring.h"
 #include "dictionarydata.h"

 U_NAMESPACE_BEGIN
@ -42,7 +43,11 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
 }

 UBool
-DictionaryBreakEngine::handles(UChar32 c, const char*) const {
+DictionaryBreakEngine::handles(UChar32 c, const char* locale) const {
+    if (DictionaryBreakEngine::suppressScriptBreak(locale, c)) {
+        // suppressed by ID
+        return false;
+    }
    return fSet.contains(c);
 }

@ -85,6 +90,40 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
    fSet.compact();
 }

+UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UScriptCode code) {
+    // get the keyword value
+    UErrorCode status = U_ZERO_ERROR;
+    char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
+    int32_t len = uloc_getKeywordValue(locale, "dx", buf, ULOC_KEYWORD_AND_VALUES_CAPACITY, &status);
+    if (U_FAILURE(status)) return false;
+    // loop over the keyword values
+    for(int32_t i =0; i<len; i+= 5) {
+        // turn hyphen into a null
+        if(buf[i+4] != 0 && buf[i+4] == '-') {
+            buf[i+4] = 0; // terminate (in buffer):  'hira-kata' -> 'hira\0kata'
+        } // else: possibly malformed, let match fail
+
+        const char *scriptName = buf+i;
+        if (!uprv_strncmp(scriptName, "zyyy", 4)) {
+            return true; // matched 'all'
+        } else if(!uprv_strnicmp(scriptName, uscript_getShortName(code), 4)) {
+            return true; // matched the specific script
+        }
+    }
+    return false;
+}
+
+UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UChar32 c) {
+    UErrorCode status = U_ZERO_ERROR;
+    UScriptCode code = uscript_getScript(c, &status);
+    if (U_FAILURE(status)) {
+        return false;
+    } else {
+        return suppressScriptBreak(locale, code);
+    }
+}
+
+
 /*
 ******************************************************************
 * PossibleWord
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@ -113,6 +113,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
                                           UBool isPhraseBreaking,
                                           UErrorCode& status) const = 0;

+public:
+   /** @returns true if the specified code is suppressed by the specified locale, -u-dx */
+   static UBool suppressScriptBreak(const char *locale, UScriptCode code); 
+   /** @returns true if the specified char is suppressed by the specified locale, -u-dx */
+   static UBool suppressScriptBreak(const char *locale, UChar32 c);
 };

 /*******************************************************************
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -25,10 +25,12 @@
 #include "unicode/ustring.h"
 #include "unicode/utext.h"
 #include "cmemory.h"
+#include "dictbe.h"
 #if !UCONFIG_NO_BREAK_ITERATION
 #include "unicode/filteredbrk.h"
 #include <stdio.h> // for snprintf
 #endif
+#include <unicode/uscript.h>
 /**
 * API Test the RuleBasedBreakIterator class
 */
@ -1438,6 +1440,56 @@ void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
 #endif
 }

+
+
+/** helper function for testing*/
+const char *RBBIAPITest::forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale) {
+    if(!locale) return locale;
+    UErrorCode status = U_ZERO_ERROR;
+    int32_t parsedLength;
+    uloc_forLanguageTag(locale, buf, ULOC_FULLNAME_CAPACITY, &parsedLength, &status);
+    // verify no err
+    assertFalse(u_errorName(status), U_FAILURE(status));
+    return buf;
+}
+
+void RBBIAPITest::TestSuppressDictionary() {
+    char buf[ULOC_FULLNAME_CAPACITY];
+
+    // sanity checks of our internal function
+    {
+        const char *t = forLangTag(buf, "en");
+        assertEquals(WHERE, "en", t);
+    }
+    {
+        const char *t = forLangTag(buf, "en-u-dx-Thai");
+        assertEquals(WHERE, "en@dx=thai", t);
+    }
+    {
+        const char *t = forLangTag(buf, "sss-u-dx-Thai-Laoo");
+        assertEquals(WHERE, "sss@dx=thai-laoo", t);
+    }
+
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(nullptr, USCRIPT_COMMON));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_THAI));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_HANGUL));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_COMMON));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_HANGUL));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_LAO));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_HANGUL));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Laoo-Zyyy"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo-tz-gblon"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_COMMON));
+
+    // try where there's no -u-dx
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh"), USCRIPT_MYANMAR));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-t-k0-plqdkbd"), USCRIPT_MYANMAR));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-u-tz-gblon-"), USCRIPT_MYANMAR));
+}
+
 //---------------------------------------------
 // runIndexedTest
 //---------------------------------------------
@ -1469,6 +1521,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
 #if !UCONFIG_NO_BREAK_ITERATION
    TESTCASE_AUTO(TestFilteredBreakIteratorBuilder);
 #endif
+    TESTCASE_AUTO(TestSuppressDictionary);
    TESTCASE_AUTO_END;
 }

--- a/icu4c/source/test/intltest/rbbiapts.h
+++ b/icu4c/source/test/intltest/rbbiapts.h
@ -91,6 +91,8 @@ public:

    void TestRefreshInputText();

+    void TestSuppressDictionary();
+
    /**
     *Internal subroutines
     **/
@ -100,6 +102,8 @@ public:
    /*Internal subroutine used for comparison of expected and acquired results */
    void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);

+    /** Helper: convert the language tag */
+    const char *forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale);

 };

--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -1516,6 +1516,31 @@ Bangkok)•</data>
 #
 ####################################################################################

+# -u-dx (exclude script)
+#<locale th
+
+
+#<line>
+
+<locale sss@dx=thai>
+<line>
+# Should no longer break at the dictionary points - it's not Thai language
+# Short Test
+<data>•โอํน• อะไป •จู่วาม •โล่น•</data>
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+#<word>
+# Should no longer break at the dictionary points - it's not the Thai language
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+
+#<locale sss@dx=zyyy>
+#<line>
+# Should no longer break at the dictionary points - it's not Thai language
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+#<word>
+# Should no longer break at the dictionary points - it's not the Thai language
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+
+
 # Japanese line break tailoring test

 <locale ja>