ICU-10326 Add dictionary-based word/line break for Burmese/Myanmar

X-SVN-Rev: 36397
2025-04-10 07:39:16 +00:00 · 2014-09-08 22:16:21 +00:00 · 2014-09-08 22:16:21 +00:00 · d87c86274c
commit d87c86274c
parent 4db4766158
12 changed files with 41499 additions and 10 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -56,6 +56,7 @@ icu4c/source/common/common.vcxproj.filters -text
 icu4c/source/common/uloc_keytype.cpp -text
 icu4c/source/common/unifiedcache.cpp -text
 icu4c/source/common/unifiedcache.h -text
+icu4c/source/data/brkitr/burmesedict.txt -text
 icu4c/source/data/coll/dsb.txt -text
 icu4c/source/data/coll/hsb.txt -text
 icu4c/source/data/coll/lb.txt -text
--- a/icu4c/license.html
+++ b/icu4c/license.html
@ -4,7 +4,7 @@
 <head>
 <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
 <title>ICU License - ICU 1.8.1 and later</title>
-	<link type="text/css" href="./icu4c.css" rel="stylesheet"/>
+	<link type="text/css" href="./icu4c.css" rel="stylesheet">
 </head>

 <body BGCOLOR="#ffffff">
@ -316,7 +316,46 @@ written authorization of the copyright holder.</pre>
 #	--------------------------------------------------------------------------------
 </pre>

-<h3>4. Time Zone Database</h3>
+<h3>4. Burmese Word Break Dictionary Data (burmesedict.txt)</h3>
+<pre>
+ #	Copyright (c) 2014 International Business Machines Corporation
+ #	and others. All Rights Reserved.
+ #
+ #	This list is part of a project hosted at:
+ #	  github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ #	--------------------------------------------------------------------------------
+ #	Copyright (c) 2013, LeRoy Benjamin Sharon
+ #	All rights reserved.
+ #
+ #	Redistribution and use in source and binary forms, with or without modification,
+ #	are permitted provided that the following conditions are met:
+ #
+ #	  Redistributions of source code must retain the above copyright notice, this
+ #	  list of conditions and the following disclaimer.
+ #
+ #	  Redistributions in binary form must reproduce the above copyright notice, this
+ #	  list of conditions and the following disclaimer in the documentation and/or
+ #	  other materials provided with the distribution.
+ #
+ #	  Neither the name Myanmar Karen Word Lists, nor the names of its
+ #	  contributors may be used to endorse or promote products derived from
+ #	  this software without specific prior written permission.
+ #
+ #	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ #	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ #	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ #	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ #	ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #	(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ #	LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ #	ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ #	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ #	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #	--------------------------------------------------------------------------------
+</pre>
+
+<h3>5. Time Zone Database</h3>
 <p>ICU uses the public domain data and code derived from <a href="http://www.iana.org/time-zones">
 Time Zone Database</a> for its time zone support. The ownership of the TZ database is explained
 in <a href="http://tools.ietf.org/html/rfc6557">BCP 175: Procedure for Maintaining the Time Zone
--- a/icu4c/source/common/brkeng.cpp
+++ b/icu4c/source/common/brkeng.cpp
@ -1,6 +1,6 @@
 /*
 ************************************************************************************
- * Copyright (C) 2006-2013, International Business Machines Corporation
+ * Copyright (C) 2006-2014, International Business Machines Corporation
 * and others. All Rights Reserved.
 ************************************************************************************
 */
@ -232,6 +232,9 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
            case USCRIPT_LAO:
                engine = new LaoBreakEngine(m, status);
                break;
+            case USCRIPT_MYANMAR:
+                engine = new BurmeseBreakEngine(m, status);
+                break;
            case USCRIPT_KHMER:
                engine = new KhmerBreakEngine(m, status);
                break;
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@ -630,6 +630,199 @@ foundBest:
    return wordsFound;
 }

+/*
+ ******************************************************************
+ * BurmeseBreakEngine
+ */
+
+// How many words in a row are "good enough"?
+static const int32_t BURMESE_LOOKAHEAD = 3;
+
+// Will not combine a non-word with a preceding dictionary word longer than this
+static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
+
+// Will not combine a non-word that shares at least this much prefix with a
+// dictionary word, with a preceding word
+static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
+
+// Minimum word size
+static const int32_t BURMESE_MIN_WORD = 2;
+
+// Minimum number of characters for two words
+static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
+
+BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
+    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
+      fDictionary(adoptDictionary)
+{
+    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
+    if (U_SUCCESS(status)) {
+        setCharacters(fBurmeseWordSet);
+    }
+    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.add(0x0020);
+    fEndWordSet = fBurmeseWordSet;
+    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
+
+    // Compact for caching.
+    fMarkSet.compact();
+    fEndWordSet.compact();
+    fBeginWordSet.compact();
+}
+
+BurmeseBreakEngine::~BurmeseBreakEngine() {
+    delete fDictionary;
+}
+
+int32_t
+BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
+                                                int32_t rangeStart,
+                                                int32_t rangeEnd,
+                                                UStack &foundBreaks ) const {
+    if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
+        return 0;       // Not enough characters for two words
+    }
+
+    uint32_t wordsFound = 0;
+    int32_t cpWordLength = 0;
+    int32_t cuWordLength = 0;
+    int32_t current;
+    UErrorCode status = U_ZERO_ERROR;
+    PossibleWord words[BURMESE_LOOKAHEAD];
+    
+    utext_setNativeIndex(text, rangeStart);
+    
+    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
+        cuWordLength = 0;
+        cpWordLength = 0;
+
+        // Look for candidate words at the current position
+        int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+        
+        // If we found exactly one, use that
+        if (candidates == 1) {
+            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
+            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
+            wordsFound += 1;
+        }
+        // If there was more than one, see which one can take us forward the most words
+        else if (candidates > 1) {
+            // If we're already at the end of the range, we're done
+            if (utext_getNativeIndex(text) >= rangeEnd) {
+                goto foundBest;
+            }
+            do {
+                int32_t wordsMatched = 1;
+                if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
+                    if (wordsMatched < 2) {
+                        // Followed by another dictionary word; mark first word as a good candidate
+                        words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
+                        wordsMatched = 2;
+                    }
+                    
+                    // If we're already at the end of the range, we're done
+                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
+                        goto foundBest;
+                    }
+                    
+                    // See if any of the possible second words is followed by a third word
+                    do {
+                        // If we find a third word, stop right away
+                        if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
+                            words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
+                            goto foundBest;
+                        }
+                    }
+                    while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
+                }
+            }
+            while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
+foundBest:
+            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
+            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
+            wordsFound += 1;
+        }
+        
+        // We come here after having either found a word or not. We look ahead to the
+        // next word. If it's not a dictionary word, we will combine it withe the word we
+        // just found (if there is one), but only if the preceding word does not exceed
+        // the threshold.
+        // The text iterator should now be positioned at the end of the word we found.
+        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
+            // if it is a dictionary word, do nothing. If it isn't, then if there is
+            // no preceding word, or the non-word shares less than the minimum threshold
+            // of characters with a dictionary word, then scan to resynchronize
+            if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
+                  && (cuWordLength == 0
+                      || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
+                // Look for a plausible word boundary
+                int32_t remaining = rangeEnd - (current + cuWordLength);
+                UChar32 pc;
+                UChar32 uc;
+                int32_t chars = 0;
+                for (;;) {
+                    int32_t pcIndex = utext_getNativeIndex(text);
+                    pc = utext_next32(text);
+                    int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
+                    chars += pcSize;
+                    remaining -= pcSize;
+                    if (remaining <= 0) {
+                        break;
+                    }
+                    uc = utext_current32(text);
+                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
+                        // Maybe. See if it's in the dictionary.
+                        // TODO: this looks iffy; compare with old code.
+                        int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+                        utext_setNativeIndex(text, current + cuWordLength + chars);
+                        if (candidates > 0) {
+                            break;
+                        }
+                    }
+                }
+                
+                // Bump the word count if there wasn't already one
+                if (cuWordLength <= 0) {
+                    wordsFound += 1;
+                }
+                
+                // Update the length with the passed-over characters
+                cuWordLength += chars;
+            }
+            else {
+                // Back up to where we were for next iteration
+                utext_setNativeIndex(text, current + cuWordLength);
+            }
+        }
+        
+        // Never stop before a combining mark.
+        int32_t currPos;
+        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
+            utext_next32(text);
+            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
+        }
+        
+        // Look ahead for possible suffixes if a dictionary word does not follow.
+        // We do this in code rather than using a rule so that the heuristic
+        // resynch continues to function. For example, one of the suffix characters
+        // could be a typo in the middle of a word.
+        // NOT CURRENTLY APPLICABLE TO BURMESE
+
+        // Did we find a word on this iteration? If so, push it on the break stack
+        if (cuWordLength > 0) {
+            foundBreaks.push((current+cuWordLength), status);
+        }
+    }
+
+    // Don't return a break for the end of the dictionary range if there is one there.
+    if (foundBreaks.peeki() >= rangeEnd) {
+        (void) foundBreaks.popi();
+        wordsFound -= 1;
+    }
+
+    return wordsFound;
+}
+
 /*
 ******************************************************************
 * KhmerBreakEngine
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@ -243,6 +243,62 @@ class LaoBreakEngine : public DictionaryBreakEngine {

 };

+/******************************************************************* 
+ * BurmeseBreakEngine 
+ */ 
+ 
+/** 
+ * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 
+ * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 
+ * 
+ * <p>After it is constructed a BurmeseBreakEngine may be shared between 
+ * threads without synchronization.</p> 
+ */ 
+class BurmeseBreakEngine : public DictionaryBreakEngine { 
+ private: 
+    /** 
+     * The set of characters handled by this engine 
+     * @internal 
+     */ 
+ 
+  UnicodeSet                fBurmeseWordSet; 
+  UnicodeSet                fEndWordSet; 
+  UnicodeSet                fBeginWordSet; 
+  UnicodeSet                fMarkSet; 
+  DictionaryMatcher  *fDictionary; 
+ 
+ public: 
+ 
+  /** 
+   * <p>Default constructor.</p> 
+   * 
+   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 
+   * engine is deleted. 
+   */ 
+  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 
+ 
+  /** 
+   * <p>Virtual destructor.</p> 
+   */ 
+  virtual ~BurmeseBreakEngine(); 
+ 
+ protected: 
+ /** 
+  * <p>Divide up a range of known dictionary characters.</p> 
+  * 
+  * @param text A UText representing the text 
+  * @param rangeStart The start of the range of dictionary characters 
+  * @param rangeEnd The end of the range of dictionary characters 
+  * @param foundBreaks Output of C array of int32_t break positions, or 0 
+  * @return The number of breaks found 
+  */ 
+  virtual int32_t divideUpDictionaryRange( UText *text, 
+                                           int32_t rangeStart, 
+                                           int32_t rangeEnd, 
+                                           UStack &foundBreaks ) const; 
+ 
+}; 
+ 
 /******************************************************************* 
 * KhmerBreakEngine 
 */ 
--- a/icu4c/source/data/Makefile.in
+++ b/icu4c/source/data/Makefile.in
@ -543,6 +543,9 @@ $(BRKBLDDIR)/thaidict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
 $(BRKBLDDIR)/laodict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
 	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x0e80 -c -i $(BUILDDIR) $(BRKSRCDIR)/laodict.txt $(BRKBLDDIR)/laodict.dict

+$(BRKBLDDIR)/burmesedict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
+	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(BRKSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict
+
 # TODO: figure out why combining characters are here?
 $(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
 	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(BRKSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
--- a/icu4c/source/data/brkitr/brkfiles.mk
+++ b/icu4c/source/data/brkitr/brkfiles.mk
@ -34,7 +34,7 @@ BRK_RES_ALIAS_SOURCE = $(BRK_RES_SYNTHETIC_ALIAS)


 # List of dictionary files (dict).
-BRK_DICT_SOURCE = cjdict.txt khmerdict.txt laodict.txt thaidict.txt
+BRK_DICT_SOURCE = burmesedict.txt cjdict.txt khmerdict.txt laodict.txt thaidict.txt


 # List of break iterator files (brk).
--- a/icu4c/source/data/brkitr/burmesedict.txt
+++ b/icu4c/source/data/brkitr/burmesedict.txt
--- a/icu4c/source/data/brkitr/root.txt
+++ b/icu4c/source/data/brkitr/root.txt
@ -21,6 +21,7 @@ root{
        Kata:process(dependency){"cjdict.dict"}
        Khmr:process(dependency){"khmerdict.dict"}
        Laoo:process(dependency){"laodict.dict"}
+        Mymr:process(dependency){"burmesedict.dict"}
        Thai:process(dependency){"thaidict.dict"}
    }
 }
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@ -739,11 +739,15 @@ CLEAN : GODATA

 $(ICUBRK)\thaidict.dict:
 	@echo Creating $(ICUBRK)\thaidict.dict
-	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0xe00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\thaidict.txt "$(ICUBLD_PKG)\$(ICUBRK)\thaidict.dict"
+	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0x0e00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\thaidict.txt "$(ICUBLD_PKG)\$(ICUBRK)\thaidict.dict"

 $(ICUBRK)\laodict.dict:
 	@echo Creating $(ICUBRK)\laodict.dict
-	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0xe00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\laodict.txt "$(ICUBLD_PKG)\$(ICUBRK)\laodict.dict"
+	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0x0e80 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\laodict.txt "$(ICUBLD_PKG)\$(ICUBRK)\laodict.dict"
+
+$(ICUBRK)\burmesedict.dict:
+	@echo Creating $(ICUBRK)\burmesedict.dict
+	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0x1000 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\burmesedict.txt "$(ICUBLD_PKG)\$(ICUBRK)\burmesedict.dict"

 $(ICUBRK)\khmerdict.dict:
 	@echo Creating $(ICUBRK)\khmerdict.dict
--- a/icu4c/source/data/xml/brkitr/root.xml
+++ b/icu4c/source/data/xml/brkitr/root.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!--
- Copyright (c) 2010-2013 International Business Machines Corporation and others. All rights reserved.
+ Copyright (c) 2010-2014 International Business Machines Corporation and others. All rights reserved.
 -->
 <!DOCTYPE ldml SYSTEM "http://www.unicode.org/repos/cldr/trunk/common/dtd/ldml.dtd"
 [
@ -24,12 +24,13 @@
                <icu:title    icu:dependency="title.brk"/>
            </icu:boundaries>
            <icu:dictionaries>
-                <icu:dictionary type="Thai" icu:dependency="thaidict.dict"/>
-                <icu:dictionary type="Laoo" icu:dependency="laodict.dict"/>
-                <icu:dictionary type="Khmr" icu:dependency="khmerdict.dict"/>
                <icu:dictionary type="Hani" icu:dependency="cjdict.dict"/>
                <icu:dictionary type="Hira" icu:dependency="cjdict.dict"/>
                <icu:dictionary type="Kata" icu:dependency="cjdict.dict"/>
+                <icu:dictionary type="Khmr" icu:dependency="khmerdict.dict"/>
+                <icu:dictionary type="Laoo" icu:dependency="laodict.dict"/>
+                <icu:dictionary type="Mymr" icu:dependency="burmesedict.dict"/>
+                <icu:dictionary type="Thai" icu:dependency="thaidict.dict"/>
            </icu:dictionaries>
        </icu:breakIteratorData>
    </special>
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -717,6 +717,19 @@ Bangkok)•</data>
 <data>•ເຈົ້າ•ເວົ້າ•ພາສາ•ອັງກິດ•ໄດ້•ບໍ່•</data>
 <data>•ກະລຸນາ•ເວົ້າ•ຊ້າ•ໆ•</data>

+##########################################################################################
+#
+#   Burmese/Myanmar Tests
+#
+##########################################################################################
+<locale en>
+# Basic sanity check for #10326 (some text from http://www.unicode.org/udhr/d/udhr_mya.txt)
+<line>
+<data>•လူ•တိုင်း•သည် •တူညီ •လွတ်လပ်•သော •ဂုဏ်•သိ•က္•ခါ•ဖြ•င့် •လည်းကောင်း၊ •</data>
+<data>•တူညီ•လွတ်လပ်•သော •အ•ခွ•င့်•အရေး•များ•ဖြ•င့် •လည်းကောင်း၊ •မွေး•ဖွား•လာ•သူများ •ဖြစ်သည်။•</data>
+<data>•ထို•သူ•တို့၌ •ပိုင်းခြား •ဝေဖန်•တတ်•သော •ဉာဏ်•နှ•င့် •ကျ•င့်•ဝတ် •သိတတ်•သော •စိတ်•တို့•ရှိ•ကြ၍ •</data>
+<data>•ထို•သူ•တို့သည် •အချင်းချင်း •မေတ္တာ•ထား၍ •ဆက်ဆံ•ကျ•င့်•သုံး•</data>
+
 ##########################################################################################
 #
 #   Khmer Tests