ICU-8329 Roll in Khmer dictionary word break code from George, data from Nathan/sbbic.org

X-SVN-Rev: 30019
2025-04-10 07:39:16 +00:00 · 2011-05-04 13:25:37 +00:00 · 2011-05-04 13:25:37 +00:00 · 7aaca9b950
commit 7aaca9b950
parent 332037ef5b
9 changed files with 79037 additions and 9 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -53,6 +53,7 @@ icu4c/source/allinone/icucheck.bat -text
 icu4c/source/common/common.vcxproj -text
 icu4c/source/common/common.vcxproj.filters -text
 icu4c/source/config/mh-haiku -text
+icu4c/source/data/brkitr/khmerdict.txt -text
 icu4c/source/data/curr/pool.res -text
 icu4c/source/data/in/coll/invuca.icu -text
 icu4c/source/data/in/coll/ucadata.icu -text
--- a/icu4c/source/common/brkeng.cpp
+++ b/icu4c/source/common/brkeng.cpp
@ -1,7 +1,7 @@
 /**
 ************************************************************************************
- * Copyright (C) 2006-2009, International Business Machines Corporation and others. *
- * All Rights Reserved.                                                             *
+ * Copyright (C) 2006-2009,2011, International Business Machines Corporation        *
+ * and others. All Rights Reserved.                                                 *
 ************************************************************************************
 */

@ -226,6 +226,9 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
            case USCRIPT_THAI:
                engine = new ThaiBreakEngine(dict, status);
                break;
+            case USCRIPT_KHMER:
+                engine = new KhmerBreakEngine(dict, status);
+                break;
            default:
                break;
            }
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@ -1,7 +1,7 @@
 /**
 *******************************************************************************
- * Copyright (C) 2006-2008, International Business Machines Corporation and others. *
- * All Rights Reserved.                                                        *
+ * Copyright (C) 2006-2008,2011, International Business Machines Corporation   *
+ * and others. All Rights Reserved.                                            *
 *******************************************************************************
 */

@ -422,6 +422,234 @@ foundBest:
    return wordsFound;
 }

+// How many words in a row are "good enough"? 
+#define KHMER_LOOKAHEAD 3 
+ 
+// Will not combine a non-word with a preceding dictionary word longer than this 
+#define KHMER_ROOT_COMBINE_THRESHOLD 3 
+ 
+// Will not combine a non-word that shares at least this much prefix with a 
+// dictionary word, with a preceding word 
+#define KHMER_PREFIX_COMBINE_THRESHOLD 3 
+ 
+// Minimum word size 
+#define KHMER_MIN_WORD 2 
+ 
+// Minimum number of characters for two words 
+#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 
+ 
+KhmerBreakEngine::KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) 
+    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 
+      fDictionary(adoptDictionary) 
+{ 
+    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 
+    if (U_SUCCESS(status)) { 
+        setCharacters(fKhmerWordSet); 
+    } 
+    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 
+    fMarkSet.add(0x0020); 
+    fEndWordSet = fKhmerWordSet; 
+    fBeginWordSet.add(0x1780, 0x17B3); 
+//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT 
+//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI 
+//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK 
+//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI 
+//    fSuffixSet.add(THAI_PAIYANNOI); 
+//    fSuffixSet.add(THAI_MAIYAMOK); 
+ 
+    // Compact for caching. 
+    fMarkSet.compact(); 
+    fEndWordSet.compact(); 
+    fBeginWordSet.compact(); 
+    fSuffixSet.compact(); 
+} 
+ 
+KhmerBreakEngine::~KhmerBreakEngine() { 
+    delete fDictionary; 
+} 
+ 
+int32_t 
+KhmerBreakEngine::divideUpDictionaryRange( UText *text, 
+                                                int32_t rangeStart, 
+                                                int32_t rangeEnd, 
+                                                UStack &foundBreaks ) const { 
+    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 
+        return 0;       // Not enough characters for two words 
+    } 
+ 
+    uint32_t wordsFound = 0; 
+    int32_t wordLength; 
+    int32_t current; 
+    UErrorCode status = U_ZERO_ERROR; 
+    PossibleWord words[KHMER_LOOKAHEAD]; 
+    UChar32 uc; 
+     
+    utext_setNativeIndex(text, rangeStart); 
+     
+    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 
+        wordLength = 0; 
+ 
+        // Look for candidate words at the current position 
+        int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 
+         
+        // If we found exactly one, use that 
+        if (candidates == 1) { 
+            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 
+            wordsFound += 1; 
+        } 
+         
+        // If there was more than one, see which one can take us forward the most words 
+        else if (candidates > 1) { 
+            // If we're already at the end of the range, we're done 
+            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 
+                goto foundBest; 
+            } 
+            do { 
+                int wordsMatched = 1; 
+                if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 
+                    if (wordsMatched < 2) { 
+                        // Followed by another dictionary word; mark first word as a good candidate 
+                        words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 
+                        wordsMatched = 2; 
+                    } 
+                     
+                    // If we're already at the end of the range, we're done 
+                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 
+                        goto foundBest; 
+                    } 
+                     
+                    // See if any of the possible second words is followed by a third word 
+                    do { 
+                        // If we find a third word, stop right away 
+                        if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 
+                            words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 
+                            goto foundBest; 
+                        } 
+                    } 
+                    while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(text)); 
+                } 
+            } 
+            while (words[wordsFound%KHMER_LOOKAHEAD].backUp(text)); 
+foundBest: 
+            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 
+            wordsFound += 1; 
+        } 
+         
+        // We come here after having either found a word or not. We look ahead to the 
+        // next word. If it's not a dictionary word, we will combine it withe the word we 
+        // just found (if there is one), but only if the preceding word does not exceed 
+        // the threshold. 
+        // The text iterator should now be positioned at the end of the word we found. 
+        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 
+            // if it is a dictionary word, do nothing. If it isn't, then if there is 
+            // no preceding word, or the non-word shares less than the minimum threshold 
+            // of characters with a dictionary word, then scan to resynchronize 
+            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 
+                  && (wordLength == 0 
+                      || words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 
+                // Look for a plausible word boundary 
+                //TODO: This section will need a rework for UText. 
+                int32_t remaining = rangeEnd - (current+wordLength); 
+                UChar32 pc = utext_current32(text); 
+                int32_t chars = 0; 
+                for (;;) { 
+                    utext_next32(text); 
+                    uc = utext_current32(text); 
+                    // TODO: Here we're counting on the fact that the SA languages are all 
+                    // in the BMP. This should get fixed with the UText rework. 
+                    chars += 1; 
+                    if (--remaining <= 0) { 
+                        break; 
+                    } 
+                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 
+                        // Maybe. See if it's in the dictionary. 
+                        // NOTE: In the original Apple code, checked that the next 
+                        // two characters after uc were not 0x0E4C THANTHAKHAT before 
+                        // checking the dictionary. That is just a performance filter, 
+                        // but it's not clear it's faster than checking the trie. 
+                        int candidates = words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 
+                        utext_setNativeIndex(text, current+wordLength+chars); 
+                        if (candidates > 0) { 
+                            break; 
+                        } 
+                    } 
+                    pc = uc; 
+                } 
+                 
+                // Bump the word count if there wasn't already one 
+                if (wordLength <= 0) { 
+                    wordsFound += 1; 
+                } 
+                 
+                // Update the length with the passed-over characters 
+                wordLength += chars; 
+            } 
+            else { 
+                // Back up to where we were for next iteration 
+                utext_setNativeIndex(text, current+wordLength); 
+            } 
+        } 
+         
+        // Never stop before a combining mark. 
+        int32_t currPos; 
+        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 
+            utext_next32(text); 
+            wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 
+        } 
+         
+        // Look ahead for possible suffixes if a dictionary word does not follow. 
+        // We do this in code rather than using a rule so that the heuristic 
+        // resynch continues to function. For example, one of the suffix characters 
+        // could be a typo in the middle of a word. 
+        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 
+            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 
+                && fSuffixSet.contains(uc = utext_current32(text))) { 
+//                if (uc == KHMER_PAIYANNOI) { 
+//                    if (!fSuffixSet.contains(utext_previous32(text))) { 
+//                        // Skip over previous end and PAIYANNOI 
+//                        utext_next32(text); 
+//                        utext_next32(text); 
+//                        wordLength += 1;            // Add PAIYANNOI to word 
+//                        uc = utext_current32(text);     // Fetch next character 
+//                    } 
+//                    else { 
+//                        // Restore prior position 
+//                        utext_next32(text); 
+//                    } 
+//                } 
+//                if (uc == KHMER_MAIYAMOK) { 
+//                    if (utext_previous32(text) != KHMER_MAIYAMOK) { 
+//                        // Skip over previous end and MAIYAMOK 
+//                        utext_next32(text); 
+//                        utext_next32(text); 
+//                        wordLength += 1;            // Add MAIYAMOK to word 
+//                    } 
+//                    else { 
+//                        // Restore prior position 
+//                        utext_next32(text); 
+//                    } 
+//                } 
+            } 
+            else { 
+                utext_setNativeIndex(text, current+wordLength); 
+            } 
+        } 
+         
+        // Did we find a word on this iteration? If so, push it on the break stack 
+        if (wordLength > 0) { 
+            foundBreaks.push((current+wordLength), status); 
+        } 
+    } 
+     
+    // Don't return a break for the end of the dictionary range if there is one there. 
+    if (foundBreaks.peeki() >= rangeEnd) { 
+        (void) foundBreaks.popi(); 
+        wordsFound -= 1; 
+    } 
+ 
+    return wordsFound; 
+} 
+ 
 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@ -1,7 +1,7 @@
 /**
 *******************************************************************************
- * Copyright (C) 2006, International Business Machines Corporation and others. *
- * All Rights Reserved.                                                        *
+ * Copyright (C) 2006,2011, International Business Machines Corporation        *
+ * and others. All Rights Reserved.                                            *
 *******************************************************************************
 */

@ -187,6 +187,64 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
 };


+/******************************************************************* 
+ * KhmerBreakEngine 
+ */ 
+ 
+/** 
+ * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 
+ * TrieWordDictionary and heuristics to determine Khmer-specific breaks.</p> 
+ * 
+ * <p>After it is constructed a KhmerBreakEngine may be shared between 
+ * threads without synchronization.</p> 
+ */ 
+class KhmerBreakEngine : public DictionaryBreakEngine { 
+ private: 
+    /** 
+     * The set of characters handled by this engine 
+     * @internal 
+     */ 
+ 
+  UnicodeSet                fKhmerWordSet; 
+  UnicodeSet                fEndWordSet; 
+  UnicodeSet                fBeginWordSet; 
+  UnicodeSet                fSuffixSet; 
+  UnicodeSet                fMarkSet; 
+  const TrieWordDictionary  *fDictionary; 
+ 
+ public: 
+ 
+  /** 
+   * <p>Default constructor.</p> 
+   * 
+   * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the 
+   * engine is deleted. 
+   */ 
+  KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status); 
+ 
+  /** 
+   * <p>Virtual destructor.</p> 
+   */ 
+  virtual ~KhmerBreakEngine(); 
+ 
+ protected: 
+ /** 
+  * <p>Divide up a range of known dictionary characters.</p> 
+  * 
+  * @param text A UText representing the text 
+  * @param rangeStart The start of the range of dictionary characters 
+  * @param rangeEnd The end of the range of dictionary characters 
+  * @param foundBreaks Output of C array of int32_t break positions, or 0 
+  * @return The number of breaks found 
+  */ 
+  virtual int32_t divideUpDictionaryRange( UText *text, 
+                                           int32_t rangeStart, 
+                                           int32_t rangeEnd, 
+                                           UStack &foundBreaks ) const; 
+ 
+}; 
+ 
+ 
 U_NAMESPACE_END

    /* DICTBE_H */
--- a/icu4c/source/data/Makefile.in
+++ b/icu4c/source/data/Makefile.in
@ -1,5 +1,5 @@
 ## Makefile.in for ICU data
-## Copyright (c) 1999-2010, International Business Machines Corporation and
+## Copyright (c) 1999-2011, International Business Machines Corporation and
 ## others. All Rights Reserved.

 ## Source directory information
@ -505,6 +505,9 @@ $(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(TOOLBINDIR)/gensprep$(TOOLEXEEXT) $(BU
 #thaidict.brk: $(SRCDATADIR)/thaidict.brk
 #	$(RMV) $@ && ln -s $(BUILDDIR) $@

+#khmerdict.brk: $(SRCDATADIR)/khmerdict.brk
+#	$(RMV) $@ && ln -s $(BUILDDIR) $@
+
 $(BRKBLDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genbrk$(TOOLEXEEXT) $(DAT_FILES)
 	$(INVOKE) $(TOOLBINDIR)/genbrk -c -i $(BUILDDIR) -r $< -o $@

--- a/icu4c/source/data/brkitr/brkfiles.mk
+++ b/icu4c/source/data/brkitr/brkfiles.mk
@ -34,7 +34,7 @@ BRK_RES_ALIAS_SOURCE = $(BRK_RES_SYNTHETIC_ALIAS)


 # List of compact trie dictionary files (ctd).
-BRK_CTD_SOURCE =  thaidict.txt 
+BRK_CTD_SOURCE =  thaidict.txt khmerdict.txt


 # List of break iterator files (brk).
--- a/icu4c/source/data/brkitr/khmerdict.txt
+++ b/icu4c/source/data/brkitr/khmerdict.txt
--- a/icu4c/source/data/brkitr/root.txt
+++ b/icu4c/source/data/brkitr/root.txt
@ -17,5 +17,6 @@ root{
    }
    dictionaries{
        Thai:process(dependency){"thaidict.ctd"}
+        Khmr:process(dependency){"khmerdict.ctd"}
    }
 }
--- a/icu4c/source/data/xml/brkitr/root.xml
+++ b/icu4c/source/data/xml/brkitr/root.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!--
- Copyright (c) 2010 International Business Machines Corporation and others. All rights reserved.
+ Copyright (c) 2010-2011 International Business Machines Corporation and others. All rights reserved.
 -->
 <!DOCTYPE ldml SYSTEM "http://www.unicode.org/repos/cldr/trunk/common/dtd/ldml.dtd"
 [
@ -25,6 +25,7 @@
            </icu:boundaries>
            <icu:dictionaries>
                <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
+                <icu:dictionary type="Khmr" icu:dependency="khmerdict.ctd"/>
            </icu:dictionaries>
        </icu:breakIteratorData>
    </special>