ICU-10326 Add dictionary-based word/line break for Burmese/Myanmar

X-SVN-Rev: 36397
This commit is contained in:
Peter Edberg 2014-09-08 22:16:21 +00:00
parent 4db4766158
commit d87c86274c
12 changed files with 41499 additions and 10 deletions

1
.gitattributes vendored
View file

@ -56,6 +56,7 @@ icu4c/source/common/common.vcxproj.filters -text
icu4c/source/common/uloc_keytype.cpp -text
icu4c/source/common/unifiedcache.cpp -text
icu4c/source/common/unifiedcache.h -text
icu4c/source/data/brkitr/burmesedict.txt -text
icu4c/source/data/coll/dsb.txt -text
icu4c/source/data/coll/hsb.txt -text
icu4c/source/data/coll/lb.txt -text

View file

@ -4,7 +4,7 @@
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
<title>ICU License - ICU 1.8.1 and later</title>
<link type="text/css" href="./icu4c.css" rel="stylesheet"/>
<link type="text/css" href="./icu4c.css" rel="stylesheet">
</head>
<body BGCOLOR="#ffffff">
@ -316,7 +316,46 @@ written authorization of the copyright holder.</pre>
# --------------------------------------------------------------------------------
</pre>
<h3>4. Time Zone Database</h3>
<h3>4. Burmese Word Break Dictionary Data (burmesedict.txt)</h3>
<pre>
# Copyright (c) 2014 International Business Machines Corporation
# and others. All Rights Reserved.
#
# This list is part of a project hosted at:
# github.com/kanyawtech/myanmar-karen-word-lists
#
# --------------------------------------------------------------------------------
# Copyright (c) 2013, LeRoy Benjamin Sharon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice, this
# list of conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
# Neither the name Myanmar Karen Word Lists, nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# --------------------------------------------------------------------------------
</pre>
<h3>5. Time Zone Database</h3>
<p>ICU uses the public domain data and code derived from <a href="http://www.iana.org/time-zones">
Time Zone Database</a> for its time zone support. The ownership of the TZ database is explained
in <a href="http://tools.ietf.org/html/rfc6557">BCP 175: Procedure for Maintaining the Time Zone

View file

@ -1,6 +1,6 @@
/*
************************************************************************************
* Copyright (C) 2006-2013, International Business Machines Corporation
* Copyright (C) 2006-2014, International Business Machines Corporation
* and others. All Rights Reserved.
************************************************************************************
*/
@ -232,6 +232,9 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
case USCRIPT_LAO:
engine = new LaoBreakEngine(m, status);
break;
case USCRIPT_MYANMAR:
engine = new BurmeseBreakEngine(m, status);
break;
case USCRIPT_KHMER:
engine = new KhmerBreakEngine(m, status);
break;

View file

@ -630,6 +630,199 @@ foundBest:
return wordsFound;
}
/*
******************************************************************
* BurmeseBreakEngine
*/
// How many words in a row are "good enough"?
static const int32_t BURMESE_LOOKAHEAD = 3;
// Will not combine a non-word with a preceding dictionary word longer than this
static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
// Will not combine a non-word that shares at least this much prefix with a
// dictionary word, with a preceding word
static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
// Minimum word size
static const int32_t BURMESE_MIN_WORD = 2;
// Minimum number of characters for two words
static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
fDictionary(adoptDictionary)
{
fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fBurmeseWordSet);
}
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
fEndWordSet = fBurmeseWordSet;
fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
// Compact for caching.
fMarkSet.compact();
fEndWordSet.compact();
fBeginWordSet.compact();
}
BurmeseBreakEngine::~BurmeseBreakEngine() {
delete fDictionary;
}
int32_t
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
return 0; // Not enough characters for two words
}
uint32_t wordsFound = 0;
int32_t cpWordLength = 0;
int32_t cuWordLength = 0;
int32_t current;
UErrorCode status = U_ZERO_ERROR;
PossibleWord words[BURMESE_LOOKAHEAD];
utext_setNativeIndex(text, rangeStart);
while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
cuWordLength = 0;
cpWordLength = 0;
// Look for candidate words at the current position
int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
// If we found exactly one, use that
if (candidates == 1) {
cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
wordsFound += 1;
}
// If there was more than one, see which one can take us forward the most words
else if (candidates > 1) {
// If we're already at the end of the range, we're done
if (utext_getNativeIndex(text) >= rangeEnd) {
goto foundBest;
}
do {
int32_t wordsMatched = 1;
if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
if (wordsMatched < 2) {
// Followed by another dictionary word; mark first word as a good candidate
words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
wordsMatched = 2;
}
// If we're already at the end of the range, we're done
if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
goto foundBest;
}
// See if any of the possible second words is followed by a third word
do {
// If we find a third word, stop right away
if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
goto foundBest;
}
}
while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
}
}
while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
foundBest:
cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
wordsFound += 1;
}
// We come here after having either found a word or not. We look ahead to the
// next word. If it's not a dictionary word, we will combine it withe the word we
// just found (if there is one), but only if the preceding word does not exceed
// the threshold.
// The text iterator should now be positioned at the end of the word we found.
if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
// if it is a dictionary word, do nothing. If it isn't, then if there is
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
&& (cuWordLength == 0
|| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
int32_t remaining = rangeEnd - (current + cuWordLength);
UChar32 pc;
UChar32 uc;
int32_t chars = 0;
for (;;) {
int32_t pcIndex = utext_getNativeIndex(text);
pc = utext_next32(text);
int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
chars += pcSize;
remaining -= pcSize;
if (remaining <= 0) {
break;
}
uc = utext_current32(text);
if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
// Maybe. See if it's in the dictionary.
// TODO: this looks iffy; compare with old code.
int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
utext_setNativeIndex(text, current + cuWordLength + chars);
if (candidates > 0) {
break;
}
}
}
// Bump the word count if there wasn't already one
if (cuWordLength <= 0) {
wordsFound += 1;
}
// Update the length with the passed-over characters
cuWordLength += chars;
}
else {
// Back up to where we were for next iteration
utext_setNativeIndex(text, current + cuWordLength);
}
}
// Never stop before a combining mark.
int32_t currPos;
while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
utext_next32(text);
cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
}
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
// resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
// NOT CURRENTLY APPLICABLE TO BURMESE
// Did we find a word on this iteration? If so, push it on the break stack
if (cuWordLength > 0) {
foundBreaks.push((current+cuWordLength), status);
}
}
// Don't return a break for the end of the dictionary range if there is one there.
if (foundBreaks.peeki() >= rangeEnd) {
(void) foundBreaks.popi();
wordsFound -= 1;
}
return wordsFound;
}
/*
******************************************************************
* KhmerBreakEngine

View file

@ -243,6 +243,62 @@ class LaoBreakEngine : public DictionaryBreakEngine {
};
/*******************************************************************
* BurmeseBreakEngine
*/
/**
* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
*
* <p>After it is constructed a BurmeseBreakEngine may be shared between
* threads without synchronization.</p>
*/
class BurmeseBreakEngine : public DictionaryBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fBurmeseWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~BurmeseBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
};
/*******************************************************************
* KhmerBreakEngine
*/

View file

@ -543,6 +543,9 @@ $(BRKBLDDIR)/thaidict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
$(BRKBLDDIR)/laodict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x0e80 -c -i $(BUILDDIR) $(BRKSRCDIR)/laodict.txt $(BRKBLDDIR)/laodict.dict
$(BRKBLDDIR)/burmesedict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(BRKSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict
# TODO: figure out why combining characters are here?
$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(BRKSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict

View file

@ -34,7 +34,7 @@ BRK_RES_ALIAS_SOURCE = $(BRK_RES_SYNTHETIC_ALIAS)
# List of dictionary files (dict).
BRK_DICT_SOURCE = cjdict.txt khmerdict.txt laodict.txt thaidict.txt
BRK_DICT_SOURCE = burmesedict.txt cjdict.txt khmerdict.txt laodict.txt thaidict.txt
# List of break iterator files (brk).

File diff suppressed because it is too large Load diff

View file

@ -21,6 +21,7 @@ root{
Kata:process(dependency){"cjdict.dict"}
Khmr:process(dependency){"khmerdict.dict"}
Laoo:process(dependency){"laodict.dict"}
Mymr:process(dependency){"burmesedict.dict"}
Thai:process(dependency){"thaidict.dict"}
}
}

View file

@ -739,11 +739,15 @@ CLEAN : GODATA
$(ICUBRK)\thaidict.dict:
@echo Creating $(ICUBRK)\thaidict.dict
@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0xe00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\thaidict.txt "$(ICUBLD_PKG)\$(ICUBRK)\thaidict.dict"
@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0x0e00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\thaidict.txt "$(ICUBLD_PKG)\$(ICUBRK)\thaidict.dict"
$(ICUBRK)\laodict.dict:
@echo Creating $(ICUBRK)\laodict.dict
@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0xe00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\laodict.txt "$(ICUBLD_PKG)\$(ICUBRK)\laodict.dict"
@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0x0e80 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\laodict.txt "$(ICUBLD_PKG)\$(ICUBRK)\laodict.dict"
$(ICUBRK)\burmesedict.dict:
@echo Creating $(ICUBRK)\burmesedict.dict
@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0x1000 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\burmesedict.txt "$(ICUBLD_PKG)\$(ICUBRK)\burmesedict.dict"
$(ICUBRK)\khmerdict.dict:
@echo Creating $(ICUBRK)\khmerdict.dict

View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Copyright (c) 2010-2013 International Business Machines Corporation and others. All rights reserved.
Copyright (c) 2010-2014 International Business Machines Corporation and others. All rights reserved.
-->
<!DOCTYPE ldml SYSTEM "http://www.unicode.org/repos/cldr/trunk/common/dtd/ldml.dtd"
[
@ -24,12 +24,13 @@
<icu:title icu:dependency="title.brk"/>
</icu:boundaries>
<icu:dictionaries>
<icu:dictionary type="Thai" icu:dependency="thaidict.dict"/>
<icu:dictionary type="Laoo" icu:dependency="laodict.dict"/>
<icu:dictionary type="Khmr" icu:dependency="khmerdict.dict"/>
<icu:dictionary type="Hani" icu:dependency="cjdict.dict"/>
<icu:dictionary type="Hira" icu:dependency="cjdict.dict"/>
<icu:dictionary type="Kata" icu:dependency="cjdict.dict"/>
<icu:dictionary type="Khmr" icu:dependency="khmerdict.dict"/>
<icu:dictionary type="Laoo" icu:dependency="laodict.dict"/>
<icu:dictionary type="Mymr" icu:dependency="burmesedict.dict"/>
<icu:dictionary type="Thai" icu:dependency="thaidict.dict"/>
</icu:dictionaries>
</icu:breakIteratorData>
</special>

View file

@ -717,6 +717,19 @@ Bangkok)•</data>
<data>•ເຈົ້າ•ເວົ້າ•ພາສາ•ອັງກິດ•ໄດ້•ບໍ່•</data>
<data>•ກະລຸນາ•ເວົ້າ•ຊ້າ•ໆ•</data>
##########################################################################################
#
# Burmese/Myanmar Tests
#
##########################################################################################
<locale en>
# Basic sanity check for #10326 (some text from http://www.unicode.org/udhr/d/udhr_mya.txt)
<line>
<data>•လူ•တိုင်း•သည် •တူညီ •လွတ်လပ်•သော •ဂုဏ်•သိ•က္•ခါ•ဖြ•င့် •လည်းကောင်း၊ •</data>
<data>•တူညီ•လွတ်လပ်•သော •အ•ခွ•င့်•အရေး•များ•ဖြ•င့် •လည်းကောင်း၊ •မွေး•ဖွား•လာ•သူများ •ဖြစ်သည်။•</data>
<data>•ထို•သူ•တို့၌ •ပိုင်းခြား •ဝေဖန်•တတ်•သော •ဉာဏ်•နှ•င့် •ကျ•င့်•ဝတ် •သိတတ်•သော •စိတ်•တို့•ရှိ•ကြ၍ •</data>
<data>•ထို•သူ•တို့သည် •အချင်းချင်း •မေတ္တာ•ထား၍ •ဆက်ဆံ•ကျ•င့်•သုံး•</data>
##########################################################################################
#
# Khmer Tests