mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
parent
57d41c92a1
commit
704415402a
7 changed files with 88 additions and 147 deletions
|
@ -199,13 +199,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
|
||||
fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
|
||||
UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fThaiWordSet);
|
||||
setCharacters(thaiWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = fThaiWordSet;
|
||||
fEndWordSet = thaiWordSet;
|
||||
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
|
||||
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
|
||||
|
@ -441,13 +441,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
|
||||
fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
|
||||
UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fLaoWordSet);
|
||||
setCharacters(laoWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = fLaoWordSet;
|
||||
fEndWordSet = laoWordSet;
|
||||
fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
|
||||
fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
|
||||
fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
|
||||
|
@ -637,14 +637,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
|
||||
fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fBurmeseWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = fBurmeseWordSet;
|
||||
fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
|
||||
fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fEndWordSet);
|
||||
}
|
||||
|
||||
// Compact for caching.
|
||||
fMarkSet.compact();
|
||||
|
@ -830,13 +829,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
|
||||
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
|
||||
UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fKhmerWordSet);
|
||||
setCharacters(khmerWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = fKhmerWordSet;
|
||||
fEndWordSet = khmerWordSet;
|
||||
fBeginWordSet.add(0x1780, 0x17B3);
|
||||
//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
|
||||
//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
|
||||
|
@ -1050,24 +1049,19 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
|
|||
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
|
||||
// Korean dictionary only includes Hangul syllables
|
||||
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
|
||||
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
|
||||
fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
|
||||
fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
|
||||
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
|
||||
// Korean dictionary only includes Hangul syllables
|
||||
fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
|
||||
fHangulWordSet.compact();
|
||||
|
||||
if (U_SUCCESS(status)) {
|
||||
// handle Korean and Japanese/Chinese using different dictionaries
|
||||
if (type == kKorean) {
|
||||
// handle Korean and Japanese/Chinese using different dictionaries
|
||||
if (type == kKorean) {
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fHangulWordSet);
|
||||
} else { //Chinese and Japanese
|
||||
UnicodeSet cjSet;
|
||||
cjSet.addAll(fHanWordSet);
|
||||
cjSet.addAll(fKatakanaWordSet);
|
||||
cjSet.addAll(fHiraganaWordSet);
|
||||
cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
}
|
||||
} else { //Chinese and Japanese
|
||||
UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(cjSet);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,7 +127,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fThaiWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fSuffixSet;
|
||||
|
@ -186,7 +185,6 @@ class LaoBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fLaoWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
|
@ -244,7 +242,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fBurmeseWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
|
@ -302,7 +299,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fKhmerWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
|
@ -366,9 +362,6 @@ class CjkBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
UnicodeSet fHangulWordSet;
|
||||
UnicodeSet fHanWordSet;
|
||||
UnicodeSet fKatakanaWordSet;
|
||||
UnicodeSet fHiraganaWordSet;
|
||||
|
||||
DictionaryMatcher *fDictionary;
|
||||
const Normalizer2 *nfkcNorm2;
|
||||
|
|
|
@ -31,24 +31,16 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine {
|
|||
private static final byte BURMESE_MIN_WORD = 2;
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fBurmeseWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
private UnicodeSet fEndWordSet;
|
||||
private UnicodeSet fBeginWordSet;
|
||||
private UnicodeSet fMarkSet;
|
||||
|
||||
static {
|
||||
public BurmeseBreakEngine() throws IOException {
|
||||
// Initialize UnicodeSets
|
||||
fBurmeseWordSet = new UnicodeSet();
|
||||
fMarkSet = new UnicodeSet();
|
||||
fBeginWordSet = new UnicodeSet();
|
||||
|
||||
fBurmeseWordSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
|
||||
fBurmeseWordSet.compact();
|
||||
|
||||
fMarkSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
|
||||
fBeginWordSet = new UnicodeSet(0x1000, 0x102A); // basic consonants and independent vowels
|
||||
fEndWordSet = new UnicodeSet("[[:Mymr:]&[:LineBreak=SA:]]");
|
||||
fMarkSet = new UnicodeSet("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = new UnicodeSet(fBurmeseWordSet);
|
||||
fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
|
||||
|
||||
// Compact for caching
|
||||
fMarkSet.compact();
|
||||
|
@ -56,14 +48,11 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine {
|
|||
fBeginWordSet.compact();
|
||||
|
||||
// Freeze the static UnicodeSet
|
||||
fBurmeseWordSet.freeze();
|
||||
fMarkSet.freeze();
|
||||
fEndWordSet.freeze();
|
||||
fBeginWordSet.freeze();
|
||||
}
|
||||
|
||||
public BurmeseBreakEngine() throws IOException {
|
||||
setCharacters(fBurmeseWordSet);
|
||||
setCharacters(fEndWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Mymr");
|
||||
}
|
||||
|
|
|
@ -20,36 +20,18 @@ import com.ibm.icu.text.Normalizer;
|
|||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class CjkBreakEngine extends DictionaryBreakEngine {
|
||||
private static final UnicodeSet fHangulWordSet = new UnicodeSet();
|
||||
private static final UnicodeSet fHanWordSet = new UnicodeSet();
|
||||
private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
|
||||
private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
|
||||
static {
|
||||
fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
|
||||
fHanWordSet.applyPattern("[:Han:]");
|
||||
fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
|
||||
fHiraganaWordSet.applyPattern("[:Hiragana:]");
|
||||
|
||||
// freeze them all
|
||||
fHangulWordSet.freeze();
|
||||
fHanWordSet.freeze();
|
||||
fKatakanaWordSet.freeze();
|
||||
fHiraganaWordSet.freeze();
|
||||
}
|
||||
|
||||
private UnicodeSet fHangulWordSet;
|
||||
private DictionaryMatcher fDictionary = null;
|
||||
|
||||
public CjkBreakEngine(boolean korean) throws IOException {
|
||||
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
|
||||
fHangulWordSet.freeze();
|
||||
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Hira");
|
||||
if (korean) {
|
||||
setCharacters(fHangulWordSet);
|
||||
} else { //Chinese and Japanese
|
||||
UnicodeSet cjSet = new UnicodeSet();
|
||||
cjSet.addAll(fHanWordSet);
|
||||
cjSet.addAll(fKatakanaWordSet);
|
||||
cjSet.addAll(fHiraganaWordSet);
|
||||
cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
|
||||
setCharacters(cjSet);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,24 +33,20 @@ public class KhmerBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fKhmerWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
private UnicodeSet fEndWordSet;
|
||||
private UnicodeSet fBeginWordSet;
|
||||
private UnicodeSet fMarkSet;
|
||||
|
||||
static {
|
||||
public KhmerBreakEngine() throws IOException {
|
||||
// Initialize UnicodeSets
|
||||
fKhmerWordSet = new UnicodeSet();
|
||||
fMarkSet = new UnicodeSet();
|
||||
fBeginWordSet = new UnicodeSet();
|
||||
|
||||
fKhmerWordSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]]");
|
||||
fKhmerWordSet.compact();
|
||||
|
||||
fMarkSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
|
||||
UnicodeSet khmerWordSet = new UnicodeSet("[[:Khmer:]&[:LineBreak=SA:]]");
|
||||
fMarkSet = new UnicodeSet("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = new UnicodeSet(fKhmerWordSet);
|
||||
fBeginWordSet.add(0x1780, 0x17B3);
|
||||
fBeginWordSet = new UnicodeSet(0x1780, 0x17B3);
|
||||
|
||||
khmerWordSet.compact();
|
||||
|
||||
fEndWordSet = new UnicodeSet(khmerWordSet);
|
||||
fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
|
||||
|
||||
// Compact for caching
|
||||
|
@ -59,14 +55,12 @@ public class KhmerBreakEngine extends DictionaryBreakEngine {
|
|||
fBeginWordSet.compact();
|
||||
|
||||
// Freeze the static UnicodeSet
|
||||
fKhmerWordSet.freeze();
|
||||
khmerWordSet.freeze();
|
||||
fMarkSet.freeze();
|
||||
fEndWordSet.freeze();
|
||||
fBeginWordSet.freeze();
|
||||
}
|
||||
|
||||
public KhmerBreakEngine() throws IOException {
|
||||
setCharacters(fKhmerWordSet);
|
||||
setCharacters(khmerWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Khmr");
|
||||
}
|
||||
|
|
|
@ -30,27 +30,24 @@ public class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
private static final byte LAO_MIN_WORD = 2;
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fLaoWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
private UnicodeSet fEndWordSet;
|
||||
private UnicodeSet fBeginWordSet;
|
||||
private UnicodeSet fMarkSet;
|
||||
|
||||
static {
|
||||
public LaoBreakEngine() throws IOException {
|
||||
// Initialize UnicodeSets
|
||||
fLaoWordSet = new UnicodeSet();
|
||||
fMarkSet = new UnicodeSet();
|
||||
fBeginWordSet = new UnicodeSet();
|
||||
|
||||
fLaoWordSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]]");
|
||||
fLaoWordSet.compact();
|
||||
|
||||
fMarkSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
|
||||
UnicodeSet laoWordSet = new UnicodeSet("[[:Laoo:]&[:LineBreak=SA:]]");
|
||||
fMarkSet = new UnicodeSet("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = new UnicodeSet(fLaoWordSet);
|
||||
fBeginWordSet = new UnicodeSet(
|
||||
0x0E81, 0x0EAE, // basic consonants (including holes for corresponding Thai characters)
|
||||
0x0EC0, 0x0EC4, // prefix vowels
|
||||
0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
|
||||
|
||||
laoWordSet.compact();
|
||||
|
||||
fEndWordSet = new UnicodeSet(laoWordSet);
|
||||
fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
|
||||
fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
|
||||
fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
|
||||
fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
|
||||
|
||||
// Compact for caching
|
||||
fMarkSet.compact();
|
||||
|
@ -58,14 +55,12 @@ public class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
fBeginWordSet.compact();
|
||||
|
||||
// Freeze the static UnicodeSet
|
||||
fLaoWordSet.freeze();
|
||||
laoWordSet.freeze();
|
||||
fMarkSet.freeze();
|
||||
fEndWordSet.freeze();
|
||||
fBeginWordSet.freeze();
|
||||
}
|
||||
|
||||
public LaoBreakEngine() throws IOException {
|
||||
setCharacters(fLaoWordSet);
|
||||
setCharacters(laoWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Laoo");
|
||||
}
|
||||
|
|
|
@ -36,32 +36,28 @@ public class ThaiBreakEngine extends DictionaryBreakEngine {
|
|||
private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fThaiWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fSuffixSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
private UnicodeSet fEndWordSet;
|
||||
private UnicodeSet fBeginWordSet;
|
||||
private UnicodeSet fSuffixSet;
|
||||
private UnicodeSet fMarkSet;
|
||||
|
||||
static {
|
||||
public ThaiBreakEngine() throws IOException {
|
||||
// Initialize UnicodeSets
|
||||
fThaiWordSet = new UnicodeSet();
|
||||
fMarkSet = new UnicodeSet();
|
||||
fBeginWordSet = new UnicodeSet();
|
||||
fSuffixSet = new UnicodeSet();
|
||||
|
||||
fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]");
|
||||
fThaiWordSet.compact();
|
||||
|
||||
fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
|
||||
UnicodeSet thaiWordSet = new UnicodeSet("[[:Thai:]&[:LineBreak=SA:]]");
|
||||
fMarkSet = new UnicodeSet("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = new UnicodeSet(fThaiWordSet);
|
||||
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
|
||||
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
|
||||
fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
fBeginWordSet = new UnicodeSet(0x0E01, 0x0E2E, //KO KAI through HO NOKHUK
|
||||
0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
fSuffixSet = new UnicodeSet();
|
||||
fSuffixSet.add(THAI_PAIYANNOI);
|
||||
fSuffixSet.add(THAI_MAIYAMOK);
|
||||
|
||||
thaiWordSet.compact();
|
||||
|
||||
fEndWordSet = new UnicodeSet(thaiWordSet);
|
||||
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
|
||||
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
|
||||
// Compact for caching
|
||||
fMarkSet.compact();
|
||||
fEndWordSet.compact();
|
||||
|
@ -69,15 +65,13 @@ public class ThaiBreakEngine extends DictionaryBreakEngine {
|
|||
fSuffixSet.compact();
|
||||
|
||||
// Freeze the static UnicodeSet
|
||||
fThaiWordSet.freeze();
|
||||
thaiWordSet.freeze();
|
||||
fMarkSet.freeze();
|
||||
fEndWordSet.freeze();
|
||||
fBeginWordSet.freeze();
|
||||
fSuffixSet.freeze();
|
||||
}
|
||||
|
||||
public ThaiBreakEngine() throws IOException {
|
||||
setCharacters(fThaiWordSet);
|
||||
setCharacters(thaiWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Thai");
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue