ICU-21847 Move UnicodeSet to stack in constructor

See #1941
This commit is contained in:
Frank Tang 2021-12-01 23:39:35 +00:00 committed by Frank Yung-Fong Tang
parent 57d41c92a1
commit 704415402a
7 changed files with 88 additions and 147 deletions

View file

@ -199,13 +199,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fThaiWordSet);
setCharacters(thaiWordSet);
}
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
fEndWordSet = fThaiWordSet;
fEndWordSet = thaiWordSet;
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
@ -441,13 +441,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fLaoWordSet);
setCharacters(laoWordSet);
}
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
fEndWordSet = fLaoWordSet;
fEndWordSet = laoWordSet;
fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
@ -637,14 +637,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fBurmeseWordSet);
}
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
fEndWordSet = fBurmeseWordSet;
fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
if (U_SUCCESS(status)) {
setCharacters(fEndWordSet);
}
// Compact for caching.
fMarkSet.compact();
@ -830,13 +829,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fKhmerWordSet);
setCharacters(khmerWordSet);
}
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
fEndWordSet = fKhmerWordSet;
fEndWordSet = khmerWordSet;
fBeginWordSet.add(0x1780, 0x17B3);
//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
@ -1050,24 +1049,19 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
// Korean dictionary only includes Hangul syllables
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
// Korean dictionary only includes Hangul syllables
fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
fHangulWordSet.compact();
if (U_SUCCESS(status)) {
// handle Korean and Japanese/Chinese using different dictionaries
if (type == kKorean) {
// handle Korean and Japanese/Chinese using different dictionaries
if (type == kKorean) {
if (U_SUCCESS(status)) {
setCharacters(fHangulWordSet);
} else { //Chinese and Japanese
UnicodeSet cjSet;
cjSet.addAll(fHanWordSet);
cjSet.addAll(fKatakanaWordSet);
cjSet.addAll(fHiraganaWordSet);
cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
}
} else { //Chinese and Japanese
UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
if (U_SUCCESS(status)) {
setCharacters(cjSet);
}
}

View file

@ -127,7 +127,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
* @internal
*/
UnicodeSet fThaiWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fSuffixSet;
@ -186,7 +185,6 @@ class LaoBreakEngine : public DictionaryBreakEngine {
* @internal
*/
UnicodeSet fLaoWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
@ -244,7 +242,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
* @internal
*/
UnicodeSet fBurmeseWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
@ -302,7 +299,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
* @internal
*/
UnicodeSet fKhmerWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
@ -366,9 +362,6 @@ class CjkBreakEngine : public DictionaryBreakEngine {
* @internal
*/
UnicodeSet fHangulWordSet;
UnicodeSet fHanWordSet;
UnicodeSet fKatakanaWordSet;
UnicodeSet fHiraganaWordSet;
DictionaryMatcher *fDictionary;
const Normalizer2 *nfkcNorm2;

View file

@ -31,24 +31,16 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine {
private static final byte BURMESE_MIN_WORD = 2;
private DictionaryMatcher fDictionary;
private static UnicodeSet fBurmeseWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fMarkSet;
private UnicodeSet fEndWordSet;
private UnicodeSet fBeginWordSet;
private UnicodeSet fMarkSet;
static {
public BurmeseBreakEngine() throws IOException {
// Initialize UnicodeSets
fBurmeseWordSet = new UnicodeSet();
fMarkSet = new UnicodeSet();
fBeginWordSet = new UnicodeSet();
fBurmeseWordSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
fBurmeseWordSet.compact();
fMarkSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
fBeginWordSet = new UnicodeSet(0x1000, 0x102A); // basic consonants and independent vowels
fEndWordSet = new UnicodeSet("[[:Mymr:]&[:LineBreak=SA:]]");
fMarkSet = new UnicodeSet("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
fMarkSet.add(0x0020);
fEndWordSet = new UnicodeSet(fBurmeseWordSet);
fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
// Compact for caching
fMarkSet.compact();
@ -56,14 +48,11 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine {
fBeginWordSet.compact();
// Freeze the static UnicodeSet
fBurmeseWordSet.freeze();
fMarkSet.freeze();
fEndWordSet.freeze();
fBeginWordSet.freeze();
}
public BurmeseBreakEngine() throws IOException {
setCharacters(fBurmeseWordSet);
setCharacters(fEndWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Mymr");
}

View file

@ -20,36 +20,18 @@ import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UnicodeSet;
public class CjkBreakEngine extends DictionaryBreakEngine {
private static final UnicodeSet fHangulWordSet = new UnicodeSet();
private static final UnicodeSet fHanWordSet = new UnicodeSet();
private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
static {
fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
fHanWordSet.applyPattern("[:Han:]");
fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
fHiraganaWordSet.applyPattern("[:Hiragana:]");
// freeze them all
fHangulWordSet.freeze();
fHanWordSet.freeze();
fKatakanaWordSet.freeze();
fHiraganaWordSet.freeze();
}
private UnicodeSet fHangulWordSet;
private DictionaryMatcher fDictionary = null;
public CjkBreakEngine(boolean korean) throws IOException {
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
fHangulWordSet.freeze();
fDictionary = DictionaryData.loadDictionaryFor("Hira");
if (korean) {
setCharacters(fHangulWordSet);
} else { //Chinese and Japanese
UnicodeSet cjSet = new UnicodeSet();
cjSet.addAll(fHanWordSet);
cjSet.addAll(fKatakanaWordSet);
cjSet.addAll(fHiraganaWordSet);
cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
setCharacters(cjSet);
}
}

View file

@ -33,24 +33,20 @@ public class KhmerBreakEngine extends DictionaryBreakEngine {
private DictionaryMatcher fDictionary;
private static UnicodeSet fKhmerWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fMarkSet;
private UnicodeSet fEndWordSet;
private UnicodeSet fBeginWordSet;
private UnicodeSet fMarkSet;
static {
public KhmerBreakEngine() throws IOException {
// Initialize UnicodeSets
fKhmerWordSet = new UnicodeSet();
fMarkSet = new UnicodeSet();
fBeginWordSet = new UnicodeSet();
fKhmerWordSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]]");
fKhmerWordSet.compact();
fMarkSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
UnicodeSet khmerWordSet = new UnicodeSet("[[:Khmer:]&[:LineBreak=SA:]]");
fMarkSet = new UnicodeSet("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
fMarkSet.add(0x0020);
fEndWordSet = new UnicodeSet(fKhmerWordSet);
fBeginWordSet.add(0x1780, 0x17B3);
fBeginWordSet = new UnicodeSet(0x1780, 0x17B3);
khmerWordSet.compact();
fEndWordSet = new UnicodeSet(khmerWordSet);
fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
// Compact for caching
@ -59,14 +55,12 @@ public class KhmerBreakEngine extends DictionaryBreakEngine {
fBeginWordSet.compact();
// Freeze the static UnicodeSet
fKhmerWordSet.freeze();
khmerWordSet.freeze();
fMarkSet.freeze();
fEndWordSet.freeze();
fBeginWordSet.freeze();
}
public KhmerBreakEngine() throws IOException {
setCharacters(fKhmerWordSet);
setCharacters(khmerWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Khmr");
}

View file

@ -30,27 +30,24 @@ public class LaoBreakEngine extends DictionaryBreakEngine {
private static final byte LAO_MIN_WORD = 2;
private DictionaryMatcher fDictionary;
private static UnicodeSet fLaoWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fMarkSet;
private UnicodeSet fEndWordSet;
private UnicodeSet fBeginWordSet;
private UnicodeSet fMarkSet;
static {
public LaoBreakEngine() throws IOException {
// Initialize UnicodeSets
fLaoWordSet = new UnicodeSet();
fMarkSet = new UnicodeSet();
fBeginWordSet = new UnicodeSet();
fLaoWordSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]]");
fLaoWordSet.compact();
fMarkSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
UnicodeSet laoWordSet = new UnicodeSet("[[:Laoo:]&[:LineBreak=SA:]]");
fMarkSet = new UnicodeSet("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
fMarkSet.add(0x0020);
fEndWordSet = new UnicodeSet(fLaoWordSet);
fBeginWordSet = new UnicodeSet(
0x0E81, 0x0EAE, // basic consonants (including holes for corresponding Thai characters)
0x0EC0, 0x0EC4, // prefix vowels
0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
laoWordSet.compact();
fEndWordSet = new UnicodeSet(laoWordSet);
fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
// Compact for caching
fMarkSet.compact();
@ -58,14 +55,12 @@ public class LaoBreakEngine extends DictionaryBreakEngine {
fBeginWordSet.compact();
// Freeze the static UnicodeSet
fLaoWordSet.freeze();
laoWordSet.freeze();
fMarkSet.freeze();
fEndWordSet.freeze();
fBeginWordSet.freeze();
}
public LaoBreakEngine() throws IOException {
setCharacters(fLaoWordSet);
setCharacters(laoWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Laoo");
}

View file

@ -36,32 +36,28 @@ public class ThaiBreakEngine extends DictionaryBreakEngine {
private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
private DictionaryMatcher fDictionary;
private static UnicodeSet fThaiWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fSuffixSet;
private static UnicodeSet fMarkSet;
private UnicodeSet fEndWordSet;
private UnicodeSet fBeginWordSet;
private UnicodeSet fSuffixSet;
private UnicodeSet fMarkSet;
static {
public ThaiBreakEngine() throws IOException {
// Initialize UnicodeSets
fThaiWordSet = new UnicodeSet();
fMarkSet = new UnicodeSet();
fBeginWordSet = new UnicodeSet();
fSuffixSet = new UnicodeSet();
fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]");
fThaiWordSet.compact();
fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
UnicodeSet thaiWordSet = new UnicodeSet("[[:Thai:]&[:LineBreak=SA:]]");
fMarkSet = new UnicodeSet("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
fMarkSet.add(0x0020);
fEndWordSet = new UnicodeSet(fThaiWordSet);
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fBeginWordSet = new UnicodeSet(0x0E01, 0x0E2E, //KO KAI through HO NOKHUK
0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fSuffixSet = new UnicodeSet();
fSuffixSet.add(THAI_PAIYANNOI);
fSuffixSet.add(THAI_MAIYAMOK);
thaiWordSet.compact();
fEndWordSet = new UnicodeSet(thaiWordSet);
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
// Compact for caching
fMarkSet.compact();
fEndWordSet.compact();
@ -69,15 +65,13 @@ public class ThaiBreakEngine extends DictionaryBreakEngine {
fSuffixSet.compact();
// Freeze the static UnicodeSet
fThaiWordSet.freeze();
thaiWordSet.freeze();
fMarkSet.freeze();
fEndWordSet.freeze();
fBeginWordSet.freeze();
fSuffixSet.freeze();
}
public ThaiBreakEngine() throws IOException {
setCharacters(fThaiWordSet);
setCharacters(thaiWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Thai");
}