From ed2c14b42532469d77427fd9acd8dfa60327e0ac Mon Sep 17 00:00:00 2001 From: Maxime Serrano Date: Thu, 16 Aug 2012 23:16:04 +0000 Subject: [PATCH] ICU-9353 merge dbbi-tries work into the trunk X-SVN-Rev: 32185 --- icu4j/build.xml | 4 +- .../com/ibm/icu/impl/CharacterIteration.java | 126 +++ .../src/com/ibm/icu/text/BreakIterator.java | 5 + .../ibm/icu/text/BreakIteratorFactory.java | 69 +- .../ibm/icu/text/BytesDictionaryMatcher.java | 83 ++ .../ibm/icu/text/CharsDictionaryMatcher.java | 61 ++ .../src/com/ibm/icu/text/CjkBreakEngine.java | 218 +++++ .../text/DictionaryBasedBreakIterator.java | 565 ------------ .../ibm/icu/text/DictionaryBreakEngine.java | 69 ++ .../src/com/ibm/icu/text/DictionaryData.java | 90 ++ .../com/ibm/icu/text/DictionaryMatcher.java | 40 + .../com/ibm/icu/text/LanguageBreakEngine.java | 40 + .../ibm/icu/text/RuleBasedBreakIterator.java | 811 ++++++++++-------- ...reakIterator.java => ThaiBreakEngine.java} | 131 +-- .../ibm/icu/text/UnhandledBreakEngine.java | 46 + icu4j/main/shared/data/icudata.jar | 4 +- icu4j/main/shared/data/icutzdata.jar | 2 +- icu4j/main/shared/data/testdata.jar | 4 +- .../icu/dev/test/rbbi/BreakIteratorTest.java | 58 +- .../com/ibm/icu/dev/test/rbbi/RBBITest.java | 12 +- .../icu/dev/test/rbbi/RBBITestExtended.java | 2 +- .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 10 +- .../ibm/icu/dev/test/rbbi/SimpleBITest.java | 24 +- .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 140 ++- .../dev/test/util/ICUResourceBundleTest.java | 12 - .../ibm/icu/dev/test/util/ULocaleTest.java | 17 - 26 files changed, 1372 insertions(+), 1271 deletions(-) create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterIteration.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/BytesDictionaryMatcher.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/CharsDictionaryMatcher.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java delete mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBasedBreakIterator.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryData.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryMatcher.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java rename icu4j/main/classes/core/src/com/ibm/icu/text/{ThaiBreakIterator.java => ThaiBreakEngine.java} (75%) create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java diff --git a/icu4j/build.xml b/icu4j/build.xml index 4a9d5848882..13f3d55a727 100644 --- a/icu4j/build.xml +++ b/icu4j/build.xml @@ -1583,7 +1583,7 @@ - + @@ -1676,7 +1676,7 @@ - + diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterIteration.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterIteration.java new file mode 100644 index 00000000000..6bd34906705 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterIteration.java @@ -0,0 +1,126 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.impl; + +import java.text.CharacterIterator; + +import com.ibm.icu.text.UTF16; + +public final class CharacterIteration { + // disallow instantiation + private CharacterIteration() { } + + // 32 bit Char value returned from when an iterator has run out of range. + // Positive value so fast case (not end, not surrogate) can be checked + // with a single test. + public static int DONE32 = 0x7fffffff; + + /** + * Move the iterator forward to the next code point, and return that code point, + * leaving the iterator positioned at char returned. + * For Supplementary chars, the iterator is left positioned at the lead surrogate. + * @param ci The character iterator + * @return The next code point. + */ + public static int next32(CharacterIterator ci) { + // If the current position is at a surrogate pair, move to the trail surrogate + // which leaves it in positon for underlying iterator's next() to work. + int c= ci.current(); + if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) { + c = ci.next(); + if (cUTF16.TRAIL_SURROGATE_MAX_VALUE) { + c = ci.previous(); + } + } + + // For BMP chars, this next() is the real deal. + c = ci.next(); + + // If we might have a lead surrogate, we need to peak ahead to get the trail + // even though we don't want to really be positioned there. + if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { + c = nextTrail32(ci, c); + } + + if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) { + // We got a supplementary char. Back the iterator up to the postion + // of the lead surrogate. + ci.previous(); + } + return c; + } + + + // Out-of-line portion of the in-line Next32 code. + // The call site does an initial ci.next() and calls this function + // if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE. + // NOTE: we leave the underlying char iterator positioned in the + // middle of a surroage pair. ci.next() will work correctly + // from there, but the ci.getIndex() will be wrong, and needs + // adjustment. + public static int nextTrail32(CharacterIterator ci, int lead) { + int retVal = lead; + if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { + char cTrail = ci.next(); + if (UTF16.isTrailSurrogate(cTrail)) { + retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + + (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + + UTF16.SUPPLEMENTARY_MIN_VALUE; + } else { + ci.previous(); + } + } else { + if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) { + retVal = DONE32; + } + } + return retVal; + } + + public static int previous32(CharacterIterator ci) { + if (ci.getIndex() <= ci.getBeginIndex()) { + return DONE32; + } + char trail = ci.previous(); + int retVal = trail; + if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) { + char lead = ci.previous(); + if (UTF16.isLeadSurrogate(lead)) { + retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + + ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + + UTF16.SUPPLEMENTARY_MIN_VALUE; + } else { + ci.next(); + } + } + return retVal; + } + + public static int current32(CharacterIterator ci) { + char lead = ci.current(); + int retVal = lead; + if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) { + return retVal; + } + if (UTF16.isLeadSurrogate(lead)) { + int trail = (int)ci.next(); + ci.previous(); + if (UTF16.isTrailSurrogate((char)trail)) { + retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + + (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + + UTF16.SUPPLEMENTARY_MIN_VALUE; + } + } else { + if (lead == CharacterIterator.DONE) { + if (ci.getIndex() >= ci.getEndIndex()) { + retVal = DONE32; + } + } + } + return retVal; + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java index aaaf0b1071b..8cf5bf42b01 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java @@ -732,6 +732,11 @@ s */ BreakIteratorCache cache = new BreakIteratorCache(where, result); iterCache[kind] = new SoftReference(cache); + if (result instanceof RuleBasedBreakIterator) { + RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result; + rbbi.setBreakType(kind); + } + return result; } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java index ea089f34874..92f815a4f9a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2002-2010, International Business Machines Corporation and * + * Copyright (C) 2002-2012, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -90,28 +90,20 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim * pre-compiled break rules. The resource bundle name is "boundaries". * The value for each key will be the rules to be used for the * specified locale - "word" -> "word_th" for Thai, for example. - * DICTIONARY_POSSIBLE indexes in the same way, and indicates whether a - * dictionary is a possibility for that type of break. This is just - * an optimization to avoid a resource lookup where no dictionary is - * ever possible. */ private static final String[] KIND_NAMES = { "grapheme", "word", "line", "sentence", "title" - }; - private static final boolean[] DICTIONARY_POSSIBLE = { - false, true, true, false, false }; private static BreakIterator createBreakInstance(ULocale locale, int kind) { - BreakIterator iter = null; - ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale); + RuleBasedBreakIterator iter = null; + ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale); // - // Get the binary rules. These are needed for both normal RulesBasedBreakIterators - // and for Dictionary iterators. - // + // Get the binary rules. + // InputStream ruleStream = null; try { String typeKey = KIND_NAMES[kind]; @@ -122,51 +114,22 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim catch (Exception e) { throw new MissingResourceException(e.toString(),"",""); } - - // - // Check whether a dictionary exists, and create a DBBI iterator is - // one does. - // - if (DICTIONARY_POSSIBLE[kind]) { - // This type of break iterator could potentially use a dictionary. - // - try { - if (locale.getLanguage().equals("th")){ - // If the language is Thai, load the thai compact trie dictionary. - String dictType = "Thai"; - String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType); - dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName; - InputStream is = ICUData.getStream(dictFileName); - iter = new ThaiBreakIterator(ruleStream, is); - } - } catch (MissingResourceException e) { - // Couldn't find a dictionary. - // This is normal, and will occur whenever creating a word or line - // break iterator for a locale that does not have a BreakDictionaryData - // resource - meaning for all but Thai. - // Fall through to creating a normal RulebasedBreakIterator. - } catch (IOException e) { - Assert.fail(e); - } - } - if (iter == null) { - // - // Create a normal RuleBasedBreakIterator. - // We have determined that this is not supposed to be a dictionary iterator. - // - try { - iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream); - } - catch (IOException e) { - // Shouldn't be possible to get here. - // If it happens, the compiled rules are probably corrupted in some way. - Assert.fail(e); - } + // + // Create a normal RuleBasedBreakIterator. + // + try { + iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream); + } + catch (IOException e) { + // Shouldn't be possible to get here. + // If it happens, the compiled rules are probably corrupted in some way. + Assert.fail(e); } // TODO: Determine valid and actual locale correctly. ULocale uloc = ULocale.forLocale(rb.getLocale()); iter.setLocale(uloc, uloc); + iter.setBreakType(kind); return iter; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/BytesDictionaryMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/text/BytesDictionaryMatcher.java new file mode 100644 index 00000000000..67a0a926d71 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BytesDictionaryMatcher.java @@ -0,0 +1,83 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.text.CharacterIterator; + +import com.ibm.icu.impl.Assert; +import com.ibm.icu.util.BytesTrie; +import com.ibm.icu.util.BytesTrie.Result; + +class BytesDictionaryMatcher extends DictionaryMatcher { + private final byte[] characters; + private final int transform; + + public BytesDictionaryMatcher(byte[] chars, int transform) { + characters = chars; + Assert.assrt((transform & DictionaryData.TRANSFORM_TYPE_MASK) == DictionaryData.TRANSFORM_TYPE_OFFSET); + // while there is only one transform type so far, save the entire transform constant so that + // if we add any others, we need only change code in transform() and the assert above rather + // than adding a "transform type" variable + this.transform = transform; + } + + private int transform(int c) { + if (c == 0x200D) { + return 0xFF; + } else if (c == 0x200C) { + return 0xFE; + } + + int delta = c - (transform & DictionaryData.TRANSFORM_OFFSET_MASK); + if (delta < 0 || 0xFD < delta) { + return -1; + } + return delta; + } + + public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) { + UCharacterIterator text = UCharacterIterator.getInstance(text_); + BytesTrie bt = new BytesTrie(characters, 0); + int c = text.nextCodePoint(); + Result result = bt.first(transform(c)); + // TODO: should numChars count Character.charCount() ? + int numChars = 1; + int count = 0; + for (;;) { + if (result.hasValue()) { + if (count < limit) { + if (values != null) { + values[count] = bt.getValue(); + } + lengths[count] = numChars; + count++; + } + if (result == Result.FINAL_VALUE) { + break; + } + } else if (result == Result.NO_MATCH) { + break; + } + + if (numChars >= maxLength) { + break; + } + + c = text.nextCodePoint(); + ++numChars; + result = bt.next(transform(c)); + } + count_[0] = count; + return numChars; + } + + public int getType() { + return DictionaryData.TRIE_TYPE_BYTES; + } +} + + diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsDictionaryMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsDictionaryMatcher.java new file mode 100644 index 00000000000..5f87100f729 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsDictionaryMatcher.java @@ -0,0 +1,61 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.text.CharacterIterator; + +import com.ibm.icu.util.BytesTrie.Result; +import com.ibm.icu.util.CharsTrie; + +class CharsDictionaryMatcher extends DictionaryMatcher { + private CharSequence characters; + + public CharsDictionaryMatcher(CharSequence chars) { + characters = chars; + } + + public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) { + UCharacterIterator text = UCharacterIterator.getInstance(text_); + CharsTrie uct = new CharsTrie(characters, 0); + int c = text.nextCodePoint(); + Result result = uct.firstForCodePoint(c); + // TODO: should numChars count Character.charCount? + int numChars = 1; + int count = 0; + for (;;) { + if (result.hasValue()) { + if (count < limit) { + if (values != null) { + values[count] = uct.getValue(); + } + lengths[count] = numChars; + count++; + } + + if (result == Result.FINAL_VALUE) { + break; + } + } else if (result == Result.NO_MATCH) { + break; + } + + if (numChars >= maxLength) { + break; + } + c = text.nextCodePoint(); + ++numChars; + result = uct.nextForCodePoint(c); + } + count_[0] = count; + return numChars; + } + + public int getType() { + return DictionaryData.TRIE_TYPE_UCHARS; + } +} + diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java new file mode 100644 index 00000000000..7f399714c09 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java @@ -0,0 +1,218 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.io.IOException; +import java.text.CharacterIterator; +import java.util.Stack; + +import com.ibm.icu.impl.Assert; + +import static com.ibm.icu.impl.CharacterIteration.*; + +public class CjkBreakEngine implements LanguageBreakEngine { + private static final UnicodeSet fHangulWordSet = new UnicodeSet(); + private static final UnicodeSet fHanWordSet = new UnicodeSet(); + private static final UnicodeSet fKatakanaWordSet = new UnicodeSet(); + private static final UnicodeSet fHiraganaWordSet = new UnicodeSet(); + static { + fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]"); + fHanWordSet.applyPattern("[:Han:]"); + fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]"); + fHiraganaWordSet.applyPattern("[:Hiragana:]"); + + // freeze them all + fHangulWordSet.freeze(); + fHanWordSet.freeze(); + fKatakanaWordSet.freeze(); + fHiraganaWordSet.freeze(); + } + + private final UnicodeSet fWordSet; + private DictionaryMatcher fDictionary = null; + + public CjkBreakEngine(boolean korean) throws IOException { + fDictionary = DictionaryData.loadDictionaryFor("Hira"); + if (korean) { + fWordSet = fHangulWordSet; + } else { + fWordSet = new UnicodeSet(); + fWordSet.addAll(fHanWordSet); + fWordSet.addAll(fKatakanaWordSet); + fWordSet.addAll(fHiraganaWordSet); + fWordSet.add("\\uff70\\u30fc"); + } + } + + public boolean handles(int c, int breakType) { + return (breakType == BreakIterator.KIND_WORD) && + (fWordSet.contains(c)); + } + + private static final int kMaxKatakanaLength = 8; + private static final int kMaxKatakanaGroupLength = 20; + private static final int maxSnlp = 255; + private static final int kint32max = Integer.MAX_VALUE; + private static int getKatakanaCost(int wordlength) { + int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 }; + return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength]; + } + + private static boolean isKatakana(int value) { + return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || + (value >= 0xFF66 && value <= 0xFF9F); + } + + public int findBreaks(CharacterIterator inText, int startPos, int endPos, + boolean reverse, int breakType, Stack foundBreaks) { + if (startPos >= endPos) { + return 0; + } + + inText.setIndex(startPos); + + int inputLength = endPos - startPos; + int[] charPositions = new int[inputLength + 1]; + StringBuffer s = new StringBuffer(""); + inText.setIndex(startPos); + while (inText.getIndex() < endPos) { + s.append(inText.current()); + inText.next(); + } + String prenormstr = s.toString(); + boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || + Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); + CharacterIterator text = inText; + int numChars = 0; + if (isNormalized) { + int index = 0; + charPositions[0] = 0; + while (index < prenormstr.length()) { + int codepoint = prenormstr.codePointAt(index); + index += Character.charCount(codepoint); + numChars++; + charPositions[numChars] = index; + } + } else { + String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); + text = new java.text.StringCharacterIterator(normStr); + charPositions = new int[normStr.length() + 1]; + Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0); + int index = 0; + charPositions[0] = 0; + while (index < normalizer.endIndex()) { + normalizer.next(); + numChars++; + index = normalizer.getIndex(); + charPositions[numChars] = index; + } + } + + // From here on out, do the algorithm. Note that our indices + // refer to indices within the normalized string. + int[] bestSnlp = new int[numChars + 1]; + bestSnlp[0] = 0; + for (int i = 1; i <= numChars; i++) { + bestSnlp[i] = kint32max; + } + + int[] prev = new int[numChars + 1]; + for (int i = 0; i <= numChars; i++) { + prev[i] = -1; + } + + final int maxWordSize = 20; + int values[] = new int[numChars]; + int lengths[] = new int[numChars]; + // dynamic programming to find the best segmentation + boolean is_prev_katakana = false; + for (int i = 0; i < numChars; i++) { + text.setIndex(i); + if (bestSnlp[i] == kint32max) { + continue; + } + + int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); + int[] count_ = new int[1]; + fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); + int count = count_[0]; + + // if there are no single character matches found in the dictionary + // starting with this character, treat character as a 1-character word + // with the highest value possible (i.e. the least likely to occur). + // Exclude Korean characters from this treatment, as they should be + // left together by default. + if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { + values[count] = maxSnlp; + lengths[count] = 1; + count++; + } + + for (int j = 0; j < count; j++) { + int newSnlp = bestSnlp[i] + values[j]; + if (newSnlp < bestSnlp[lengths[j] + i]) { + bestSnlp[lengths[j] + i] = newSnlp; + prev[lengths[j] + i] = i; + } + } + + // In Japanese, single-character Katakana words are pretty rare. + // So we apply the following heuristic to Katakana: any continuous + // run of Katakana characters is considered a candidate word with + // a default cost specified in the katakanaCost table according + // to its length. + text.setIndex(i); + boolean is_katakana = isKatakana(current32(text)); + if (!is_prev_katakana && is_katakana) { + int j = i + 1; + next32(text); + while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { + next32(text); + ++j; + } + + if ((j - i) < kMaxKatakanaGroupLength) { + int newSnlp = bestSnlp[i] + getKatakanaCost(j - i); + if (newSnlp < bestSnlp[j]) { + bestSnlp[j] = newSnlp; + prev[j] = i; + } + } + } + is_prev_katakana = is_katakana; + } + + int t_boundary[] = new int[numChars + 1]; + int numBreaks = 0; + if (bestSnlp[numChars] == kint32max) { + t_boundary[numBreaks] = numChars; + numBreaks++; + } else { + for (int i = numChars; i > 0; i = prev[i]) { + t_boundary[numBreaks] = i; + numBreaks++; + } + Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0); + } + + if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) { + t_boundary[numBreaks++] = 0; + } + + for (int i = numBreaks - 1; i >= 0; i--) { + int pos = charPositions[t_boundary[i]] + startPos; + if (!(foundBreaks.contains(pos) || pos == startPos)) + foundBreaks.push(charPositions[t_boundary[i]] + startPos); + } + + if (!foundBreaks.empty() && foundBreaks.peek() == endPos) + foundBreaks.pop(); + if (!foundBreaks.empty()) + inText.setIndex(foundBreaks.peek()); + return 0; + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBasedBreakIterator.java deleted file mode 100644 index 0e1cca28dce..00000000000 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBasedBreakIterator.java +++ /dev/null @@ -1,565 +0,0 @@ -/* - ******************************************************************************* - * Copyright (C) 1996-2010, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - */ - -package com.ibm.icu.text; - -import java.io.IOException; -import java.io.InputStream; -import java.text.CharacterIterator; -import java.util.ArrayList; -import java.util.List; -import java.util.Stack; - -import com.ibm.icu.impl.Assert; - - -/** - * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary - * to further subdivide ranges of text beyond what is possible using just the - * state-table-based algorithm. This is necessary, for example, to handle - * word and line breaking in Thai, which doesn't use spaces between words. The - * state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide - * up text as far as possible, and then contiguous ranges of letters are - * repeatedly compared against a list of known words (i.e., the dictionary) - * to divide them up into words. - * - * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old, - * but adds one more special substitution name: _dictionary_. This substitution - * name is used to identify characters in words in the dictionary. The idea is that - * if the iterator passes over a chunk of text that includes two or more characters - * in a row that are included in _dictionary_, it goes back through that range and - * derives additional break positions (if possible) using the dictionary. - * - * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary - * file. It uses Class.getResource() to locate the dictionary file. The - * dictionary file is in a serialized binary format. We have a very primitive (and - * slow) BuildDictionaryFile utility for creating dictionary files, but aren't - * currently making it public. Contact us for help. - * - * @stable ICU 2.0 - */ -public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { - - /** - * Keeps track of if we are using the compact trie dictionary. - */ - private boolean usingCTDictionary = false; - /** - * a list of known words that is used to divide up contiguous ranges of letters, - * stored in a compressed, indexed, format that offers fast access - */ - private BreakDictionary dictionary; - - /* - * a list of flags indicating which character categories are contained in - * the dictionary file (this is used to determine which ranges of characters - * to apply the dictionary to) - */ - //private boolean[] categoryFlags; - - - /** - * when a range of characters is divided up using the dictionary, the break - * positions that are discovered are stored here, preventing us from having - * to use either the dictionary or the state table again until the iterator - * leaves this range of text - */ - int[] cachedBreakPositions; - - /** - * if cachedBreakPositions is not null, this indicates which item in the - * cache the current iteration position refers to - */ - int positionInCache; - - /** - * Special variable name for characters in words in dictionary - */ - - /** - * Construct a DictionarBasedBreakIterator from precompiled rules. Use by ThaiBreakEngine - * uses the BreakCTDictionary. - * @param compiledRules an input stream containing the binary (flattened) compiled rules. - * @internal - * @deprecated This API is ICU internal only. - */ - protected DictionaryBasedBreakIterator(InputStream compiledRules) throws IOException { - fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator. - dictionary = null; - usingCTDictionary = true; - } - /** - * Constructs a DictionaryBasedBreakIterator. - * @param rules Same as the rules parameter on RuleBasedBreakIterator, - * except for the special meaning of "_dictionary_". This parameter is just - * passed through to RuleBasedBreakIterator constructor. - * @param dictionaryStream the stream containing the dictionary data - * @stable ICU 2.0 - */ - public DictionaryBasedBreakIterator(String rules, - InputStream dictionaryStream) throws IOException { - super(rules); - dictionary = new BreakDictionary(dictionaryStream); - } - - - /** - * Construct a DictionarBasedBreakIterator from precompiled rules. - * @param compiledRules an input stream containing the binary (flattened) compiled rules. - * @param dictionaryStream an input stream containing the dictionary data - * @internal - * @deprecated This API is ICU internal only. - */ - public DictionaryBasedBreakIterator(InputStream compiledRules, - InputStream dictionaryStream) throws IOException { - fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator. - dictionary = new BreakDictionary(dictionaryStream); - } - - - /** @stable ICU 2.0 */ - public void setText(CharacterIterator newText) { - super.setText(newText); - cachedBreakPositions = null; - fDictionaryCharCount = 0; - positionInCache = 0; - } - - /** - * Sets the current iteration position to the beginning of the text. - * (i.e., the CharacterIterator's starting offset). - * @return The offset of the beginning of the text. - * @stable ICU 2.0 - */ - public int first() { - cachedBreakPositions = null; - fDictionaryCharCount = 0; - positionInCache = 0; - return super.first(); - } - - /** - * Sets the current iteration position to the end of the text. - * (i.e., the CharacterIterator's ending offset). - * @return The text's past-the-end offset. - * @stable ICU 2.0 - */ - public int last() { - cachedBreakPositions = null; - fDictionaryCharCount = 0; - positionInCache = 0; - return super.last(); - } - - /** - * Advances the iterator one step backwards. - * @return The position of the last boundary position before the - * current iteration position - * @stable ICU 2.0 - */ - public int previous() { - CharacterIterator text = getText(); - - // if we have cached break positions and we're still in the range - // covered by them, just move one step backward in the cache - if (cachedBreakPositions != null && positionInCache > 0) { - --positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return cachedBreakPositions[positionInCache]; - } - - // otherwise, dump the cache and use the inherited previous() method to move - // backward. This may fill up the cache with new break positions, in which - // case we have to mark our position in the cache. If it doesn't, use next() - // to move forward until we hit or pass the current position. This *will* fill - // the cache. - else { - cachedBreakPositions = null; - int offset = current(); - int result = super.previous(); - - if (cachedBreakPositions != null) { - positionInCache = cachedBreakPositions.length - 2; - return result; - } - - while (result < offset) { - int nextResult = next(); - - if (nextResult >= offset) { - break; - } - - result = nextResult; - } - - if (cachedBreakPositions != null) { - positionInCache = cachedBreakPositions.length - 2; - } - - if (result != BreakIterator.DONE) { - text.setIndex(result); - } - - return result; - } - } - - /** - * Sets the current iteration position to the last boundary position - * before the specified position. - * @param offset The position to begin searching from - * @return The position of the last boundary before "offset" - * @stable ICU 2.0 - */ - public int preceding(int offset) { - CharacterIterator text = getText(); - checkOffset(offset, text); - - // if we have no cached break positions, or "offset" is outside the - // range covered by the cache, we can just call the inherited routine - // (which will eventually call other routines in this class that may - // refresh the cache) - if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] || - offset > cachedBreakPositions[cachedBreakPositions.length - 1]) { - cachedBreakPositions = null; - return super.preceding(offset); - } - - // on the other hand, if "offset" is within the range covered by the cache, - // then all we have to do is search the cache for the last break position - // before "offset" - else { - positionInCache = 0; - while (positionInCache < cachedBreakPositions.length - && offset > cachedBreakPositions[positionInCache]) - ++positionInCache; - --positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return text.getIndex(); - } - } - - /** - * Sets the current iteration position to the first boundary position after - * the specified position. - * @param offset The position to begin searching forward from - * @return The position of the first boundary after "offset" - * @stable ICU 2.0 - */ - public int following(int offset) { - CharacterIterator text = getText(); - checkOffset(offset, text); - - // if we have no cached break positions, or if "offset" is outside the - // range covered by the cache, then dump the cache and call our - // inherited following() method. This will call other methods in this - // class that may refresh the cache. - if (cachedBreakPositions == null || offset < cachedBreakPositions[0] || - offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) { - cachedBreakPositions = null; - return super.following(offset); - } - - // on the other hand, if "offset" is within the range covered by the - // cache, then just search the cache for the first break position - // after "offset" - else { - positionInCache = 0; - while (positionInCache < cachedBreakPositions.length - && offset >= cachedBreakPositions[positionInCache]) - ++positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return text.getIndex(); - } - } - - - /** - * Return the status tag from the break rule that determined the most recently - * returned break position. - * - * TODO: not supported with dictionary based break iterators. - * - * @return the status from the break rule that determined the most recently - * returned break position. - * @draft ICU 3.0 - * @provisional This API might change or be removed in a future release. - */ - public int getRuleStatus() { - return 0; - } - - - /** - * Get the status (tag) values from the break rule(s) that determined the most - * recently returned break position. The values appear in the rule source - * within brackets, {123}, for example. The default status value for rules - * that do not explicitly provide one is zero. - *

- * TODO: not supported for dictionary based break iterator. - * - * @param fillInArray an array to be filled in with the status values. - * @return The number of rule status values from rules that determined - * the most recent boundary returned by the break iterator. - * In the event that the array is too small, the return value - * is the total number of status values that were available, - * not the reduced number that were actually returned. - * @draft ICU 3.0 - * @provisional This API might change or be removed in a future release. - */ - public int getRuleStatusVec(int[] fillInArray) { - if (fillInArray != null && fillInArray.length>=1) { - fillInArray[0] = 0; - } - return 1; - } - /** - * This is the implementation function for next(). - * @internal - * @deprecated This API is ICU internal only. - */ - protected int handleNext() { - CharacterIterator text = getText(); - - // if there are no cached break positions, or if we've just moved - // off the end of the range covered by the cache, we have to dump - // and possibly regenerate the cache - if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) { - - // start by using the inherited handleNext() to find a tentative return - // value. dictionaryCharCount tells us how many dictionary characters - // we passed over on our way to the tentative return value - int startPos = text.getIndex(); - fDictionaryCharCount = 0; - int result = super.handleNext(); - - // if we passed over more than one dictionary character, then we use - // divideUpDictionaryRange() to regenerate the cached break positions - // for the new range. - if (!usingCTDictionary && fDictionaryCharCount > 1 && result - startPos > 1) { - divideUpDictionaryRange(startPos, result); - } - - // otherwise, the value we got back from the inherited fuction - // is our return value, and we can dump the cache - else { - cachedBreakPositions = null; - return result; - } - } - - // if the cache of break positions has been regenerated (or existed all - // along), then just advance to the next break position in the cache - // and return it - if (cachedBreakPositions != null) { - ++positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return cachedBreakPositions[positionInCache]; - } - ///CLOVER:OFF - Assert.assrt(false); - return -9999; // SHOULD NEVER GET HERE! - ///CLOVER:ON - } - - /** - * This is the function that actually implements the dictionary-based - * algorithm. Given the endpoints of a range of text, it uses the - * dictionary to determine the positions of any boundaries in this - * range. It stores all the boundary positions it discovers in - * cachedBreakPositions so that we only have to do this work once - * for each time we enter the range. - */ - @SuppressWarnings("unchecked") - private void divideUpDictionaryRange(int startPos, int endPos) { - CharacterIterator text = getText(); - - // the range we're dividing may begin or end with non-dictionary characters - // (i.e., for line breaking, we may have leading or trailing punctuation - // that needs to be kept with the word). Seek from the beginning of the - // range to the first dictionary character - text.setIndex(startPos); - int c = CICurrent32(text); - while (isDictionaryChar(c) == false) { - c = CINext32(text); - } - - //System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos); - - // initialize. We maintain two stacks: currentBreakPositions contains - // the list of break positions that will be returned if we successfully - // finish traversing the whole range now. possibleBreakPositions lists - // all other possible word ends we've passed along the way. (Whenever - // we reach an error [a sequence of characters that can't begin any word - // in the dictionary], we back up, possibly delete some breaks from - // currentBreakPositions, move a break from possibleBreakPositions - // to currentBreakPositions, and start over from there. This process - // continues in this way until we either successfully make it all the way - // across the range, or exhaust all of our combinations of break - // positions.) - Stack currentBreakPositions = new Stack(); - Stack possibleBreakPositions = new Stack(); - List wrongBreakPositions = new ArrayList(); - - // the dictionary is implemented as a trie, which is treated as a state - // machine. -1 represents the end of a legal word. Every word in the - // dictionary is represented by a path from the root node to -1. A path - // that ends in state 0 is an illegal combination of characters. - int state = 0; - - // these two variables are used for error handling. We keep track of the - // farthest we've gotten through the range being divided, and the combination - // of breaks that got us that far. If we use up all possible break - // combinations, the text contains an error or a word that's not in the - // dictionary. In this case, we "bless" the break positions that got us the - // farthest as real break positions, and then start over from scratch with - // the character where the error occurred. - int farthestEndPoint = text.getIndex(); - Stack bestBreakPositions = null; - - // initialize (we always exit the loop with a break statement) - c = CICurrent32(text); - while (true) { -//System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex()); - - // if we can transition to state "-1" from our current state, we're - // on the last character of a legal word. Push that position onto - // the possible-break-positions stack - if (dictionary.at(state, 0) == -1) { - possibleBreakPositions.push(Integer.valueOf(text.getIndex())); - } - - // look up the new state to transition to in the dictionary - // There will be no supplementaries here because the Thai dictionary - // does not include any. This code is going away soon, not worth - // fixing. - state = (dictionary.at(state, (char)c)) & 0xFFFF; // TODO: fix supplementaries -//System.out.print(", state = " + state); - - // if the character we're sitting on causes us to transition to - // the "end of word" state, then it was a non-dictionary character - // and we've successfully traversed the whole range. Drop out - // of the loop. - if (state == /*-1*/ 0xFFFF) { - currentBreakPositions.push(Integer.valueOf(text.getIndex())); - break; - } - - // if the character we're sitting on causes us to transition to - // the error state, or if we've gone off the end of the range - // without transitioning to the "end of word" state, we've hit - // an error... - else if (state == 0 || text.getIndex() >= endPos) { - - // if this is the farthest we've gotten, take note of it in - // case there's an error in the text - if (text.getIndex() > farthestEndPoint) { - farthestEndPoint = text.getIndex(); - bestBreakPositions = (Stack)(currentBreakPositions.clone()); - } - - // wrongBreakPositions is a list of all break positions we've tried starting - // that didn't allow us to traverse all the way through the text. Every time - // we pop a break position off of currentBreakPositions, we put it into - // wrongBreakPositions to avoid trying it again later. If we make it to this - // spot, we're either going to back up to a break in possibleBreakPositions - // and try starting over from there, or we've exhausted all possible break - // positions and are going to do the fallback procedure. This loop prevents - // us from messing with anything in possibleBreakPositions that didn't work as - // a starting point the last time we tried it (this is to prevent a bunch of - // repetitive checks from slowing down some extreme cases) - // variable not used Integer newStartingSpot = null; - while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains( - possibleBreakPositions.peek())) { - possibleBreakPositions.pop(); - } - - // if we've used up all possible break-position combinations, there's - // an error or an unknown word in the text. In this case, we start - // over, treating the farthest character we've reached as the beginning - // of the range, and "blessing" the break positions that got us that - // far as real break positions - if (possibleBreakPositions.isEmpty()) { - if (bestBreakPositions != null) { - currentBreakPositions = bestBreakPositions; - if (farthestEndPoint < endPos) { - text.setIndex(farthestEndPoint + 1); - } - else { - break; - } - } - else { - if ((currentBreakPositions.size() == 0 - || currentBreakPositions.peek().intValue() != text.getIndex()) - && text.getIndex() != startPos) { - currentBreakPositions.push(Integer.valueOf(text.getIndex())); - } - CINext32(text); - currentBreakPositions.push(Integer.valueOf(text.getIndex())); - } - } - - // if we still have more break positions we can try, then promote the - // last break in possibleBreakPositions into currentBreakPositions, - // and get rid of all entries in currentBreakPositions that come after - // it. Then back up to that position and start over from there (i.e., - // treat that position as the beginning of a new word) - else { - Integer temp = possibleBreakPositions.pop(); - Integer temp2 = null; - while (!currentBreakPositions.isEmpty() && temp.intValue() < - currentBreakPositions.peek().intValue()) { - temp2 = currentBreakPositions.pop(); - wrongBreakPositions.add(temp2); - } - currentBreakPositions.push(temp); - text.setIndex(currentBreakPositions.peek().intValue()); - } - - // re-sync "c" for the next go-round, and drop out of the loop if - // we've made it off the end of the range - c = CICurrent32(text); - state = 0; - if (text.getIndex() >= endPos) { - break; - } - } - - // if we didn't hit any exceptional conditions on this last iteration, - // just advance to the next character and loop - else { - c = CINext32(text); - } -//System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}"); -//System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}"); - } - - // dump the last break position in the list, and replace it with the actual - // end of the range (which may be the same character, or may be further on - // because the range actually ended with non-dictionary characters we want to - // keep with the word) - if (!currentBreakPositions.isEmpty()) { - currentBreakPositions.pop(); - } - currentBreakPositions.push(Integer.valueOf(endPos)); - - // create a regular array to hold the break positions and copy - // the break positions from the stack to the array (in addition, - // our starting position goes into this array as a break position). - // This array becomes the cache of break positions used by next() - // and previous(), so this is where we actually refresh the cache. - cachedBreakPositions = new int[currentBreakPositions.size() + 1]; - cachedBreakPositions[0] = startPos; - - for (int i = 0; i < currentBreakPositions.size(); i++) { - cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue(); - } - positionInCache = 0; - } -} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java new file mode 100644 index 00000000000..8b3447953a2 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java @@ -0,0 +1,69 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.text.CharacterIterator; +import java.util.Stack; + +abstract class DictionaryBreakEngine implements LanguageBreakEngine { + protected UnicodeSet fSet = new UnicodeSet(); + private final int fTypes; + + /** + * @param breakTypes A mask of the break iterators that can use this engine. + * For example, (1 << KIND_WORD) | (1 << KIND_LINE) could be used by + * word iterators and line iterators, but not any other kind. + */ + public DictionaryBreakEngine(int breakTypes) { + // TODO: consider using a java.util.BitSet with nbits <= 32 + fTypes = breakTypes; + } + + public boolean handles(int c, int breakType) { + return (breakType >= 0 && breakType < 32) && // breakType is in range + ((1 << breakType) & fTypes) != 0 && // this type can use us + fSet.contains(c); // we recognize the character + } + + public int findBreaks(CharacterIterator text_, int startPos, int endPos, + boolean reverse, int breakType, Stack foundBreaks) { + if (breakType < 0 || breakType >= 32 || + ((1 << breakType) & fTypes) == 0) { + return 0; + } + + int result = 0; + UCharacterIterator text = UCharacterIterator.getInstance(text_); + int start = text.getIndex(); + int current, rangeStart, rangeEnd; + int c = text.current(); + if (reverse) { + boolean isDict = fSet.contains(c); + while ((current = text.getIndex()) > startPos && isDict) { + c = text.previous(); + isDict = fSet.contains(c); + } + rangeStart = (current < startPos) ? startPos : + current + (isDict ? 0 : 1); + rangeEnd = start + 1; + } else { + while ((current = text.getIndex()) < endPos && fSet.contains(c)) { + c = text.next(); + } + rangeStart = start; + rangeEnd = current; + } + + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); + text.setIndex(current); + + return result; + } + + protected abstract int divideUpDictionaryRange(UCharacterIterator text, + int rangeStart, int rangeEnd, Stack foundBreaks); +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryData.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryData.java new file mode 100644 index 00000000000..f24970e3cb6 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryData.java @@ -0,0 +1,90 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; + +import com.ibm.icu.impl.Assert; +import com.ibm.icu.impl.ICUBinary; +import com.ibm.icu.impl.ICUData; +import com.ibm.icu.impl.ICUResourceBundle; +import com.ibm.icu.util.UResourceBundle; + +final class DictionaryData { + // disallow instantiation + private DictionaryData() { } + + public static final int TRIE_TYPE_BYTES = 0; + public static final int TRIE_TYPE_UCHARS = 1; + public static final int TRIE_TYPE_MASK = 7; + public static final int TRIE_HAS_VALUES = 8; + public static final int TRANSFORM_NONE = 0; + public static final int TRANSFORM_TYPE_OFFSET = 0x1000000; + public static final int TRANSFORM_TYPE_MASK = 0x7f000000; + public static final int TRANSFORM_OFFSET_MASK = 0x1fffff; + + public static final int IX_STRING_TRIE_OFFSET = 0; + public static final int IX_RESERVED1_OFFSET = 1; + public static final int IX_RESERVED2_OFFSET = 2; + public static final int IX_TOTAL_SIZE = 3; + public static final int IX_TRIE_TYPE = 4; + public static final int IX_TRANSFORM = 5; + public static final int IX_RESERVED6 = 6; + public static final int IX_RESERVED7 = 7; + public static final int IX_COUNT = 8; + + private static final byte DATA_FORMAT_ID[] = { (byte) 0x44, (byte) 0x69, + (byte) 0x63, (byte) 0x74 }; + + public static DictionaryMatcher loadDictionaryFor(String dictType) throws IOException { + ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME); + String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType); + dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName; + InputStream is = ICUData.getStream(dictFileName); + ICUBinary.readHeader(is, DATA_FORMAT_ID, null); + DataInputStream s = new DataInputStream(is); + int[] indexes = new int[IX_COUNT]; + // TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[] + for (int i = 0; i < IX_COUNT; i++) { + indexes[i] = s.readInt(); + } + int offset = indexes[IX_STRING_TRIE_OFFSET]; + Assert.assrt(offset >= (4 * IX_COUNT)); + if (offset > (4 * IX_COUNT)) { + int diff = offset - (4 * IX_COUNT); + s.skipBytes(diff); + } + int trieType = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK; + int totalSize = indexes[IX_TOTAL_SIZE] - offset; + DictionaryMatcher m = null; + if (trieType == TRIE_TYPE_BYTES) { + int transform = indexes[IX_TRANSFORM]; + byte[] data = new byte[totalSize]; + int i; + for (i = 0; i < data.length; i++) { + data[i] = s.readByte(); + } + Assert.assrt(i == totalSize); + m = new BytesDictionaryMatcher(data, transform); + } else if (trieType == TRIE_TYPE_UCHARS) { + Assert.assrt(totalSize % 2 == 0); + int num = totalSize / 2; + char[] data = new char[totalSize / 2]; + for (int i = 0; i < num; i++) { + data[i] = s.readChar(); + } + m = new CharsDictionaryMatcher(new String(data)); + } else { + m = null; + } + s.close(); + is.close(); + return m; + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryMatcher.java new file mode 100644 index 00000000000..c46f003af47 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryMatcher.java @@ -0,0 +1,40 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.text.CharacterIterator; + +/** + * The DictionaryMatcher interface is used to allow arbitrary "types" of + * back-end data structures to be used with the break iteration code. + */ +abstract class DictionaryMatcher { + /** + * Find dictionary words that match the text. + * + * @param text A CharacterIterator representing the text. The iterator is + * left after the longest prefix match in the dictionary. + * @param maxLength The maximum number of code units to match. + * @param lengths An array that is filled with the lengths of words that matched. + * @param count Filled with the number of elements output in lengths. + * @param limit The maximum amount of words to output. Must be less than or equal to lengths.length. + * @param values Filled with the weight values associated with the various words. + * @return The number of characters in text that were matched. + */ + public abstract int matches(CharacterIterator text, int maxLength, int[] lengths, + int[] count, int limit, int[] values); + + public int matches(CharacterIterator text, int maxLength, int[] lengths, + int[] count, int limit) { + return matches(text, maxLength, lengths, count, limit, null); + } + + /** + * @return the kind of dictionary that this matcher is using + */ + public abstract int getType(); +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java new file mode 100644 index 00000000000..583fe8cb161 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java @@ -0,0 +1,40 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.text.CharacterIterator; +import java.util.Stack; + +/** + * The LanguageBreakEngine interface is to be used to implement any + * language-specific logic for break iteration. + */ +interface LanguageBreakEngine { + /** + * @param c A Unicode codepoint value + * @param breakType The kind of break iterator that is wanting to make use + * of this engine - character, word, line, sentence + * @return true if the engine can handle this character, false otherwise + */ + public boolean handles(int c, int breakType); + + /** + * Implements the actual breaking logic. + * @param text The text to break over + * @param startPos The index of the beginning of our range + * @param endPos The index of the possible end of our range. It is possible, + * however, that our range ends earlier + * @param reverse true iff we are iterating backwards (in a call to + * previous(), for example) + * @param breakType The kind of break iterator that is wanting to make use + * of this engine - character, word, line, sentence + * @param foundBreaks A Stack that the breaks found will be added to + * @return the number of words found + */ + public int findBreaks(CharacterIterator text, int startPos, int endPos, + boolean reverse, int breakType, Stack foundBreaks); +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 59c74e28425..8623fbbe95f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -12,10 +12,18 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.text.CharacterIterator; +import java.util.Collections; +import java.util.Set; +import java.util.Stack; +import java.util.concurrent.ConcurrentHashMap; import com.ibm.icu.impl.Assert; import com.ibm.icu.impl.ICUDebug; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import static com.ibm.icu.impl.CharacterIteration.*; /** * Rule Based Break Iterator @@ -24,8 +32,6 @@ import com.ibm.icu.impl.ICUDebug; * @stable ICU 2.0 */ public class RuleBasedBreakIterator extends BreakIterator { - - //======================================================================= // Constructors & Factories //======================================================================= @@ -57,14 +63,6 @@ public class RuleBasedBreakIterator extends BreakIterator { This.fRData = RBBIDataWrapper.get(is); return This; } - - /*private RuleBasedBreakIterator(RuleBasedBreakIterator other) { - // TODO: check types. - fRData = other.fRData; - if (fText != null) { - fText = (CharacterIterator)(other.fText.clone()); - } - }*/ /** * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. @@ -89,12 +87,11 @@ public class RuleBasedBreakIterator extends BreakIterator { ///CLOVER:ON } } - - + //======================================================================= // Boilerplate //======================================================================= - + /** * Clones this iterator. * @return A newly-constructed RuleBasedBreakIterator with the same @@ -124,7 +121,7 @@ public class RuleBasedBreakIterator extends BreakIterator { } try { RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; - if (fRData != other.fRData && (fRData == null || other.fRData == null)) {System.out.println("GOT HERE"); + if (fRData != other.fRData && (fRData == null || other.fRData == null)) { return false; } if (fRData != null && other.fRData != null && @@ -167,7 +164,6 @@ public class RuleBasedBreakIterator extends BreakIterator { return fRData.fRuleSource.hashCode(); } - /** * Tag value for "words" that do not fit into any of other categories. * Includes spaces and most punctuation. @@ -240,9 +236,6 @@ public class RuleBasedBreakIterator extends BreakIterator { */ public static final int WORD_IDEO_LIMIT = 500; - - - private static final int START_STATE = 1; // The state number of the starting state private static final int STOP_STATE = 0; // The state-transition value indicating "stop" @@ -283,9 +276,8 @@ public class RuleBasedBreakIterator extends BreakIterator { * for updating it is live. Dictionary Based break iterators (a subclass * of us) access this field directly. * @internal - * @deprecated This API is ICU internal only. */ - protected int fDictionaryCharCount; + private int fDictionaryCharCount; /** * Debugging flag. Trace operation of state machine when true. @@ -294,6 +286,44 @@ public class RuleBasedBreakIterator extends BreakIterator { */ public static boolean fTrace; + /** + * What kind of break iterator this is. Set to KIND_LINE by default, + * since this produces sensible output. + */ + private int fBreakType = KIND_LINE; + + /** + * The "default" break engine - just skips over ranges of dictionary words, + * producing no breaks. Should only be used if characters need to be handled + * by a dictionary but we have no dictionary implementation for them. + */ + private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine(); + + /** + * when a range of characters is divided up using the dictionary, the break + * positions that are discovered are stored here, preventing us from having + * to use either the dictionary or the state table again until the iterator + * leaves this range of text + */ + private int[] fCachedBreakPositions; + + /** + * if fCachedBreakPositions is not null, this indicates which item in the + * cache the current iteration position refers to + */ + private int fPositionInCache; + + /** + * Whether or not we should be using the dictionary. Set to true by + * default - only set to false if we get an empty string as input or + * if our "kind" is not KIND_WORD or KIND_LINE. + * + * If this is set to false, no dictionary handling is done. + */ + private boolean fUseDictionary = true; + + private final Set fBreakEngines = Collections.newSetFromMap(new ConcurrentHashMap()); + /* * ICU debug argument name for RBBI */ @@ -314,7 +344,7 @@ public class RuleBasedBreakIterator extends BreakIterator { private void init() { fLastStatusIndexValid = true; fDictionaryCharCount = 0; - + fBreakEngines.add(fUnhandledBreakEngine); if (debugInitDone == false) { fTrace = ICUDebug.enabled(RBBI_DEBUG_ARG) @@ -353,6 +383,9 @@ public class RuleBasedBreakIterator extends BreakIterator { * @stable ICU 2.0 */ public int first() { + fCachedBreakPositions = null; + fDictionaryCharCount = 0; + fPositionInCache = 0; fLastRuleStatusIndex = 0; fLastStatusIndexValid = true; if (fText == null) { @@ -362,7 +395,6 @@ public class RuleBasedBreakIterator extends BreakIterator { return fText.getIndex(); } - /** * Sets the current iteration position to the end of the text. * (i.e., the CharacterIterator's ending offset). @@ -370,26 +402,26 @@ public class RuleBasedBreakIterator extends BreakIterator { * @stable ICU 2.0 */ public int last() { + fCachedBreakPositions = null; + fDictionaryCharCount = 0; + fPositionInCache = 0; + if (fText == null) { fLastRuleStatusIndex = 0; fLastStatusIndexValid = true; return BreakIterator.DONE; } - // I'm not sure why, but t.last() returns the offset of the last character, + // t.last() returns the offset of the last character, // rather than the past-the-end offset - // - // (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ... - // will work correctly.) - - + // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ... + // will work correctly. fLastStatusIndexValid = false; int pos = fText.getEndIndex(); fText.setIndex(pos); return pos; } - /** * Advances the iterator either forward or backward the specified number of steps. * Negative values move backward, and positive values move forward. This is @@ -413,7 +445,6 @@ public class RuleBasedBreakIterator extends BreakIterator { return result; } - /** * Advances the iterator to the next boundary position. * @return The position of the first boundary after this one. @@ -430,6 +461,72 @@ public class RuleBasedBreakIterator extends BreakIterator { * @stable ICU 2.0 */ public int previous() { + CharacterIterator text = getText(); + + fLastStatusIndexValid = false; + + // if we have cached break positions and we're still in the range + // covered by them, just move one step backward in the cache + if (fCachedBreakPositions != null && fPositionInCache > 0) { + --fPositionInCache; + text.setIndex(fCachedBreakPositions[fPositionInCache]); + return fCachedBreakPositions[fPositionInCache]; + } + + // otherwise, dump the cache and use the inherited previous() method to move + // backward. This may fill up the cache with new break positions, in which + // case we have to mark our position in the cache. If it doesn't, use next() + // to move forward until we hit or pass the current position. This *will* fill + // the cache. + else { + // TODO: Try to reuse the array rather than reallocating it all the time + fCachedBreakPositions = null; + + int offset = current(); + int result = rulesPrevious(); + if (result == BreakIterator.DONE) { + return result; + } + + if (fDictionaryCharCount == 0) { + return result; + } + + if (fCachedBreakPositions != null) { + fPositionInCache = fCachedBreakPositions.length - 2; + return result; + } + + while (result < offset) { + int nextResult = handleNext(); + if (nextResult >= offset) { + break; + } + + result = nextResult; + } + + if (fCachedBreakPositions != null) { + for (fPositionInCache = 0; fPositionInCache < fCachedBreakPositions.length; fPositionInCache++) { + if (fCachedBreakPositions[fPositionInCache] >= offset) { + fPositionInCache--; + break; + } + } + } + + // prepare for the user asking for our status + // our status will have been marked as valid by the next() + // calls but isn't at the right place, so mark it as invalid + // and recompute it when the user asks + fLastStatusIndexValid = false; + text.setIndex(result); + + return result; + } + } + + private int rulesPrevious() { // if we're already sitting at the beginning of the text, return DONE if (fText == null || current() == fText.getBeginIndex()) { fLastRuleStatusIndex = 0; @@ -450,7 +547,7 @@ public class RuleBasedBreakIterator extends BreakIterator { int start = current(); - CIPrevious32(fText); + previous32(fText); int lastResult = handlePrevious(fRData.fRTable); if (lastResult == BreakIterator.DONE) { lastResult = fText.getBeginIndex(); @@ -488,6 +585,7 @@ public class RuleBasedBreakIterator extends BreakIterator { fLastStatusIndexValid = breakTagValid; return lastResult; } + /** * Sets the iterator to refer to the first boundary position following * the specified position. @@ -496,6 +594,32 @@ public class RuleBasedBreakIterator extends BreakIterator { * @stable ICU 2.0 */ public int following(int offset) { + CharacterIterator text = getText(); + + // if we have no cached break positions, or if "offset" is outside the + // range covered by the cache, then dump the cache and call our + // inherited following() method. This will call other methods in this + // class that may refresh the cache. + if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] || + offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) { + fCachedBreakPositions = null; + return rulesFollowing(offset); + } + + // on the other hand, if "offset" is within the range covered by the + // cache, then just search the cache for the first break position + // after "offset" + else { + fPositionInCache = 0; + while (fPositionInCache < fCachedBreakPositions.length + && offset >= fCachedBreakPositions[fPositionInCache]) + ++fPositionInCache; + text.setIndex(fCachedBreakPositions[fPositionInCache]); + return text.getIndex(); + } + } + + private int rulesFollowing(int offset) { // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the // text's starting offset @@ -522,7 +646,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character - CINext32(fText); + next32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fRData.fSRTable); result = next(); @@ -535,7 +659,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // No Safe point reverse table, but there is a safe pt forward table. // fText.setIndex(offset); - CIPrevious32(fText); + previous32(fText); // handle next will give result >= offset handleNext(fRData.fSFTable); // previous will give result 0 or 1 boundary away from offset, @@ -584,6 +708,33 @@ public class RuleBasedBreakIterator extends BreakIterator { * @stable ICU 2.0 */ public int preceding(int offset) { + CharacterIterator text = getText(); + + // if we have no cached break positions, or "offset" is outside the + // range covered by the cache, we can just call the inherited routine + // (which will eventually call other routines in this class that may + // refresh the cache) + if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] || + offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) { + fCachedBreakPositions = null; + return rulesPreceding(offset); + } + + // on the other hand, if "offset" is within the range covered by the cache, + // then all we have to do is search the cache for the last break position + // before "offset" + else { + fPositionInCache = 0; + while (fPositionInCache < fCachedBreakPositions.length + && offset > fCachedBreakPositions[fPositionInCache]) + ++fPositionInCache; + --fPositionInCache; + text.setIndex(fCachedBreakPositions[fPositionInCache]); + return text.getIndex(); + } + } + + private int rulesPreceding(int offset) { // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the @@ -608,7 +759,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // move backwards one codepoint to prepare for moving forwards to a // safe point. // this handles offset being between a supplementary character - CIPrevious32(fText); + previous32(fText); handleNext(fRData.fSFTable); result = previous(); while (result >= offset) { @@ -619,7 +770,7 @@ public class RuleBasedBreakIterator extends BreakIterator { if (fRData.fSRTable != null) { // backup plan if forward safe table is not available fText.setIndex(offset); - CINext32(fText); + next32(fText); // handle previous will give result <= offset handlePrevious(fRData.fSRTable); @@ -657,162 +808,158 @@ public class RuleBasedBreakIterator extends BreakIterator { } -/** - * Returns true if the specfied position is a boundary position. As a side - * effect, leaves the iterator pointing to the first boundary position at - * or after "offset". - * @param offset the offset to check. - * @return True if "offset" is a boundary position. - * @stable ICU 2.0 - */ -public boolean isBoundary(int offset) { - checkOffset(offset, fText); - - // the beginning index of the iterator is always a boundary position by definition - if (offset == fText.getBeginIndex()) { - first(); // For side effects on current position, tag values. - return true; - } + /** + * Returns true if the specified position is a boundary position. As a side + * effect, leaves the iterator pointing to the first boundary position at + * or after "offset". + * @param offset the offset to check. + * @return True if "offset" is a boundary position. + * @stable ICU 2.0 + */ + public boolean isBoundary(int offset) { + checkOffset(offset, fText); - if (offset == fText.getEndIndex()) { - last(); // For side effects on current position, tag values. - return true; - } - - // otherwise, we can use following() on the position before the specified - // one and return true if the position we get back is the one the user - // specified - - // return following(offset - 1) == offset; - // TODO: check whether it is safe to revert to the simpler offset-1 code - // The safe rules may take care of unpaired surrogates ok. - fText.setIndex(offset); - CIPrevious32(fText); - int pos = fText.getIndex(); - boolean result = following(pos) == offset; - return result; -} - -/** - * Returns the current iteration position. - * @return The current iteration position. - * @stable ICU 2.0 - */ -public int current() { - return (fText != null) ? fText.getIndex() : BreakIterator.DONE; - } - - - -private void makeRuleStatusValid() { - if (fLastStatusIndexValid == false) { - // No cached status is available. - if (fText == null || current() == fText.getBeginIndex()) { - // At start of text, or there is no text. Status is always zero. - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; - } else { - // Not at start of text. Find status the tedious way. - int pa = current(); - previous(); - int pb = next(); - Assert.assrt (pa == pb); + // the beginning index of the iterator is always a boundary position by definition + if (offset == fText.getBeginIndex()) { + first(); // For side effects on current position, tag values. + return true; } - Assert.assrt(fLastStatusIndexValid == true); - Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); + + if (offset == fText.getEndIndex()) { + last(); // For side effects on current position, tag values. + return true; + } + + // otherwise, we can use following() on the position before the specified + // one and return true if the position we get back is the one the user + // specified + + // return following(offset - 1) == offset; + // TODO: check whether it is safe to revert to the simpler offset-1 code + // The safe rules may take care of unpaired surrogates ok. + fText.setIndex(offset); + previous32(fText); + int pos = fText.getIndex(); + boolean result = following(pos) == offset; + return result; } -} + /** + * Returns the current iteration position. + * @return The current iteration position. + * @stable ICU 2.0 + */ + public int current() { + return (fText != null) ? fText.getIndex() : BreakIterator.DONE; + } -/** - * Return the status tag from the break rule that determined the most recently - * returned break position. The values appear in the rule source - * within brackets, {123}, for example. For rules that do not specify a - * status, a default value of 0 is returned. If more than one rule applies, - * the numerically largest of the possible status values is returned. - *

- * Of the standard types of ICU break iterators, only the word break - * iterator provides status values. The values are defined in - * class RuleBasedBreakIterator, and allow distinguishing between words - * that contain alphabetic letters, "words" that appear to be numbers, - * punctuation and spaces, words containing ideographic characters, and - * more. Call getRuleStatus after obtaining a boundary - * position from next(), previous(), or - * any other break iterator functions that returns a boundary position. - *

- * @return the status from the break rule that determined the most recently - * returned break position. - * - * @draft ICU 3.0 - * @provisional This is a draft API and might change in a future release of ICU. - */ - -public int getRuleStatus() { - makeRuleStatusValid(); - // Status records have this form: - // Count N <-- fLastRuleStatusIndex points here. - // Status val 0 - // Status val 1 - // ... - // Status val N-1 <-- the value we need to return - // The status values are sorted in ascending order. - // This function returns the last (largest) of the array of status values. - int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; - int tagVal = fRData.fStatusTable[idx]; - - return tagVal; -} - - - -/** - * Get the status (tag) values from the break rule(s) that determined the most - * recently returned break position. The values appear in the rule source - * within brackets, {123}, for example. The default status value for rules - * that do not explicitly provide one is zero. - *

- * The status values used by the standard ICU break rules are defined - * as public constants in class RuleBasedBreakIterator. - *

- * If the size of the output array is insufficient to hold the data, - * the output will be truncated to the available length. No exception - * will be thrown. - * - * @param fillInArray an array to be filled in with the status values. - * @return The number of rule status values from rules that determined - * the most recent boundary returned by the break iterator. - * In the event that the array is too small, the return value - * is the total number of status values that were available, - * not the reduced number that were actually returned. - * @draft ICU 3.0 - * @provisional This is a draft API and might change in a future release of ICU. - */ -public int getRuleStatusVec(int[] fillInArray) { - makeRuleStatusValid(); - int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; - if (fillInArray != null) { - int numToCopy = Math.min(numStatusVals, fillInArray.length); - for (int i=0; i= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); } } - return numStatusVals; - } + /** + * Return the status tag from the break rule that determined the most recently + * returned break position. The values appear in the rule source + * within brackets, {123}, for example. For rules that do not specify a + * status, a default value of 0 is returned. If more than one rule applies, + * the numerically largest of the possible status values is returned. + *

+ * Of the standard types of ICU break iterators, only the word break + * iterator provides status values. The values are defined in + * class RuleBasedBreakIterator, and allow distinguishing between words + * that contain alphabetic letters, "words" that appear to be numbers, + * punctuation and spaces, words containing ideographic characters, and + * more. Call getRuleStatus after obtaining a boundary + * position from next(), previous(), or + * any other break iterator functions that returns a boundary position. + *

+ * @return the status from the break rule that determined the most recently + * returned break position. + * + * @draft ICU 3.0 + * @provisional This is a draft API and might change in a future release of ICU. + */ -/** - * Return a CharacterIterator over the text being analyzed. This version - * of this method returns the actual CharacterIterator we're using internally. - * Changing the state of this iterator can have undefined consequences. If - * you need to change it, clone it first. - * @return An iterator over the text being analyzed. - * @stable ICU 2.0 - */ + public int getRuleStatus() { + makeRuleStatusValid(); + // Status records have this form: + // Count N <-- fLastRuleStatusIndex points here. + // Status val 0 + // Status val 1 + // ... + // Status val N-1 <-- the value we need to return + // The status values are sorted in ascending order. + // This function returns the last (largest) of the array of status values. + int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; + int tagVal = fRData.fStatusTable[idx]; + return tagVal; + } + + /** + * Get the status (tag) values from the break rule(s) that determined the most + * recently returned break position. The values appear in the rule source + * within brackets, {123}, for example. The default status value for rules + * that do not explicitly provide one is zero. + *

+ * The status values used by the standard ICU break rules are defined + * as public constants in class RuleBasedBreakIterator. + *

+ * If the size of the output array is insufficient to hold the data, + * the output will be truncated to the available length. No exception + * will be thrown. + * + * @param fillInArray an array to be filled in with the status values. + * @return The number of rule status values from rules that determined + * the most recent boundary returned by the break iterator. + * In the event that the array is too small, the return value + * is the total number of status values that were available, + * not the reduced number that were actually returned. + * @draft ICU 3.0 + * @provisional This is a draft API and might change in a future release of ICU. + */ + public int getRuleStatusVec(int[] fillInArray) { + makeRuleStatusValid(); + int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; + if (fillInArray != null) { + int numToCopy = Math.min(numStatusVals, fillInArray.length); + for (int i=0; i= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) { - c = ci.next(); - if (cUTF16.TRAIL_SURROGATE_MAX_VALUE) { - c = ci.previous(); + protected LanguageBreakEngine getEngineFor(int c) { + if (c == DONE32 || !fUseDictionary) { + return null; + } + + for (LanguageBreakEngine candidate : fBreakEngines) { + if (candidate.handles(c, fBreakType)) { + return candidate; } } - // For BMP chars, this next() is the real deal. - c = ci.next(); - - // If we might have a lead surrogate, we need to peak ahead to get the trail - // even though we don't want to really be positioned there. - if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { - c = CINextTrail32(ci, c); + // if we don't have an existing engine, build one. + int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); + LanguageBreakEngine eng = null; + try { + switch (script) { + case UScript.THAI: + eng = new ThaiBreakEngine(); + break; + case UScript.KATAKANA: + case UScript.HIRAGANA: + case UScript.HAN: + if (getBreakType() == KIND_WORD) + eng = new CjkBreakEngine(false); + break; + case UScript.HANGUL: + if (getBreakType() == KIND_WORD) + eng = new CjkBreakEngine(true); + break; + default: + fUnhandledBreakEngine.handleChar(c, getBreakType()); + eng = fUnhandledBreakEngine; + break; + } + } catch (IOException e) { + eng = null; } - - if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) { - // We got a supplementary char. Back the iterator up to the postion - // of the lead surrogate. - ci.previous(); - } - return c; - } - - // Out-of-line portion of the in-line Next32 code. - // The call site does an initial ci.next() and calls this function - // if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE. - // NOTE: we leave the underlying char iterator positioned in the - // middle of a surroage pair. ci.next() will work correctly - // from there, but the ci.getIndex() will be wrong, and needs - // adjustment. - private static int CINextTrail32(CharacterIterator ci, int lead) { - int retVal = lead; - if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - char cTrail = ci.next(); - if (UTF16.isTrailSurrogate(cTrail)) { - retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + - (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + - UTF16.SUPPLEMENTARY_MIN_VALUE; - } else { - ci.previous(); - } - } else { - if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) { - retVal = CI_DONE32; - } + if (eng != null) { + fBreakEngines.add(eng); } - return retVal; + return eng; } - - private static int CIPrevious32(CharacterIterator ci) { - if (ci.getIndex() <= ci.getBeginIndex()) { - return CI_DONE32; - } - char trail = ci.previous(); - int retVal = trail; - if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) { - char lead = ci.previous(); - if (UTF16.isLeadSurrogate(lead)) { - retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + - ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + - UTF16.SUPPLEMENTARY_MIN_VALUE; - } else { - ci.next(); - } - } - return retVal; - } - - static int CICurrent32(CharacterIterator ci) { - char lead = ci.current(); - int retVal = lead; - if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) { - return retVal; - } - if (UTF16.isLeadSurrogate(lead)) { - int trail = (int)ci.next(); - ci.previous(); - if (UTF16.isTrailSurrogate((char)trail)) { - retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + - (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + - UTF16.SUPPLEMENTARY_MIN_VALUE; - } - } else { - if (lead == CharacterIterator.DONE) { - if (ci.getIndex() >= ci.getEndIndex()) { - retVal = CI_DONE32; - } - } - } - return retVal; - } - //----------------------------------------------------------------------------------- // // handleNext(void) All forward iteration vectors through this function. - // NOTE: This function is overridden by the dictionary base break iterator. - // User level API functions go to the dbbi implementation - // when the break iterator type is dbbi. - // The DBBI implementation sometimes explicitly calls back to here, - // its inherited handleNext(). // //----------------------------------------------------------------------------------- int handleNext() { - return handleNext(fRData.fFTable); + CharacterIterator text = getText(); + + // if there are no cached break positions, or if we've just moved + // off the end of the range covered by the cache, we have to dump + // and possibly regenerate the cache + int startPos = text.getIndex(); + if (fCachedBreakPositions == null || fPositionInCache == fCachedBreakPositions.length - 1) { + // start by using the rules handleNext() to find a tentative return + // value. dictionaryCharCount tells us how many dictionary characters + // we passed over on our way to the tentative return value + fDictionaryCharCount = 0; + int result = handleNext(fRData.fFTable); + + // if we passed over more than one dictionary character, then we use + // divideUpDictionaryRange() to regenerate the cached break positions + // for the new range. + if (fDictionaryCharCount > 1 && result - startPos > 1) { + text.setIndex(startPos); + LanguageBreakEngine e = getEngineFor(current32(text)); + if (e != null) { + // we have an engine! use it to produce breaks + Stack breaks = new Stack(); + e.findBreaks(text, startPos, result, false, getBreakType(), breaks); + + fCachedBreakPositions = new int[breaks.size() + 2]; + fCachedBreakPositions[0] = startPos; + for (int i = 0; i < breaks.size(); i++) { + fCachedBreakPositions[i + 1] = breaks.elementAt(i).intValue(); + } + fCachedBreakPositions[breaks.size() + 1] = result; + + fPositionInCache = 0; + } else { + // we don't have an engine; just use the rules + text.setIndex(result); + return result; + } + } + else { + // otherwise, the value we got back from the inherited function + // is our return value, and we can dump the cache + fCachedBreakPositions = null; + return result; + } + } + + // if the cache of break positions has been regenerated (or existed all + // along), then just advance to the next break position in the cache + // and return it + if (fCachedBreakPositions != null) { + ++fPositionInCache; + text.setIndex(fCachedBreakPositions[fPositionInCache]); + return fCachedBreakPositions[fPositionInCache]; + } + + ///CLOVER:OFF + Assert.assrt(false); + return -9999; // WE SHOULD NEVER GET HERE! + ///CLOVER:ON } /** @@ -1006,8 +1173,8 @@ public int getRuleStatusVec(int[] fillInArray) { result = initialPosition; c = fText.current(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { - c = CINextTrail32(fText, c); - if (c == CI_DONE32) { + c = nextTrail32(fText, c); + if (c == DONE32) { fLastRuleStatusIndex = 0; return BreakIterator.DONE; } @@ -1023,10 +1190,9 @@ public int getRuleStatusVec(int[] fillInArray) { mode = RBBI_START; } - // loop until we reach the end of the text or transition to state 0 while (state != STOP_STATE) { - if (c == CI_DONE32) { + if (c == DONE32) { // Reached end of input string. if (mode == RBBI_END) { // We have already run the loop one last time with the @@ -1046,7 +1212,7 @@ public int getRuleStatusVec(int[] fillInArray) { // Ran off end, no match found. // move forward one fText.setIndex(initialPosition); - CINext32(fText); + next32(fText); } break; } @@ -1094,7 +1260,7 @@ public int getRuleStatusVec(int[] fillInArray) { if (mode == RBBI_RUN) { c = (int)fText.next(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { - c = CINextTrail32(fText, c); + c = nextTrail32(fText, c); } } else { if (mode == RBBI_START) { @@ -1105,7 +1271,7 @@ public int getRuleStatusVec(int[] fillInArray) { if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { // Match found, common case result = fText.getIndex(); - if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) { + if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) { // The iterator has been left in the middle of a surrogate pair. // We want the start of it. result--; @@ -1134,7 +1300,7 @@ public int getRuleStatusVec(int[] fillInArray) { } lookaheadResult = fText.getIndex(); - if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) { + if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=DONE32) { // The iterator has been left in the middle of a surrogate pair. // We want the beginning of it. lookaheadResult--; @@ -1160,7 +1326,7 @@ public int getRuleStatusVec(int[] fillInArray) { // at least one character.) if (result == initialPosition) { result = fText.setIndex(initialPosition); - CINext32(fText); + next32(fText); result = fText.getIndex(); } @@ -1174,8 +1340,6 @@ public int getRuleStatusVec(int[] fillInArray) { return result; } - - private int handlePrevious(short stateTable[]) { if (fText == null || stateTable == null) { return 0; @@ -1203,7 +1367,7 @@ public int getRuleStatusVec(int[] fillInArray) { // set up the starting char initialPosition = fText.getIndex(); result = initialPosition; - c = CIPrevious32(fText); + c = previous32(fText); // Set up the initial state for the state machine state = START_STATE; @@ -1223,7 +1387,7 @@ public int getRuleStatusVec(int[] fillInArray) { // mainLoop: for (;;) { innerBlock: { - if (c == CI_DONE32) { + if (c == DONE32) { // Reached end of input string. if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { // Either this is the old (ICU 3.2 and earlier) format data which @@ -1240,7 +1404,7 @@ public int getRuleStatusVec(int[] fillInArray) { // Ran off start, no match found. // Move one position (towards the start, since we are doing previous.) fText.setIndex(initialPosition); - CIPrevious32(fText); + previous32(fText); } break mainLoop; } @@ -1297,7 +1461,7 @@ public int getRuleStatusVec(int[] fillInArray) { // time. result = lookaheadResult; lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. + // TODO: make a stand-alone hard break in a rule work. if (lookAheadHardBreak) { break mainLoop; @@ -1340,7 +1504,7 @@ public int getRuleStatusVec(int[] fillInArray) { // then move iterator position backwards one character // if (mode == RBBI_RUN) { - c = CIPrevious32(fText); + c = previous32(fText); } else { if (mode == RBBI_START) { mode = RBBI_RUN; @@ -1357,7 +1521,7 @@ public int getRuleStatusVec(int[] fillInArray) { // at least one character.) if (result == initialPosition) { result = fText.setIndex(initialPosition); - CIPrevious32(fText); + previous32(fText); result = fText.getIndex(); } @@ -1368,38 +1532,5 @@ public int getRuleStatusVec(int[] fillInArray) { return result; } - - - - - - //------------------------------------------------------------------------------- - - // - - // isDictionaryChar Return true if the category lookup for this char - - // indicates that it is in the set of dictionary lookup - - // chars. - - // - - // This function is intended for use by dictionary based - - // break iterators. - - // - - //------------------------------------------------------------------------------- - - boolean isDictionaryChar(int c) { - - short category = (short) fRData.fTrie.getCodePointValue(c); - - return (category & 0x4000) != 0; - - } - } -//eof + diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java similarity index 75% rename from icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakIterator.java rename to icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java index 9b9d83dc3cc..f543bdd266f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java @@ -1,20 +1,20 @@ /* ******************************************************************************* - * Copyright (C) 1996-2011, International Business Machines Corporation and * + * Copyright (C) 2012, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.text; import java.io.IOException; -import java.io.InputStream; import java.text.CharacterIterator; import java.util.Stack; -import com.ibm.icu.impl.Assert; - -class ThaiBreakIterator extends DictionaryBasedBreakIterator { +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +public class ThaiBreakEngine implements LanguageBreakEngine { /* Helper class for improving readability of the Thai word break * algorithm. */ @@ -25,7 +25,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { //list of word candidate lengths, in increasing length order private int lengths[]; private int count[]; // Count of candidates - private int prefix; // The longeset match with a dictionary word + private int prefix; // The longest match with a dictionary word private int offset; // Offset in the text of these candidates private int mark; // The preferred candidate's offset private int current; // The candidate we're currently looking at @@ -38,7 +38,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { } // Fill the list of candidates if needed, select the longest, and return the number found - public int candidates(CharacterIterator fIter, BreakCTDictionary dict, int rangeEnd) { + public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) { int start = fIter.getIndex(); if (start != offset) { offset = start; @@ -62,7 +62,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { return lengths[mark]; } - // Backup from the current candidate to the next shorter one; rreturn true if that exists + // Backup from the current candidate to the next shorter one; return true if that exists // and point the text after it public boolean backUp(CharacterIterator fIter) { if (current > 0) { @@ -82,14 +82,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { mark = current; } } - - private static UnicodeSet fThaiWordSet; - private static UnicodeSet fEndWordSet; - private static UnicodeSet fBeginWordSet; - private static UnicodeSet fSuffixSet; - private static UnicodeSet fMarkSet; - private BreakCTDictionary fDictionary; - + // Constants for ThaiBreakIterator // How many words in a row are "good enough"? private static final byte THAI_LOOKAHEAD = 3; @@ -104,9 +97,14 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { private static final char THAI_MAIYAMOK = 0x0E46; // Minimum word size private static final byte THAI_MIN_WORD = 2; - // Minimum number of characters for two words - //private final int THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; - + + private DictionaryMatcher fDictionary; + private static UnicodeSet fThaiWordSet; + private static UnicodeSet fEndWordSet; + private static UnicodeSet fBeginWordSet; + private static UnicodeSet fSuffixSet; + private static UnicodeSet fMarkSet; + static { // Initialize UnicodeSets fThaiWordSet = new UnicodeSet(); @@ -141,73 +139,28 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { fBeginWordSet.freeze(); fSuffixSet.freeze(); } - - public ThaiBreakIterator(InputStream ruleStream, InputStream dictionaryStream) throws IOException { - super(ruleStream); - // Initialize diciontary - fDictionary = new BreakCTDictionary(dictionaryStream); + + public ThaiBreakEngine() throws IOException { + // Initialize dictionary + fDictionary = DictionaryData.loadDictionaryFor("Thai"); } - /** - * This is the implementation function for next(). - */ - protected int handleNext() { - CharacterIterator text = getText(); - - // if there are no cached break positions, or if we've just moved - // off the end of the range covered by the cache, we have to dump - // and possibly regenerate the cache - if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) { - - // start by using the inherited handleNext() to find a tentative return - // value. dictionaryCharCount tells us how many dictionary characters - // we passed over on our way to the tentative return value - int startPos = text.getIndex(); - fDictionaryCharCount = 0; - int result = super.handleNext(); - - // if we passed over more than one dictionary character, then we use - // divideUpDictionaryRange() to regenerate the cached break positions - // for the new range - if (fDictionaryCharCount > 1 && result - startPos > 1) { - divideUpDictionaryRange(startPos, result); - } - - // otherwise, the value we got back from the inherited fuction - // is our return value, and we can dump the cache - else { - cachedBreakPositions = null; - return result; - } + public boolean handles(int c, int breakType) { + if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) { + int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); + return (script == UScript.THAI); } - // if the cache of break positions has been regenerated (or existed all - // along), then just advance to the next break position in the cache - // and return it - if (cachedBreakPositions != null) { - ++positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return cachedBreakPositions[positionInCache]; - } - Assert.assrt(false); - return -9999; // SHOULD NEVER GET HERE! + return false; } - /** - * Divide up a range of known dictionary characters. - * - * @param rangeStart The start of the range of dictionary characters - * @param rangeEnd The end of the range of dictionary characters - * @return The number of breaks found - */ - private int divideUpDictionaryRange(int rangeStart, int rangeEnd) { + public int findBreaks(CharacterIterator fIter, int rangeStart, int rangeEnd, boolean reverse, int breakType, + Stack foundBreaks) { if ((rangeEnd - rangeStart) < THAI_MIN_WORD) { - return 0; // Not enough chacters for word + return 0; // Not enough characters for word } - CharacterIterator fIter = getText(); int wordsFound = 0; int wordLength; int current; - Stack foundBreaks = new Stack(); PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD]; for (int i = 0; i < THAI_LOOKAHEAD; i++) { words[i] = new PossibleWord(); @@ -228,7 +181,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { wordsFound += 1; } - // If there was more than one, see which one can take use forward the most words + // If there was more than one, see which one can take us forward the most words else if (candidates > 1) { boolean foundBest = false; // If we're already at the end of the range, we're done @@ -259,9 +212,10 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { } } while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter) && !foundBest); } - /* foundBest: */wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter); + wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter); wordsFound += 1; } + // We come here after having either found a word or not. We look ahead to the // next word. If it's not a dictionary word, we will combine it with the word we // just found (if there is one), but only if the preceding word does not exceed @@ -291,8 +245,8 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { // two characters after uc were not 0x0E4C THANTHAKHAT before // checking the dictionary. That is just a performance filter, // but it's not clear it's faster than checking the trie - int candidate = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); - fIter.setIndex(current+wordLength+chars); + int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); + fIter.setIndex(current + wordLength + chars); if (candidate > 0) { break; } @@ -300,7 +254,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { pc = uc; } - // Bump the word cound if there wasn't already one + // Bump the word count if there wasn't already one if (wordLength <= 0) { wordsFound += 1; } @@ -351,13 +305,13 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { } } } else { - fIter.setIndex(current+wordLength); + fIter.setIndex(current + wordLength); } } // Did we find a word on this iteration? If so, push it on the break stack if (wordLength > 0) { - foundBreaks.push(Integer.valueOf(current+wordLength)); + foundBreaks.push(Integer.valueOf(current + wordLength)); } } @@ -367,16 +321,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator { wordsFound -= 1; } - // Store the break points in cachedBreakPositions. - cachedBreakPositions = new int[foundBreaks.size() + 2]; - cachedBreakPositions[0] = rangeStart; - int i; - for (i = 0; i < foundBreaks.size(); i++) { - cachedBreakPositions[i + 1] = foundBreaks.elementAt(i).intValue(); - } - cachedBreakPositions[i + 1] = rangeEnd; - positionInCache = 0; - return wordsFound; } + } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java new file mode 100644 index 00000000000..1b71385c777 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java @@ -0,0 +1,46 @@ +/* + ******************************************************************************* + * Copyright (C) 2012, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.text.CharacterIterator; +import java.util.Stack; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; + +import static com.ibm.icu.impl.CharacterIteration.*; + +public final class UnhandledBreakEngine implements LanguageBreakEngine { + // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen. + // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one. + private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1]; + public UnhandledBreakEngine() { + for (int i = 0; i < fHandled.length; i++) { + fHandled[i] = new UnicodeSet(); + } + } + + public boolean handles(int c, int breakType) { + return (breakType >= 0 && breakType < fHandled.length) && + (fHandled[breakType].contains(c)); + } + + public int findBreaks(CharacterIterator text, int startPos, int endPos, + boolean reverse, int breakType, Stack foundBreaks) { + text.setIndex(endPos); + return 0; + } + + public synchronized void handleChar(int c, int breakType) { + if (breakType >= 0 && breakType < fHandled.length && c != DONE32) { + if (!fHandled[breakType].contains(c)) { + int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); + fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script); + } + } + } +} diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 17d9aaff301..0390542a48d 100755 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a243a8584459d751b33c922f2fbfaea27200721a1a27661b5fa2ec96bb5fc6e2 -size 7929565 +oid sha256:23641fd85dfa40f916a7a5b47a6dc8ebd591862a9fe2d62ddcd46b7f1a862d36 +size 9286396 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 3c62711b6dc..9d36598002e 100755 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc6ebf5e136b448a03a7e74463c67d96217cc9f9d3feed4d2aa7f74dc5e25e63 +oid sha256:e951e7a3cc20e7126326db97e92ce533db611fde39c201795680246fde86c8e0 size 97666 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 53a388344dd..0da592c7935 100755 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2029b2752b52d544749fffea9b2574ddfd19ea278cf5f26243efd98bd3f15313 -size 719725 +oid sha256:54eeee6d7834231edb7d2d9bd3174d3c4347c737f556bc6b25915bb6860b6fe2 +size 719912 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java index 67f8b5b6077..52a955fa38c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java @@ -1,16 +1,11 @@ /* ******************************************************************************* - * Copyright (C) 1996-2010, International Business Machines Corporation and * + * Copyright (C) 1996-2012, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.rbbi; -import java.io.DataInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; import java.text.StringCharacterIterator; import java.util.ArrayList; import java.util.List; @@ -18,7 +13,6 @@ import java.util.Locale; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.DictionaryBasedBreakIterator; public class BreakIteratorTest extends TestFmwk { @@ -849,52 +843,4 @@ public class BreakIteratorTest extends TestFmwk errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage()); } } - - /* - * Tests the constructors public DictionaryBasedBreakIterator(String rules, ... public - * DictionaryBasedBreakIterator(InputStream compiledRules, ... - */ - public void TestDictionaryBasedBreakIterator() throws IOException { - // The following class allows the testing of the constructor - // public DictionaryBasedBreakIterator(String rules, ... - class TestDictionaryBasedBreakIterator extends DictionaryBasedBreakIterator { - public TestDictionaryBasedBreakIterator(InputStream is) throws IOException { - super("", is); - } - } - try { - @SuppressWarnings("unused") - TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(null); - errln("DictionaryBasedBreakIterator constructor is suppose to return an " - + "exception for an empty string."); - } catch (Exception e) { - } - - try { - File file = File.createTempFile("dummy", ""); - FileInputStream fis = new FileInputStream(file); - DataInputStream dis = new DataInputStream(fis); - @SuppressWarnings("unused") - TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(dis); - errln("DictionaryBasedBreakIterator constructor is suppose to return an " - + "exception for a temporary file with EOF."); - } catch (Exception e) { - } - - // The following class allows the testing of the constructor - // public DictionaryBasedBreakIterator(InputStream compiledRules, ... - class TestDictionaryBasedBreakIterator1 extends DictionaryBasedBreakIterator { - public TestDictionaryBasedBreakIterator1() throws IOException { - super((InputStream) null, (InputStream) null); - } - - } - try { - @SuppressWarnings("unused") - TestDictionaryBasedBreakIterator1 td1 = new TestDictionaryBasedBreakIterator1(); - errln("DictionaryBasedBreakIterator constructor is suppose to return an " - + "exception for an null input stream."); - } catch (Exception e) { - } - } -} \ No newline at end of file +} diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index b93248eaf1d..c491f40eef1 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 1996-2011, International Business Machines Corporation and + * Copyright (C) 1996-2012, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ @@ -20,7 +20,6 @@ import java.util.List; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.DictionaryBasedBreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.util.ULocale; @@ -584,7 +583,7 @@ public class RBBITest extends TestFmwk { errln("Incorrect following position."); } int []fillInArray = new int[2]; - if (((DictionaryBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) { + if (((RuleBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) { errln("Error: Since getRuleStatusVec is not supported in DictionaryBasedBreakIterator, it should return 1 and fillInArray[0] == 0."); } } @@ -663,11 +662,6 @@ public class RBBITest extends TestFmwk { final String posxWordText = "Can't have breaks in xx:yy or struct.field for CS-types."; final int[] posxWordTOffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; final int[] posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; - // KIND_WORD "ja" - final String jaWordText = "\u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF" + - "\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002"; - final int[] jaWordTOffsets = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; - final int[] jaWordROffsets = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; // KIND_SENTENCE "el" final String elSentText = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " + "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3"; @@ -688,8 +682,6 @@ public class RBBITest extends TestFmwk { final TBItem[] tests = { new TBItem( BreakIterator.KIND_WORD, new ULocale("en_US_POSIX"), posxWordText, posxWordTOffsets ), new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, posxWordText, posxWordROffsets ), - new TBItem( BreakIterator.KIND_WORD, new ULocale("ja"), jaWordText, jaWordTOffsets ), - new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, jaWordText, jaWordROffsets ), new TBItem( BreakIterator.KIND_SENTENCE, new ULocale("el"), elSentText, elSentTOffsets ), new TBItem( BreakIterator.KIND_SENTENCE, ULocale.ROOT, elSentText, elSentROffsets ), new TBItem( BreakIterator.KIND_CHARACTER, new ULocale("th"), thCharText, thCharTOffsets ), diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java index 4ac00e98799..1b4983dc22a 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java @@ -51,7 +51,6 @@ static class TestParams { public void TestExtended() { - TestParams tp = new TestParams(); @@ -434,6 +433,7 @@ void executeTest(TestParams t) { } } + // // Run the iterator backwards, verify that the same breaks are found. // diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 1931a58b9ea..3fbc34cfd29 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2003-2011 International Business Machines Corporation and + * Copyright (C) 2003-2012 International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ @@ -264,15 +264,19 @@ public class RBBITestMonkey extends TestFmwk { UnicodeSet fExtendSet; UnicodeSet fExtendNumLetSet; UnicodeSet fOtherSet; + + UnicodeSet fDictionaryCjkSet; RBBIWordMonkey() { fCharProperty = UProperty.WORD_BREAK; + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]"); fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]"); fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]"); fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]"); fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); + fALetterSet.removeAll(fDictionaryCjkSet); fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); @@ -297,13 +301,14 @@ public class RBBITestMonkey extends TestFmwk { fOtherSet.removeAll(fExtendNumLetSet); // Inhibit dictionary characters from being tested at all. fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]")); + fOtherSet.removeAll(fDictionaryCjkSet); fSets = new ArrayList(); fSets.add(fCRSet); fSets.add(fLFSet); fSets.add(fNewlineSet); fSets.add(fALetterSet); - fSets.add(fKatakanaSet); + //fSets.add(fKatakanaSet); // TODO: work out how to test katakana fSets.add(fMidLetterSet); fSets.add(fMidNumLetSet); fSets.add(fMidNumSet); @@ -1484,7 +1489,6 @@ public class RBBITestMonkey extends TestFmwk { /** * return the index of the next code point in the input text. * @param i the preceding index - * @return */ static int nextCP(StringBuffer s, int i) { if (i == -1) { diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/SimpleBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/SimpleBITest.java index c659e6a572f..fa1c15e2436 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/SimpleBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/SimpleBITest.java @@ -1,19 +1,15 @@ /* ******************************************************************************* - * Copyright (C) 1996-2006, International Business Machines Corporation and * + * Copyright (C) 1996-2012, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.rbbi; -import java.io.IOException; -import java.io.InputStream; import java.util.ListResourceBundle; -import java.util.MissingResourceException; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.DictionaryBasedBreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; // TODO: {dlf} this test currently doesn't test anything! @@ -160,30 +156,12 @@ public class SimpleBITest extends TestFmwk{ "Character", "Word", "Line", "Sentence" }; String rulesName = kindNames[kind] + "BreakRules"; - String dictionaryName = kindNames[kind] + "BreakDictionary"; String[] classNames = bundle.getStringArray("BreakIteratorClasses"); String rules = bundle.getString(rulesName); if (classNames[kind].equals("RuleBasedBreakIterator")) { iter = new RuleBasedBreakIterator(rules); } - else if (classNames[kind].equals("DictionaryBasedBreakIterator")) { - try { - String dictionaryPath = bundle.getString(dictionaryName); - InputStream dictionary = bundle.getClass().getResourceAsStream(dictionaryPath); - System.out.println("looking for " + dictionaryPath + " from " + bundle.getClass() + " returned " + dictionary); - iter = new DictionaryBasedBreakIterator(rules, dictionary); - } - catch(IOException e) { - e.printStackTrace(); - errln(e.getMessage()); - System.out.println(e); // debug - } - catch(MissingResourceException e) { - errln(e.getMessage()); - System.out.println(e); // debug - } - } if (iter == null) { errln("could not create iterator"); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 1d00e528285..5a28c275238 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -33,9 +33,8 @@ # Temp debugging tests - -•Hello, •World.• +•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb• ######################################################################################## # @@ -171,7 +170,14 @@ •abc<200>\U0001D800•def<200>\U0001D3FF• • # Hiragana & Katakana stay together, but separates from each other and Latin. -•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#• +# *** what to do about theoretical combos of chars? i.e. hiragana + accent +#•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#• + +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth +•芽キャベツ<400>芽キャベツ<400> + +# Testing of word boundary for dictionary word containing both kanji and kana +•中だるみ<400>蔵王の森<400>ウ離島<400> # Words with interior formatting characters •def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> • @@ -179,7 +185,6 @@ # to test for bug #4097779 •aa\N{COMBINING GRAVE ACCENT}a<200> • - # to test for bug #4098467 # What follows is a string of Korean characters (I found it in the Yellow Pages # ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed @@ -188,9 +193,14 @@ # precomposed syllables... •\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> • -•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> • +# more Korean tests (Jamo not tested here, not counted as dictionary characters) +# Disable them now because we don't include a Korean dictionary. +#•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200> +#•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e• +#•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> • + +•\u06c9<200>\uc799<200>\ufffa• -•\u06c9\uc799\ufffa<200> # # Try some words from other scripts. @@ -507,8 +517,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal •\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c• # conjoining jamo... -# TODO: rules update needed -#•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c• +•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c• # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd •\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f• @@ -572,17 +581,17 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal # Test data originally from the test code source file # // @suwit -- Thai sample data from GVT Guideline # -#•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\ -#\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\ -#\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\ -#\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200> -# -## Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 -#•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200> -# -#•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\ -#\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\ -#\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200> +•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\ +\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\ +\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\ +\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200> + +# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 +•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200> + +•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\ +\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\ +\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200> •0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\ @@ -619,22 +628,22 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal # @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start # -#•\u0E1B\u0E35•\ -#\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\ -#2545 •\ -#\u0E40\u0E1B\u0E47\u0E19•\ -#\u0E1B\u0E35•\ -#\u0E09\u0E25\u0E2D\u0E07•\ -#\u0E04\u0E23\u0E1A•\ -#\u0E23\u0E2D\u0E1A •\ -#\"\u0E52\u0E52\u0E50 •\ -#\u0E1b\u0E35\" •\ -#\u0E02\u0E2d\u0E07•\ -#\u0E01\u0E23\u0E38\u0E07•\ -#\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\ -#(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\ -#\u0E2B\u0E23\u0E37\u0E2D •\ -#Bangkok)• +•\u0E1B\u0E35•\ +\u0E1E\u0E38\u0E17\u0E18•\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\ +2545 •\ +\u0E40\u0E1B\u0E47\u0E19•\ +\u0E1B\u0E35•\ +\u0E09\u0E25\u0E2D\u0E07•\ +\u0E04\u0E23\u0E1A•\ +\u0E23\u0E2D\u0E1A •\ +\"\u0E52\u0E52\u0E50 •\ +\u0E1b\u0E35\" •\ +\u0E02\u0E2d\u0E07•\ +\u0E01\u0E23\u0E38\u0E07•\ +\u0E23\u0E31\u0E15\u0E19•\u0E42\u0E01•\u0E2A\u0E34•\u0E19\u0E17\u0E23\u0E4C •\ +(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F\ +\u0E2B\u0E23\u0E37\u0E2D •\ +Bangkok)• # Data originally from RBBITest::TestMaiyamok() # The Thai maiyamok character is a shorthand symbol that means "repeat the previous @@ -652,58 +661,6 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal \u0e22\u0e07•\ \u0e43\u0e2b\u0e21\u0e48• - - -########################################################################################## -# -# Khmer Tests -# -########################################################################################## - -# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 -# from the file testdata/wordsegments.txt - - - -#•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200> -#•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200> -#•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200> -##ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200> -#•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>អាច<200>ចូល<200> -##ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200> -#•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់<200>ព្រះអង្គ<200> -#•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200> -#•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200> -#•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200> -#•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200> -#•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200> -#•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200> -#•បន្ត<200>សេចក្ត<200>ទៅទៀត<200> -#•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200> -#•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200> -#•បាន<200>សុខភាព<200>បរិប្បូណ៌<200> -#•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200> -#•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200> -#•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200> -##អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200> -#•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200> -#•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200> -#•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200> - - -# -# Jitterbug 3671 Test Case -# -#•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200> - -# -# Trac ticket 5595 Test Case -#•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\ -#ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\ -#ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\ -#สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\ -#ไมล์<200> - #################################################################################### # # Tailored (locale specific) breaking. @@ -714,7 +671,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal -•\u3041•\u3043•\u3045•\u31f1• +•\u3041\u3043\u3045\u31f1• •\u3041\u3043\u3045\u31f1• @@ -722,19 +679,20 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal # The following data was originally in RBBITest::TestJapaneseWordBreak() -•\u4ECA\u65E5<400>\u306F\u3044\u3044<300>\u5929\u6C17<400>\u3067\u3059\u306D<300>\u3002•\u000D\u000A• +•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A• # UBreakIteratorType UBRK_WORD, Locale "ja" # Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). # \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002 +# modified to work with dbbi code - should verify -•私達<400>に<300>一〇〇〇<400>の<300>コンピュータ<300>がある<300>。<0>奈々<400>は<300>ワード<300>である<300>。• +•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。• -•私<400>達<400>に<300>一<400>〇<400>〇<400>〇<400>の<300>コンピュータ<300>が<300>あ<300>る<300>。<0>奈<400>々<200>は<300>ワード<300>で<300>あ<300>る<300>。• +•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。• # UBreakIteratorType UBRK_SENTENCE, Locale "el" # Add break after Greek question mark (cldrbug #2069). diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java index 71032c057bd..530fc04182b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java @@ -474,18 +474,6 @@ public final class ICUResourceBundleTest extends TestFmwk { errln("Did not get the expected output for referencingalias"); } } - { - rb = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader); - sub = rb.get("boundaries"); - String word = sub.getString("word"); - - if(word.equals("word_ja.brk")){ - logln("Got the expected output for boundaries/word"); - }else{ - errln("Did not get the expected type for boundaries/word"); - } - - } { UResourceBundle rb1 = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader); if(rb1!=rb){ diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java index 72d5519a7b1..d641229c6f6 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -104,23 +104,6 @@ public class ULocaleTest extends TestFmwk { } */ - public void TestBreakIterator() { - checkService("ja_JP_OSAKA", new ServiceFacade() { - public Object create(ULocale req) { - return BreakIterator.getWordInstance(req); - } - }, null, new Registrar() { - public Object register(ULocale loc, Object prototype) { - return BreakIterator.registerInstance( - (BreakIterator) prototype, - loc, BreakIterator.KIND_WORD); - } - public boolean unregister(Object key) { - return BreakIterator.unregister(key); - } - }); - } - public void TestDateFormat() { checkService("de_CH_ZURICH", new ServiceFacade() { public Object create(ULocale req) {