ICU-11999 BreakIterator, UnhandledBreakEngine consuming too many characters.

X-SVN-Rev: 38669
This commit is contained in:
Andy Heninger 2016-04-29 21:30:27 +00:00
parent 1f2813e7fa
commit b5be040dd0
2 changed files with 26 additions and 2 deletions

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and *
* Copyright (C) 2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -10,6 +10,7 @@ import static com.ibm.icu.impl.CharacterIteration.DONE32;
import java.text.CharacterIterator;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
@ -30,7 +31,20 @@ final class UnhandledBreakEngine implements LanguageBreakEngine {
public int findBreaks(CharacterIterator text, int startPos, int endPos,
boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
text.setIndex(endPos);
if (breakType >= 0 && breakType < fHandled.length) {
int c = CharacterIteration.current32(text);
if (reverse) {
while (text.getIndex() > startPos && fHandled[breakType].contains(c)) {
CharacterIteration.previous32(text);
c = CharacterIteration.current32(text);
}
} else {
while (text.getIndex() < endPos && fHandled[breakType].contains(c)) {
CharacterIteration.next32(text);
c = CharacterIteration.current32(text);
}
}
}
return 0;
}

View file

@ -274,6 +274,16 @@
<word>
<data>•ジョージア<400> •</data>
# Ticket #11999
# Unhandled Break Engine was consuming all characters, not just unhandled.
# \U00011700 is AHOM LETTER KA. There is no dictionary for AHOM, triggering the unhandled engine,
# which then incorrectly also consumed the following Japanese text. (ICU4J only)
<word>
<locale en>
<data>•ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
<data>•\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
########################################################################################
#
#