ICU-11996 CJKBreakEngine divideUpDictionaryRange failing to divide long Japanese sentences.

X-SVN-Rev: 38675
This commit is contained in:
Andy Heninger 2016-04-29 23:09:35 +00:00
parent 3947687fcc
commit 456857b494
2 changed files with 9 additions and 2 deletions

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines Corporation and *
* Copyright (C) 2012-2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -158,6 +158,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
// with the highest value possible (i.e. the least likely to occur).
// Exclude Korean characters from this treatment, as they should be
// left together by default.
text.setIndex(i); // fDictionary.matches() advances the text position; undo that.
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
values[count] = maxSnlp;
lengths[count] = 1;
@ -177,7 +178,6 @@ class CjkBreakEngine extends DictionaryBreakEngine {
// run of Katakana characters is considered a candidate word with
// a default cost specified in the katakanaCost table according
// to its length.
text.setIndex(i);
boolean is_katakana = isKatakana(current32(text));
if (!is_prev_katakana && is_katakana) {
int j = i + 1;

View file

@ -274,6 +274,13 @@
<word>
<data>•ジョージア<400> •</data>
# Ticket #11996
<locale en>
<word>
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400></data>
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400>さ<400>れ<400>た<400></data>
# Ticket #11999
# Unhandled Break Engine was consuming all characters, not just unhandled.
# \U00011700 is AHOM LETTER KA. There is no dictionary for AHOM, triggering the unhandled engine,