mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-11996 CJKBreakEngine divideUpDictionaryRange failing to divide long Japanese sentences.
X-SVN-Rev: 38675
This commit is contained in:
parent
3947687fcc
commit
456857b494
2 changed files with 9 additions and 2 deletions
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines Corporation and *
|
||||
* Copyright (C) 2012-2016, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -158,6 +158,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||
// with the highest value possible (i.e. the least likely to occur).
|
||||
// Exclude Korean characters from this treatment, as they should be
|
||||
// left together by default.
|
||||
text.setIndex(i); // fDictionary.matches() advances the text position; undo that.
|
||||
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
|
||||
values[count] = maxSnlp;
|
||||
lengths[count] = 1;
|
||||
|
@ -177,7 +178,6 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||
// run of Katakana characters is considered a candidate word with
|
||||
// a default cost specified in the katakanaCost table according
|
||||
// to its length.
|
||||
text.setIndex(i);
|
||||
boolean is_katakana = isKatakana(current32(text));
|
||||
if (!is_prev_katakana && is_katakana) {
|
||||
int j = i + 1;
|
||||
|
|
|
@ -274,6 +274,13 @@
|
|||
<word>
|
||||
<data>•ジョージア<400> •</data>
|
||||
|
||||
# Ticket #11996
|
||||
<locale en>
|
||||
<word>
|
||||
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400></data>
|
||||
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400>さ<400>れ<400>た<400></data>
|
||||
|
||||
|
||||
# Ticket #11999
|
||||
# Unhandled Break Engine was consuming all characters, not just unhandled.
|
||||
# \U00011700 is AHOM LETTER KA. There is no dictionary for AHOM, triggering the unhandled engine,
|
||||
|
|
Loading…
Add table
Reference in a new issue