mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-13060 Documentation and parsing whitespace/bidi tweaks.
X-SVN-Rev: 39951
This commit is contained in:
parent
dd47bbc596
commit
08dcc7e82e
4 changed files with 127 additions and 23 deletions
|
@ -11,6 +11,7 @@ import java.util.Iterator;
|
|||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import com.ibm.icu.impl.PatternProps;
|
||||
import com.ibm.icu.impl.StandardPlural;
|
||||
import com.ibm.icu.impl.TextTrieMap;
|
||||
import com.ibm.icu.impl.number.formatters.BigDecimalMultiplier;
|
||||
|
@ -1129,7 +1130,7 @@ public class Parse {
|
|||
}
|
||||
acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
|
||||
if (state.length > 0 && mode == ParseMode.FAST) break;
|
||||
acceptBidi(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
|
||||
acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
|
||||
if (state.length > 0 && mode == ParseMode.FAST) break;
|
||||
acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
|
||||
if (state.length > 0 && mode == ParseMode.FAST) break;
|
||||
|
@ -1155,7 +1156,7 @@ public class Parse {
|
|||
// We encountered a decimal point
|
||||
acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
|
||||
if (state.length > 0 && mode == ParseMode.FAST) break;
|
||||
acceptBidi(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
|
||||
acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
|
||||
if (state.length > 0 && mode == ParseMode.FAST) break;
|
||||
acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
|
||||
if (state.length > 0 && mode == ParseMode.FAST) break;
|
||||
|
@ -1184,7 +1185,7 @@ public class Parse {
|
|||
break;
|
||||
|
||||
case AFTER_EXPONENT_DIGIT:
|
||||
acceptBidi(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
|
||||
acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
|
||||
acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
|
||||
acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
|
||||
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
|
||||
|
@ -1257,17 +1258,17 @@ public class Parse {
|
|||
break;
|
||||
|
||||
case INSIDE_STRING:
|
||||
acceptStringOffset(cp, state, item);
|
||||
long added0 = acceptStringOffset(cp, state, item);
|
||||
// Accept arbitrary bidi in the middle of strings.
|
||||
if (state.length == 0 && UNISET_BIDI.contains(cp)) {
|
||||
if (added0 == 0L && UNISET_BIDI.contains(cp)) {
|
||||
state.getNext().copyFrom(item, item.name, cp);
|
||||
}
|
||||
break;
|
||||
|
||||
case INSIDE_AFFIX_PATTERN:
|
||||
long added = acceptAffixPatternOffset(cp, state, item);
|
||||
// Accept arbitrary bidi and whitespace (if lenient) in the middle of affixes.
|
||||
if (added == 0L && isIgnorable(cp, state)) {
|
||||
long added1 = acceptAffixPatternOffset(cp, state, item);
|
||||
// Accept arbitrary bidi and whitespace (if lenient) in the middle of affix patterns.
|
||||
if (added1 == 0L && isIgnorable(cp, state)) {
|
||||
state.getNext().copyFrom(item, item.name, cp);
|
||||
}
|
||||
break;
|
||||
|
@ -1758,8 +1759,8 @@ public class Parse {
|
|||
if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5;
|
||||
}
|
||||
|
||||
private static void acceptStringOffset(int cp, ParserState state, StateItem item) {
|
||||
acceptString(
|
||||
private static long acceptStringOffset(int cp, ParserState state, StateItem item) {
|
||||
return acceptString(
|
||||
cp,
|
||||
item.returnTo1,
|
||||
item.returnTo2,
|
||||
|
@ -1813,20 +1814,20 @@ public class Parse {
|
|||
referenceCp = Character.codePointAt(str, offset);
|
||||
count = Character.charCount(referenceCp);
|
||||
equals = codePointEquals(cp, referenceCp, state);
|
||||
if (!UNISET_BIDI.contains(cp)) break;
|
||||
if (!UNISET_BIDI.contains(referenceCp)) break;
|
||||
}
|
||||
|
||||
if (equals) {
|
||||
// Matches first code point of the string
|
||||
StateItem next = state.getNext().copyFrom(item, null, cp);
|
||||
|
||||
// Skip over ignorable code points in the middle of the string.
|
||||
// Skip over bidi code points in the middle or end of the string.
|
||||
// They will be accepted in the main loop.
|
||||
offset += count;
|
||||
for (; offset < str.length(); offset += count) {
|
||||
referenceCp = Character.codePointAt(str, offset);
|
||||
count = Character.charCount(referenceCp);
|
||||
if (!UNISET_BIDI.contains(cp)) break;
|
||||
if (!UNISET_BIDI.contains(referenceCp)) break;
|
||||
}
|
||||
|
||||
if (offset < str.length()) {
|
||||
|
@ -1879,7 +1880,7 @@ public class Parse {
|
|||
tag = AffixPatternUtils.nextToken(tag, str);
|
||||
typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
|
||||
hasNext = AffixPatternUtils.hasNext(tag, str);
|
||||
if (typeOrCp < 0 || !isIgnorable(typeOrCp, state)) break;
|
||||
if (typeOrCp < 0 || !isPatternIgnorable(typeOrCp, state)) break;
|
||||
}
|
||||
|
||||
// Convert from the returned tag to a code point, string, or currency to check
|
||||
|
@ -1926,18 +1927,16 @@ public class Parse {
|
|||
while (hasNext) {
|
||||
long futureTag = AffixPatternUtils.nextToken(tag, str);
|
||||
int futureTypeOrCp = AffixPatternUtils.getTypeOrCp(futureTag);
|
||||
if (futureTypeOrCp < 0 || !isIgnorable(futureTypeOrCp, state)) break;
|
||||
if (futureTypeOrCp < 0 || !isPatternIgnorable(futureTypeOrCp, state)) break;
|
||||
tag = futureTag;
|
||||
typeOrCp = futureTypeOrCp;
|
||||
hasNext = AffixPatternUtils.hasNext(tag, str);
|
||||
}
|
||||
|
||||
long added = 0L;
|
||||
if (resolvedCp >= 0) {
|
||||
if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) {
|
||||
// Code point
|
||||
if (!codePointEquals(cp, resolvedCp, state)) return 0L;
|
||||
StateItem next = state.getNext().copyFrom(item, null, cp);
|
||||
|
||||
if (hasNext) {
|
||||
// Additional tokens in affix string.
|
||||
next.name = StateName.INSIDE_AFFIX_PATTERN;
|
||||
|
@ -1948,6 +1947,7 @@ public class Parse {
|
|||
next.trailingCount = 0;
|
||||
next.returnTo1 = null;
|
||||
}
|
||||
next.score += 1; // reward for consuming code point from pattern
|
||||
added |= 1L << state.lastInsertedIndex();
|
||||
}
|
||||
if (resolvedMinusSign) {
|
||||
|
@ -2180,11 +2180,26 @@ public class Parse {
|
|||
*
|
||||
* @param cp The code point to test. Returns false if cp is negative.
|
||||
* @param state The current {@link ParserState}, used for determining strict mode.
|
||||
* @return true if cp is bidi or whitespace in lenient mode; false otherwise.
|
||||
* @return true if cp is ignorable; false otherwise.
|
||||
*/
|
||||
private static boolean isIgnorable(int cp, ParserState state) {
|
||||
if (cp < 0) return false;
|
||||
if (UNISET_BIDI.contains(cp)) return true;
|
||||
return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the given code point is "ignorable" in pattern syntax. This includes all
|
||||
* characters that are normally ignorable in {@link #isIgnorable} plus characters having the
|
||||
* Pattern_White_Space property.
|
||||
*
|
||||
* @param cp The code point to test. Returns false if cp is negative.
|
||||
* @param state The current {@link ParserState}, used for determining strict mode.
|
||||
* @return true if cp is ignorable; false otherwise.
|
||||
*/
|
||||
private static boolean isPatternIgnorable(int cp, ParserState state) {
|
||||
if (cp < 0) return false;
|
||||
if (state.mode == ParseMode.LENIENT && PatternProps.isWhiteSpace(cp)) return true;
|
||||
return isIgnorable(cp, state);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -198,6 +198,15 @@ import com.ibm.icu.util.ULocale.Category;
|
|||
* and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates
|
||||
* parse failure by throwing a {@link java.text.ParseException}.
|
||||
*
|
||||
* <p>Under the hood, a state table parsing engine is used. To debug a parsing failure during
|
||||
* development, use the following pattern to print details about the state table transitions:
|
||||
*
|
||||
* <pre>
|
||||
* com.ibm.icu.impl.number.Parse.DEBUGGING = true;
|
||||
* df.parse("123.45", ppos);
|
||||
* com.ibm.icu.impl.number.Parse.DEBUGGING = false;
|
||||
* </pre>
|
||||
*
|
||||
* <h3>Thread Safety and Best Practices</h3>
|
||||
*
|
||||
* <p>Starting with ICU 59, instance of DecimalFormat are thread-safe.
|
||||
|
|
|
@ -1439,3 +1439,62 @@ parse output breaks
|
|||
// K and J return null
|
||||
9 999 9999 JK
|
||||
|
||||
test parse ignorables
|
||||
set locale ar
|
||||
// Note: Prefixes contain RLMs, as do some of the test cases.
|
||||
set pattern x ab0c df
|
||||
set negativePrefix y gh
|
||||
set negativeSuffix i jk
|
||||
begin
|
||||
parse output breaks
|
||||
x ab56c df 56
|
||||
x ab56c df 56 K
|
||||
x ab56c df 56 K
|
||||
x ab56c df 56 JK
|
||||
x ab56c df 56 K
|
||||
x ab56 56 JK
|
||||
x a b56 56 JK
|
||||
56cdf 56 JK
|
||||
56c df 56 JK
|
||||
56cd f 56 JK
|
||||
56cdf 56 JK
|
||||
56cdf 56 JK
|
||||
56c df 56 JK
|
||||
56c df 56 JK
|
||||
y gh56i jk -56
|
||||
y gh56i jk -56 KS
|
||||
y gh56i jk -56 K
|
||||
y gh56i jk -56 JK
|
||||
y gh56i jk -56 K
|
||||
y gh56 -56 JK
|
||||
y g h56 -56 JKS
|
||||
// S stops parsing at the 'i' for these and returns -56
|
||||
56ijk -56 JK
|
||||
56i jk -56 JK
|
||||
56ij k -56 JK
|
||||
56ijk -56 JK
|
||||
56ijk -56 JK
|
||||
56i jk -56 JK
|
||||
56i jk -56 JK
|
||||
// S gets 56 (accepts ' ' gs grouping); J and K get null
|
||||
5 6 5 JKS
|
||||
56 5 JK
|
||||
|
||||
test parse spaces in grouping
|
||||
// This test gives the ideal behavior of these cases, which
|
||||
// none of the implementations currently support.
|
||||
set locale en
|
||||
set pattern #,##0
|
||||
begin
|
||||
parse output breaks
|
||||
// J and S get "12" here
|
||||
1 2 1 JS
|
||||
1 23 1 JS
|
||||
// K gets 1 here; doesn't pick up the grouping separator
|
||||
1 234 1234 K
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4850,13 +4850,16 @@ public class NumberFormatTest extends TestFmwk {
|
|||
@Test
|
||||
public void Test11839() {
|
||||
DecimalFormatSymbols dfs = new DecimalFormatSymbols(ULocale.ENGLISH);
|
||||
dfs.setMinusSign('∸');
|
||||
dfs.setPlusSign('∔'); // ∔ U+2214 DOT PLUS
|
||||
dfs.setMinusSignString("a∸");
|
||||
dfs.setPlusSignString("b∔"); // ∔ U+2214 DOT PLUS
|
||||
DecimalFormat df = new DecimalFormat("0.00+;0.00-", dfs);
|
||||
String result = df.format(-1.234);
|
||||
assertEquals("Locale-specific minus sign should be used", "1.23∸", result);
|
||||
assertEquals("Locale-specific minus sign should be used", "1.23a∸", result);
|
||||
result = df.format(1.234);
|
||||
assertEquals("Locale-specific plus sign should be used", "1.23∔", result);
|
||||
assertEquals("Locale-specific plus sign should be used", "1.23b∔", result);
|
||||
// Test round-trip with parse
|
||||
expect2(df, -456, "456.00a∸");
|
||||
expect2(df, 456, "456.00b∔");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -5171,6 +5174,24 @@ public class NumberFormatTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseIgnorables() {
|
||||
// Also see the test case "test parse ignorables" in numberformattestspecification.txt
|
||||
DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance();
|
||||
dfs.setPercentString("\u200E%\u200E");
|
||||
DecimalFormat df = new DecimalFormat("0%;-0a", dfs);
|
||||
ParsePosition ppos = new ParsePosition(0);
|
||||
Number result = df.parse("42\u200E%\u200E ", ppos);
|
||||
assertEquals("Should parse as percentage", new BigDecimal("0.42"), result);
|
||||
// NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
|
||||
assertEquals("Should not consume the trailing bidi even though it is in the symbol", 4, ppos.getIndex());
|
||||
ppos.setIndex(0);
|
||||
result = df.parse("-42a\u200E ", ppos);
|
||||
assertEquals("Should parse as percent", new BigDecimal("-0.42"), result);
|
||||
// NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
|
||||
assertEquals("Should not consume the trailing bidi or whitespace", 4, ppos.getIndex());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSignificantDigitsMode() {
|
||||
String[][] allExpected = {
|
||||
|
|
Loading…
Add table
Reference in a new issue