ICU-10161 J - Ignore LRM/RLM/ALM in text and affixes when comparing;

add internal support for minus/plus symbol strings to DecimalFormatSymbols;
pre-integrate some CLDR 24 number symbol/pattern changes so tests match them.

X-SVN-Rev: 34212
This commit is contained in:
Peter Edberg 2013-09-06 02:06:05 +00:00
parent 9254716caa
commit ec72ffe189
5 changed files with 282 additions and 37 deletions

View file

@ -2760,6 +2760,35 @@ public class DecimalFormat extends NumberFormat {
}
/**
* Check for bidi marks: LRM, RLM, ALM
*/
private static boolean isBidiMark(int c) {
return (c==0x200E || c==0x200F || c==0x061C);
}
/**
* Remove bidi marks from affix
*/
private static final int TRIM_BUFLEN = 32;
private static String trimMarksFromAffix(String affix) {
char[] trimBuf = new char[TRIM_BUFLEN];
int affixLen = affix.length();
int affixPos, trimLen = 0;
for (affixPos = 0; affixPos < affixLen; affixPos++) {
char c = affix.charAt(affixPos);
if (!isBidiMark(c)) {
if (trimLen < TRIM_BUFLEN) {
trimBuf[trimLen++] = c;
} else {
trimLen = 0;
break;
}
}
}
return (trimLen > 0)? new String(trimBuf, 0, trimLen): affix;
}
/**
* Return the length matched by the given affix, or -1 if none. Runs of white space in
* the affix, match runs of white space in the input. Pattern white space and input
@ -2772,8 +2801,12 @@ public class DecimalFormat extends NumberFormat {
*/
private static int compareSimpleAffix(String affix, String input, int pos) {
int start = pos;
for (int i = 0; i < affix.length();) {
int c = UTF16.charAt(affix, i);
// Affixes here might consist of sign, currency symbol and related spacing, etc.
// For more efficiency we should keep lazily-created trimmed affixes around in
// instance variables instead of trimming each time they are used (the next step).
String trimmedAffix = trimMarksFromAffix(affix);
for (int i = 0; i < trimmedAffix.length();) {
int c = UTF16.charAt(trimmedAffix, i);
int len = UTF16.getCharCount(c);
if (PatternProps.isWhiteSpace(c)) {
// We may have a pattern like: \u200F and input text like: \u200F Note
@ -2781,22 +2814,29 @@ public class DecimalFormat extends NumberFormat {
// UWhiteSpace. So we have to first do a direct match of the run of RULE
// whitespace in the pattern, then match any extra characters.
boolean literalMatch = false;
while (pos < input.length() && UTF16.charAt(input, pos) == c) {
literalMatch = true;
i += len;
pos += len;
if (i == affix.length()) {
break;
}
c = UTF16.charAt(affix, i);
len = UTF16.getCharCount(c);
if (!PatternProps.isWhiteSpace(c)) {
while (pos < input.length()) {
int ic = UTF16.charAt(input, pos);
if (ic == c) {
literalMatch = true;
i += len;
pos += len;
if (i == trimmedAffix.length()) {
break;
}
c = UTF16.charAt(trimmedAffix, i);
len = UTF16.getCharCount(c);
if (!PatternProps.isWhiteSpace(c)) {
break;
}
} else if (isBidiMark(ic)) {
pos++; // just skip over this input text
} else {
break;
}
}
// Advance over run in affix
i = skipPatternWhiteSpace(affix, i);
// Advance over run in trimmedAffix
i = skipPatternWhiteSpace(trimmedAffix, i);
// Advance over run in input text. Must see at least one white space char
// in input, unless we've already matched some characters literally.
@ -2807,13 +2847,23 @@ public class DecimalFormat extends NumberFormat {
}
// If we skip UWhiteSpace in the input text, we need to skip it in the
// pattern. Otherwise, the previous lines may have skipped over text
// (such as U+00A0) that is also in the affix.
i = skipUWhiteSpace(affix, i);
// (such as U+00A0) that is also in the trimmedAffix.
i = skipUWhiteSpace(trimmedAffix, i);
} else {
if (pos < input.length() && equalWithSignCompatibility(UTF16.charAt(input, pos), c)) {
i += len;
pos += len;
} else {
boolean match = false;
while (pos < input.length()) {
int ic = UTF16.charAt(input, pos);
if (!match && equalWithSignCompatibility(ic, c)) {
i += len;
pos += len;
match = true;
} else if (isBidiMark(ic)) {
pos++; // just skip over this input text
} else {
break;
}
}
if (!match) {
return -1;
}
}
@ -2855,7 +2905,21 @@ public class DecimalFormat extends NumberFormat {
return pos;
}
/**
/**
* Skips over a run of zero or more bidi marks at pos in text.
*/
private static int skipBidiMarks(String text, int pos) {
while (pos < text.length()) {
int c = UTF16.charAt(text, pos);
if (!isBidiMark(c)) {
break;
}
pos += UTF16.getCharCount(c);
}
return pos;
}
/**
* Returns the length matched by the given affix, or -1 if none.
*
* @param affixPat pattern string
@ -2972,9 +3036,10 @@ public class DecimalFormat extends NumberFormat {
* white space in text.
*/
static final int match(String text, int pos, int ch) {
if (pos >= text.length()) {
if (pos < 0 || pos >= text.length()) {
return -1;
}
pos = skipBidiMarks(text, pos);
if (PatternProps.isWhiteSpace(ch)) {
// Advance over run of white space in input text
// Must see at least one white space char in input
@ -2985,7 +3050,11 @@ public class DecimalFormat extends NumberFormat {
}
return pos;
}
return (pos >= 0 && UTF16.charAt(text, pos) == ch) ? (pos + UTF16.getCharCount(ch)) : -1;
if (pos >= text.length() || UTF16.charAt(text, pos) != ch) {
return -1;
}
pos = skipBidiMarks(text, pos + UTF16.getCharCount(ch));
return pos;
}
/**
@ -4031,8 +4100,9 @@ public class DecimalFormat extends NumberFormat {
c = symbols.getPerMill();
break;
case PATTERN_MINUS:
c = symbols.getMinusSign();
break;
String minusString = symbols.getMinusString();
buffer.append(minusString);
continue;
}
buffer.append(c);
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and *
* Copyright (C) 1996-2013, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -403,6 +403,16 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
return minusSign;
}
/**
* Returns the string used to represent minus sign.
* @return the minus sign string
* @internal
* @deprecated This API is ICU internal only.
*/
public String getMinusString() {
return minusString;
}
/**
* Sets the character used to represent minus sign. If no explicit
* negative format is specified, one is formed by prefixing
@ -412,6 +422,9 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
*/
public void setMinusSign(char minusSign) {
this.minusSign = minusSign;
// Also updates minusString
char[] minusArray = { minusSign };
minusString = new String(minusArray);
}
/**
@ -567,6 +580,16 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
return plusSign;
}
/**
* Returns the string used to represent plus sign.
* @return the plus sign string
* @internal
* @deprecated This API is ICU internal only.
*/
public String getPlusString() {
return plusString;
}
/**
* {@icu} Sets the localized plus sign.
* @param plus the plus sign, used in localized patterns and formatted
@ -578,6 +601,9 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
*/
public void setPlusSign(char plus) {
plusSign = plus;
// Also updates plusString
char[] plusArray = { plusSign };
plusString = new String(plusArray);
}
/**
@ -763,6 +789,7 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
perMill == other.perMill &&
digit == other.digit &&
minusSign == other.minusSign &&
minusString.equals(other.minusString) &&
patternSeparator == other.patternSeparator &&
infinity.equals(other.infinity) &&
NaN.equals(other.NaN) &&
@ -770,6 +797,7 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
intlCurrencySymbol.equals(other.intlCurrencySymbol) &&
padEscape == other.padEscape &&
plusSign == other.plusSign &&
plusString.equals(other.plusString) &&
exponentSeparator.equals(other.exponentSeparator) &&
monetarySeparator == other.monetarySeparator &&
monetaryGroupingSeparator == other.monetaryGroupingSeparator);
@ -786,6 +814,13 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
return result;
}
/**
* Check for bidi marks: LRM, RLM, ALM
*/
private static boolean isBidiMark(char c) {
return (c=='\u200E' || c=='\u200F' || c=='\u061C');
}
/**
* Initializes the symbols from the LocaleElements resource bundle.
* Note: The organization of LocaleElements badly needs to be
@ -874,8 +909,10 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
groupingSeparator = numberElements[1].charAt(0);
patternSeparator = numberElements[2].charAt(0);
percent = numberElements[3].charAt(0);
minusSign = numberElements[4].charAt(0);
plusSign =numberElements[5].charAt(0);
minusString = numberElements[4];
minusSign = (minusString.length() > 1 && isBidiMark(minusString.charAt(0)))? minusString.charAt(1): minusString.charAt(0);
plusString = numberElements[5];
plusSign = (plusString.length() > 1 && isBidiMark(plusString.charAt(0)))? plusString.charAt(1): plusString.charAt(0);
exponentSeparator = numberElements[6];
perMill = numberElements[7].charAt(0);
infinity = numberElements[8];
@ -1000,6 +1037,17 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
}
initSpacingInfo(CurrencyData.CurrencySpacingInfo.DEFAULT);
}
if (serialVersionOnStream < 7) {
// Set minusString,plusString from minusSign,plusSign
if (minusString == null) {
char[] minusArray = { minusSign };
minusString = new String(minusArray);
}
if (plusString == null) {
char[] plusArray = { plusSign };
plusString = new String(plusArray);
}
}
serialVersionOnStream = currentSerialVersion;
// recreate
@ -1172,10 +1220,18 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
/**
* The requested ULocale. We keep the old locale for serialization compatibility.
* @since IDU 3.2
* @since ICU 3.2
*/
private ULocale ulocale;
/**
* String versions of some number symbols.
* @serial
* @since ICU 52
*/
private String minusString = null;
private String plusString = null;
// Proclaim JDK 1.1 FCS compatibility
private static final long serialVersionUID = 5772796243397350300L;
@ -1189,7 +1245,8 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
// - 4 for ICU 3.2, which includes the ULocale field
// - 5 for ICU 3.6, which includes the monetaryGroupingSeparator field
// - 6 for ICU 4.2, which includes the currencySpc* fields
private static final int currentSerialVersion = 6;
// - 7 for ICU 52, which includes the minusString and plusString fields
private static final int currentSerialVersion = 7;
/**
* Describes the version of <code>DecimalFormatSymbols</code> present on the stream.
@ -1205,7 +1262,8 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
* <li><b>4</b>: Version for ICU 3.2, which adds ulocale.
* <li><b>5</b>: Version for ICU 3.6, which adds monetaryGroupingSeparator.
* <li><b>6</b>: Version for ICU 4.2, which adds currencySpcBeforeSym and
* currencySpcAfterSym.
* currencySpcAfterSym.
* <li><b>7</b>: Version for ICU 52, which adds minusString and plusString.
* </ul>
* When streaming out a <code>DecimalFormatSymbols</code>, the most recent format
* (corresponding to the highest allowable <code>serialVersionOnStream</code>)

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0281eb436d3f76c50252cc66bbe357ba00aeb6db06839224cfefa18d386f3338
size 10966706
oid sha256:22577e214f576563ba1289192cb1d3d8684bb5273192958d1c0782cb4797b69b
size 10966724

View file

@ -270,7 +270,7 @@ public class CompactDecimalFormatTest extends TestFmwk {
NumberFormat cdf =
CompactDecimalFormat.getInstance(
ULocale.forLanguageTag("ar"), CompactStyle.LONG);
assertEquals("Arabic Long", "\u0665\u066B\u0663- \u0623\u0644\u0641", cdf.format(-5300));
assertEquals("Arabic Long", "\u200F-\u0665\u066B\u0663 \u0623\u0644\u0641", cdf.format(-5300));
}
public void TestCsShort() {

View file

@ -510,7 +510,7 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
// test locale without currency information
{"root", "-1.23", "USD", "-US$ 1.23", "-USD 1.23", "-1.23 USD"},
{"root@numbers=latn", "-1.23", "USD", "-US$ 1.23", "-USD 1.23", "-1.23 USD"}, // ensure that the root locale is still used with modifiers
{"root@numbers=arab", "-1.23", "USD", "-US$ ١٫٢٣", "-USD ١٫٢٣", "-١٫٢٣ USD"}, // ensure that the root locale is still used with modifiers
{"root@numbers=arab", "-1.23", "USD", "\u200F-US$ ١٫٢٣", "\u200F-USD ١٫٢٣", "\u200F-١٫٢٣ USD"}, // ensure that the root locale is still used with modifiers
// test choice format
{"es_AR", "1", "INR", "₹1,00", "INR1,00", "1,00 rupia india"},
{"ar_EG", "1", "USD", "US$ ١٫٠٠", "USD ١٫٠٠", "١٫٠٠ دولار أمريكي"},
@ -1400,7 +1400,7 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
new TestNumberingSystemItem( "en_US@numbers=hebr", 5678.0, true, "\u05D4\u05F3\u05EA\u05E8\u05E2\u05F4\u05D7" ),
new TestNumberingSystemItem( "en_US@numbers=arabext", 1234.567, false, "\u06F1\u066c\u06F2\u06F3\u06F4\u066b\u06F5\u06F6\u06F7" ),
new TestNumberingSystemItem( "de_DE@numbers=foobar", 1234.567, false, "1.234,567" ),
new TestNumberingSystemItem( "ar_EG", 1234.567, false, "\u0661\u0662\u0663\u0664\u066b\u0665\u0666\u0667" ),
new TestNumberingSystemItem( "ar_EG", 1234.567, false, "\u0661\u066C\u0662\u0663\u0664\u066b\u0665\u0666\u0667" ),
new TestNumberingSystemItem( "th_TH@numbers=traditional", 1234.567, false, "\u0E51,\u0E52\u0E53\u0E54.\u0E55\u0E56\u0E57" ), // fall back to native per TR35
new TestNumberingSystemItem( "ar_MA", 1234.567, false, "1.234,567" ),
new TestNumberingSystemItem( "en_US@numbers=hanidec", 1234.567, false, "\u4e00,\u4e8c\u4e09\u56db.\u4e94\u516d\u4e03" ),
@ -3313,7 +3313,7 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
}
}
public void TestCustomCurrecySignAndSeparator() {
public void TestCustomCurrencySignAndSeparator() {
DecimalFormatSymbols custom = new DecimalFormatSymbols(ULocale.US);
custom.setCurrencySymbol("*");
@ -3325,4 +3325,121 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
final String numstr = "* 1^234:56";
expect2(fmt, 1234.56, numstr);
}
public void TestParseSignsAndMarks() {
class SignsAndMarksItem {
public String locale;
public boolean lenient;
public String numString;
public double value;
// Simple constructor
public SignsAndMarksItem(String loc, boolean lnt, String numStr, double val) {
locale = loc;
lenient = lnt;
numString = numStr;
value = val;
}
};
final SignsAndMarksItem[] items = {
// *** Note, ICU4J lenient number parsing does not handle arbitrary whitespace, but can
// treat some whitespace as a grouping separator. The cases marked *** below depend
// on isGroupingUsed() being set for the locale, which in turn depends on grouping
// separators being present in the decimalFormat pattern for the locale (& num sys).
//
// locale lenient numString value
new SignsAndMarksItem("en", false, "12", 12 ),
new SignsAndMarksItem("en", true, "12", 12 ),
new SignsAndMarksItem("en", false, "-23", -23 ),
new SignsAndMarksItem("en", true, "-23", -23 ),
new SignsAndMarksItem("en", true, "- 23", -23 ), // ***
new SignsAndMarksItem("en", false, "\u200E-23", -23 ),
new SignsAndMarksItem("en", true, "\u200E-23", -23 ),
new SignsAndMarksItem("en", true, "\u200E- 23", -23 ), // ***
new SignsAndMarksItem("en@numbers=arab", false, "\u0663\u0664", 34 ),
new SignsAndMarksItem("en@numbers=arab", true, "\u0663\u0664", 34 ),
new SignsAndMarksItem("en@numbers=arab", false, "-\u0664\u0665", -45 ),
new SignsAndMarksItem("en@numbers=arab", true, "-\u0664\u0665", -45 ),
new SignsAndMarksItem("en@numbers=arab", true, "- \u0664\u0665", -45 ), // ***
new SignsAndMarksItem("en@numbers=arab", false, "\u200F-\u0664\u0665", -45 ),
new SignsAndMarksItem("en@numbers=arab", true, "\u200F-\u0664\u0665", -45 ),
new SignsAndMarksItem("en@numbers=arab", true, "\u200F- \u0664\u0665", -45 ), // ***
new SignsAndMarksItem("en@numbers=arabext", false, "\u06F5\u06F6", 56 ),
new SignsAndMarksItem("en@numbers=arabext", true, "\u06F5\u06F6", 56 ),
new SignsAndMarksItem("en@numbers=arabext", false, "-\u06F6\u06F7", -67 ),
new SignsAndMarksItem("en@numbers=arabext", true, "-\u06F6\u06F7", -67 ),
new SignsAndMarksItem("en@numbers=arabext", true, "- \u06F6\u06F7", -67 ), // ***
new SignsAndMarksItem("en@numbers=arabext", false, "\u200E-\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("en@numbers=arabext", true, "\u200E-\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("en@numbers=arabext", true, "\u200E-\u200E \u06F6\u06F7", -67 ), // ***
new SignsAndMarksItem("he", false, "12", 12 ),
new SignsAndMarksItem("he", true, "12", 12 ),
new SignsAndMarksItem("he", false, "-23", -23 ),
new SignsAndMarksItem("he", true, "-23", -23 ),
new SignsAndMarksItem("he", true, "- 23", -23 ), // ***
new SignsAndMarksItem("he", false, "\u200E-23", -23 ),
new SignsAndMarksItem("he", true, "\u200E-23", -23 ),
new SignsAndMarksItem("he", true, "\u200E- 23", -23 ), // ***
new SignsAndMarksItem("ar", false, "\u0663\u0664", 34 ),
new SignsAndMarksItem("ar", true, "\u0663\u0664", 34 ),
new SignsAndMarksItem("ar", false, "-\u0664\u0665", -45 ),
new SignsAndMarksItem("ar", true, "-\u0664\u0665", -45 ),
new SignsAndMarksItem("ar", true, "- \u0664\u0665", -45 ), // ***
new SignsAndMarksItem("ar", false, "\u200F-\u0664\u0665", -45 ),
new SignsAndMarksItem("ar", true, "\u200F-\u0664\u0665", -45 ),
new SignsAndMarksItem("ar", true, "\u200F- \u0664\u0665", -45 ), // ***
new SignsAndMarksItem("ar_MA", false, "12", 12 ),
new SignsAndMarksItem("ar_MA", true, "12", 12 ),
new SignsAndMarksItem("ar_MA", false, "-23", -23 ),
new SignsAndMarksItem("ar_MA", true, "-23", -23 ),
new SignsAndMarksItem("ar_MA", true, "- 23", -23 ), // ***
new SignsAndMarksItem("ar_MA", false, "\u200E-23", -23 ),
new SignsAndMarksItem("ar_MA", true, "\u200E-23", -23 ),
new SignsAndMarksItem("ar_MA", true, "\u200E- 23", -23 ), // ***
new SignsAndMarksItem("fa", false, "\u06F5\u06F6", 56 ),
new SignsAndMarksItem("fa", true, "\u06F5\u06F6", 56 ),
new SignsAndMarksItem("fa", false, "\u2212\u06F6\u06F7", -67 ),
new SignsAndMarksItem("fa", true, "\u2212\u06F6\u06F7", -67 ),
new SignsAndMarksItem("fa", true, "\u2212 \u06F6\u06F7", -67 ), // ***
new SignsAndMarksItem("fa", false, "\u200E\u2212\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("fa", true, "\u200E\u2212\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("fa", true, "\u200E\u2212\u200E \u06F6\u06F7", -67 ), // ***
new SignsAndMarksItem("ps", false, "\u06F5\u06F6", 56 ),
new SignsAndMarksItem("ps", true, "\u06F5\u06F6", 56 ),
new SignsAndMarksItem("ps", false, "-\u06F6\u06F7", -67 ),
new SignsAndMarksItem("ps", true, "-\u06F6\u06F7", -67 ),
new SignsAndMarksItem("ps", true, "- \u06F6\u06F7", -67 ), // ***
new SignsAndMarksItem("ps", false, "\u200E-\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("ps", true, "\u200E-\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("ps", true, "\u200E-\u200E \u06F6\u06F7", -67 ), // ***
new SignsAndMarksItem("ps", false, "-\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("ps", true, "-\u200E\u06F6\u06F7", -67 ),
new SignsAndMarksItem("ps", true, "-\u200E \u06F6\u06F7", -67 ), // ***
};
for (SignsAndMarksItem item: items) {
ULocale locale = new ULocale(item.locale);
NumberFormat numfmt = NumberFormat.getInstance(locale);
if (numfmt != null) {
numfmt.setParseStrict(!item.lenient);
ParsePosition ppos = new ParsePosition(0);
Number num = numfmt.parse(item.numString, ppos);
if (num != null && ppos.getIndex() == item.numString.length()) {
double parsedValue = num.doubleValue();
if (parsedValue != item.value) {
errln("FAIL: locale " + item.locale + ", lenient " + item.lenient + ", parse of \"" + item.numString + "\" gives value " + parsedValue);
}
} else {
errln("FAIL: locale " + item.locale + ", lenient " + item.lenient + ", parse of \"" + item.numString + "\" gives position " + ppos.getIndex());
}
} else {
errln("FAIL: NumberFormat.getInstance for locale " + item.locale);
}
}
}
}