ICU-13513 Integrating some of Andy's feedback. Moving code unit vs. code point logic out of the matchers and into the main loop. Moving case folding logic from run-time in StringSegment to build-time. Refactoring to replace booleans with flags at build-time. Further optimizing lead-code-point creation time by adding more fallbacks to the static cache.

X-SVN-Rev: 40784
This commit is contained in:
Shane Carr 2018-01-18 10:50:36 +00:00
parent a0ed703c5e
commit d0a75c667b
22 changed files with 494 additions and 340 deletions

View file

@ -90,11 +90,11 @@ public class TextTrieMap<V> {
}
public void find(CharSequence text, ResultHandler<V> handler) {
find(text, 0, handler, new Output());
find(text, 0, handler, null);
}
public void find(CharSequence text, int offset, ResultHandler<V> handler) {
find(text, offset, handler, new Output());
find(text, offset, handler, null);
}
private void find(CharSequence text, int offset, ResultHandler<V> handler, Output output) {
@ -116,8 +116,8 @@ public class TextTrieMap<V> {
}
}
public void putLeadChars(UnicodeSet output) {
_root.putLeadChars(output);
public void putLeadCodePoints(UnicodeSet output) {
_root.putLeadCodePoints(output);
}
/**
@ -363,7 +363,9 @@ public class TextTrieMap<V> {
return null;
}
if (!chitr.hasNext()) {
output.partialMatch = true;
if (output != null) {
output.partialMatch = true;
}
return null;
}
Node match = null;
@ -382,12 +384,24 @@ public class TextTrieMap<V> {
return match;
}
public void putLeadChars(UnicodeSet output) {
public void putLeadCodePoints(UnicodeSet output) {
if (_children == null) {
return;
}
for (Node child : _children) {
output.add(child._text[0]);
char c0 = child._text[0];
if (!UCharacter.isHighSurrogate(c0)) {
output.add(c0);
} else if (child.charCount() >= 2) {
output.add(Character.codePointAt(child._text, 0));
} else if (child._children != null) {
// Construct all possible code points from grandchildren.
for (Node grandchild : child._children) {
char c1 = grandchild._text[0];
int cp = Character.toCodePoint(c0, c1);
output.add(cp);
}
}
}
}
@ -465,7 +479,9 @@ public class TextTrieMap<V> {
int idx = 1;
while (idx < _text.length) {
if(!chitr.hasNext()) {
output.partialMatch = true;
if (output != null) {
output.partialMatch = true;
}
matched = false;
break;
}

View file

@ -44,7 +44,7 @@ public class AffixMatcher implements NumberParseMatcher {
AffixPatternProvider patternInfo,
NumberParserImpl output,
IgnorablesMatcher ignorables,
boolean includeUnpaired) {
int parseFlags) {
// Lazy-initialize the StringBuilder.
StringBuilder sb = null;
@ -53,9 +53,11 @@ public class AffixMatcher implements NumberParseMatcher {
ArrayList<AffixMatcher> matchers = new ArrayList<AffixMatcher>(6);
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_POS_PREFIX, ignorables.getSet(), sb);
String posPrefix = toStringOrEmpty(sb);
String posPrefix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_POS_SUFFIX, ignorables.getSet(), sb);
String posSuffix = toStringOrEmpty(sb);
String posSuffix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
boolean includeUnpaired = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
if (!posPrefix.isEmpty() || !posSuffix.isEmpty()) {
matchers.add(getInstance(posPrefix, posSuffix, 0));
@ -67,9 +69,9 @@ public class AffixMatcher implements NumberParseMatcher {
if (patternInfo.hasNegativeSubpattern()) {
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_NEG_PREFIX, ignorables.getSet(), sb);
String negPrefix = toStringOrEmpty(sb);
String negPrefix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_NEG_SUFFIX, ignorables.getSet(), sb);
String negSuffix = toStringOrEmpty(sb);
String negSuffix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
if (negPrefix.equals(posPrefix) && negSuffix.equals(posSuffix)) {
// No-op: favor the positive AffixMatcher
@ -115,6 +117,8 @@ public class AffixMatcher implements NumberParseMatcher {
}
private AffixMatcher(String prefix, String suffix, int flags) {
assert prefix != null;
assert suffix != null;
this.prefix = prefix;
this.suffix = suffix;
this.flags = flags;
@ -157,11 +161,11 @@ public class AffixMatcher implements NumberParseMatcher {
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
UnicodeSet leadChars = new UnicodeSet();
ParsingUtils.putLeadingChar(prefix, leadChars, ignoreCase);
ParsingUtils.putLeadingChar(suffix, leadChars, ignoreCase);
return leadChars.freeze();
public UnicodeSet getLeadCodePoints() {
UnicodeSet leadCodePoints = new UnicodeSet();
ParsingUtils.putLeadCodePoint(prefix, leadCodePoints);
ParsingUtils.putLeadCodePoint(suffix, leadCodePoints);
return leadCodePoints.freeze();
}
@Override

View file

@ -15,14 +15,16 @@ public class CurrencyMatcher implements NumberParseMatcher {
private final String currency1;
private final String currency2;
public static NumberParseMatcher getInstance(Currency currency, ULocale loc) {
return new CurrencyMatcher(currency, loc);
public static NumberParseMatcher getInstance(Currency currency, ULocale loc, int setupFlags) {
return new CurrencyMatcher(currency.getSubtype(),
ParsingUtils.maybeFold(currency.getSymbol(loc), setupFlags),
ParsingUtils.maybeFold(currency.getCurrencyCode(), setupFlags));
}
private CurrencyMatcher(Currency currency, ULocale loc) {
isoCode = currency.getSubtype();
currency1 = currency.getSymbol(loc);
currency2 = currency.getCurrencyCode();
private CurrencyMatcher(String isoCode, String currency1, String currency2) {
this.isoCode = isoCode;
this.currency1 = currency1;
this.currency2 = currency2;
}
@Override
@ -49,11 +51,11 @@ public class CurrencyMatcher implements NumberParseMatcher {
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
UnicodeSet leadChars = new UnicodeSet();
ParsingUtils.putLeadingChar(currency1, leadChars, ignoreCase);
ParsingUtils.putLeadingChar(currency2, leadChars, ignoreCase);
return leadChars.freeze();
public UnicodeSet getLeadCodePoints() {
UnicodeSet leadCodePoints = new UnicodeSet();
ParsingUtils.putLeadCodePoint(currency1, leadCodePoints);
ParsingUtils.putLeadCodePoint(currency2, leadCodePoints);
return leadCodePoints.freeze();
}
@Override

View file

@ -25,6 +25,8 @@ public class CurrencyTrieMatcher implements NumberParseMatcher {
}
private CurrencyTrieMatcher(ULocale locale) {
// TODO: Currency trie does not currently have an option for case folding. It defaults to use
// case folding on long-names but not symbols.
longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
}
@ -49,11 +51,11 @@ public class CurrencyTrieMatcher implements NumberParseMatcher {
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
UnicodeSet leadChars = new UnicodeSet();
longNameTrie.putLeadChars(leadChars);
symbolTrie.putLeadChars(leadChars);
return leadChars.freeze();
public UnicodeSet getLeadCodePoints() {
UnicodeSet leadCodePoints = new UnicodeSet();
longNameTrie.putLeadCodePoints(leadCodePoints);
symbolTrie.putLeadCodePoints(leadCodePoints);
return leadCodePoints.freeze();
}
@Override

View file

@ -5,6 +5,7 @@ package com.ibm.icu.impl.number.parse;
import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.number.Grouper;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
@ -14,48 +15,57 @@ import com.ibm.icu.text.UnicodeSet;
*/
public class DecimalMatcher implements NumberParseMatcher {
public boolean requireGroupingMatch = false;
public boolean decimalEnabled = true;
public boolean groupingEnabled = true;
public int grouping1 = 3;
public int grouping2 = 3;
public boolean integerOnly = false;
public boolean isScientific = false;
private final boolean requireGroupingMatch;
private final boolean groupingDisabled;
private final int grouping1;
private final int grouping2;
private final boolean integerOnly;
private final boolean isScientific;
private UnicodeSet groupingUniSet = null;
private UnicodeSet decimalUniSet = null;
private UnicodeSet separatorSet = null;
private UnicodeSet separatorLeadChars = null;
private String[] digitStrings = null;
private boolean frozen;
// Assumption: these sets all consist of single code points. If this assumption needs to be broken,
// fix getLeadCodePoints() as well as matching logic. Be careful of the performance impact.
private final UnicodeSet groupingUniSet;
private final UnicodeSet decimalUniSet;
private final UnicodeSet separatorSet;
private final UnicodeSet leadSet;
private final String[] digitStrings;
public DecimalMatcher() {
frozen = false;
public static DecimalMatcher getInstance(
DecimalFormatSymbols symbols,
Grouper grouper,
int parseFlags) {
// TODO: Cache popular instances?
return new DecimalMatcher(symbols, grouper, parseFlags);
}
public void freeze(DecimalFormatSymbols symbols, boolean monetarySeparators, boolean isStrict) {
assert !frozen;
frozen = true;
String groupingSeparator = monetarySeparators ? symbols.getMonetaryGroupingSeparatorString()
: symbols.getGroupingSeparatorString();
String decimalSeparator = monetarySeparators ? symbols.getMonetaryDecimalSeparatorString()
: symbols.getDecimalSeparatorString();
private DecimalMatcher(DecimalFormatSymbols symbols, Grouper grouper, int parseFlags) {
Key groupingKey, decimalKey;
String groupingSeparator, decimalSeparator;
if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_MONETARY_SEPARATORS)) {
groupingSeparator = symbols.getMonetaryGroupingSeparatorString();
decimalSeparator = symbols.getMonetaryDecimalSeparatorString();
} else {
groupingSeparator = symbols.getGroupingSeparatorString();
decimalSeparator = symbols.getDecimalSeparatorString();
}
// Attempt to find values in the static cache
if (isStrict) {
decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator, Key.STRICT_COMMA, Key.STRICT_PERIOD);
if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_SEPARATORS)) {
decimalKey = UnicodeSetStaticCache
.chooseFrom(decimalSeparator, Key.STRICT_COMMA, Key.STRICT_PERIOD);
if (decimalKey == Key.STRICT_COMMA) {
// Decimal is comma; grouping should be period or custom
groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator, Key.STRICT_PERIOD_OR_OTHER);
groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
Key.STRICT_PERIOD_OR_OTHER);
} else if (decimalKey == Key.STRICT_PERIOD) {
// Decimal is period; grouping should be comma or custom
groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator, Key.STRICT_COMMA_OR_OTHER);
groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
Key.STRICT_COMMA_OR_OTHER);
} else {
// Decimal is custom; grouping can be either comma or period or custom
groupingKey = UnicodeSetStaticCache
.chooseFrom(groupingSeparator, Key.STRICT_COMMA_OR_OTHER, Key.STRICT_PERIOD_OR_OTHER);
groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
Key.STRICT_COMMA_OR_OTHER,
Key.STRICT_PERIOD_OR_OTHER);
}
} else {
decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator, Key.COMMA, Key.PERIOD);
@ -73,35 +83,46 @@ public class DecimalMatcher implements NumberParseMatcher {
}
// Get the sets from the static cache if they were found
UnicodeSet _groupingUniSet = null, _decimalUniSet = null, _separatorSet = null, _leadSet = null;
if (groupingKey != null && decimalKey != null) {
groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
_groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
_decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
Key separatorKey = UnicodeSetStaticCache.unionOf(groupingKey, decimalKey);
if (separatorKey != null) {
separatorSet = UnicodeSetStaticCache.get(separatorKey);
separatorLeadChars = UnicodeSetStaticCache.getLeadChars(separatorKey);
_separatorSet = UnicodeSetStaticCache.get(separatorKey);
Key leadKey = UnicodeSetStaticCache.unionOf(Key.DIGITS, separatorKey);
if (leadKey != null) {
_leadSet = UnicodeSetStaticCache.get(leadKey);
}
}
} else if (groupingKey != null) {
groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
_groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
} else if (decimalKey != null) {
decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
_decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
}
// Resolve fallbacks if we don't have sets from the static cache
if (groupingUniSet == null) {
groupingUniSet = new UnicodeSet().add(groupingSeparator).freeze();
}
if (decimalUniSet == null) {
decimalUniSet = new UnicodeSet().add(decimalSeparator).freeze();
}
if (separatorSet == null) {
separatorSet = new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
}
// Finish resolving fallbacks
groupingUniSet = _groupingUniSet != null ? _groupingUniSet
: new UnicodeSet().add(groupingSeparator.codePointAt(0)).freeze();
decimalUniSet = _decimalUniSet != null ? _decimalUniSet
: new UnicodeSet().add(decimalSeparator.codePointAt(0)).freeze();
separatorSet = _separatorSet != null ? _separatorSet
: new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
leadSet = _leadSet; // null if not available
int cpZero = symbols.getCodePointZero();
if (cpZero == -1 || !UCharacter.isDigit(cpZero) || UCharacter.digit(cpZero) != 0) {
digitStrings = symbols.getDigitStrings();
digitStrings = symbols.getDigitStringsLocal();
} else {
digitStrings = null;
}
requireGroupingMatch = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE);
groupingDisabled = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_GROUPING_DISABLED);
grouping1 = grouper.getPrimary();
grouping2 = grouper.getSecondary();
integerOnly = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_INTEGER_ONLY);
isScientific = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_DECIMAL_SCIENTIFIC);
}
@Override
@ -110,7 +131,6 @@ public class DecimalMatcher implements NumberParseMatcher {
}
public boolean match(StringSegment segment, ParsedNumber result, boolean negativeExponent) {
assert frozen;
if (result.seenNumber() && !isScientific) {
// A number has already been consumed.
return false;
@ -177,16 +197,18 @@ public class DecimalMatcher implements NumberParseMatcher {
if (separator == -1) {
// First separator; could be either grouping or decimal.
separator = cp;
if (groupingEnabled && requireGroupingMatch && groupingUniSet.contains(cp)
if (!groupingDisabled
&& requireGroupingMatch
&& groupingUniSet.contains(cp)
&& (currGroup == 0 || currGroup > grouping2)) {
break;
}
} else if (groupingEnabled && separator == cp && groupingUniSet.contains(cp)) {
} else if (!groupingDisabled && separator == cp && groupingUniSet.contains(cp)) {
// Second or later grouping separator.
if (requireGroupingMatch && currGroup != grouping2) {
break;
}
} else if (groupingEnabled && separator != cp && decimalUniSet.contains(cp)) {
} else if (!groupingDisabled && separator != cp && decimalUniSet.contains(cp)) {
// Decimal separator after a grouping separator.
if (requireGroupingMatch && currGroup != grouping1) {
break;
@ -234,13 +256,15 @@ public class DecimalMatcher implements NumberParseMatcher {
result.quantity.truncate();
segment.setOffset(lastSeparatorOffset);
}
} else if (separator != -1 && !groupingEnabled) {
} else if (separator != -1 && groupingDisabled) {
// The final separator was a grouping separator, but we aren't accepting grouping.
// Reset the offset to immediately before that grouping separator.
result.quantity.adjustMagnitude(-currGroup);
result.quantity.truncate();
segment.setOffset(lastSeparatorOffset);
} else if (separator != -1 && requireGroupingMatch && groupingUniSet.contains(separator)
} else if (separator != -1
&& requireGroupingMatch
&& groupingUniSet.contains(separator)
&& currGroup != grouping1) {
// The final separator was a grouping separator, and we have a mismatched grouping size.
// Reset the offset to the beginning of the number.
@ -252,24 +276,25 @@ public class DecimalMatcher implements NumberParseMatcher {
// segment.setOffset(initialOffset);
}
return segment.length() == 0 || hasPartialPrefix || segment.isLeadingSurrogate();
return segment.length() == 0 || hasPartialPrefix;
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
UnicodeSet leadChars = new UnicodeSet();
leadChars.addAll(UnicodeSetStaticCache.getLeadChars(Key.DIGITS));
public UnicodeSet getLeadCodePoints() {
if (digitStrings == null && leadSet != null) {
return leadSet;
}
UnicodeSet leadCodePoints = new UnicodeSet();
// Assumption: the sets are all single code points.
leadCodePoints.addAll(UnicodeSetStaticCache.get(Key.DIGITS));
leadCodePoints.addAll(separatorSet);
if (digitStrings != null) {
for (int i = 0; i < digitStrings.length; i++) {
ParsingUtils.putLeadingChar(digitStrings[i], leadChars, ignoreCase);
ParsingUtils.putLeadCodePoint(digitStrings[i], leadCodePoints);
}
}
if (separatorLeadChars != null) {
leadChars.addAll(separatorLeadChars);
} else {
ParsingUtils.putLeadSurrogates(separatorSet, leadChars);
}
return leadChars.freeze();
return leadCodePoints.freeze();
}
@Override

View file

@ -26,13 +26,13 @@ public class IgnorablesMatcher extends RangeMatcher {
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
public UnicodeSet getLeadCodePoints() {
if (this == DEFAULT) {
return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
} else if (this == STRICT) {
return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
} else {
return super.getLeadChars(ignoreCase);
return super.getLeadCodePoints();
}
}

View file

@ -1,15 +0,0 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
/**
* @author sffc
*
*/
public class MatcherUtils {
public static boolean isValidCodePoint(int cp) {
return Character.isValidCodePoint(cp)
&& (Character.isSupplementaryCodePoint(cp) || !Character.isSurrogate((char) cp));
}
}

View file

@ -2,6 +2,7 @@
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
@ -11,23 +12,34 @@ import com.ibm.icu.text.UnicodeSet;
*/
public class NanMatcher extends SymbolMatcher {
private static final NanMatcher DEFAULT = new NanMatcher();
private static final NanMatcher DEFAULT = new NanMatcher("NaN");
private static final NanMatcher DEFAULT_FOLDED = new NanMatcher(UCharacter.foldCase("NaN", true));
public static NanMatcher getInstance(DecimalFormatSymbols symbols) {
String symbolString = symbols.getNaN();
public static NanMatcher getInstance(DecimalFormatSymbols symbols, int parseFlags) {
String symbolString = ParsingUtils.maybeFold(symbols.getNaN(), parseFlags);
if (DEFAULT.string.equals(symbolString)) {
return DEFAULT;
} else if (DEFAULT_FOLDED.string.equals(symbolString)) {
return DEFAULT_FOLDED;
} else {
return new NanMatcher(symbolString);
}
}
private NanMatcher(String symbolString) {
super(symbolString, DEFAULT.uniSet);
super(symbolString, UnicodeSet.EMPTY);
}
private NanMatcher() {
super("NaN", UnicodeSet.EMPTY);
@Override
public UnicodeSet getLeadCodePoints() {
// Overriding this here to allow use of statically allocated sets
if (this == DEFAULT) {
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_N);
} else if (this == DEFAULT_FOLDED) {
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_N);
} else {
return super.getLeadCodePoints();
}
}
@Override

View file

@ -29,7 +29,7 @@ public interface NumberParseMatcher {
* this matcher unless a segment begins with a char in this set. To make this matcher always run, return
* {@link UnicodeSet#ALL_CODE_POINTS}.
*/
public UnicodeSet getLeadChars(boolean ignoreCase);
public UnicodeSet getLeadCodePoints();
/**
* Method called at the end of a parse, after all matchers have failed to consume any more chars. Allows a matcher

View file

@ -14,8 +14,10 @@ import com.ibm.icu.impl.number.CustomSymbolCurrency;
import com.ibm.icu.impl.number.DecimalFormatProperties;
import com.ibm.icu.impl.number.Parse.ParseMode;
import com.ibm.icu.impl.number.PatternStringParser;
import com.ibm.icu.impl.number.PatternStringParser.ParsedPatternInfo;
import com.ibm.icu.impl.number.PropertiesAffixPatternProvider;
import com.ibm.icu.impl.number.RoundingUtils;
import com.ibm.icu.number.Grouper;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Currency;
@ -33,23 +35,26 @@ public class NumberParserImpl {
public static NumberParserImpl createParserFromPattern(String pattern, boolean strictGrouping) {
// Temporary frontend for testing.
NumberParserImpl parser = new NumberParserImpl(true, true);
int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE
| ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
if (strictGrouping) {
parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE;
}
NumberParserImpl parser = new NumberParserImpl(parseFlags, true);
ULocale locale = new ULocale("en_IN");
DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale);
IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT;
AffixPatternProvider patternInfo = PatternStringParser.parseToPatternInfo(pattern);
AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, true);
ParsedPatternInfo patternInfo = PatternStringParser.parseToPatternInfo(pattern);
AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, parseFlags);
Grouper grouper = Grouper.defaults().withLocaleData(patternInfo);
parser.addMatcher(ignorables);
DecimalMatcher decimalMatcher = new DecimalMatcher();
decimalMatcher.requireGroupingMatch = strictGrouping;
decimalMatcher.grouping1 = 3;
decimalMatcher.grouping2 = 2;
decimalMatcher.freeze(symbols, false, false);
parser.addMatcher(decimalMatcher);
parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
parser.addMatcher(MinusSignMatcher.getInstance(symbols));
parser.addMatcher(new ScientificMatcher(symbols));
parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
parser.addMatcher(new RequireNumberMatcher());
@ -90,7 +95,8 @@ public class NumberParserImpl {
currency = Currency.getInstance(result.currencyCode);
} else {
assert 0 != (result.flags & ParsedNumber.FLAG_HAS_DEFAULT_CURRENCY);
currency = CustomSymbolCurrency.resolve(properties.getCurrency(), symbols.getULocale(), symbols);
currency = CustomSymbolCurrency
.resolve(properties.getCurrency(), symbols.getULocale(), symbols);
}
return new CurrencyAmount(result.getNumber(), currency);
} else {
@ -110,23 +116,44 @@ public class NumberParserImpl {
DecimalFormatSymbols symbols,
boolean parseCurrency,
boolean optimize) {
NumberParserImpl parser = new NumberParserImpl(!properties.getParseCaseSensitive(), optimize);
ULocale locale = symbols.getULocale();
AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
Currency currency = CustomSymbolCurrency.resolve(properties.getCurrency(), locale, symbols);
boolean isStrict = properties.getParseMode() == ParseMode.STRICT;
boolean decimalSeparatorRequired = properties.getDecimalPatternMatchRequired()
? (properties.getDecimalSeparatorAlwaysShown()
|| properties.getMaximumFractionDigits() != 0)
: false;
Grouper grouper = Grouper.defaults().withProperties(properties);
int parseFlags = 0;
if (!properties.getParseCaseSensitive()) {
parseFlags |= ParsingUtils.PARSE_FLAG_IGNORE_CASE;
}
if (properties.getParseIntegerOnly()) {
parseFlags |= ParsingUtils.PARSE_FLAG_INTEGER_ONLY;
}
if (isStrict) {
parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE;
} else {
parseFlags |= ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
}
if (grouper.getPrimary() == -1) {
parseFlags |= ParsingUtils.PARSE_FLAG_GROUPING_DISABLED;
}
if (parseCurrency || patternInfo.hasCurrencySign()) {
parseFlags |= ParsingUtils.PARSE_FLAG_MONETARY_SEPARATORS;
}
IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
boolean decimalSeparatorRequired = properties.getDecimalPatternMatchRequired()
? (properties.getDecimalSeparatorAlwaysShown() || properties.getMaximumFractionDigits() != 0)
: false;
NumberParserImpl parser = new NumberParserImpl(parseFlags, optimize);
//////////////////////
/// AFFIX MATCHERS ///
//////////////////////
// Set up a pattern modifier with mostly defaults to generate AffixMatchers.
AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, !isStrict);
AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, parseFlags);
////////////////////////
/// CURRENCY MATCHER ///
@ -134,18 +161,20 @@ public class NumberParserImpl {
if (parseCurrency || patternInfo.hasCurrencySign()) {
parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
parser.addMatcher(CurrencyMatcher.getInstance(currency, locale));
parser.addMatcher(CurrencyMatcher.getInstance(currency, locale, parseFlags));
}
///////////////////////////////
/// OTHER STANDARD MATCHERS ///
///////////////////////////////
if (!isStrict || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN) || properties.getSignAlwaysShown()) {
if (!isStrict
|| patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN)
|| properties.getSignAlwaysShown()) {
parser.addMatcher(PlusSignMatcher.getInstance(symbols));
}
parser.addMatcher(MinusSignMatcher.getInstance(symbols));
parser.addMatcher(NanMatcher.getInstance(symbols));
parser.addMatcher(NanMatcher.getInstance(symbols, parseFlags));
parser.addMatcher(PercentMatcher.getInstance(symbols));
parser.addMatcher(PermilleMatcher.getInstance(symbols));
parser.addMatcher(InfinityMatcher.getInstance(symbols));
@ -154,17 +183,9 @@ public class NumberParserImpl {
parser.addMatcher(new PaddingMatcher(padString));
}
parser.addMatcher(ignorables);
DecimalMatcher decimalMatcher = new DecimalMatcher();
decimalMatcher.requireGroupingMatch = isStrict;
decimalMatcher.groupingEnabled = properties.getGroupingSize() > 0;
decimalMatcher.decimalEnabled = properties.getDecimalPatternMatchRequired() ? decimalSeparatorRequired : true;
decimalMatcher.grouping1 = properties.getGroupingSize();
decimalMatcher.grouping2 = properties.getSecondaryGroupingSize();
decimalMatcher.integerOnly = properties.getParseIntegerOnly();
decimalMatcher.freeze(symbols, parseCurrency || patternInfo.hasCurrencySign(), isStrict);
parser.addMatcher(decimalMatcher);
parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
if (!properties.getParseNoExponent()) {
parser.addMatcher(new ScientificMatcher(symbols));
parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
}
//////////////////
@ -195,9 +216,9 @@ public class NumberParserImpl {
return parser;
}
private final boolean ignoreCase;
private final int parseFlags;
private final List<NumberParseMatcher> matchers;
private final List<UnicodeSet> leadCharses;
private final List<UnicodeSet> leadCodePointses;
private Comparator<ParsedNumber> comparator;
private boolean frozen;
@ -205,43 +226,44 @@ public class NumberParserImpl {
* Creates a new, empty parser.
*
* @param ignoreCase
* If true, perform case-folding. This parameter needs to go into the constructor because its value is
* used during the construction of the matcher chain.
* If true, perform case-folding. This parameter needs to go into the constructor because
* its value is used during the construction of the matcher chain.
* @param optimize
* If true, compute "lead chars" UnicodeSets for the matchers. This reduces parsing runtime but increases
* construction runtime. If the parser is going to be used only once or twice, set this to false; if it
* is going to be used hundreds of times, set it to true.
* If true, compute "lead chars" UnicodeSets for the matchers. This reduces parsing
* runtime but increases construction runtime. If the parser is going to be used only once
* or twice, set this to false; if it is going to be used hundreds of times, set it to
* true.
*/
public NumberParserImpl(boolean ignoreCase, boolean optimize) {
public NumberParserImpl(int parseFlags, boolean optimize) {
matchers = new ArrayList<NumberParseMatcher>();
if (optimize) {
leadCharses = new ArrayList<UnicodeSet>();
leadCodePointses = new ArrayList<UnicodeSet>();
} else {
leadCharses = null;
leadCodePointses = null;
}
comparator = ParsedNumber.COMPARATOR; // default value
this.ignoreCase = ignoreCase;
this.parseFlags = parseFlags;
frozen = false;
}
public void addMatcher(NumberParseMatcher matcher) {
assert !frozen;
this.matchers.add(matcher);
if (leadCharses != null) {
UnicodeSet leadChars = matcher.getLeadChars(ignoreCase);
assert leadChars.isFrozen();
this.leadCharses.add(leadChars);
if (leadCodePointses != null) {
UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
assert leadCodePoints.isFrozen();
this.leadCodePointses.add(leadCodePoints);
}
}
public void addMatchers(Collection<? extends NumberParseMatcher> matchers) {
assert !frozen;
this.matchers.addAll(matchers);
if (leadCharses != null) {
if (leadCodePointses != null) {
for (NumberParseMatcher matcher : matchers) {
UnicodeSet leadChars = matcher.getLeadChars(ignoreCase);
assert leadChars.isFrozen();
this.leadCharses.add(leadChars);
UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
assert leadCodePoints.isFrozen();
this.leadCodePointses.add(leadCodePoints);
}
}
}
@ -263,8 +285,8 @@ public class NumberParserImpl {
* Primary entrypoint to parsing code path.
*
* @param input
* The string to parse. This is a String, not CharSequence, to enforce assumptions about immutability
* (CharSequences are not guaranteed to be immutable).
* The string to parse. This is a String, not CharSequence, to enforce assumptions about
* immutability (CharSequences are not guaranteed to be immutable).
* @param start
* The index into the string at which to start parsing.
* @param greedy
@ -274,7 +296,7 @@ public class NumberParserImpl {
*/
public void parse(String input, int start, boolean greedy, ParsedNumber result) {
assert frozen;
StringSegment segment = new StringSegment(input, ignoreCase);
StringSegment segment = new StringSegment(ParsingUtils.maybeFold(input, parseFlags));
segment.adjustOffset(start);
if (greedy) {
parseGreedyRecursive(segment, result);
@ -293,10 +315,9 @@ public class NumberParserImpl {
}
int initialOffset = segment.getOffset();
char leadChar = leadCharses == null ? 0
: ignoreCase ? ParsingUtils.getCaseFoldedLeadingChar(segment) : segment.charAt(0);
int leadCp = segment.getCodePoint();
for (int i = 0; i < matchers.size(); i++) {
if (leadCharses != null && !leadCharses.get(i).contains(leadChar)) {
if (leadCodePointses != null && !leadCodePointses.get(i).contains(leadCp)) {
continue;
}
NumberParseMatcher matcher = matchers.get(i);
@ -304,7 +325,8 @@ public class NumberParserImpl {
if (segment.getOffset() != initialOffset) {
// In a greedy parse, recurse on only the first match.
parseGreedyRecursive(segment, result);
// The following line resets the offset so that the StringSegment says the same across the function
// The following line resets the offset so that the StringSegment says the same across
// the function
// call boundary. Since we recurse only once, this line is not strictly necessary.
segment.setOffset(initialOffset);
return;
@ -329,10 +351,11 @@ public class NumberParserImpl {
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
// In a non-greedy parse, we attempt all possible matches and pick the best.
for (int charsToConsume = 1; charsToConsume <= segment.length(); charsToConsume++) {
candidate.copyFrom(initial);
for (int charsToConsume = 0; charsToConsume < segment.length();) {
charsToConsume += Character.charCount(Character.codePointAt(segment, charsToConsume));
// Run the matcher on a segment of the current length.
candidate.copyFrom(initial);
segment.setLength(charsToConsume);
boolean maybeMore = matcher.match(segment, candidate);
segment.resetLength();

View file

@ -3,7 +3,6 @@
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.EntryRange;
@ -12,50 +11,40 @@ import com.ibm.icu.text.UnicodeSet.EntryRange;
*/
public class ParsingUtils {
/**
* Adds all chars and lead surrogates from input into output.
*/
public static void putLeadSurrogates(UnicodeSet input, UnicodeSet output) {
if (input.isEmpty()) {
return;
}
public static final int PARSE_FLAG_IGNORE_CASE = 0x0001;
public static final int PARSE_FLAG_MONETARY_SEPARATORS = 0x0002;
public static final int PARSE_FLAG_STRICT_SEPARATORS = 0x0004;
public static final int PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008;
public static final int PARSE_FLAG_INTEGER_ONLY = 0x0010;
public static final int PARSE_FLAG_GROUPING_DISABLED = 0x0020;
public static final int PARSE_FLAG_DECIMAL_SCIENTIFIC = 0x0040;
public static final int PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080;
public static void putLeadCodePoints(UnicodeSet input, UnicodeSet output) {
for (EntryRange range : input.ranges()) {
if (range.codepointEnd <= 0xFFFF) {
// All BMP chars
output.add(range.codepoint, range.codepointEnd);
} else {
// Need to get the lead surrogates
// TODO: Make this more efficient?
if (range.codepoint <= 0xFFFF) {
output.add(range.codepoint, 0xFFFF);
}
for (int cp = Math.max(0x10000, range.codepoint); cp <= range.codepointEnd; cp++) {
output.add(UTF16.getLeadSurrogate(cp));
}
}
output.add(range.codepoint, range.codepointEnd);
}
for (String str : input.strings()) {
output.add(str.codePointAt(0));
}
}
public static void putLeadCodePoint(String input, UnicodeSet output) {
if (!input.isEmpty()) {
output.add(input.codePointAt(0));
}
}
private static final UnicodeSet LETTERS = new UnicodeSet("[:letter:]").freeze();
/**
* Adds the first char of the given string to leadChars, performing case-folding if necessary.
* Case-folds the string if IGNORE_CASE flag is set; otherwise, returns the same string.
*/
public static void putLeadingChar(String str, UnicodeSet leadChars, boolean ignoreCase) {
if (str.isEmpty()) {
return;
}
if (ignoreCase) {
leadChars.add(getCaseFoldedLeadingChar(str));
public static String maybeFold(String input, int parseFlags) {
if (0 != (parseFlags & PARSE_FLAG_IGNORE_CASE) && LETTERS.containsSome(input)) {
return UCharacter.foldCase(input, true);
} else {
leadChars.add(str.charAt(0));
}
}
public static char getCaseFoldedLeadingChar(CharSequence str) {
int cp = UCharacter.foldCase(Character.codePointAt(str, 0), true);
if (cp <= 0xFFFF) {
return (char) cp;
} else {
return UTF16.getLeadSurrogate(cp);
return input;
}
}

View file

@ -35,7 +35,7 @@ public abstract class RangeMatcher implements NumberParseMatcher {
}
// If we get here, the code point didn't match the uniSet.
return segment.isLeadingSurrogate();
return false;
}
// If we get here, we consumed the entire string segment.
@ -43,10 +43,10 @@ public abstract class RangeMatcher implements NumberParseMatcher {
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
UnicodeSet leadChars = new UnicodeSet();
ParsingUtils.putLeadSurrogates(uniSet, leadChars);
return leadChars.freeze();
public UnicodeSet getLeadCodePoints() {
UnicodeSet leadCodePoints = new UnicodeSet();
ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
return leadCodePoints.freeze();
}
@Override

View file

@ -2,6 +2,7 @@
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.number.Grouper;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
@ -14,13 +15,19 @@ public class ScientificMatcher implements NumberParseMatcher {
private final String exponentSeparatorString;
private final DecimalMatcher exponentMatcher;
public ScientificMatcher(DecimalFormatSymbols symbols) {
exponentSeparatorString = symbols.getExponentSeparator();
exponentMatcher = new DecimalMatcher();
exponentMatcher.isScientific = true;
exponentMatcher.groupingEnabled = false;
exponentMatcher.decimalEnabled = false;
exponentMatcher.freeze(symbols, false, false);
public static ScientificMatcher getInstance(
DecimalFormatSymbols symbols,
Grouper grouper,
int parseFlags) {
// TODO: Static-initialize most common instances?
return new ScientificMatcher(symbols, grouper, parseFlags);
}
private ScientificMatcher(DecimalFormatSymbols symbols, Grouper grouper, int parseFlags) {
exponentSeparatorString = ParsingUtils.maybeFold(symbols.getExponentSeparator(), parseFlags);
exponentMatcher = DecimalMatcher.getInstance(symbols,
grouper,
ParsingUtils.PARSE_FLAG_DECIMAL_SCIENTIFIC | ParsingUtils.PARSE_FLAG_INTEGER_ONLY);
}
@Override
@ -76,10 +83,15 @@ public class ScientificMatcher implements NumberParseMatcher {
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
UnicodeSet leadChars = new UnicodeSet();
ParsingUtils.putLeadingChar(exponentSeparatorString, leadChars, ignoreCase);
return leadChars.freeze();
public UnicodeSet getLeadCodePoints() {
int cp = exponentSeparatorString.codePointAt(0);
if (cp == 'E') {
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_E);
} else if (cp == 'e') {
return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_E);
} else {
return new UnicodeSet().add(cp).freeze();
}
}
@Override

View file

@ -2,11 +2,9 @@
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.lang.UCharacter;
/**
* A mutable class allowing for a String with a variable offset and length. The charAt, length, and subSequence methods
* all operate relative to the fixed offset into the String.
* A mutable class allowing for a String with a variable offset and length. The charAt, length, and
* subSequence methods all operate relative to the fixed offset into the String.
*
* @author sffc
*/
@ -14,13 +12,11 @@ public class StringSegment implements CharSequence {
private final String str;
private int start;
private int end;
private final boolean ignoreCase;
public StringSegment(String str, boolean ignoreCase) {
public StringSegment(String str) {
this.str = str;
this.start = 0;
this.end = str.length();
this.ignoreCase = ignoreCase;
}
public int getOffset() {
@ -66,7 +62,8 @@ public class StringSegment implements CharSequence {
}
/**
* Returns the first code point in the string segment, or -1 if the string starts with an invalid code point.
* Returns the first code point in the string segment, or -1 if the string starts with an invalid
* code point.
*/
public int getCodePoint() {
assert start < end;
@ -81,36 +78,17 @@ public class StringSegment implements CharSequence {
}
/**
* Returns whether the segment is one char in length, and that the char is a leading surrogate.
*/
public boolean isLeadingSurrogate() {
return (end - start == 1) && Character.isHighSurrogate(str.charAt(start));
}
/**
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For example, if this
* string segment is "aab", and the char sequence is "aac", this method returns 2, since the first 2 characters are
* the same.
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
* example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
* since the first 2 characters are the same.
*/
public int getCommonPrefixLength(CharSequence other) {
int offset = 0;
for (; offset < Math.min(length(), other.length());) {
if (ignoreCase) {
// NOTE: Character.codePointAt() returns the leading surrogate if it is the only char left in the
// string. UCharacter.foldCase() will simply return the same integer since it is not a valid code point.
int cp1 = Character.codePointAt(this, offset);
int cp2 = Character.codePointAt(other, offset);
if (cp1 != cp2 && UCharacter.foldCase(cp1, true) != UCharacter.foldCase(cp2, true)) {
break;
}
offset += Character.charCount(cp1);
} else {
// Case folding is not necessary. Use a slightly faster code path comparing chars with chars.
if (charAt(offset) != other.charAt(offset)) {
break;
}
offset++;
if (charAt(offset) != other.charAt(offset)) {
break;
}
offset++;
}
return offset;
}

View file

@ -11,7 +11,6 @@ import com.ibm.icu.text.UnicodeSet;
public abstract class SymbolMatcher implements NumberParseMatcher {
protected final String string;
protected final UnicodeSet uniSet;
protected final UnicodeSet leadChars;
// TODO: Implement this class using only UnicodeSet and not String?
// How to deal with case folding?
@ -19,13 +18,11 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
protected SymbolMatcher(String symbolString, UnicodeSet symbolUniSet) {
string = symbolString;
uniSet = symbolUniSet;
leadChars = null;
}
protected SymbolMatcher(UnicodeSetStaticCache.Key key) {
string = "";
uniSet = UnicodeSetStaticCache.get(key);
leadChars = UnicodeSetStaticCache.getLeadChars(key);
}
@Override
@ -43,7 +40,7 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
}
if (string.isEmpty()) {
return segment.isLeadingSurrogate();
return false;
}
int overlap = segment.getCommonPrefixLength(string);
if (overlap == string.length()) {
@ -51,19 +48,20 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
accept(segment, result);
return false;
}
return overlap == segment.length() || segment.isLeadingSurrogate();
return overlap == segment.length();
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
if (leadChars != null) {
return leadChars;
public UnicodeSet getLeadCodePoints() {
if (string == null || string.isEmpty()) {
// Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
return uniSet;
}
UnicodeSet leadChars = new UnicodeSet();
ParsingUtils.putLeadSurrogates(uniSet, leadChars);
ParsingUtils.putLeadingChar(string, leadChars, ignoreCase);
return leadChars.freeze();
UnicodeSet leadCodePoints = new UnicodeSet();
ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
ParsingUtils.putLeadCodePoint(string, leadCodePoints);
return leadCodePoints.freeze();
}
@Override

View file

@ -8,8 +8,14 @@ import java.util.Map;
import com.ibm.icu.text.UnicodeSet;
/**
* @author sffc
* This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to
* bring a very sizeable performance boost.
*
* IMPORTANT ASSUMPTION: All of the sets contain code points (no strings) and they are all case-folded.
* If this assumption were ever broken, logic in classes such as SymbolMatcher would need to be updated
* in order to return well-formed sets upon calls to getLeadCodePoints().
*
* @author sffc
*/
public class UnicodeSetStaticCache {
public static enum Key {
@ -42,19 +48,26 @@ public class UnicodeSetStaticCache {
// Other
DIGITS,
CAPITAL_N,
FOLDED_N,
CAPITAL_E,
FOLDED_E,
// Combined Separators with Digits (for lead code points)
DIGITS_OR_COMMA_OR_OTHER,
DIGITS_OR_PERIOD_OR_OTHER,
DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER,
DIGITS_OR_STRICT_COMMA_OR_OTHER,
DIGITS_OR_STRICT_PERIOD_OR_OTHER,
DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER,
};
private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class);
private static final Map<Key, UnicodeSet> leadCharsSets = new EnumMap<Key, UnicodeSet>(Key.class);
public static UnicodeSet get(Key key) {
return unicodeSets.get(key);
}
public static UnicodeSet getLeadChars(Key key) {
return leadCharsSets.get(key);
}
public static Key chooseFrom(String str, Key key1) {
return get(key1).contains(str) ? key1 : null;
}
@ -107,6 +120,23 @@ public class UnicodeSetStaticCache {
// Strict 1'234.567
return Key.STRICT_PERIOD_OR_OTHER;
} else if (key1 == Key.COMMA_OR_OTHER && key2 == Key.DIGITS) {
return Key.DIGITS_OR_COMMA_OR_OTHER;
} else if (key1 == Key.PERIOD_OR_OTHER && key2 == Key.DIGITS) {
return Key.DIGITS_OR_PERIOD_OR_OTHER;
} else if (key1 == Key.COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
return Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER;
} else if (key1 == Key.STRICT_COMMA_OR_OTHER && key2 == Key.DIGITS) {
return Key.DIGITS_OR_STRICT_COMMA_OR_OTHER;
} else if (key1 == Key.STRICT_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
return Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER;
} else if (key1 == Key.STRICT_COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
return Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER;
}
return null;
@ -143,8 +173,10 @@ public class UnicodeSetStaticCache {
unicodeSets.put(Key.PERIOD_OR_OTHER, computeUnion(Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.COMMA_OR_PERIOD_OR_OTHER,
computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.STRICT_COMMA_OR_OTHER, computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER, computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.STRICT_COMMA_OR_OTHER,
computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER,
computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.STRICT_COMMA_OR_PERIOD_OR_OTHER,
computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
@ -157,11 +189,20 @@ public class UnicodeSetStaticCache {
unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
unicodeSets.put(Key.CAPITAL_N, new UnicodeSet("[N]").freeze());
unicodeSets.put(Key.FOLDED_N, new UnicodeSet("[n]").freeze());
unicodeSets.put(Key.CAPITAL_E, new UnicodeSet("[E]").freeze());
unicodeSets.put(Key.FOLDED_E, new UnicodeSet("[e]").freeze());
for (Key key : Key.values()) {
UnicodeSet leadChars = new UnicodeSet();
ParsingUtils.putLeadSurrogates(get(key), leadChars);
leadCharsSets.put(key, leadChars.freeze());
}
unicodeSets.put(Key.DIGITS_OR_COMMA_OR_OTHER, computeUnion(Key.DIGITS, Key.COMMA_OR_OTHER));
unicodeSets.put(Key.DIGITS_OR_PERIOD_OR_OTHER, computeUnion(Key.DIGITS, Key.PERIOD_OR_OTHER));
unicodeSets.put(Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER,
computeUnion(Key.DIGITS, Key.COMMA_OR_PERIOD_OR_OTHER));
unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_OTHER,
computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_OTHER));
unicodeSets.put(Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER,
computeUnion(Key.DIGITS, Key.STRICT_PERIOD_OR_OTHER));
unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER,
computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_PERIOD_OR_OTHER));
}
}

View file

@ -15,7 +15,7 @@ public abstract class ValidationMatcher implements NumberParseMatcher {
}
@Override
public UnicodeSet getLeadChars(boolean ignoreCase) {
public UnicodeSet getLeadCodePoints() {
return UnicodeSet.EMPTY;
}

View file

@ -2,6 +2,7 @@
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.number;
import com.ibm.icu.impl.number.DecimalFormatProperties;
import com.ibm.icu.impl.number.DecimalQuantity;
import com.ibm.icu.impl.number.PatternStringParser.ParsedPatternInfo;
@ -84,7 +85,30 @@ public class Grouper {
}
}
Grouper withLocaleData(ParsedPatternInfo patternInfo) {
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public Grouper withProperties(DecimalFormatProperties properties) {
if (grouping1 != -2) {
return this;
}
byte grouping1 = (byte) properties.getGroupingSize();
byte grouping2 = (byte) properties.getSecondaryGroupingSize();
int minGrouping = properties.getMinimumGroupingDigits();
grouping1 = grouping1 > 0 ? grouping1 : grouping2 > 0 ? grouping2 : -1;
grouping2 = grouping2 > 0 ? grouping2 : grouping1;
// TODO: Is it important to handle minGrouping > 2?
return getInstance(grouping1, grouping2, minGrouping == 2);
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public Grouper withLocaleData(ParsedPatternInfo patternInfo) {
if (grouping1 != -2) {
return this;
}
@ -112,4 +136,22 @@ public class Grouper {
&& (position % grouping2) == 0
&& value.getUpperDisplayMagnitude() - grouping1 + 1 >= (min2 ? 2 : 1);
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public byte getPrimary() {
return grouping1;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public byte getSecondary() {
return grouping2;
}
}

View file

@ -193,14 +193,7 @@ final class NumberPropertyMapper {
// GROUPING STRATEGY //
///////////////////////
int grouping1 = properties.getGroupingSize();
int grouping2 = properties.getSecondaryGroupingSize();
int minGrouping = properties.getMinimumGroupingDigits();
assert grouping1 >= -2; // value of -2 means to forward no grouping information
grouping1 = grouping1 > 0 ? grouping1 : grouping2 > 0 ? grouping2 : grouping1;
grouping2 = grouping2 > 0 ? grouping2 : grouping1;
// TODO: Is it important to handle minGrouping > 2?
macros.grouper = Grouper.getInstance((byte) grouping1, (byte) grouping2, minGrouping == 2);
macros.grouper = Grouper.defaults().withProperties(properties);
/////////////
// PADDING //

View file

@ -35,14 +35,14 @@ public class NumberParserTest {
{ 3, "𝟱𝟭𝟰𝟮𝟯x", "0", 10, 51423. },
{ 3, " 𝟱𝟭𝟰𝟮𝟯", "0", 11, 51423. },
{ 3, "𝟱𝟭𝟰𝟮𝟯 ", "0", 10, 51423. },
{ 7, "𝟱𝟭,𝟰𝟮𝟯", "0", 11, 51423. },
{ 7, "𝟳,𝟴𝟵,𝟱𝟭,𝟰𝟮𝟯", "0", 19, 78951423. },
{ 4, "𝟳𝟴,𝟵𝟱𝟭,𝟰𝟮𝟯", "0", 11, 78951. },
{ 7, "𝟳𝟴,𝟵𝟱𝟭.𝟰𝟮𝟯", "0", 18, 78951.423 },
{ 7, "𝟳𝟴,𝟬𝟬𝟬", "0", 11, 78000. },
{ 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟬𝟬", "0", 18, 78000. },
{ 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟮𝟯", "0", 18, 78000.023 },
{ 7, "𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", "0", 11, 78. },
{ 7, "𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 11, 51423. },
{ 7, "𝟳,𝟴𝟵,𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 19, 78951423. },
{ 4, "𝟳𝟴,𝟵𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 11, 78951. },
{ 7, "𝟳𝟴,𝟵𝟱𝟭.𝟰𝟮𝟯", "#,##,##0", 18, 78951.423 },
{ 7, "𝟳𝟴,𝟬𝟬𝟬", "#,##,##0", 11, 78000. },
{ 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟬𝟬", "#,##,##0", 18, 78000. },
{ 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟮𝟯", "#,##,##0", 18, 78000.023 },
{ 7, "𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", "#,##,##0", 11, 78. },
{ 3, "-𝟱𝟭𝟰𝟮𝟯", "0", 11, -51423. },
{ 3, "-𝟱𝟭𝟰𝟮𝟯-", "0", 11, -51423. },
{ 3, "a51423US dollars", "a0¤¤¤", 16, 51423. },
@ -68,9 +68,11 @@ public class NumberParserTest {
{ 3, "𝟱.𝟭𝟰𝟮E𝟯", "0", 12, 5142. },
{ 3, "𝟱.𝟭𝟰𝟮E-𝟯", "0", 13, 0.005142 },
{ 3, "𝟱.𝟭𝟰𝟮e-𝟯", "0", 13, 0.005142 },
{ 3, "5,142.50 Canadian dollars", "0", 25, 5142.5 },
{ 7, "5,142.50 Canadian dollars", "#,##,##0", 25, 5142.5 },
// { 3, "a$ b5", "a ¤ b0", 6, 5.0 }, // TODO: Does not work
{ 7, ".00", "0", 3, 0.0 },
{ 3, "📺1.23", "📺0;📻0", 6, 1.23 },
{ 3, "📻1.23", "📺0;📻0", 6, -1.23 },
{ 3, ".00", "0", 3, 0.0 },
{ 3, "0", "0", 1, 0.0 } };
for (Object[] cas : cases) {

View file

@ -3,8 +3,6 @@
package com.ibm.icu.dev.test.number;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
@ -19,7 +17,7 @@ public class StringSegmentTest {
@Test
public void testOffset() {
StringSegment segment = new StringSegment(SAMPLE_STRING, false);
StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(0, segment.getOffset());
segment.adjustOffset(3);
assertEquals(3, segment.getOffset());
@ -31,7 +29,7 @@ public class StringSegmentTest {
@Test
public void testLength() {
StringSegment segment = new StringSegment(SAMPLE_STRING, false);
StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(11, segment.length());
segment.adjustOffset(3);
assertEquals(8, segment.length());
@ -45,7 +43,7 @@ public class StringSegmentTest {
@Test
public void testCharAt() {
StringSegment segment = new StringSegment(SAMPLE_STRING, false);
StringSegment segment = new StringSegment(SAMPLE_STRING);
assertCharSequenceEquals(SAMPLE_STRING, segment);
segment.adjustOffset(3);
assertCharSequenceEquals("radio 📻", segment);
@ -55,7 +53,7 @@ public class StringSegmentTest {
@Test
public void testGetCodePoint() {
StringSegment segment = new StringSegment(SAMPLE_STRING, false);
StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(0x1F4FB, segment.getCodePoint());
segment.setLength(1);
assertEquals(-1, segment.getCodePoint());
@ -66,20 +64,9 @@ public class StringSegmentTest {
assertEquals(0x20, segment.getCodePoint());
}
@Test
public void testIsLeadingSurrogate() {
StringSegment segment = new StringSegment(SAMPLE_STRING, false);
assertFalse(segment.isLeadingSurrogate());
segment.setLength(1);
assertTrue(segment.isLeadingSurrogate());
segment.adjustOffset(1);
segment.setLength(1);
assertFalse(segment.isLeadingSurrogate()); // trail, not lead
}
@Test
public void testCommonPrefixLength() {
StringSegment segment = new StringSegment(SAMPLE_STRING, false);
StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING));
assertEquals(4, segment.getCommonPrefixLength("📻 r"));
assertEquals(3, segment.getCommonPrefixLength("📻 x"));
@ -101,15 +88,6 @@ public class StringSegmentTest {
assertEquals(0, segment.getCommonPrefixLength("foo"));
}
@Test
public void testIgnoreCase() {
StringSegment segment = new StringSegment(SAMPLE_STRING, true);
assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING));
assertEquals(0, segment.getCommonPrefixLength("x"));
segment.setOffset(3);
assertEquals(5, segment.getCommonPrefixLength("RAdiO"));
}
private static void assertCharSequenceEquals(CharSequence a, CharSequence b) {
assertEquals(a.length(), b.length());
for (int i = 0; i < a.length(); i++) {

View file

@ -17,6 +17,7 @@ import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.TextTrieMap;
import com.ibm.icu.text.UnicodeSet;
@RunWith(JUnit4.class)
public class TextTrieMapTest extends TestFmwk {
@ -33,6 +34,7 @@ public class TextTrieMapTest extends TestFmwk {
private static final Integer SUP2 = new Integer(9);
private static final Integer SUP3 = new Integer(10);
private static final Integer SUP4 = new Integer(11);
private static final Integer SUP5 = new Integer(12);
private static final Integer FOO = new Integer(-1);
private static final Integer BAR = new Integer(-2);
@ -63,6 +65,9 @@ public class TextTrieMapTest extends TestFmwk {
{"L📺1", SUP2}, // L, 0xD83D, 0xDCFA, 1
{"L📻", SUP3}, // L, 0xD83D, 0xDCFB
{"L🃏", SUP4}, // L, 0xD83C, 0xDCCF
{"📺", SUP5}, // 0xD83D, 0xDCFA
{"📻", SUP5}, // 0xD83D, 0xDCFB
{"🃏", SUP5}, // 0xD83C, 0xDCCF
};
private static final Object[][] TESTCASES = {
@ -174,6 +179,30 @@ public class TextTrieMapTest extends TestFmwk {
checkParse(map, test, expecteds, true);
}
logln("Test for partial match");
for (Object[] cas : TESTDATA) {
String str = (String) cas[0];
for (int i = 0; i < str.length() - 1; i++) {
TextTrieMap.Output output = new TextTrieMap.Output();
map.get(str.substring(0, i), 0, output);
assertTrue("Partial string means partial match", output.partialMatch);
}
String bad = str + "x";
TextTrieMap.Output output = new TextTrieMap.Output();
map.get(bad, 0, output);
assertFalse("No partial match on bad string", output.partialMatch);
}
TextTrieMap.Output output = new TextTrieMap.Output();
map.get("Sunday", 0, output);
assertFalse("No partial match on string with no continuation", output.partialMatch);
logln("Test for LeadCodePoints");
// Note: The 📺 and 📻 have the same lead surrogate
UnicodeSet expectedLeadCodePoints = new UnicodeSet("[SMTWFL📺📻🃏]");
UnicodeSet actualLeadCodePoints = new UnicodeSet();
map.putLeadCodePoints(actualLeadCodePoints);
assertEquals("leadCodePoints", expectedLeadCodePoints, actualLeadCodePoints);
// Add duplicated entry
map.put("Sunday", FOO);
// Add duplicated entry with different casing
@ -217,6 +246,29 @@ public class TextTrieMapTest extends TestFmwk {
checkParse(map, test, expecteds, false);
}
logln("Test for partial match");
for (Object[] cas : TESTDATA) {
String str = (String) cas[0];
for (int i = 0; i < str.length() - 1; i++) {
TextTrieMap.Output output = new TextTrieMap.Output();
map.get(str.substring(0, i), 0, output);
assertTrue("Partial string means partial match", output.partialMatch);
}
String bad = str + "x";
TextTrieMap.Output output = new TextTrieMap.Output();
map.get(bad, 0, output);
assertFalse("No partial match on bad string", output.partialMatch);
}
TextTrieMap.Output output = new TextTrieMap.Output();
map.get("Sunday", 0, output);
assertFalse("No partial match on string with no continuation", output.partialMatch);
logln("Test for LeadCodePoints");
UnicodeSet expectedLeadCodePoints = new UnicodeSet("[smtwfl📺📻🃏]");
UnicodeSet actualLeadCodePoints = new UnicodeSet();
map.putLeadCodePoints(actualLeadCodePoints);
assertEquals("leadCodePoints", expectedLeadCodePoints, actualLeadCodePoints);
// Add duplicated entry
map.put("Sunday", FOO);
// Add duplicated entry with different casing