ICU-13574 Adding currency names matcher to ICU4C.

X-SVN-Rev: 40889
This commit is contained in:
Shane Carr 2018-02-10 02:59:49 +00:00
parent e91ff603de
commit 852897ba2c
11 changed files with 255 additions and 41 deletions

View file

@ -16,6 +16,8 @@
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/parsepos.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "ustr_imp.h"
#include "charstr.h"
#include "cmemory.h"
@ -1287,17 +1289,28 @@ static void
linearSearch(const CurrencyNameStruct* currencyNames,
int32_t begin, int32_t end,
const UChar* text, int32_t textLen,
int32_t *partialMatchLen,
int32_t *maxMatchLen, int32_t* maxMatchIndex) {
int32_t initialPartialMatchLen = *partialMatchLen;
for (int32_t index = begin; index <= end; ++index) {
int32_t len = currencyNames[index].currencyNameLen;
if (len > *maxMatchLen && len <= textLen &&
uprv_memcmp(currencyNames[index].currencyName, text, len * sizeof(UChar)) == 0) {
*partialMatchLen = MAX(*partialMatchLen, len);
*maxMatchIndex = index;
*maxMatchLen = len;
#ifdef UCURR_DEBUG
printf("maxMatchIndex = %d, maxMatchLen = %d\n",
*maxMatchIndex, *maxMatchLen);
#endif
} else {
// Check for partial matches.
for (int32_t i=initialPartialMatchLen; i<MIN(len, textLen); i++) {
if (currencyNames[index].currencyName[i] != text[i]) {
break;
}
*partialMatchLen = MAX(*partialMatchLen, i + 1);
}
}
}
}
@ -1314,7 +1327,8 @@ linearSearch(const CurrencyNameStruct* currencyNames,
static void
searchCurrencyName(const CurrencyNameStruct* currencyNames,
int32_t total_currency_count,
const UChar* text, int32_t textLen,
const UChar* text, int32_t textLen,
int32_t *partialMatchLen,
int32_t* maxMatchLen, int32_t* maxMatchIndex) {
*maxMatchIndex = -1;
*maxMatchLen = 0;
@ -1344,6 +1358,7 @@ searchCurrencyName(const CurrencyNameStruct* currencyNames,
if (binarySearchBegin == -1) { // did not find the range
break;
}
*partialMatchLen = MAX(*partialMatchLen, index + 1);
if (matchIndex != -1) {
// find an exact match for text from text[0] to text[index]
// in currencyNames array.
@ -1354,6 +1369,7 @@ searchCurrencyName(const CurrencyNameStruct* currencyNames,
// linear search if within threshold.
linearSearch(currencyNames, binarySearchBegin, binarySearchEnd,
text, textLen,
partialMatchLen,
maxMatchLen, maxMatchIndex);
break;
}
@ -1422,19 +1438,13 @@ currency_cache_cleanup(void) {
}
U_CAPI void
uprv_parseCurrency(const char* locale,
const icu::UnicodeString& text,
icu::ParsePosition& pos,
int8_t type,
UChar* result,
UErrorCode& ec)
{
U_NAMESPACE_USE
if (U_FAILURE(ec)) {
return;
}
/**
* Loads the currency name data from the cache, or from resource bundles if necessary.
* The refCount is automatically incremented. It is the caller's responsibility
* to decrement it when done!
*/
static CurrencyNameCacheEntry*
getCacheEntry(const char* locale, UErrorCode& ec) {
int32_t total_currency_name_count = 0;
CurrencyNameStruct* currencyNames = NULL;
@ -1455,17 +1465,13 @@ uprv_parseCurrency(const char* locale,
}
if (found != -1) {
cacheEntry = currCache[found];
currencyNames = cacheEntry->currencyNames;
total_currency_name_count = cacheEntry->totalCurrencyNameCount;
currencySymbols = cacheEntry->currencySymbols;
total_currency_symbol_count = cacheEntry->totalCurrencySymbolCount;
++(cacheEntry->refCount);
}
umtx_unlock(&gCurrencyCacheMutex);
if (found == -1) {
collectCurrencyNames(locale, &currencyNames, &total_currency_name_count, &currencySymbols, &total_currency_symbol_count, ec);
if (U_FAILURE(ec)) {
return;
return NULL;
}
umtx_lock(&gCurrencyCacheMutex);
// check again.
@ -1505,15 +1511,45 @@ uprv_parseCurrency(const char* locale,
deleteCurrencyNames(currencyNames, total_currency_name_count);
deleteCurrencyNames(currencySymbols, total_currency_symbol_count);
cacheEntry = currCache[found];
currencyNames = cacheEntry->currencyNames;
total_currency_name_count = cacheEntry->totalCurrencyNameCount;
currencySymbols = cacheEntry->currencySymbols;
total_currency_symbol_count = cacheEntry->totalCurrencySymbolCount;
++(cacheEntry->refCount);
}
umtx_unlock(&gCurrencyCacheMutex);
}
return cacheEntry;
}
static void releaseCacheEntry(CurrencyNameCacheEntry* cacheEntry) {
umtx_lock(&gCurrencyCacheMutex);
--(cacheEntry->refCount);
if (cacheEntry->refCount == 0) { // remove
deleteCacheEntry(cacheEntry);
}
umtx_unlock(&gCurrencyCacheMutex);
}
U_CAPI void
uprv_parseCurrency(const char* locale,
const icu::UnicodeString& text,
icu::ParsePosition& pos,
int8_t type,
int32_t* partialMatchLen,
UChar* result,
UErrorCode& ec) {
U_NAMESPACE_USE
if (U_FAILURE(ec)) {
return;
}
CurrencyNameCacheEntry* cacheEntry = getCacheEntry(locale, ec);
if (U_FAILURE(ec)) {
return;
}
int32_t total_currency_name_count = cacheEntry->totalCurrencyNameCount;
CurrencyNameStruct* currencyNames = cacheEntry->currencyNames;
int32_t total_currency_symbol_count = cacheEntry->totalCurrencySymbolCount;
CurrencyNameStruct* currencySymbols = cacheEntry->currencySymbols;
int32_t start = pos.getIndex();
UChar inputText[MAX_CURRENCY_NAME_LEN];
@ -1523,11 +1559,14 @@ uprv_parseCurrency(const char* locale,
UErrorCode ec1 = U_ZERO_ERROR;
textLen = u_strToUpper(upperText, MAX_CURRENCY_NAME_LEN, inputText, textLen, locale, &ec1);
// Make sure partialMatchLen is initialized
*partialMatchLen = 0;
int32_t max = 0;
int32_t matchIndex = -1;
// case in-sensitive comparision against currency names
searchCurrencyName(currencyNames, total_currency_name_count,
upperText, textLen, &max, &matchIndex);
upperText, textLen, partialMatchLen, &max, &matchIndex);
#ifdef UCURR_DEBUG
printf("search in names, max = %d, matchIndex = %d\n", max, matchIndex);
@ -1538,7 +1577,8 @@ uprv_parseCurrency(const char* locale,
if (type != UCURR_LONG_NAME) { // not name only
// case sensitive comparison against currency symbols and ISO code.
searchCurrencyName(currencySymbols, total_currency_symbol_count,
inputText, textLen,
inputText, textLen,
partialMatchLen,
&maxInSymbol, &matchIndexInSymbol);
}
@ -1555,15 +1595,38 @@ uprv_parseCurrency(const char* locale,
} else if (maxInSymbol >= max && matchIndexInSymbol != -1) {
u_charsToUChars(currencySymbols[matchIndexInSymbol].IsoCode, result, 4);
pos.setIndex(start + maxInSymbol);
}
}
// decrease reference count
umtx_lock(&gCurrencyCacheMutex);
--(cacheEntry->refCount);
if (cacheEntry->refCount == 0) { // remove
deleteCacheEntry(cacheEntry);
releaseCacheEntry(cacheEntry);
}
void uprv_currencyLeads(const char* locale, icu::UnicodeSet& result, UErrorCode& ec) {
U_NAMESPACE_USE
if (U_FAILURE(ec)) {
return;
}
umtx_unlock(&gCurrencyCacheMutex);
CurrencyNameCacheEntry* cacheEntry = getCacheEntry(locale, ec);
if (U_FAILURE(ec)) {
return;
}
for (int32_t i=0; i<cacheEntry->totalCurrencySymbolCount; i++) {
const CurrencyNameStruct& info = cacheEntry->currencySymbols[i];
UChar32 cp;
U16_GET(info.currencyName, 0, 0, info.currencyNameLen, cp);
result.add(cp);
}
for (int32_t i=0; i<cacheEntry->totalCurrencyNameCount; i++) {
const CurrencyNameStruct& info = cacheEntry->currencyNames[i];
UChar32 cp;
U16_GET(info.currencyName, 0, 0, info.currencyNameLen, cp);
result.add(cp);
}
// decrease reference count
releaseCacheEntry(cacheEntry);
}

View file

@ -13,6 +13,7 @@
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/parsepos.h"
#include "unicode/uniset.h"
/**
* Internal method. Given a currency ISO code and a locale, return
@ -36,6 +37,8 @@ uprv_getStaticCurrencyName(const UChar* iso, const char* loc,
* match, then the display name is preferred, unless it's length
* is less than 3.
*
* The parameters must not be NULL.
*
* @param locale the locale of the display names to match
* @param text the text to parse
* @param pos input-output position; on input, the position within
@ -43,6 +46,8 @@ uprv_getStaticCurrencyName(const UChar* iso, const char* loc,
* on output, the position after the last matched character. If
* the parse fails, the position in unchanged upon output.
* @param type currency type to parse against, LONG_NAME only or not
* @param partialMatchLen The length of the longest matching prefix;
* this may be nonzero even if no full currency was matched.
* @return the ISO 4217 code, as a string, of the best match, or
* null if there is no match
*
@ -53,9 +58,21 @@ uprv_parseCurrency(const char* locale,
const icu::UnicodeString& text,
icu::ParsePosition& pos,
int8_t type,
int32_t* partialMatchLen,
UChar* result,
UErrorCode& ec);
/**
* Puts all possible first-characters of a currency into the
* specified UnicodeSet.
*
* @param locale the locale of the display names of interest
* @param result the UnicodeSet to which to add the starting characters
*/
void uprv_currencyLeads(const char* locale, icu::UnicodeSet& result, UErrorCode& ec);
#endif /* #ifndef _UCURR_IMP_H_ */
//eof

View file

@ -109,7 +109,8 @@ number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \
number_padding.o number_patternmodifier.o number_patternstring.o \
number_rounding.o number_scientific.o number_stringbuilder.o \
numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o \
numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o
numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o \
numparse_currency.o
## Header files to install

View file

@ -2171,10 +2171,11 @@ int32_t DecimalFormat::compareComplexAffix(const UnicodeString& affixPat,
// determine our locale.
const char* loc = fCurrencyPluralInfo->getLocale().getName();
ParsePosition ppos(pos);
int32_t currMatchLen = 0;
UChar curr[4];
UErrorCode ec = U_ZERO_ERROR;
// Delegate parse of display name => ISO code to Currency
uprv_parseCurrency(loc, text, ppos, type, curr, ec);
uprv_parseCurrency(loc, text, ppos, type, &currMatchLen, curr, ec);
// If parse succeeds, populate currency[0]
if (U_SUCCESS(ec) && ppos.getIndex() != pos) {

View file

@ -0,0 +1,67 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
#include "numparse_types.h"
#include "numparse_currency.h"
#include "ucurrimp.h"
#include "unicode/errorcode.h"
using namespace icu;
using namespace icu::numparse;
using namespace icu::numparse::impl;
CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status)
: fLocaleName(locale.getName(), -1, status) {}
bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
if (result.currencyCode[0] != 0) {
return false;
}
// NOTE: This requires a new UnicodeString to be allocated, instead of using the StringSegment.
// This should be fixed with #13584.
UnicodeString segmentString = segment.toUnicodeString();
// Try to parse the currency
ParsePosition ppos(0);
int32_t partialMatchLen = 0;
uprv_parseCurrency(
fLocaleName.data(),
segmentString,
ppos,
UCURR_SYMBOL_NAME, // checks for both UCURR_SYMBOL_NAME and UCURR_LONG_NAME
&partialMatchLen,
result.currencyCode,
status);
// Possible partial match
bool partialMatch = partialMatchLen == segment.length();
if (U_SUCCESS(status) && ppos.getIndex() != 0) {
// Complete match.
// NOTE: The currency code should already be saved in the ParsedNumber.
segment.adjustOffset(ppos.getIndex());
result.setCharsConsumed(segment);
}
return partialMatch;
}
const UnicodeSet* CurrencyNamesMatcher::getLeadCodePoints() const {
ErrorCode status;
UnicodeSet* leadCodePoints = new UnicodeSet();
uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status);
// Always apply case mapping closure for currencies
leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS);
leadCodePoints->freeze();
return leadCodePoints;
}
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -0,0 +1,47 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
#ifndef __NUMPARSE_CURRENCY_H__
#define __NUMPARSE_CURRENCY_H__
#include "numparse_types.h"
#include "charstr.h"
U_NAMESPACE_BEGIN namespace numparse {
namespace impl {
/**
* Matches currencies according to all available strings in locale data.
*
* The implementation of this class is different between J and C. See #13584 for a follow-up.
*
* @author sffc
*/
class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
public:
CurrencyNamesMatcher() = default; // WARNING: Leaves the object in an unusable state
CurrencyNamesMatcher(const Locale& locale, UErrorCode& status);
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
const UnicodeSet* getLeadCodePoints() const override;
private:
// We could use Locale instead of CharString here, but
// Locale has a non-trivial default constructor.
CharString fLocaleName;
};
} // namespace impl
} // namespace numparse
U_NAMESPACE_END
#endif //__NUMPARSE_CURRENCY_H__
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -58,7 +58,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
parser->addMatcher(parser->fLocalMatchers.infinity = {symbols});
parser->addMatcher(parser->fLocalMatchers.padding = {u"@"});
parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper});
// parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
parser->addMatcher(parser->fLocalMatchers.currencyNames = {locale, status});
// parser.addMatcher(new RequireNumberMatcher());
parser->freeze();
@ -91,12 +91,26 @@ void NumberParserImpl::addMatcher(const NumberParseMatcher& matcher) {
fMatchers[fNumMatchers] = &matcher;
if (fComputeLeads) {
fLeads[fNumMatchers] = matcher.getLeadCodePoints();
addLeadCodePointsForMatcher(matcher);
}
fNumMatchers++;
}
void NumberParserImpl::addLeadCodePointsForMatcher(const NumberParseMatcher& matcher) {
const UnicodeSet* leadCodePoints = matcher.getLeadCodePoints();
// TODO: Avoid the clone operation here.
if (0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)) {
UnicodeSet* copy = static_cast<UnicodeSet*>(leadCodePoints->cloneAsThawed());
delete leadCodePoints;
copy->closeOver(USET_ADD_CASE_MAPPINGS);
copy->freeze();
fLeads[fNumMatchers] = copy;
} else {
fLeads[fNumMatchers] = leadCodePoints;
}
}
void NumberParserImpl::freeze() {
fFrozen = true;
}

View file

@ -12,6 +12,7 @@
#include "numparse_symbols.h"
#include "numparse_scientific.h"
#include "unicode/uniset.h"
#include "numparse_currency.h"
U_NAMESPACE_BEGIN namespace numparse {
namespace impl {
@ -43,7 +44,7 @@ class NumberParserImpl {
bool fComputeLeads;
bool fFrozen = false;
// WARNING: All of these matchers start in an uninitialized state.
// WARNING: All of these matchers start in an undefined state (default-constructed).
// You must use an assignment operator on them before using.
struct {
IgnorablesMatcher ignorables;
@ -56,10 +57,13 @@ class NumberParserImpl {
PlusSignMatcher plusSign;
DecimalMatcher decimal;
ScientificMatcher scientific;
CurrencyNamesMatcher currencyNames;
} fLocalMatchers;
NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);
void addLeadCodePointsForMatcher(const NumberParseMatcher& matcher);
void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
void parseLongestRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;

View file

@ -23,7 +23,7 @@ void ParsedNumber::clear() {
flags = 0;
prefix.setToBogus();
suffix.setToBogus();
currencyCode.setToBogus();
currencyCode[0] = 0;
}
void ParsedNumber::setCharsConsumed(const StringSegment& segment) {

View file

@ -93,7 +93,7 @@ class ParsedNumber {
/**
* The currency that got consumed.
*/
UnicodeString currencyCode;
UChar currencyCode[4];
ParsedNumber();

View file

@ -67,8 +67,8 @@ void NumberParserTest::testBasic() {
{3, u"@@@123@@ ", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak?
// {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.},
// {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.},
// {3, u"514.23 USD", u"¤0", 10, 514.23},
// {3, u"514.23 GBP", u"¤0", 10, 514.23},
{3, u"514.23 USD", u"¤0", 10, 514.23},
{3, u"514.23 GBP", u"¤0", 10, 514.23},
// {3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.},
// {3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
// {3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
@ -88,7 +88,7 @@ void NumberParserTest::testBasic() {
{3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.},
{3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142},
{3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142},
// {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5},
{7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5},
// {3, u"a$ b5", u"a ¤ b0", 5, 5.0},
// {3, u"📺1.23", u"📺0;📻0", 6, 1.23},
// {3, u"📻1.23", u"📺0;📻0", 6, -1.23},