ICU-97 Faster performance for parsing

X-SVN-Rev: 2769
This commit is contained in:
George Rhoten 2000-10-24 16:09:39 +00:00
parent 2277b8fb2d
commit c4183c1638
7 changed files with 220 additions and 102 deletions

View file

@ -38,10 +38,12 @@
const char DigitList::kZero = '0';
const char DigitList::LONG_MIN_REP[] = "2147483648"; // ignore negative sign
/* Only for 32 bit numbers. Ignore the negative sign. */
static const char LONG_MIN_REP[] = "2147483648";
/* Ignore the NULL at the end */
int32_t DigitList::LONG_MIN_REP_LENGTH = sizeof(DigitList::LONG_MIN_REP) - 1;
enum {
LONG_MIN_REP_LENGTH = sizeof(LONG_MIN_REP) - 1 //Ignore the NULL at the end
};
// -------------------------------------
// default constructor
@ -62,6 +64,8 @@ DigitList::~DigitList()
DigitList::DigitList(const DigitList &other)
{
*fDecimalDigits = '.';
fDigits = fDecimalDigits + 1; // skip the decimal
*this = other;
}
@ -97,21 +101,62 @@ DigitList::operator==(const DigitList& that) const
void
DigitList::clear()
{
*fDecimalDigits = '.';
fDigits = fDecimalDigits + 1; // skip the decimal
fDecimalAt = 0;
fCount = 0;
for (int32_t i=0; i<MAX_DIGITS; ++i)
fDigits[i] = kZero;
// This isn't needed because fCount = 0;
// for (int32_t i=0; i<MAX_DIGITS; ++i)
// fDigits[i] = kZero;
}
// -------------------------------------
int32_t
DigitList::formatBase10(int32_t number, char *outputStr, int32_t outputLen)
{
char buffer[MAX_DIGITS + 1];
int32_t bufferLen;
if (outputLen > MAX_DIGITS) {
outputLen = MAX_DIGITS; // Ignore NULL
}
else if (outputLen < 3) {
return 0; // Not enough room
}
bufferLen = outputLen;
if (number < 0) { // Negative numbers are slightly larger than a postive
buffer[bufferLen--] = -(number % 10) + '0';
number /= -10;
*(outputStr++) = '-';
}
else {
*(outputStr++) = '+'; // allow +0
}
while (bufferLen >= 0 && number) { // Output the number
buffer[bufferLen--] = number % 10 + '0';
number /= 10;
}
outputLen -= bufferLen++;
while (bufferLen <= MAX_DIGITS) { // Copy the number to output
*(outputStr++) = buffer[bufferLen++];
}
*outputStr = 0; // NULL terminate.
return outputLen;
}
/**
* Currently, getDouble() depends on atof() to do its conversion.
*
* WARNING!!
* This is an extremely costly function. ~2/3 of the conversion time
* This is an extremely costly function. ~1/2 of the conversion time
* can be linked to this function.
*/
double
@ -120,24 +165,25 @@ DigitList::getDouble() const
if (fCount == 0)
return 0.0;
// For the string "." + fDigits + "e" + fDecimalAt.
char buffer[MAX_DIGITS+32];
*buffer = '.';
strncpy(buffer+1, fDigits, fCount);
sprintf(buffer+fCount+1, "e%d", fDecimalAt);
return atof(buffer);
*(fDigits+fCount) = 'e'; // add an e after the digits.
formatBase10(fDecimalAt,
fDigits + fCount + 1, // skip the 'e'
MAX_DEC_DIGITS - fCount - 2); // skip the 'e' and '.'
return atof(fDecimalDigits);
}
// -------------------------------------
/**
* Make sure that fitsIntoLong() is called before calling this function.
*/
int32_t DigitList::getLong()
{
if (fCount == fDecimalAt) {
fDigits[fCount] = 0; // NULL terminate
/* This conversion is bad on 64-bit platforms when we want to
be able to return a 64-bit number [grhoten]
*/
// This conversion is bad on 64-bit platforms when we want to
// be able to return a 64-bit number [grhoten]
return (int32_t)atol(fDigits);
}
else {

View file

@ -91,6 +91,7 @@ public:
/**
* Utility routine to get the value of the digit list
* Make sure that fitsIntoLong() is called before calling this function.
* Returns 0 if zero length.
*/
int32_t getLong(void);
@ -133,6 +134,15 @@ public:
*/
static const char kZero;
private:
enum {
MAX_DIGITS = DBL_DIG,
MAX_EXPONENT = 30,
// "." + fDigits + "e" + fDecimalAt
MAX_DEC_DIGITS = DBL_DIG + 2 + MAX_EXPONENT
};
public:
/**
* These data members are intentionally public and can be set directly.
@ -157,13 +167,16 @@ public:
*/
int32_t fDecimalAt;
int32_t fCount;
private:
enum { MAX_DIGITS = DBL_DIG };
public:
char fDigits[MAX_DIGITS + 1];
char *fDigits;
private:
char fDecimalDigits[MAX_DEC_DIGITS + 1]; // +1 for NULL
// static char LONG_MIN_REP[LONG_DIGITS];
// static const char LONG_MIN_REP[];
// static int32_t LONG_MIN_REP_LENGTH;
/**
* Round the representation to the given number of digits.
* @param maximumDigits The maximum number of digits to be shown.
@ -178,9 +191,15 @@ private:
UBool shouldRoundUp(int32_t maximumDigits);
// static char LONG_MIN_REP[LONG_DIGITS];
static const char LONG_MIN_REP[];
static int32_t LONG_MIN_REP_LENGTH;
/**
* Formats a number into a base 10 string representation, and NULL terminates it.
* @param number The number to format
* @param outputStr The string to output to
* @param outputLen The maximum number of characters to put into outputStr
* (including NULL).
* @return the length of the new string.
*/
static int32_t formatBase10(int32_t number, char *outputStr, int32_t outputLen);
};
// -------------------------------------

View file

@ -83,7 +83,7 @@ const UChar DecimalFormat::kPatternPadEscape = 0x002A /*'*'*/;
const UChar DecimalFormat::kCurrencySign = 0x00A4;
const UChar DecimalFormat::kQuote = 0x0027 /*'\''*/;
const int8_t DecimalFormat::fgMaxDigit = 9;
//const int8_t DecimalFormat::fgMaxDigit = 9;
const int32_t DecimalFormat::kDoubleIntegerDigits = 309;
const int32_t DecimalFormat::kDoubleFractionDigits = 340;
@ -821,7 +821,8 @@ DecimalFormat::subformat(UnicodeString& result,
} else if (fExponentSignAlwaysShown) {
result += fSymbols->getPlusSign();
}
if (negativeExponent) exponent = -exponent;
if (negativeExponent)
exponent = -exponent;
DecimalFormat* non_const = (DecimalFormat*)this;
non_const->fDigitList->set(exponent);
for (i=fDigitList->fDecimalAt; i<fMinExponentDigits; ++i)
@ -1004,10 +1005,6 @@ DecimalFormat::parse(const UnicodeString& text,
NumberFormat::parse(text, result, status);
}
const int32_t DecimalFormat::fgStatusInfinite = 0;
const int32_t DecimalFormat::fgStatusPositive = 1;
const int32_t DecimalFormat::fgStatusLength = 2;
void
DecimalFormat::parse(const UnicodeString& text,
Formattable& result,
@ -1024,7 +1021,7 @@ DecimalFormat::parse(const UnicodeString& text,
}
// special case NaN
// If the text is composed of the representation of NaN, returns NaN.
// If the text is composed of the representation of NaN, returns NaN.length
int32_t nanLen = fSymbols->compareNaN(text, parsePosition.getIndex());
if (nanLen) {
parsePosition.setIndex(parsePosition.getIndex() + nanLen);
@ -1086,6 +1083,12 @@ DecimalFormat::parse(const UnicodeString& text,
result.setDouble(status[fgStatusPositive] ? a : -a);
}
/*
This is an old implimentation that was preparing for 64-bit numbers in ICU.
It is very slow, and 64-bit numbers are not ANSI-C compatible. This code
is here if we change our minds.
*/
/**
* Parse the given text into a number. The text is parsed beginning at
* parsePosition, until an unparseable character is seen.
@ -1102,7 +1105,7 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
DigitList& digits, UBool* status) const
{
int32_t position = parsePosition.getIndex();
int32_t oldStart = parsePosition.getIndex();
int32_t oldStart = position;
// check for positivePrefix; take longest
UBool gotPositive = text.compare(position,fPositivePrefix.length(),fPositivePrefix,0,
@ -1152,12 +1155,13 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
int32_t backup = -1;
UChar ch;
int32_t digit;
int32_t textLength = text.length(); // One less pointer to follow
// We have to track digitCount ourselves, because digits.fCount will
// pin when the maximum allowable digits is reached.
int32_t digitCount = 0;
for (; position < text.length(); ++position)
for (; position < textLength; ++position)
{
ch = text[(UTextOffset)position];
@ -1180,18 +1184,18 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
if (digit > 0 && digit <= 9)
{
// Cancel out backup setting (see grouping handler below)
backup = -1;
sawDigit = TRUE;
// output a regular non-zero digit.
++digitCount;
digits.append((char)(digit + '0'));
// Cancel out backup setting (see grouping handler below)
backup = -1;
}
else if (digit == 0)
{
// Cancel out backup setting (see grouping handler below)
backup = -1; // Do this BEFORE continue statement below!!!
backup = -1;
sawDigit = TRUE;
// Check for leading zeros
@ -1221,19 +1225,18 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
{
// If we're only parsing integers, or if we ALREADY saw the
// decimal, then don't parse this one.
//if (isParseIntegerOnly() || sawDecimal)
// break;
digits.fDecimalAt = digitCount; // Not digits.fCount!
sawDecimal = TRUE;
}
else if (ch == exponentChar /* && sawDigit correct? */)
else if (ch == exponentChar) // error code is set below if !sawDigit
{
// Parse sign, if present
UBool negExp = FALSE;
int32_t pos = position + 1; // position + exponentSep.length();
DigitList exponentDigits;
if (pos < text.length())
if (pos < textLength)
{
ch = text[(UTextOffset) pos];
if (ch == fSymbols->getPlusSign())
@ -1247,7 +1250,7 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
}
}
while (pos < text.length()) {
while (pos < textLength) {
ch = text[(UTextOffset)pos];
digit = ch - zero;
@ -1303,12 +1306,12 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
}
// check for positiveSuffix
if (gotPositive)
if (gotPositive && fPositiveSuffix.length() > 0)
{
gotPositive = text.compare(position,fPositiveSuffix.length(),fPositiveSuffix,0,
fPositiveSuffix.length()) == 0;
}
if (gotNegative)
if (gotNegative && fNegativeSuffix.length() > 0)
{
gotNegative = text.compare(position,fNegativeSuffix.length(),fNegativeSuffix,0,
fNegativeSuffix.length()) == 0;
@ -1325,6 +1328,11 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
{
gotPositive = FALSE;
}
else
{
gotPositive = TRUE; // Make them equal to each other.
gotNegative = TRUE;
}
}
// fail if neither or both
@ -1348,6 +1356,7 @@ UBool DecimalFormat::subparse(const UnicodeString& text, ParsePosition& parsePos
return TRUE;
}
//------------------------------------------------------------------------------
// Gets the pointer to the localized decimal format symbols
@ -1473,27 +1482,15 @@ int32_t DecimalFormat::getMultiplier() const
//------------------------------------------------------------------------------
// Sets the multiplier of the number pattern.
// Deprecated!! use the other one
void
DecimalFormat::setMultiplier(int32_t newValue)
{
// This shouldn't be set to 0.
// Due to compatibility with ICU4J we cannot set an error code and refuse 0.
// So the rest of the code should ignore fMultiplier when it's 0. [grhoten]
fMultiplier = newValue;
}
//------------------------------------------------------------------------------
// Sets the multiplier of the number pattern.
void
DecimalFormat::setMultiplier(int32_t newValue, UErrorCode *err)
{
if (newValue != 0) {
fMultiplier = newValue;
}
else {
*err = U_ILLEGAL_ARGUMENT_ERROR;
}
}
/**
* Get the rounding increment.
* @return A positive rounding increment, or 0.0 if rounding
@ -2592,6 +2589,8 @@ DecimalFormat::applyPattern(const UnicodeString& pattern,
if (fRoundingIncrement != NULL) {
*fRoundingIncrement = roundingInc;
} else {
// Todo: fix this after time testing.
// Copy constructor calls operator=.
fRoundingIncrement = new DigitList(roundingInc);
}
fRoundingDouble = fRoundingIncrement->getDouble();

View file

@ -38,10 +38,12 @@
const char DigitList::kZero = '0';
const char DigitList::LONG_MIN_REP[] = "2147483648"; // ignore negative sign
/* Only for 32 bit numbers. Ignore the negative sign. */
static const char LONG_MIN_REP[] = "2147483648";
/* Ignore the NULL at the end */
int32_t DigitList::LONG_MIN_REP_LENGTH = sizeof(DigitList::LONG_MIN_REP) - 1;
enum {
LONG_MIN_REP_LENGTH = sizeof(LONG_MIN_REP) - 1 //Ignore the NULL at the end
};
// -------------------------------------
// default constructor
@ -62,6 +64,8 @@ DigitList::~DigitList()
DigitList::DigitList(const DigitList &other)
{
*fDecimalDigits = '.';
fDigits = fDecimalDigits + 1; // skip the decimal
*this = other;
}
@ -97,21 +101,62 @@ DigitList::operator==(const DigitList& that) const
void
DigitList::clear()
{
*fDecimalDigits = '.';
fDigits = fDecimalDigits + 1; // skip the decimal
fDecimalAt = 0;
fCount = 0;
for (int32_t i=0; i<MAX_DIGITS; ++i)
fDigits[i] = kZero;
// This isn't needed because fCount = 0;
// for (int32_t i=0; i<MAX_DIGITS; ++i)
// fDigits[i] = kZero;
}
// -------------------------------------
int32_t
DigitList::formatBase10(int32_t number, char *outputStr, int32_t outputLen)
{
char buffer[MAX_DIGITS + 1];
int32_t bufferLen;
if (outputLen > MAX_DIGITS) {
outputLen = MAX_DIGITS; // Ignore NULL
}
else if (outputLen < 3) {
return 0; // Not enough room
}
bufferLen = outputLen;
if (number < 0) { // Negative numbers are slightly larger than a postive
buffer[bufferLen--] = -(number % 10) + '0';
number /= -10;
*(outputStr++) = '-';
}
else {
*(outputStr++) = '+'; // allow +0
}
while (bufferLen >= 0 && number) { // Output the number
buffer[bufferLen--] = number % 10 + '0';
number /= 10;
}
outputLen -= bufferLen++;
while (bufferLen <= MAX_DIGITS) { // Copy the number to output
*(outputStr++) = buffer[bufferLen++];
}
*outputStr = 0; // NULL terminate.
return outputLen;
}
/**
* Currently, getDouble() depends on atof() to do its conversion.
*
* WARNING!!
* This is an extremely costly function. ~2/3 of the conversion time
* This is an extremely costly function. ~1/2 of the conversion time
* can be linked to this function.
*/
double
@ -120,24 +165,25 @@ DigitList::getDouble() const
if (fCount == 0)
return 0.0;
// For the string "." + fDigits + "e" + fDecimalAt.
char buffer[MAX_DIGITS+32];
*buffer = '.';
strncpy(buffer+1, fDigits, fCount);
sprintf(buffer+fCount+1, "e%d", fDecimalAt);
return atof(buffer);
*(fDigits+fCount) = 'e'; // add an e after the digits.
formatBase10(fDecimalAt,
fDigits + fCount + 1, // skip the 'e'
MAX_DEC_DIGITS - fCount - 2); // skip the 'e' and '.'
return atof(fDecimalDigits);
}
// -------------------------------------
/**
* Make sure that fitsIntoLong() is called before calling this function.
*/
int32_t DigitList::getLong()
{
if (fCount == fDecimalAt) {
fDigits[fCount] = 0; // NULL terminate
/* This conversion is bad on 64-bit platforms when we want to
be able to return a 64-bit number [grhoten]
*/
// This conversion is bad on 64-bit platforms when we want to
// be able to return a 64-bit number [grhoten]
return (int32_t)atol(fDigits);
}
else {

View file

@ -91,6 +91,7 @@ public:
/**
* Utility routine to get the value of the digit list
* Make sure that fitsIntoLong() is called before calling this function.
* Returns 0 if zero length.
*/
int32_t getLong(void);
@ -133,6 +134,15 @@ public:
*/
static const char kZero;
private:
enum {
MAX_DIGITS = DBL_DIG,
MAX_EXPONENT = 30,
// "." + fDigits + "e" + fDecimalAt
MAX_DEC_DIGITS = DBL_DIG + 2 + MAX_EXPONENT
};
public:
/**
* These data members are intentionally public and can be set directly.
@ -157,13 +167,16 @@ public:
*/
int32_t fDecimalAt;
int32_t fCount;
private:
enum { MAX_DIGITS = DBL_DIG };
public:
char fDigits[MAX_DIGITS + 1];
char *fDigits;
private:
char fDecimalDigits[MAX_DEC_DIGITS + 1]; // +1 for NULL
// static char LONG_MIN_REP[LONG_DIGITS];
// static const char LONG_MIN_REP[];
// static int32_t LONG_MIN_REP_LENGTH;
/**
* Round the representation to the given number of digits.
* @param maximumDigits The maximum number of digits to be shown.
@ -178,9 +191,15 @@ private:
UBool shouldRoundUp(int32_t maximumDigits);
// static char LONG_MIN_REP[LONG_DIGITS];
static const char LONG_MIN_REP[];
static int32_t LONG_MIN_REP_LENGTH;
/**
* Formats a number into a base 10 string representation, and NULL terminates it.
* @param number The number to format
* @param outputStr The string to output to
* @param outputLen The maximum number of characters to put into outputStr
* (including NULL).
* @return the length of the new string.
*/
static int32_t formatBase10(int32_t number, char *outputStr, int32_t outputLen);
};
// -------------------------------------

View file

@ -476,23 +476,9 @@ public:
* For a permill, set the suffixes to have "\u2031" and the multiplier to be 1000.
*
* Examples: with 100, 1.23 -> "123", and "123" -> 1.23
* @deprecated Use the other setMultiplier, remove after Apr. 12, 2001
*/
virtual void setMultiplier(int32_t newValue);
/**
* Set the multiplier for use in percent, permill, etc.
* For a percentage, set the suffixes to have "%" and the multiplier to be 100.
* (For Arabic, use arabic percent symbol).
* For a permill, set the suffixes to have "\u2031" and the multiplier to be 1000.
*
* Examples: with 100, 1.23 -> "123", and "123" -> 1.23
* @param err Will return U_ILLEGAL_ARGUMENT_ERROR when newValue is 0 or
* some other invalid number.
* @draft
*/
virtual void setMultiplier(int32_t newValue, UErrorCode *err);
/**
* Get the rounding increment.
* @return A positive rounding increment, or 0.0 if rounding
@ -572,7 +558,7 @@ public:
* @see #setPadCharacter
* @see #getPadPosition
* @see #setPadPosition
* @deprecated remove after 2000-dec-31. See UNumberFormatSymbol and unum_getSymbol() for a replacement.
* @deprecated remove after 2000-dec-31. See the other getPadCharacter() function
*/
inline UChar getPadCharacter(void);
@ -600,7 +586,7 @@ public:
* @see #getPadCharacter
* @see #getPadPosition
* @see #setPadPosition
* @deprecated remove after 2000-dec-31. See UNumberFormatSymbol and unum_getSymbol() for a replacement.
* @deprecated remove after 2000-dec-31. See the other setPadCharacter() function
*/
inline void setPadCharacter(UChar padChar);
@ -992,9 +978,11 @@ private:
UBool isNegative,
UBool isInteger) const;
static const int32_t fgStatusInfinite;
static const int32_t fgStatusPositive;
static const int32_t fgStatusLength;
enum {
fgStatusInfinite,
fgStatusPositive,
fgStatusLength // Leave last in list.
} StatusFlags;
/**
* Parse the given text into a number. The text is parsed beginning at
@ -1038,7 +1026,7 @@ private:
/**
* Constants.
*/
static const int8_t fgMaxDigit; // The largest digit, in this case 9
//static const int8_t fgMaxDigit; // The largest digit, in this case 9
/*transient*/ DigitList* fDigitList;
@ -1055,7 +1043,7 @@ private:
int32_t fGroupingSize2;
UBool fDecimalSeparatorAlwaysShown;
/*transient*/ UBool fIsCurrencyFormat;
DecimalFormatSymbols* fSymbols;
/* @deprecated */ DecimalFormatSymbols* fSymbols;
UBool fUseExponentialNotation;
int8_t fMinExponentDigits;

View file

@ -225,9 +225,10 @@ unum_parseDouble( const UNumberFormat* fmt,
int32_t *parsePos /* 0 = start */,
UErrorCode *status)
{
if(U_FAILURE(*status)) return 0;
if(U_FAILURE(*status))
return 0;
int32_t len = (textLength == -1 ? u_strlen(text) : textLength);
int32_t len = (textLength < 0 ? u_strlen(text) : textLength);
const UnicodeString src((UChar*)text, len, len);
ParsePosition pp;
Formattable res;