diff --git a/icu4c/readme.html b/icu4c/readme.html index d238cd49edd..ccc6f08a7a8 100644 --- a/icu4c/readme.html +++ b/icu4c/readme.html @@ -3,8 +3,7 @@ - - ReadMe for ICU 4.7.1 (4.8M1) + ReadMe for ICU 4.8 ICU download page.

+

MessageFormat Changes

+

MessageFormat and related classes (choice/plural/select) have been reimplemented, + with several improvements and some incompatible changes. + See the ICU 4.8 download page for details.

+

How To Download the Source Code

diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index 4391e608443..2fb2365383b 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -84,7 +84,7 @@ ucnv.o ucnv_bld.o ucnv_cnv.o ucnv_io.o ucnv_cb.o ucnv_err.o ucnvlat1.o \ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \ ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \ uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \ -ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o loclikely.o locresdata.o \ +messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o loclikely.o locresdata.o \ bytestream.o stringpiece.o \ stringtriebuilder.o bytestriebuilder.o \ bytestrie.o bytestrieiterator.o \ @@ -93,7 +93,7 @@ appendable.o ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \ utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \ normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \ chariter.o schriter.o uchriter.o uiter.o \ -uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \ +patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \ uscript.o usc_impl.o unames.o \ utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \ uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \ diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 3ef2ed1d4bb..ddc00c799dd 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -374,6 +374,7 @@ + @@ -408,6 +409,7 @@ + @@ -1193,6 +1195,8 @@ + + @@ -1432,6 +1436,20 @@ + + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode + + ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + copy "%(FullPath)" ..\..\include\unicode diff --git a/icu4c/source/common/messageimpl.h b/icu4c/source/common/messageimpl.h new file mode 100644 index 00000000000..9af400cd27d --- /dev/null +++ b/icu4c/source/common/messageimpl.h @@ -0,0 +1,63 @@ +/* +******************************************************************************* +* Copyright (C) 2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: messageimpl.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011apr04 +* created by: Markus W. Scherer +*/ + +#ifndef __MESSAGEIMPL_H__ +#define __MESSAGEIMPL_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/messagepattern.h" + +U_NAMESPACE_BEGIN + +/** + * Helper functions for use of MessagePattern. + * In Java, these are package-private methods in MessagePattern itself. + * In C++, they are declared here and implemented in messagepattern.cpp. + */ +class U_COMMON_API MessageImpl { +public: + /** + * @return TRUE if getApostropheMode()==UMSGPAT_APOS_DOUBLE_REQUIRED + */ + static UBool jdkAposMode(const MessagePattern &msgPattern) { + return msgPattern.getApostropheMode()==UMSGPAT_APOS_DOUBLE_REQUIRED; + } + + /** + * Appends the s[start, limit[ substring to sb, but with only half of the apostrophes + * according to JDK pattern behavior. + */ + static void appendReducedApostrophes(const UnicodeString &s, int32_t start, int32_t limit, + UnicodeString &sb); + + /** + * Appends the sub-message to the result string. + * Omits SKIP_SYNTAX and appends whole arguments using appendReducedApostrophes(). + */ + static UnicodeString &appendSubMessageWithoutSkipSyntax(const MessagePattern &msgPattern, + int32_t msgStart, + UnicodeString &result); + +private: + MessageImpl(); // no constructor: all static methods +}; + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_FORMATTING + +#endif // __MESSAGEIMPL_H__ diff --git a/icu4c/source/common/messagepattern.cpp b/icu4c/source/common/messagepattern.cpp new file mode 100644 index 00000000000..6e94fcba2b8 --- /dev/null +++ b/icu4c/source/common/messagepattern.cpp @@ -0,0 +1,1208 @@ +/* +******************************************************************************* +* Copyright (C) 2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: messagepattern.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011mar14 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/messagepattern.h" +#include "unicode/unistr.h" +#include "cmemory.h" +#include "cstring.h" +#include "messageimpl.h" +#include "patternprops.h" +#include "putilimp.h" +#include "uassert.h" + +U_NAMESPACE_BEGIN + +// Unicode character/code point constants ---------------------------------- *** + +static const UChar u_pound=0x23; +static const UChar u_apos=0x27; +static const UChar u_plus=0x2B; +static const UChar u_comma=0x2C; +static const UChar u_minus=0x2D; +static const UChar u_dot=0x2E; +static const UChar u_colon=0x3A; +static const UChar u_lessThan=0x3C; +static const UChar u_equal=0x3D; +static const UChar u_A=0x41; +static const UChar u_C=0x43; +static const UChar u_E=0x45; +static const UChar u_H=0x48; +static const UChar u_I=0x49; +static const UChar u_L=0x4C; +static const UChar u_O=0x4F; +static const UChar u_P=0x50; +static const UChar u_R=0x52; +static const UChar u_S=0x53; +static const UChar u_T=0x54; +static const UChar u_U=0x55; +static const UChar u_Z=0x5A; +static const UChar u_a=0x61; +static const UChar u_c=0x63; +static const UChar u_e=0x65; +static const UChar u_f=0x66; +static const UChar u_h=0x68; +static const UChar u_i=0x69; +static const UChar u_l=0x6C; +static const UChar u_o=0x6F; +static const UChar u_p=0x70; +static const UChar u_r=0x72; +static const UChar u_s=0x73; +static const UChar u_t=0x74; +static const UChar u_u=0x75; +static const UChar u_z=0x7A; +static const UChar u_leftCurlyBrace=0x7B; +static const UChar u_pipe=0x7C; +static const UChar u_rightCurlyBrace=0x7D; +static const UChar u_lessOrEqual=0x2264; // U+2264 is <= + +static const UChar kOffsetColon[]={ // "offset:" + u_o, u_f, u_f, u_s, u_e, u_t, u_colon +}; + +static const UChar kOther[]={ // "other" + u_o, u_t, u_h, u_e, u_r +}; + +// MessagePatternList ------------------------------------------------------ *** + +template +class MessagePatternList { +public: + MessagePatternList() {} + void copyFrom(const MessagePatternList &other, + int32_t length, + UErrorCode &errorCode); + UBool ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode); + UBool memEquals(const MessagePatternList &other, int32_t length) const { + return 0==uprv_memcmp(a.getAlias(), other.a.getAlias(), length*sizeof(T)); + } + + MaybeStackArray a; +}; + +template +void +MessagePatternList::copyFrom( + const MessagePatternList &other, + int32_t length, + UErrorCode &errorCode) { + if(U_SUCCESS(errorCode) && length>0) { + if(length>a.getCapacity() && NULL==a.resize(length)) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memcpy(a.getAlias(), other.a.getAlias(), length*sizeof(T)); + } +} + +template +UBool +MessagePatternList::ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return FALSE; + } + if(a.getCapacity()>oldLength || a.resize(2*oldLength, oldLength)!=NULL) { + return TRUE; + } + errorCode=U_MEMORY_ALLOCATION_ERROR; + return FALSE; +} + +// MessagePatternList specializations -------------------------------------- *** + +class MessagePatternDoubleList : public MessagePatternList { +}; + +class MessagePatternPartsList : public MessagePatternList { +}; + +// MessagePattern constructors etc. ---------------------------------------- *** + +MessagePattern::MessagePattern(UErrorCode &errorCode) + : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE), + partsList(NULL), parts(NULL), partsLength(0), + numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), + hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) { + init(errorCode); +} + +MessagePattern::MessagePattern(UMessagePatternApostropheMode mode, UErrorCode &errorCode) + : aposMode(mode), + partsList(NULL), parts(NULL), partsLength(0), + numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), + hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) { + init(errorCode); +} + +MessagePattern::MessagePattern(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) + : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE), + partsList(NULL), parts(NULL), partsLength(0), + numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), + hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) { + if(init(errorCode)) { + parse(pattern, parseError, errorCode); + } +} + +UBool +MessagePattern::init(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return FALSE; + } + partsList=new MessagePatternPartsList(); + if(partsList==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + parts=partsList->a.getAlias(); + return TRUE; +} + +MessagePattern::MessagePattern(const MessagePattern &other) + : aposMode(other.aposMode), msg(other.msg), + partsList(NULL), parts(NULL), partsLength(0), + numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), + hasArgNames(other.hasArgNames), hasArgNumbers(other.hasArgNumbers), + needsAutoQuoting(other.needsAutoQuoting) { + UErrorCode errorCode=U_ZERO_ERROR; + if(!copyStorage(other, errorCode)) { + clear(); + } +} + +MessagePattern & +MessagePattern::operator=(const MessagePattern &other) { + if(this==&other) { + return *this; + } + aposMode=other.aposMode; + msg=other.msg; + hasArgNames=other.hasArgNames; + hasArgNumbers=other.hasArgNumbers; + needsAutoQuoting=other.needsAutoQuoting; + UErrorCode errorCode=U_ZERO_ERROR; + if(!copyStorage(other, errorCode)) { + clear(); + } + return *this; +} + +UBool +MessagePattern::copyStorage(const MessagePattern &other, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return FALSE; + } + parts=NULL; + partsLength=0; + numericValues=NULL; + numericValuesLength=0; + if(partsList==NULL) { + partsList=new MessagePatternPartsList(); + if(partsList==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + parts=partsList->a.getAlias(); + } + if(other.partsLength>0) { + partsList->copyFrom(*other.partsList, other.partsLength, errorCode); + if(U_FAILURE(errorCode)) { + return FALSE; + } + parts=partsList->a.getAlias(); + partsLength=other.partsLength; + } + if(other.numericValuesLength>0) { + if(numericValuesList==NULL) { + numericValuesList=new MessagePatternDoubleList(); + if(numericValuesList==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + numericValues=numericValuesList->a.getAlias(); + } + numericValuesList->copyFrom( + *other.numericValuesList, other.numericValuesLength, errorCode); + if(U_FAILURE(errorCode)) { + return FALSE; + } + numericValues=numericValuesList->a.getAlias(); + numericValuesLength=other.numericValuesLength; + } + return TRUE; +} + +MessagePattern::~MessagePattern() { + delete partsList; + delete numericValuesList; +} + +// MessagePattern API ------------------------------------------------------ *** + +MessagePattern & +MessagePattern::parse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) { + preParse(pattern, parseError, errorCode); + parseMessage(0, 0, 0, UMSGPAT_ARG_TYPE_NONE, parseError, errorCode); + postParse(); + return *this; +} + +MessagePattern & +MessagePattern::parseChoiceStyle(const UnicodeString &pattern, + UParseError *parseError, UErrorCode &errorCode) { + preParse(pattern, parseError, errorCode); + parseChoiceStyle(0, 0, parseError, errorCode); + postParse(); + return *this; +} + +MessagePattern & +MessagePattern::parsePluralStyle(const UnicodeString &pattern, + UParseError *parseError, UErrorCode &errorCode) { + preParse(pattern, parseError, errorCode); + parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_PLURAL, 0, 0, parseError, errorCode); + postParse(); + return *this; +} + +MessagePattern & +MessagePattern::parseSelectStyle(const UnicodeString &pattern, + UParseError *parseError, UErrorCode &errorCode) { + preParse(pattern, parseError, errorCode); + parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_SELECT, 0, 0, parseError, errorCode); + postParse(); + return *this; +} + +void +MessagePattern::clear() { + // Mostly the same as preParse(). + msg.remove(); + hasArgNames=hasArgNumbers=FALSE; + needsAutoQuoting=FALSE; + partsLength=0; + numericValuesLength=0; +} + +UBool +MessagePattern::operator==(const MessagePattern &other) const { + if(this==&other) { + return TRUE; + } + return + aposMode==other.aposMode && + msg==other.msg && + // parts.equals(o.parts) + partsLength==other.partsLength && + (partsLength==0 || partsList->memEquals(*other.partsList, partsLength)); + // No need to compare numericValues if msg and parts are the same. +} + +int32_t +MessagePattern::hashCode() const { + int32_t hash=(aposMode*37+msg.hashCode())*37+partsLength; + for(int32_t i=0; i0;) { + const Part &part=getPart(--i); + if(part.getType()==UMSGPAT_PART_TYPE_INSERT_CHAR) { + modified.insert(part.index, part.value); + } + } + return modified; +} + +double +MessagePattern::getNumericValue(const Part &part) const { + UMessagePatternPartType type=part.type; + if(type==UMSGPAT_PART_TYPE_ARG_INT) { + return part.value; + } else if(type==UMSGPAT_PART_TYPE_ARG_DOUBLE) { + return numericValues[part.value]; + } else { + return UMSGPAT_NO_NUMERIC_VALUE; + } +} + +/** + * Returns the "offset:" value of a PluralFormat argument, or 0 if none is specified. + * @param pluralStart the index of the first PluralFormat argument style part. (0..countParts()-1) + * @return the "offset:" value. + * @draft ICU 4.8 + */ +double +MessagePattern::getPluralOffset(int32_t pluralStart) const { + const Part &part=getPart(pluralStart); + if(Part::hasNumericValue(part.type)) { + return getNumericValue(part); + } else { + return 0; + } +} + +// MessagePattern::Part ---------------------------------------------------- *** + +UBool +MessagePattern::Part::operator==(const Part &other) const { + if(this==&other) { + return TRUE; + } + return + type==other.type && + index==other.index && + length==other.length && + value==other.value && + limitPartIndex==other.limitPartIndex; +} + +// MessagePattern parser --------------------------------------------------- *** + +void +MessagePattern::preParse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + if(parseError!=NULL) { + parseError->line=0; + parseError->offset=0; + parseError->preContext[0]=0; + parseError->postContext[0]=0; + } + msg=pattern; + hasArgNames=hasArgNumbers=FALSE; + needsAutoQuoting=FALSE; + partsLength=0; + numericValuesLength=0; +} + +void +MessagePattern::postParse() { + if(partsList!=NULL) { + parts=partsList->a.getAlias(); + } + if(numericValuesList!=NULL) { + numericValues=numericValuesList->a.getAlias(); + } +} + +int32_t +MessagePattern::parseMessage(int32_t index, int32_t msgStartLength, + int32_t nestingLevel, UMessagePatternArgType parentType, + UParseError *parseError, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return 0; + } + if(nestingLevel>Part::MAX_VALUE) { + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + int32_t msgStart=partsLength; + addPart(UMSGPAT_PART_TYPE_MSG_START, index, msgStartLength, nestingLevel, errorCode); + index+=msgStartLength; + for(;;) { // while(index=msg.length()) { + break; + } + UChar c=msg.charAt(index++); + if(c==u_apos) { + if(index==msg.length()) { + // The apostrophe is the last character in the pattern. + // Add a Part for auto-quoting. + addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0, + u_apos, errorCode); // value=char to be inserted + needsAutoQuoting=TRUE; + } else { + c=msg.charAt(index); + if(c==u_apos) { + // double apostrophe, skip the second one + addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index++, 1, 0, errorCode); + } else if( + aposMode==UMSGPAT_APOS_DOUBLE_REQUIRED || + c==u_leftCurlyBrace || c==u_rightCurlyBrace || + (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe) || + (parentType==UMSGPAT_ARG_TYPE_PLURAL && c==u_pound) + ) { + // skip the quote-starting apostrophe + addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index-1, 1, 0, errorCode); + // find the end of the quoted literal text + for(;;) { + index=msg.indexOf(u_apos, index+1); + if(index>=0) { + if(/*(index+1)0 && c==u_rightCurlyBrace) || + (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe)) { + // Finish the message before the terminator. + // In a choice style, report the "}" substring only for the following ARG_LIMIT, + // not for this MSG_LIMIT. + int32_t limitLength=(parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_rightCurlyBrace) ? 0 : 1; + addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index-1, limitLength, + nestingLevel, errorCode); + if(parentType==UMSGPAT_ARG_TYPE_CHOICE) { + // Let the choice style parser see the '}' or '|'. + return index-1; + } else { + // continue parsing after the '}' + return index; + } + } // else: c is part of literal text + } + if(nestingLevel>0 && !inTopLevelChoiceMessage(nestingLevel, parentType)) { + setParseError(parseError, 0); // Unmatched '{' braces in message. + errorCode=U_UNMATCHED_BRACES; + return 0; + } + addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index, 0, nestingLevel, errorCode); + return index; +} + +int32_t +MessagePattern::parseArg(int32_t index, int32_t argStartLength, int32_t nestingLevel, + UParseError *parseError, UErrorCode &errorCode) { + int32_t argStart=partsLength; + UMessagePatternArgType argType=UMSGPAT_ARG_TYPE_NONE; + addPart(UMSGPAT_PART_TYPE_ARG_START, index, argStartLength, argType, errorCode); + if(U_FAILURE(errorCode)) { + return 0; + } + int32_t nameIndex=index=skipWhiteSpace(index+argStartLength); + if(index==msg.length()) { + setParseError(parseError, 0); // Unmatched '{' braces in message. + errorCode=U_UNMATCHED_BRACES; + return 0; + } + // parse argument name or number + index=skipIdentifier(index); + int32_t number=parseArgNumber(nameIndex, index); + if(number>=0) { + int32_t length=index-nameIndex; + if(length>Part::MAX_LENGTH || number>Part::MAX_VALUE) { + setParseError(parseError, nameIndex); // Argument number too large. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + hasArgNumbers=TRUE; + addPart(UMSGPAT_PART_TYPE_ARG_NUMBER, nameIndex, length, number, errorCode); + } else if(number==UMSGPAT_ARG_NAME_NOT_NUMBER) { + int32_t length=index-nameIndex; + if(length>Part::MAX_LENGTH) { + setParseError(parseError, nameIndex); // Argument name too long. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + hasArgNames=TRUE; + addPart(UMSGPAT_PART_TYPE_ARG_NAME, nameIndex, length, 0, errorCode); + } else { // number<-1 (ARG_NAME_NOT_VALID) + setParseError(parseError, nameIndex); // Bad argument syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + index=skipWhiteSpace(index); + if(index==msg.length()) { + setParseError(parseError, 0); // Unmatched '{' braces in message. + errorCode=U_UNMATCHED_BRACES; + return 0; + } + UChar c=msg.charAt(index); + if(c==u_rightCurlyBrace) { + // all done + } else if(c!=u_comma) { + setParseError(parseError, nameIndex); // Bad argument syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } else /* ',' */ { + // parse argument type: case-sensitive a-zA-Z + int32_t typeIndex=index=skipWhiteSpace(index+1); + while(indexPart::MAX_LENGTH) { + setParseError(parseError, nameIndex); // Argument type name too long. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + argType=UMSGPAT_ARG_TYPE_SIMPLE; + if(length==6) { + // case-insensitive comparisons for complex-type names + if(isChoice(typeIndex)) { + argType=UMSGPAT_ARG_TYPE_CHOICE; + } else if(isPlural(typeIndex)) { + argType=UMSGPAT_ARG_TYPE_PLURAL; + } else if(isSelect(typeIndex)) { + argType=UMSGPAT_ARG_TYPE_SELECT; + } + } + // change the ARG_START type from NONE to argType + partsList->a[argStart].value=(int16_t)argType; + if(argType==UMSGPAT_ARG_TYPE_SIMPLE) { + addPart(UMSGPAT_PART_TYPE_ARG_TYPE, typeIndex, length, 0, errorCode); + } + // look for an argument style (pattern) + if(c==u_rightCurlyBrace) { + if(argType!=UMSGPAT_ARG_TYPE_SIMPLE) { + setParseError(parseError, nameIndex); // No style field for complex argument. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + } else /* ',' */ { + ++index; + if(argType==UMSGPAT_ARG_TYPE_SIMPLE) { + index=parseSimpleStyle(index, parseError, errorCode); + } else if(argType==UMSGPAT_ARG_TYPE_CHOICE) { + index=parseChoiceStyle(index, nestingLevel, parseError, errorCode); + } else { + index=parsePluralOrSelectStyle(argType, index, nestingLevel, parseError, errorCode); + } + } + } + // Argument parsing stopped on the '}'. + addLimitPart(argStart, UMSGPAT_PART_TYPE_ARG_LIMIT, index, 1, argType, errorCode); + return index+1; +} + +int32_t +MessagePattern::parseSimpleStyle(int32_t index, UParseError *parseError, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return 0; + } + int32_t start=index; + int32_t nestedBraces=0; + while(index0) { + --nestedBraces; + } else { + int32_t length=--index-start; + if(length>Part::MAX_LENGTH) { + setParseError(parseError, start); // Argument style text too long. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + addPart(UMSGPAT_PART_TYPE_ARG_STYLE, start, length, 0, errorCode); + return index; + } + } // c is part of literal text + } + setParseError(parseError, 0); // Unmatched '{' braces in message. + errorCode=U_UNMATCHED_BRACES; + return 0; +} + +int32_t +MessagePattern::parseChoiceStyle(int32_t index, int32_t nestingLevel, + UParseError *parseError, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return 0; + } + int32_t start=index; + index=skipWhiteSpace(index); + if(index==msg.length() || msg.charAt(index)==u_rightCurlyBrace) { + setParseError(parseError, 0); // Missing choice argument pattern. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + for(;;) { + // The choice argument style contains |-separated (number, separator, message) triples. + // Parse the number. + int32_t numberIndex=index; + index=skipDouble(index); + int32_t length=index-numberIndex; + if(length==0) { + setParseError(parseError, start); // Bad choice pattern syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + if(length>Part::MAX_LENGTH) { + setParseError(parseError, numberIndex); // Choice number too long. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + parseDouble(numberIndex, index, TRUE, parseError, errorCode); // adds ARG_INT or ARG_DOUBLE + if(U_FAILURE(errorCode)) { + return 0; + } + // Parse the separator. + index=skipWhiteSpace(index); + if(index==msg.length()) { + setParseError(parseError, start); // Bad choice pattern syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + UChar c=msg.charAt(index); + if(!(c==u_pound || c==u_lessThan || c==u_lessOrEqual)) { // U+2264 is <= + setParseError(parseError, start); // Expected choice separator (#<\u2264) instead of c. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, index, 1, 0, errorCode); + // Parse the message fragment. + index=parseMessage(++index, 0, nestingLevel+1, UMSGPAT_ARG_TYPE_CHOICE, parseError, errorCode); + if(U_FAILURE(errorCode)) { + return 0; + } + // parseMessage(..., CHOICE) returns the index of the terminator, or msg.length(). + if(index==msg.length()) { + return index; + } + if(msg.charAt(index)==u_rightCurlyBrace) { + if(!inMessageFormatPattern(nestingLevel)) { + setParseError(parseError, start); // Bad choice pattern syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + return index; + } // else the terminator is '|' + index=skipWhiteSpace(index+1); + } +} + +int32_t +MessagePattern::parsePluralOrSelectStyle(UMessagePatternArgType argType, + int32_t index, int32_t nestingLevel, + UParseError *parseError, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return 0; + } + int32_t start=index; + UBool isEmpty=TRUE; + UBool hasOther=FALSE; + for(;;) { + // First, collect the selector looking for a small set of terminators. + // It would be a little faster to consider the syntax of each possible + // token right here, but that makes the code too complicated. + index=skipWhiteSpace(index); + UBool eos=index==msg.length(); + if(eos || msg.charAt(index)==u_rightCurlyBrace) { + if(eos==inMessageFormatPattern(nestingLevel)) { + setParseError(parseError, start); // Bad plural/select pattern syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + if(!hasOther) { + setParseError(parseError, 0); // Missing 'other' keyword in plural/select pattern. + errorCode=U_DEFAULT_KEYWORD_MISSING; + return 0; + } + return index; + } + int32_t selectorIndex=index; + if(argType==UMSGPAT_ARG_TYPE_PLURAL && msg.charAt(selectorIndex)==u_equal) { + // explicit-value plural selector: =double + index=skipDouble(index+1); + int32_t length=index-selectorIndex; + if(length==1) { + setParseError(parseError, start); // Bad plural/select pattern syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + if(length>Part::MAX_LENGTH) { + setParseError(parseError, selectorIndex); // Argument selector too long. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode); + parseDouble(selectorIndex+1, index, FALSE, + parseError, errorCode); // adds ARG_INT or ARG_DOUBLE + } else { + index=skipIdentifier(index); + int32_t length=index-selectorIndex; + if(length==0) { + setParseError(parseError, start); // Bad plural/select pattern syntax. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + // Note: The ':' in "offset:" is just beyond the skipIdentifier() range. + if( argType==UMSGPAT_ARG_TYPE_PLURAL && length==6 && indexPart::MAX_LENGTH) { + setParseError(parseError, valueIndex); // Plural offset value too long. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + parseDouble(valueIndex, index, FALSE, + parseError, errorCode); // adds ARG_INT or ARG_DOUBLE + if(U_FAILURE(errorCode)) { + return 0; + } + isEmpty=FALSE; + continue; // no message fragment after the offset + } else { + // normal selector word + if(length>Part::MAX_LENGTH) { + setParseError(parseError, selectorIndex); // Argument selector too long. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode); + if(0==msg.compare(selectorIndex, length, kOther, 0, 5)) { + hasOther=TRUE; + } + } + } + if(U_FAILURE(errorCode)) { + return 0; + } + + // parse the message fragment following the selector + index=skipWhiteSpace(index); + if(index==msg.length() || msg.charAt(index)!=u_leftCurlyBrace) { + setParseError(parseError, selectorIndex); // No message fragment after plural/select selector. + errorCode=U_PATTERN_SYNTAX_ERROR; + return 0; + } + index=parseMessage(index, 1, nestingLevel+1, argType, parseError, errorCode); + if(U_FAILURE(errorCode)) { + return 0; + } + isEmpty=FALSE; + } +} + +int32_t +MessagePattern::parseArgNumber(const UnicodeString &s, int32_t start, int32_t limit) { + // If the identifier contains only ASCII digits, then it is an argument _number_ + // and must not have leading zeros (except "0" itself). + // Otherwise it is an argument _name_. + if(start>=limit) { + return UMSGPAT_ARG_NAME_NOT_VALID; + } + int32_t number; + // Defer numeric errors until we know there are only digits. + UBool badNumber; + UChar c=s.charAt(start++); + if(c==0x30) { + if(start==limit) { + return 0; + } else { + number=0; + badNumber=TRUE; // leading zero + } + } else if(0x31<=c && c<=0x39) { + number=c-0x30; + badNumber=FALSE; + } else { + return UMSGPAT_ARG_NAME_NOT_NUMBER; + } + while(start=INT32_MAX/10) { + badNumber=TRUE; // overflow + } + number=number*10+(c-0x30); + } else { + return UMSGPAT_ARG_NAME_NOT_NUMBER; + } + } + // There are only ASCII digits. + if(badNumber) { + return UMSGPAT_ARG_NAME_NOT_VALID; + } else { + return number; + } +} + +void +MessagePattern::parseDouble(int32_t start, int32_t limit, UBool allowInfinity, + UParseError *parseError, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + U_ASSERT(start(Part::MAX_VALUE+isNegative)) { + break; // not a small-enough integer + } + if(index==limit) { + addPart(UMSGPAT_PART_TYPE_ARG_INT, start, limit-start, + isNegative!=0 ? -value : value, errorCode); + return; + } + c=msg.charAt(index++); + } + // Let Double.parseDouble() throw a NumberFormatException. + char numberChars[128]; + int32_t capacity=(int32_t)sizeof(numberChars); + int32_t length=limit-start; + if(length>=capacity) { + break; // number too long + } + msg.extract(start, length, numberChars, capacity, US_INV); + if((int32_t)uprv_strlen(numberChars)0x39 && c!=u_e && c!=u_E && c!=0x221e)) { + break; + } + ++index; + } + return index; +} + +UBool +MessagePattern::isArgTypeChar(UChar32 c) { + return (u_a<=c && c<=u_z) || (u_A<=c && c<=u_Z); +} + +UBool +MessagePattern::isChoice(int32_t index) { + UChar c; + return + ((c=msg.charAt(index++))==u_c || c==u_C) && + ((c=msg.charAt(index++))==u_h || c==u_H) && + ((c=msg.charAt(index++))==u_o || c==u_O) && + ((c=msg.charAt(index++))==u_i || c==u_I) && + ((c=msg.charAt(index++))==u_c || c==u_C) && + ((c=msg.charAt(index))==u_e || c==u_E); +} + +UBool +MessagePattern::isPlural(int32_t index) { + UChar c; + return + ((c=msg.charAt(index++))==u_p || c==u_P) && + ((c=msg.charAt(index++))==u_l || c==u_L) && + ((c=msg.charAt(index++))==u_u || c==u_U) && + ((c=msg.charAt(index++))==u_r || c==u_R) && + ((c=msg.charAt(index++))==u_a || c==u_A) && + ((c=msg.charAt(index))==u_l || c==u_L); +} + +UBool +MessagePattern::isSelect(int32_t index) { + UChar c; + return + ((c=msg.charAt(index++))==u_s || c==u_S) && + ((c=msg.charAt(index++))==u_e || c==u_E) && + ((c=msg.charAt(index++))==u_l || c==u_L) && + ((c=msg.charAt(index++))==u_e || c==u_E) && + ((c=msg.charAt(index++))==u_c || c==u_C) && + ((c=msg.charAt(index))==u_t || c==u_T); +} + +UBool +MessagePattern::inMessageFormatPattern(int32_t nestingLevel) { + return nestingLevel>0 || partsList->a[0].type==UMSGPAT_PART_TYPE_MSG_START; +} + +UBool +MessagePattern::inTopLevelChoiceMessage(int32_t nestingLevel, UMessagePatternArgType parentType) { + return + nestingLevel==1 && + parentType==UMSGPAT_ARG_TYPE_CHOICE && + partsList->a[0].type!=UMSGPAT_PART_TYPE_MSG_START; +} + +void +MessagePattern::addPart(UMessagePatternPartType type, int32_t index, int32_t length, + int32_t value, UErrorCode &errorCode) { + if(partsList->ensureCapacityForOneMore(partsLength, errorCode)) { + Part &part=partsList->a[partsLength++]; + part.type=type; + part.index=index; + part.length=(uint16_t)length; + part.value=(int16_t)value; + part.limitPartIndex=0; + } +} + +void +MessagePattern::addLimitPart(int32_t start, + UMessagePatternPartType type, int32_t index, int32_t length, + int32_t value, UErrorCode &errorCode) { + partsList->a[start].limitPartIndex=partsLength; + addPart(type, index, length, value, errorCode); +} + +void +MessagePattern::addArgDoublePart(double numericValue, int32_t start, int32_t length, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + int32_t numericIndex=numericValuesLength; + if(numericValuesList==NULL) { + numericValuesList=new MessagePatternDoubleList(); + if(numericValuesList==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + } else if(!numericValuesList->ensureCapacityForOneMore(numericValuesLength, errorCode)) { + return; + } else { + if(numericIndex>Part::MAX_VALUE) { + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + } + numericValuesList->a[numericValuesLength++]=numericValue; + addPart(UMSGPAT_PART_TYPE_ARG_DOUBLE, start, length, numericIndex, errorCode); +} + +void +MessagePattern::setParseError(UParseError *parseError, int32_t index) { + if(parseError==NULL) { + return; + } + parseError->offset=index; + + // Set preContext to some of msg before index. + // Avoid splitting a surrogate pair. + int32_t length=index; + if(length>=U_PARSE_CONTEXT_LEN) { + length=U_PARSE_CONTEXT_LEN-1; + if(length>0 && U16_IS_TRAIL(msg[index-length])) { + --length; + } + } + msg.extract(index-length, length, parseError->preContext); + parseError->preContext[length]=0; + + // Set postContext to some of msg starting at index. + length=msg.length()-index; + if(length>=U_PARSE_CONTEXT_LEN) { + length=U_PARSE_CONTEXT_LEN-1; + if(length>0 && U16_IS_LEAD(msg[index+length-1])) { + --length; + } + } + msg.extract(index, length, parseError->postContext); + parseError->postContext[length]=0; +} + +UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(MessagePattern) + +// MessageImpl ------------------------------------------------------------- *** + +void +MessageImpl::appendReducedApostrophes(const UnicodeString &s, int32_t start, int32_t limit, + UnicodeString &sb) { + int32_t doubleApos=-1; + for(;;) { + int32_t i=s.indexOf(u_apos, start); + if(i<0 || i>=limit) { + sb.append(s, start, limit-start); + break; + } + if(i==doubleApos) { + // Double apostrophe at start-1 and start==i, append one. + sb.append(u_apos); + ++start; + doubleApos=-1; + } else { + // Append text between apostrophes and skip this one. + sb.append(s, start, i-start); + doubleApos=start=i+1; + } + } +} + +// Ported from second half of ICU4J SelectFormat.format(String). +UnicodeString & +MessageImpl::appendSubMessageWithoutSkipSyntax(const MessagePattern &msgPattern, + int32_t msgStart, + UnicodeString &result) { + const UnicodeString &msgString=msgPattern.getPatternString(); + int32_t prevIndex=msgPattern.getPart(msgStart).getLimit(); + for(int32_t i=msgStart;;) { + const MessagePattern::Part &part=msgPattern.getPart(++i); + UMessagePatternPartType type=part.getType(); + int32_t index=part.getIndex(); + if(type==UMSGPAT_PART_TYPE_MSG_LIMIT) { + return result.append(msgString, prevIndex, index-prevIndex); + } else if(type==UMSGPAT_PART_TYPE_SKIP_SYNTAX) { + result.append(msgString, prevIndex, index-prevIndex); + prevIndex=part.getLimit(); + } else if(type==UMSGPAT_PART_TYPE_ARG_START) { + result.append(msgString, prevIndex, index-prevIndex); + prevIndex=index; + i=msgPattern.getLimitPartIndex(i); + index=msgPattern.getPart(i).getLimit(); + appendReducedApostrophes(msgString, prevIndex, index, result); + prevIndex=index; + } + } +} + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_FORMATTING diff --git a/icu4c/source/common/patternprops.cpp b/icu4c/source/common/patternprops.cpp new file mode 100644 index 00000000000..b2c52499866 --- /dev/null +++ b/icu4c/source/common/patternprops.cpp @@ -0,0 +1,218 @@ +/* +******************************************************************************* +* Copyright (C) 2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: patternprops.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011mar13 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "patternprops.h" + +U_NAMESPACE_BEGIN + +/* + * One byte per Latin-1 character. + * Bit 0 is set if either Pattern property is true, + * bit 1 if Pattern_Syntax is true, + * bit 2 if Pattern_White_Space is true. + * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5. + */ +static const uint8_t latin1[256]={ + // WS: 9..D + 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // WS: 20 Syntax: 21..2F + 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + // Syntax: 3A..40 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: 5B..5E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + // Syntax: 60 + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: 7B..7E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + // WS: 85 + 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: A1..A7, A9, AB, AC, AE + 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, + // Syntax: B0, B1, B6, BB, BF + 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: D7 + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: F7 + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* + * One byte per 32 characters from U+2000..U+303F indexing into + * a small table of 32-bit data words. + * The first two data words are all-zeros and all-ones. + */ +static const uint8_t index2000[130]={ + 2, 3, 4, 0, 0, 0, 0, 0, // 20xx + 0, 0, 0, 0, 5, 1, 1, 1, // 21xx + 1, 1, 1, 1, 1, 1, 1, 1, // 22xx + 1, 1, 1, 1, 1, 1, 1, 1, // 23xx + 1, 1, 1, 0, 0, 0, 0, 0, // 24xx + 1, 1, 1, 1, 1, 1, 1, 1, // 25xx + 1, 1, 1, 1, 1, 1, 1, 1, // 26xx + 1, 1, 1, 6, 7, 1, 1, 1, // 27xx + 1, 1, 1, 1, 1, 1, 1, 1, // 28xx + 1, 1, 1, 1, 1, 1, 1, 1, // 29xx + 1, 1, 1, 1, 1, 1, 1, 1, // 2Axx + 1, 1, 1, 1, 1, 1, 1, 1, // 2Bxx + 0, 0, 0, 0, 0, 0, 0, 0, // 2Cxx + 0, 0, 0, 0, 0, 0, 0, 0, // 2Dxx + 1, 1, 1, 1, 0, 0, 0, 0, // 2Exx + 0, 0, 0, 0, 0, 0, 0, 0, // 2Fxx + 8, 9 // 3000..303F +}; + +/* + * One 32-bit integer per 32 characters. Ranges of all-false and all-true + * are mapped to the first two values, other ranges map to appropriate bit patterns. + */ +static const uint32_t syntax2000[]={ + 0, + 0xffffffff, + 0xffff0000, // 2: 2010..201F + 0x7fff00ff, // 3: 2020..2027, 2030..203E + 0x7feffffe, // 4: 2041..2053, 2055..205E + 0xffff0000, // 5: 2190..219F + 0x003fffff, // 6: 2760..2775 + 0xfff00000, // 7: 2794..279F + 0xffffff0e, // 8: 3001..3003, 3008..301F + 0x00010001 // 9: 3020, 3030 +}; + +/* + * Same as syntax2000, but with additional bits set for the + * Pattern_White_Space characters 200E 200F 2028 2029. + */ +static const uint32_t syntaxOrWhiteSpace2000[]={ + 0, + 0xffffffff, + 0xffffc000, // 2: 200E..201F + 0x7fff03ff, // 3: 2020..2029, 2030..203E + 0x7feffffe, // 4: 2041..2053, 2055..205E + 0xffff0000, // 5: 2190..219F + 0x003fffff, // 6: 2760..2775 + 0xfff00000, // 7: 2794..279F + 0xffffff0e, // 8: 3001..3003, 3008..301F + 0x00010001 // 9: 3020, 3030 +}; + +UBool +PatternProps::isSyntax(UChar32 c) { + if(c<0) { + return FALSE; + } else if(c<=0xff) { + return (UBool)(latin1[c]>>1)&1; + } else if(c<0x2010) { + return FALSE; + } else if(c<=0x3030) { + uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]]; + return (UBool)((bits>>(c&0x1f))&1); + } else if(0xfd3e<=c && c<=0xfe46) { + return c<=0xfd3f || 0xfe45<=c; + } else { + return FALSE; + } +} + +UBool +PatternProps::isSyntaxOrWhiteSpace(UChar32 c) { + if(c<0) { + return FALSE; + } else if(c<=0xff) { + return (UBool)(latin1[c]&1); + } else if(c<0x200e) { + return FALSE; + } else if(c<=0x3030) { + uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]]; + return (UBool)((bits>>(c&0x1f))&1); + } else if(0xfd3e<=c && c<=0xfe46) { + return c<=0xfd3f || 0xfe45<=c; + } else { + return FALSE; + } +} + +UBool +PatternProps::isWhiteSpace(UChar32 c) { + if(c<0) { + return FALSE; + } else if(c<=0xff) { + return (UBool)(latin1[c]>>2)&1; + } else if(0x200e<=c && c<=0x2029) { + return c<=0x200f || 0x2028<=c; + } else { + return FALSE; + } +} + +const UChar * +PatternProps::skipWhiteSpace(const UChar *s, int32_t length) { + while(length>0 && isWhiteSpace(*s)) { + ++s; + --length; + } + return s; +} + +const UChar * +PatternProps::trimWhiteSpace(const UChar *s, int32_t &length) { + if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) { + return s; + } + int32_t start=0; + int32_t limit=length; + while(start0 && !isSyntaxOrWhiteSpace(*s)) { + ++s; + --length; + } + return s; +} + +U_NAMESPACE_END diff --git a/icu4c/source/common/patternprops.h b/icu4c/source/common/patternprops.h new file mode 100644 index 00000000000..0ceab510a1c --- /dev/null +++ b/icu4c/source/common/patternprops.h @@ -0,0 +1,89 @@ +/* +******************************************************************************* +* Copyright (C) 2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: patternprops.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011mar13 +* created by: Markus W. Scherer +*/ + +#ifndef __PATTERNPROPS_H__ +#define __PATTERNPROPS_H__ + +#include "unicode/utypes.h" + +U_NAMESPACE_BEGIN + +/** + * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space. + * Hardcodes these properties, does not load data, does not depend on other ICU classes. + *

+ * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points, + * and both properties only include BMP code points (no supplementary ones). + * Pattern_Syntax includes some unassigned code points. + *

+ * [:Pattern_White_Space:] = + * [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029] + *

+ * [:Pattern_Syntax:] = + * [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE + * \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7 + * \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E + * \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F + * \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46] + * @author mscherer + */ +class U_COMMON_API PatternProps { +public: + /** + * @return TRUE if c is a Pattern_Syntax code point. + */ + static UBool isSyntax(UChar32 c); + + /** + * @return TRUE if c is a Pattern_Syntax or Pattern_White_Space code point. + */ + static UBool isSyntaxOrWhiteSpace(UChar32 c); + + /** + * @return TRUE if c is a Pattern_White_Space character. + */ + static UBool isWhiteSpace(UChar32 c); + + /** + * Skips over Pattern_White_Space starting at s. + * @return The smallest pointer at or after s with a non-white space character. + */ + static const UChar *skipWhiteSpace(const UChar *s, int32_t length); + + /** + * @return s except with leading and trailing Pattern_White_Space removed and length adjusted. + */ + static const UChar *trimWhiteSpace(const UChar *s, int32_t &length); + + /** + * Tests whether the string contains a "pattern identifier", that is, + * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters. + * @return TRUE if there are no Pattern_White_Space or Pattern_Syntax characters in s. + */ + static UBool isIdentifier(const UChar *s, int32_t length); + + /** + * Skips over a "pattern identifier" starting at index s. + * @return The smallest pointer at or after s with + * a Pattern_White_Space or Pattern_Syntax character. + */ + static const UChar *skipIdentifier(const UChar *s, int32_t length); + +private: + PatternProps(); // no constructor: all static methods +}; + +U_NAMESPACE_END + +#endif // __PATTERNPROPS_H__ diff --git a/icu4c/source/common/unicode/messagepattern.h b/icu4c/source/common/unicode/messagepattern.h new file mode 100644 index 00000000000..296e35fef49 --- /dev/null +++ b/icu4c/source/common/unicode/messagepattern.h @@ -0,0 +1,918 @@ +/* +******************************************************************************* +* Copyright (C) 2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: messagepattern.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011mar14 +* created by: Markus W. Scherer +*/ + +#ifndef __MESSAGEPATTERN_H__ +#define __MESSAGEPATTERN_H__ + +/** + * \file + * \brief C++ API: MessagePattern class: Parses and represents ICU MessageFormat patterns. + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/parseerr.h" +#include "unicode/unistr.h" + +/** + * Mode for when an apostrophe starts quoted literal text for MessageFormat output. + * The default is DOUBLE_OPTIONAL unless overridden via uconfig.h + * (UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE). + *

+ * A pair of adjacent apostrophes always results in a single apostrophe in the output, + * even when the pair is between two single, text-quoting apostrophes. + *

+ * The following table shows examples of desired MessageFormat.format() output + * with the pattern strings that yield that output. + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Desired outputDOUBLE_OPTIONALDOUBLE_REQUIRED
I see {many}I see '{many}'(same)
I said {'Wow!'}I said '{''Wow!''}'(same)
I don't knowI don't know OR
I don''t know
I don''t know
+ * @draft ICU 4.8 + * @see UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE + */ +enum UMessagePatternApostropheMode { + /** + * A literal apostrophe is represented by + * either a single or a double apostrophe pattern character. + * Within a MessageFormat pattern, a single apostrophe only starts quoted literal text + * if it immediately precedes a curly brace {}, + * or a pipe symbol | if inside a choice format, + * or a pound symbol # if inside a plural format. + *

+ * This is the default behavior starting with ICU 4.8. + * @draft ICU 4.8 + */ + UMSGPAT_APOS_DOUBLE_OPTIONAL, + /** + * A literal apostrophe must be represented by + * a double apostrophe pattern character. + * A single apostrophe always starts quoted literal text. + *

+ * This is the behavior of ICU 4.6 and earlier, and of the JDK. + * @draft ICU 4.8 + */ + UMSGPAT_APOS_DOUBLE_REQUIRED +}; +typedef enum UMessagePatternApostropheMode UMessagePatternApostropheMode; + +/** + * MessagePattern::Part type constants. + * @draft ICU 4.8 + */ +enum UMessagePatternPartType { + /** + * Start of a message pattern (main or nested). + * The length is 0 for the top-level message + * and for a choice argument sub-message, otherwise 1 for the '{'. + * The value indicates the nesting level, starting with 0 for the main message. + *

+ * There is always a later MSG_LIMIT part. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_MSG_START, + /** + * End of a message pattern (main or nested). + * The length is 0 for the top-level message and + * the last sub-message of a choice argument, + * otherwise 1 for the '}' or (in a choice argument style) the '|'. + * The value indicates the nesting level, starting with 0 for the main message. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_MSG_LIMIT, + /** + * Indicates a substring of the pattern string which is to be skipped when formatting. + * For example, an apostrophe that begins or ends quoted text + * would be indicated with such a part. + * The value is undefined and currently always 0. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_SKIP_SYNTAX, + /** + * Indicates that a syntax character needs to be inserted for auto-quoting. + * The length is 0. + * The value is the character code of the insertion character. (U+0027=APOSTROPHE) + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_INSERT_CHAR, + /** + * Indicates a syntactic (non-escaped) # symbol in a plural variant. + * When formatting, replace this part's substring with the + * (value-offset) for the plural argument value. + * The value is undefined and currently always 0. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_REPLACE_NUMBER, + /** + * Start of an argument. + * The length is 1 for the '{'. + * The value is the ordinal value of the ArgType. Use getArgType(). + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_START, + /** + * End of an argument. + * The length is 1 for the '}'. + * The value is the ordinal value of the ArgType. Use getArgType(). + *

+ * This part is followed by either an ARG_NUMBER or ARG_NAME, + * followed by optional argument sub-parts (see UMessagePatternArgType constants) + * and finally an ARG_LIMIT part. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_LIMIT, + /** + * The argument number, provided by the value. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_NUMBER, + /** + * The argument name. + * The value is undefined and currently always 0. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_NAME, + /** + * The argument type. + * The value is undefined and currently always 0. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_TYPE, + /** + * The argument style text. + * The value is undefined and currently always 0. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_STYLE, + /** + * A selector substring in a "complex" argument style. + * The value is undefined and currently always 0. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_SELECTOR, + /** + * An integer value, for example the offset or an explicit selector value + * in a PluralFormat style. + * The part value is the integer value. + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_INT, + /** + * A numeric value, for example the offset or an explicit selector value + * in a PluralFormat style. + * The part value is an index into an internal array of numeric values; + * use getNumericValue(). + * @draft ICU 4.8 + */ + UMSGPAT_PART_TYPE_ARG_DOUBLE +}; +typedef enum UMessagePatternPartType UMessagePatternPartType; + +/** + * Argument type constants. + * Returned by Part.getArgType() for ARG_START and ARG_LIMIT parts. + * + * Messages nested inside an argument are each delimited by MSG_START and MSG_LIMIT, + * with a nesting level one greater than the surrounding message. + * @draft ICU 4.8 + */ +enum UMessagePatternArgType { + /** + * The argument has no specified type. + * @draft ICU 4.8 + */ + UMSGPAT_ARG_TYPE_NONE, + /** + * The argument has a "simple" type which is provided by the ARG_TYPE part. + * An ARG_STYLE part might follow that. + * @draft ICU 4.8 + */ + UMSGPAT_ARG_TYPE_SIMPLE, + /** + * The argument is a ChoiceFormat with one or more + * ((ARG_INT | ARG_DOUBLE), ARG_SELECTOR, message) tuples. + * @draft ICU 4.8 + */ + UMSGPAT_ARG_TYPE_CHOICE, + /** + * The argument is a PluralFormat with an optional ARG_INT or ARG_DOUBLE offset + * (e.g., offset:1) + * and one or more (ARG_SELECTOR [explicit-value] message) tuples. + * If the selector has an explicit value (e.g., =2), then + * that value is provided by the ARG_INT or ARG_DOUBLE part preceding the message. + * Otherwise the message immediately follows the ARG_SELECTOR. + * @draft ICU 4.8 + */ + UMSGPAT_ARG_TYPE_PLURAL, + /** + * The argument is a SelectFormat with one or more (ARG_SELECTOR, message) pairs. + * @draft ICU 4.8 + */ + UMSGPAT_ARG_TYPE_SELECT +}; +typedef enum UMessagePatternArgType UMessagePatternArgType; + +enum { + /** + * Return value from MessagePattern.validateArgumentName() for when + * the string is a valid "pattern identifier" but not a number. + * @draft ICU 4.8 + */ + UMSGPAT_ARG_NAME_NOT_NUMBER=-1, + + /** + * Return value from MessagePattern.validateArgumentName() for when + * the string is invalid. + * It might not be a valid "pattern identifier", + * or it have only ASCII digits but there is a leading zero or the number is too large. + * @draft ICU 4.8 + */ + UMSGPAT_ARG_NAME_NOT_VALID=-2 +}; + +/** + * Special value that is returned by getNumericValue(Part) when no + * numeric value is defined for a part. + * @see MessagePattern.getNumericValue() + * @draft ICU 4.8 + */ +#define UMSGPAT_NO_NUMERIC_VALUE ((double)(-123456789)) + +U_NAMESPACE_BEGIN + +class MessagePatternDoubleList; +class MessagePatternPartsList; + +/** + * Parses and represents ICU MessageFormat patterns. + * Also handles patterns for ChoiceFormat, PluralFormat and SelectFormat. + * Used in the implementations of those classes as well as in tools + * for message validation, translation and format conversion. + *

+ * The parser handles all syntax relevant for identifying message arguments. + * This includes "complex" arguments whose style strings contain + * nested MessageFormat pattern substrings. + * For "simple" arguments (with no nested MessageFormat pattern substrings), + * the argument style is not parsed any further. + *

+ * The parser handles named and numbered message arguments and allows both in one message. + *

+ * Once a pattern has been parsed successfully, iterate through the parsed data + * with countParts(), getPart() and related methods. + *

+ * The data logically represents a parse tree, but is stored and accessed + * as a list of "parts" for fast and simple parsing and to minimize object allocations. + * Arguments and nested messages are best handled via recursion. + * For every _START "part", MessagePattern.getLimitPartIndex() efficiently returns + * the index of the corresponding _LIMIT "part". + *

+ * List of "parts": + *

+ * message = MSG_START (SKIP_SYNTAX | INSERT_CHAR | REPLACE_NUMBER | argument)* MSG_LIMIT
+ * argument = noneArg | simpleArg | complexArg
+ * complexArg = choiceArg | pluralArg | selectArg
+ *
+ * noneArg = ARG_START.NONE (ARG_NAME | ARG_NUMBER) ARG_LIMIT.NONE
+ * simpleArg = ARG_START.SIMPLE (ARG_NAME | ARG_NUMBER) ARG_TYPE [ARG_STYLE] ARG_LIMIT.SIMPLE
+ * choiceArg = ARG_START.CHOICE (ARG_NAME | ARG_NUMBER) choiceStyle ARG_LIMIT.CHOICE
+ * pluralArg = ARG_START.PLURAL (ARG_NAME | ARG_NUMBER) pluralStyle ARG_LIMIT.PLURAL
+ * selectArg = ARG_START.SELECT (ARG_NAME | ARG_NUMBER) selectStyle ARG_LIMIT.SELECT
+ *
+ * choiceStyle = ((ARG_INT | ARG_DOUBLE) ARG_SELECTOR message)+
+ * pluralStyle = [ARG_INT | ARG_DOUBLE] (ARG_SELECTOR [ARG_INT | ARG_DOUBLE] message)+
+ * selectStyle = (ARG_SELECTOR message)+
+ * 
+ *
    + *
  • Literal output text is not represented directly by "parts" but accessed + * between parts of a message, from one part's getLimit() to the next part's getIndex(). + *
  • ARG_START.CHOICE stands for an ARG_START Part with ArgType CHOICE. + *
  • In the choiceStyle, the ARG_SELECTOR has the '<', the '#' or + * the less-than-or-equal-to sign (U+2264). + *
  • In the pluralStyle, the first, optional numeric Part has the "offset:" value. + * The optional numeric Part between each (ARG_SELECTOR, message) pair + * is the value of an explicit-number selector like "=2", + * otherwise the selector is a non-numeric identifier. + *
  • The REPLACE_NUMBER Part can occur only in an immediate sub-message of the pluralStyle. + *

    + * This class is not intended for public subclassing. + * + * @draft ICU 4.8 + */ +class U_COMMON_API MessagePattern : public UObject { +public: + /** + * Constructs an empty MessagePattern with default UMessagePatternApostropheMode. + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @draft ICU 4.8 + */ + MessagePattern(UErrorCode &errorCode); + + /** + * Constructs an empty MessagePattern. + * @param mode Explicit UMessagePatternApostropheMode. + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @draft ICU 4.8 + */ + MessagePattern(UMessagePatternApostropheMode mode, UErrorCode &errorCode); + + /** + * Constructs a MessagePattern with default UMessagePatternApostropheMode and + * parses the MessageFormat pattern string. + * @param pattern a MessageFormat pattern string + * @param parseError Struct to receive information on the position + * of an error within the pattern. + * Can be NULL. + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * TODO: turn @throws into UErrorCode specifics? + * @throws IllegalArgumentException for syntax errors in the pattern string + * @throws IndexOutOfBoundsException if certain limits are exceeded + * (e.g., argument number too high, argument name too long, etc.) + * @throws NumberFormatException if a number could not be parsed + * @draft ICU 4.8 + */ + MessagePattern(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode); + + /** + * Copy constructor. + * @param other Object to copy. + * @draft ICU 4.8 + */ + MessagePattern(const MessagePattern &other); + + /** + * Assignment operator. + * @param other Object to copy. + * @return *this=other + * @draft ICU 4.8 + */ + MessagePattern &operator=(const MessagePattern &other); + + /** + * Destructor. + * @draft ICU 4.8 + */ + virtual ~MessagePattern(); + + /** + * Parses a MessageFormat pattern string. + * @param pattern a MessageFormat pattern string + * @param parseError Struct to receive information on the position + * of an error within the pattern. + * Can be NULL. + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return *this + * @throws IllegalArgumentException for syntax errors in the pattern string + * @throws IndexOutOfBoundsException if certain limits are exceeded + * (e.g., argument number too high, argument name too long, etc.) + * @throws NumberFormatException if a number could not be parsed + * @draft ICU 4.8 + */ + MessagePattern &parse(const UnicodeString &pattern, + UParseError *parseError, UErrorCode &errorCode); + + /** + * Parses a ChoiceFormat pattern string. + * @param pattern a ChoiceFormat pattern string + * @param parseError Struct to receive information on the position + * of an error within the pattern. + * Can be NULL. + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return *this + * @throws IllegalArgumentException for syntax errors in the pattern string + * @throws IndexOutOfBoundsException if certain limits are exceeded + * (e.g., argument number too high, argument name too long, etc.) + * @throws NumberFormatException if a number could not be parsed + * @draft ICU 4.8 + */ + MessagePattern &parseChoiceStyle(const UnicodeString &pattern, + UParseError *parseError, UErrorCode &errorCode); + + /** + * Parses a PluralFormat pattern string. + * @param pattern a PluralFormat pattern string + * @param parseError Struct to receive information on the position + * of an error within the pattern. + * Can be NULL. + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return *this + * @throws IllegalArgumentException for syntax errors in the pattern string + * @throws IndexOutOfBoundsException if certain limits are exceeded + * (e.g., argument number too high, argument name too long, etc.) + * @throws NumberFormatException if a number could not be parsed + * @draft ICU 4.8 + */ + MessagePattern &parsePluralStyle(const UnicodeString &pattern, + UParseError *parseError, UErrorCode &errorCode); + + /** + * Parses a SelectFormat pattern string. + * @param pattern a SelectFormat pattern string + * @param parseError Struct to receive information on the position + * of an error within the pattern. + * Can be NULL. + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return *this + * @throws IllegalArgumentException for syntax errors in the pattern string + * @throws IndexOutOfBoundsException if certain limits are exceeded + * (e.g., argument number too high, argument name too long, etc.) + * @throws NumberFormatException if a number could not be parsed + * @draft ICU 4.8 + */ + MessagePattern &parseSelectStyle(const UnicodeString &pattern, + UParseError *parseError, UErrorCode &errorCode); + + /** + * Clears this MessagePattern. + * countParts() will return 0. + * @draft ICU 4.8 + */ + void clear(); + + /** + * Clears this MessagePattern and sets the UMessagePatternApostropheMode. + * countParts() will return 0. + * @param mode The new UMessagePatternApostropheMode. + * @draft ICU 4.8 + * @provisional This API might change or be removed in a future release. + */ + void clearPatternAndSetApostropheMode(UMessagePatternApostropheMode mode) { + clear(); + aposMode=mode; + } + + /** + * @param other another object to compare with. + * @return TRUE if this object is equivalent to the other one. + * @draft ICU 4.8 + */ + UBool operator==(const MessagePattern &other) const; + + /** + * @param other another object to compare with. + * @return FALSE if this object is equivalent to the other one. + * @draft ICU 4.8 + */ + inline UBool operator!=(const MessagePattern &other) const { + return !operator==(other); + } + + /** + * @return A hash code for this object. + * @draft ICU 4.8 + */ + int32_t hashCode() const; + + /** + * @return this instance's UMessagePatternApostropheMode. + * @draft ICU 4.8 + */ + UMessagePatternApostropheMode getApostropheMode() const { + return aposMode; + } + + // Java has package-private jdkAposMode() here. + // In C++, this is declared in the MessageImpl class. + + /** + * @return the parsed pattern string (null if none was parsed). + * @draft ICU 4.8 + */ + const UnicodeString &getPatternString() const { + return msg; + } + + /** + * Does the parsed pattern have named arguments like {first_name}? + * @return TRUE if the parsed pattern has at least one named argument. + * @draft ICU 4.8 + */ + UBool hasNamedArguments() const { + return hasArgNames; + } + + /** + * Does the parsed pattern have numbered arguments like {2}? + * @return TRUE if the parsed pattern has at least one numbered argument. + * @draft ICU 4.8 + */ + UBool hasNumberedArguments() const { + return hasArgNumbers; + } + + /** + * Validates and parses an argument name or argument number string. + * An argument name must be a "pattern identifier", that is, it must contain + * no Unicode Pattern_Syntax or Pattern_White_Space characters. + * If it only contains ASCII digits, then it must be a small integer with no leading zero. + * @param name Input string. + * @return >=0 if the name is a valid number, + * ARG_NAME_NOT_NUMBER (-1) if it is a "pattern identifier" but not all ASCII digits, + * ARG_NAME_NOT_VALID (-2) if it is neither. + * @draft ICU 4.8 + */ + static int32_t validateArgumentName(const UnicodeString &name); + + /** + * Returns a version of the parsed pattern string where each ASCII apostrophe + * is doubled (escaped) if it is not already, and if it is not interpreted as quoting syntax. + *

    + * For example, this turns "I don't '{know}' {gender,select,female{h''er}other{h'im}}." + * into "I don''t '{know}' {gender,select,female{h''er}other{h''im}}." + * @return the deep-auto-quoted version of the parsed pattern string. + * @see MessageFormat.autoQuoteApostrophe() + * @draft ICU 4.8 + */ + UnicodeString autoQuoteApostropheDeep() const; + + class Part; + + /** + * Returns the number of "parts" created by parsing the pattern string. + * Returns 0 if no pattern has been parsed or clear() was called. + * @return the number of pattern parts. + * @draft ICU 4.8 + */ + int32_t countParts() const { + return partsLength; + } + + /** + * Gets the i-th pattern "part". + * @param i The index of the Part data. (0..countParts()-1) + * @return the i-th pattern "part". + * @draft ICU 4.8 + */ + const Part &getPart(int32_t i) const { + return parts[i]; + } + + /** + * Returns the UMessagePatternPartType of the i-th pattern "part". + * Convenience method for getPart(i).getType(). + * @param i The index of the Part data. (0..countParts()-1) + * @return The UMessagePatternPartType of the i-th Part. + * @draft ICU 4.8 + */ + UMessagePatternPartType getPartType(int32_t i) const { + return getPart(i).type; + } + + /** + * Returns the pattern index of the specified pattern "part". + * Convenience method for getPart(partIndex).getIndex(). + * @param partIndex The index of the Part data. (0..countParts()-1) + * @return The pattern index of this Part. + * @draft ICU 4.8 + */ + int32_t getPatternIndex(int32_t partIndex) const { + return getPart(partIndex).index; + } + + /** + * Returns the substring of the pattern string indicated by the Part. + * Convenience method for getPatternString().substring(part.getIndex(), part.getLimit()). + * @param part a part of this MessagePattern. + * @return the substring associated with part. + * @draft ICU 4.8 + */ + UnicodeString getSubstring(const Part &part) const { + return msg.tempSubString(part.index, part.length); + } + + /** + * Compares the part's substring with the input string s. + * @param part a part of this MessagePattern. + * @param s a string. + * @return TRUE if getSubstring(part).equals(s). + * @draft ICU 4.8 + */ + UBool partSubstringMatches(const Part &part, const UnicodeString &s) const { + return 0==msg.compare(part.index, part.length, s); + } + + /** + * Returns the numeric value associated with an ARG_INT or ARG_DOUBLE. + * @param part a part of this MessagePattern. + * @return the part's numeric value, or UMSGPAT_NO_NUMERIC_VALUE if this is not a numeric part. + * @draft ICU 4.8 + */ + double getNumericValue(const Part &part) const; + + /** + * Returns the "offset:" value of a PluralFormat argument, or 0 if none is specified. + * @param pluralStart the index of the first PluralFormat argument style part. (0..countParts()-1) + * @return the "offset:" value. + * @draft ICU 4.8 + */ + double getPluralOffset(int32_t pluralStart) const; + + /** + * Returns the index of the ARG|MSG_LIMIT part corresponding to the ARG|MSG_START at start. + * @param start The index of some Part data (0..countParts()-1); + * this Part should be of Type ARG_START or MSG_START. + * @return The first i>start where getPart(i).getType()==ARG|MSG_LIMIT at the same nesting level, + * or start itself if getPartType(msgStart)!=ARG|MSG_START. + * @draft ICU 4.8 + */ + int32_t getLimitPartIndex(int32_t start) const { + int32_t limit=getPart(start).limitPartIndex; + if(limit parts=new ArrayList(); + MessagePatternPartsList *partsList; + Part *parts; + int32_t partsLength; + // ArrayList numericValues; + MessagePatternDoubleList *numericValuesList; + double *numericValues; + int32_t numericValuesLength; + UBool hasArgNames; + UBool hasArgNumbers; + UBool needsAutoQuoting; +}; + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_FORMATTING + +#endif // __MESSAGEPATTERN_H__ diff --git a/icu4c/source/common/unicode/uconfig.h b/icu4c/source/common/unicode/uconfig.h index 6521eed472d..f7546bfc877 100644 --- a/icu4c/source/common/unicode/uconfig.h +++ b/icu4c/source/common/unicode/uconfig.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2002-2009, International Business Machines +* Copyright (C) 2002-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: uconfig.h @@ -176,6 +176,17 @@ # define UCONFIG_NO_IDNA 0 #endif +/** + * \def UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE + * Determines the default UMessagePatternApostropheMode. + * See the documentation for that enum. + * + * @draft ICU 4.8 + */ +#ifndef UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE +# define UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE UMSGPAT_APOS_DOUBLE_OPTIONAL +#endif + /* i18n library switches ---------------------------------------------------- */ /** diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index c5fe0ef15bd..aba0619f085 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 1999-2010, International Business Machines Corporation +* Copyright (C) 1999-2011, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** * Date Name Description @@ -575,8 +575,8 @@ public: /** * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. + * pattern, ignoring Unicode Pattern_White_Space characters. + * See the class description for the syntax of the pattern language. * A frozen set will not be modified. * @param pattern a string specifying what characters are in the set * @param status returns U_ILLEGAL_ARGUMENT_ERROR if the pattern @@ -590,8 +590,8 @@ public: /** * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. + * pattern, optionally ignoring Unicode Pattern_White_Space characters. + * See the class description for the syntax of the pattern language. * A frozen set will not be modified. * @param pattern a string specifying what characters are in the set * @param options bitmask for options to apply to the pattern. @@ -1540,8 +1540,8 @@ private: * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P" * \\N{name} - white space not allowed within "\\N" * - * Other than the above restrictions, white space is ignored. Case - * is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading + * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored. + * Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading * and trailing space is deleted, and internal runs of whitespace * are collapsed to a single space. * diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index 4ff3f6b2717..ed50e1758c2 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2009, International Business Machines +* Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description @@ -15,6 +15,7 @@ #include "ruleiter.h" #include "cmemory.h" #include "cstring.h" +#include "patternprops.h" #include "uhash.h" #include "util.h" #include "uvector.h" @@ -1926,7 +1927,7 @@ escapeUnprintable) { break; default: // Escape whitespace - if (uprv_isRuleWhiteSpace(c)) { + if (PatternProps::isWhiteSpace(c)) { buf.append(BACKSLASH); } break; diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 6f82dfb7dc8..95ed2640aa4 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2010, International Business Machines +* Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -399,20 +399,6 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, // Public API //---------------------------------------------------------------- -/** - * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. - * @param pattern a string specifying what characters are in the set - * @param ignoreSpaces if true, all spaces in the - * pattern are ignored. Spaces are those characters for which - * uprv_isRuleWhiteSpace() is true. - * Characters preceded by '\\' are escaped, losing any special - * meaning they otherwise have. Spaces may be included by - * escaping them. - * @exception IllegalArgumentException if the pattern - * contains a syntax error. - */ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, UErrorCode& status) { return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); diff --git a/icu4c/source/i18n/choicfmt.cpp b/icu4c/source/i18n/choicfmt.cpp index ae5387273e3..58b440a7e4c 100644 --- a/icu4c/source/i18n/choicfmt.cpp +++ b/icu4c/source/i18n/choicfmt.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 1997-2009, International Business Machines Corporation and * +* Copyright (C) 1997-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * @@ -32,7 +32,9 @@ #include "unicode/locid.h" #include "cpputils.h" #include "cstring.h" +#include "messageimpl.h" #include "putilimp.h" +#include "uassert.h" #include #include @@ -54,6 +56,9 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ChoiceFormat) #define VERTICAL_BAR ((UChar)0x007C) /*|*/ #define MINUS ((UChar)0x002D) /*-*/ +static const UChar LEFT_CURLY_BRACE = 0x7B; /*{*/ +static const UChar RIGHT_CURLY_BRACE = 0x7D; /*}*/ + #ifdef INFINITY #undef INFINITY #endif @@ -69,10 +74,8 @@ static const UChar gNegativeInfinity[] = {MINUS, INFINITY, 0}; ChoiceFormat::ChoiceFormat(const UnicodeString& newPattern, UErrorCode& status) -: fChoiceLimits(0), - fClosures(0), - fChoiceFormats(0), - fCount(0) +: constructorErrorCode(status), + msgPattern(status) { applyPattern(newPattern, status); } @@ -84,12 +87,10 @@ ChoiceFormat::ChoiceFormat(const UnicodeString& newPattern, ChoiceFormat::ChoiceFormat(const double* limits, const UnicodeString* formats, int32_t cnt ) -: fChoiceLimits(0), - fClosures(0), - fChoiceFormats(0), - fCount(0) +: constructorErrorCode(U_ZERO_ERROR), + msgPattern(constructorErrorCode) { - setChoices(limits, formats, cnt ); + setChoices(limits, NULL, formats, cnt, constructorErrorCode); } // ------------------------------------- @@ -98,12 +99,10 @@ ChoiceFormat::ChoiceFormat(const double* limits, const UBool* closures, const UnicodeString* formats, int32_t cnt ) -: fChoiceLimits(0), - fClosures(0), - fChoiceFormats(0), - fCount(0) +: constructorErrorCode(U_ZERO_ERROR), + msgPattern(constructorErrorCode) { - setChoices(limits, closures, formats, cnt ); + setChoices(limits, closures, formats, cnt, constructorErrorCode); } // ------------------------------------- @@ -111,11 +110,9 @@ ChoiceFormat::ChoiceFormat(const double* limits, ChoiceFormat::ChoiceFormat(const ChoiceFormat& that) : NumberFormat(that), - fChoiceLimits(0), - fClosures(0), - fChoiceFormats(0) + constructorErrorCode(that.constructorErrorCode), + msgPattern(that.msgPattern) { - *this = that; } // ------------------------------------- @@ -126,10 +123,8 @@ ChoiceFormat::ChoiceFormat(const ChoiceFormat& that) ChoiceFormat::ChoiceFormat(const UnicodeString& newPattern, UParseError& parseError, UErrorCode& status) -: fChoiceLimits(0), - fClosures(0), - fChoiceFormats(0), - fCount(0) +: constructorErrorCode(status), + msgPattern(status) { applyPattern(newPattern,parseError, status); } @@ -141,16 +136,7 @@ ChoiceFormat::operator==(const Format& that) const if (this == &that) return TRUE; if (!NumberFormat::operator==(that)) return FALSE; ChoiceFormat& thatAlias = (ChoiceFormat&)that; - if (fCount != thatAlias.fCount) return FALSE; - // Checks the limits, the corresponding format string and LE or LT flags. - // LE means less than and equal to, LT means less than. - for (int32_t i = 0; i < fCount; i++) { - if ((fChoiceLimits[i] != thatAlias.fChoiceLimits[i]) || - (fClosures[i] != thatAlias.fClosures[i]) || - (fChoiceFormats[i] != thatAlias.fChoiceFormats[i])) - return FALSE; - } - return TRUE; + return msgPattern == thatAlias.msgPattern; } // ------------------------------------- @@ -161,37 +147,8 @@ ChoiceFormat::operator=(const ChoiceFormat& that) { if (this != &that) { NumberFormat::operator=(that); - fCount = that.fCount; - uprv_free(fChoiceLimits); - fChoiceLimits = NULL; - uprv_free(fClosures); - fClosures = NULL; - delete [] fChoiceFormats; - fChoiceFormats = NULL; - - fChoiceLimits = (double*) uprv_malloc( sizeof(double) * fCount); - fClosures = (UBool*) uprv_malloc( sizeof(UBool) * fCount); - fChoiceFormats = new UnicodeString[fCount]; - - // check for memory allocation error - if (!fChoiceLimits || !fClosures || !fChoiceFormats) { - if (fChoiceLimits) { - uprv_free(fChoiceLimits); - fChoiceLimits = NULL; - } - if (fClosures) { - uprv_free(fClosures); - fClosures = NULL; - } - if (fChoiceFormats) { - delete[] fChoiceFormats; - fChoiceFormats = NULL; - } - } else { - uprv_arrayCopy(that.fChoiceLimits, fChoiceLimits, fCount); - uprv_arrayCopy(that.fClosures, fClosures, fCount); - uprv_arrayCopy(that.fChoiceFormats, fChoiceFormats, fCount); - } + constructorErrorCode = that.constructorErrorCode; + msgPattern = that.msgPattern; } return *this; } @@ -200,32 +157,12 @@ ChoiceFormat::operator=(const ChoiceFormat& that) ChoiceFormat::~ChoiceFormat() { - uprv_free(fChoiceLimits); - fChoiceLimits = NULL; - uprv_free(fClosures); - fClosures = NULL; - delete [] fChoiceFormats; - fChoiceFormats = NULL; - fCount = 0; -} - -/** - * Convert a string to a double value - */ -double -ChoiceFormat::stod(const UnicodeString& string) -{ - char source[256]; - char* end; - - string.extract(0, string.length(), source, (int32_t)sizeof(source), US_INV); /* invariant codepage */ - return uprv_strtod(source,&end); } // ------------------------------------- /** - * Convert a double value to a string without the overhead of ICU. + * Convert a double value to a string without the overhead of NumberFormat. */ UnicodeString& ChoiceFormat::dtos(double value, @@ -286,8 +223,8 @@ void ChoiceFormat::applyPattern(const UnicodeString& pattern, UErrorCode& status) { - UParseError parseError; - applyPattern(pattern, parseError, status); + msgPattern.parseChoiceStyle(pattern, NULL, status); + constructorErrorCode = status; } // ------------------------------------- @@ -298,217 +235,16 @@ ChoiceFormat::applyPattern(const UnicodeString& pattern, UParseError& parseError, UErrorCode& status) { - if (U_FAILURE(status)) - { - return; - } - - // Clear error struct - parseError.offset = -1; - parseError.preContext[0] = parseError.postContext[0] = (UChar)0; - - // Perform 2 passes. The first computes the number of limits in - // this pattern (fCount), which is 1 more than the number of - // literal VERTICAL_BAR characters. - int32_t count = 1; - int32_t i; - for (i=0; i 0 && limit <= newLimits[k-1]) { - // Each limit must be strictly > than the previous - // limit. One exception: Two subsequent limits may be - // == if the first closure is FALSE and the second - // closure is TRUE. This places the limit value in - // the second interval. - if (!(limit == newLimits[k-1] && - !newClosures[k-1] && - newClosures[k])) { - goto error; - } - } - - buf.truncate(0); - } else if (c == VERTICAL_BAR) { - if (inNumber) { - goto error; - } - inNumber = TRUE; - - newFormats[k] = buf; - ++k; - buf.truncate(0); - } else { - buf += c; - } - } - - if (k != (count-1) || inNumber || inQuote) { - goto error; - } - newFormats[k] = buf; - - // Don't modify this object until the parse succeeds - uprv_free(fChoiceLimits); - uprv_free(fClosures); - delete[] fChoiceFormats; - fCount = count; - fChoiceLimits = newLimits; - fClosures = newClosures; - fChoiceFormats = newFormats; - return; - -error: - status = U_ILLEGAL_ARGUMENT_ERROR; - syntaxError(pattern,i,parseError); - uprv_free(newLimits); - uprv_free(newClosures); - delete[] newFormats; - return; - + msgPattern.parseChoiceStyle(pattern, &parseError, status); + constructorErrorCode = status; } // ------------------------------------- -// Reconstruct the original input pattern. +// Returns the input pattern string. UnicodeString& ChoiceFormat::toPattern(UnicodeString& result) const { - result.remove(); - for (int32_t i = 0; i < fCount; ++i) { - if (i != 0) { - result += VERTICAL_BAR; - } - UnicodeString buf; - if (uprv_isPositiveInfinity(fChoiceLimits[i])) { - result += INFINITY; - } else if (uprv_isNegativeInfinity(fChoiceLimits[i])) { - result += MINUS; - result += INFINITY; - } else { - result += dtos(fChoiceLimits[i], buf); - } - if (fClosures[i]) { - result += LESS_THAN; - } else { - result += LESS_EQUAL; - } - // Append fChoiceFormats[i], using quotes if there are special - // characters. Single quotes themselves must be escaped in - // either case. - const UnicodeString& text = fChoiceFormats[i]; - UBool needQuote = text.indexOf(LESS_THAN) >= 0 - || text.indexOf(LESS_EQUAL) >= 0 - || text.indexOf(LESS_EQUAL2) >= 0 - || text.indexOf(VERTICAL_BAR) >= 0; - if (needQuote) { - result += SINGLE_QUOTE; - } - if (text.indexOf(SINGLE_QUOTE) < 0) { - result += text; - } - else { - for (int32_t j = 0; j < text.length(); ++j) { - UChar c = text[j]; - result += c; - if (c == SINGLE_QUOTE) { - result += c; - } - } - } - if (needQuote) { - result += SINGLE_QUOTE; - } - } - - return result; + return result = msgPattern.getPatternString(); } // ------------------------------------- @@ -518,7 +254,8 @@ ChoiceFormat::setChoices( const double* limits, const UnicodeString* formats, int32_t cnt ) { - setChoices(limits, 0, formats, cnt); + UErrorCode errorCode = U_ZERO_ERROR; + setChoices(limits, NULL, formats, cnt, errorCode); } // ------------------------------------- @@ -529,54 +266,76 @@ ChoiceFormat::setChoices( const double* limits, const UnicodeString* formats, int32_t cnt ) { - if(limits == 0 || formats == 0) - return; + UErrorCode errorCode = U_ZERO_ERROR; + setChoices(limits, closures, formats, cnt, errorCode); +} - if (fChoiceLimits) { - uprv_free(fChoiceLimits); - } - if (fClosures) { - uprv_free(fClosures); - } - if (fChoiceFormats) { - delete [] fChoiceFormats; - } - - // Note that the old arrays are deleted and this owns - // the created array. - fCount = cnt; - fChoiceLimits = (double*) uprv_malloc( sizeof(double) * fCount); - fClosures = (UBool*) uprv_malloc( sizeof(UBool) * fCount); - fChoiceFormats = new UnicodeString[fCount]; - - //check for memory allocation error - if (!fChoiceLimits || !fClosures || !fChoiceFormats) { - if (fChoiceLimits) { - uprv_free(fChoiceLimits); - fChoiceLimits = NULL; - } - if (fClosures) { - uprv_free(fClosures); - fClosures = NULL; - } - if (fChoiceFormats) { - delete[] fChoiceFormats; - fChoiceFormats = NULL; - } +void +ChoiceFormat::setChoices(const double* limits, + const UBool* closures, + const UnicodeString* formats, + int32_t count, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } - - uprv_arrayCopy(limits, fChoiceLimits, fCount); - uprv_arrayCopy(formats, fChoiceFormats, fCount); - - if (closures != 0) { - uprv_arrayCopy(closures, fClosures, fCount); - } else { - int32_t i; - for (i=0; i '|' + // |' -> '|''' + // |'' -> '|''''' etc. + result.append(SINGLE_QUOTE).append(c).append(SINGLE_QUOTE); + continue; // Skip the append(c) at the end of the loop body. + } else if (c == LEFT_CURLY_BRACE) { + ++nestingLevel; + } else if (c == RIGHT_CURLY_BRACE && nestingLevel > 0) { + --nestingLevel; + } + result.append(c); } } + // Apply the reconstructed pattern. + applyPattern(result, errorCode); } // ------------------------------------- @@ -585,8 +344,8 @@ ChoiceFormat::setChoices( const double* limits, const double* ChoiceFormat::getLimits(int32_t& cnt) const { - cnt = fCount; - return fChoiceLimits; + cnt = 0; + return NULL; } // ------------------------------------- @@ -595,8 +354,8 @@ ChoiceFormat::getLimits(int32_t& cnt) const const UBool* ChoiceFormat::getClosures(int32_t& cnt) const { - cnt = fCount; - return fClosures; + cnt = 0; + return NULL; } // ------------------------------------- @@ -605,8 +364,8 @@ ChoiceFormat::getClosures(int32_t& cnt) const const UnicodeString* ChoiceFormat::getFormats(int32_t& cnt) const { - cnt = fCount; - return fChoiceFormats; + cnt = 0; + return NULL; } // ------------------------------------- @@ -623,9 +382,8 @@ ChoiceFormat::format(int64_t number, } // ------------------------------------- -// Formats a long number, it's actually formatted as -// a double. The returned format string may differ -// from the input number because of this. +// Formats an int32_t number, it's actually formatted as +// a double. UnicodeString& ChoiceFormat::format(int32_t number, @@ -643,26 +401,63 @@ ChoiceFormat::format(double number, UnicodeString& appendTo, FieldPosition& /*pos*/) const { - // find the number - int32_t i; - for (i = 0; i < fCount; ++i) { - if (fClosures[i]) { - if (!(number > fChoiceLimits[i])) { - // same as number <= fChoiceLimits, except catches NaN - break; - } - } else if (!(number >= fChoiceLimits[i])) { - // same as number < fChoiceLimits, except catches NaN + if (msgPattern.countParts() == 0) { + // No pattern was applied, or it failed. + return appendTo; + } + // Get the appropriate sub-message. + int32_t msgStart = findSubMessage(msgPattern, 0, number); + if (!MessageImpl::jdkAposMode(msgPattern)) { + int32_t patternStart = msgPattern.getPart(msgStart).getLimit(); + int32_t msgLimit = msgPattern.getLimitPartIndex(msgStart); + appendTo.append(msgPattern.getPatternString(), + patternStart, + msgPattern.getPatternIndex(msgLimit) - patternStart); + return appendTo; + } + // JDK compatibility mode: Remove SKIP_SYNTAX. + return MessageImpl::appendSubMessageWithoutSkipSyntax(msgPattern, msgStart, appendTo); +} + +int32_t +ChoiceFormat::findSubMessage(const MessagePattern &pattern, int32_t partIndex, double number) { + int32_t count = pattern.countParts(); + int32_t msgStart; + // Iterate over (ARG_INT|DOUBLE, ARG_SELECTOR, message) tuples + // until ARG_LIMIT or end of choice-only pattern. + // Ignore the first number and selector and start the loop on the first message. + partIndex += 2; + for (;;) { + // Skip but remember the current sub-message. + msgStart = partIndex; + partIndex = pattern.getLimitPartIndex(partIndex); + if (++partIndex >= count) { + // Reached the end of the choice-only pattern. + // Return with the last sub-message. + break; + } + const MessagePattern::Part &part = pattern.getPart(partIndex++); + UMessagePatternPartType type = part.getType(); + if (type == UMSGPAT_PART_TYPE_ARG_LIMIT) { + // Reached the end of the ChoiceFormat style. + // Return with the last sub-message. + break; + } + // part is an ARG_INT or ARG_DOUBLE + U_ASSERT(MessagePattern::Part::hasNumericValue(type)); + double boundary = pattern.getNumericValue(part); + // Fetch the ARG_SELECTOR character. + int32_t selectorIndex = pattern.getPatternIndex(partIndex++); + UChar boundaryChar = pattern.getPatternString().charAt(selectorIndex); + if (boundaryChar == LESS_THAN ? !(number > boundary) : !(number >= boundary)) { + // The number is in the interval between the previous boundary and the current one. + // Return with the sub-message between them. + // The !(a>b) and !(a>=b) comparisons are equivalent to + // (a<=b) and (a furthest) { - furthest = status.getIndex(); + int32_t count = pattern.countParts(); + while (partIndex < count && pattern.getPartType(partIndex) != UMSGPAT_PART_TYPE_ARG_LIMIT) { + tempNumber = pattern.getNumericValue(pattern.getPart(partIndex)); + partIndex += 2; // skip the numeric part and ignore the ARG_SELECTOR + int32_t msgLimit = pattern.getLimitPartIndex(partIndex); + int32_t len = matchStringUntilLimitPart(pattern, partIndex, msgLimit, source, start); + if (len >= 0) { + int32_t newIndex = start + len; + if (newIndex > furthest) { + furthest = newIndex; bestNumber = tempNumber; - if (furthest == text.length()) + if (furthest == source.length()) { break; + } } } + partIndex = msgLimit + 1; } - status.setIndex(furthest); - if (status.getIndex() == start) { - status.setErrorIndex(furthest); + if (furthest == start) { + pos.setErrorIndex(start); + } else { + pos.setIndex(furthest); + } + return bestNumber; +} + +int32_t +ChoiceFormat::matchStringUntilLimitPart( + const MessagePattern &pattern, int32_t partIndex, int32_t limitPartIndex, + const UnicodeString &source, int32_t sourceOffset) { + int32_t matchingSourceLength = 0; + const UnicodeString &msgString = pattern.getPatternString(); + int32_t prevIndex = pattern.getPart(partIndex).getLimit(); + for (;;) { + const MessagePattern::Part &part = pattern.getPart(++partIndex); + if (partIndex == limitPartIndex || part.getType() == UMSGPAT_PART_TYPE_SKIP_SYNTAX) { + int32_t index = part.getIndex(); + int32_t length = index - prevIndex; + if (length != 0 && 0 != source.compare(sourceOffset, length, msgString, prevIndex, length)) { + return -1; // mismatch + } + matchingSourceLength += length; + if (partIndex == limitPartIndex) { + return matchingSourceLength; + } + prevIndex = part.getLimit(); // SKIP_SYNTAX + } } - result.setDouble(bestNumber); } // ------------------------------------- diff --git a/icu4c/source/i18n/msgfmt.cpp b/icu4c/source/i18n/msgfmt.cpp index b529246683e..73ecc14e8fe 100644 --- a/icu4c/source/i18n/msgfmt.cpp +++ b/icu4c/source/i18n/msgfmt.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************** * @@ -23,41 +23,39 @@ #if !UCONFIG_NO_FORMATTING -#include "unicode/msgfmt.h" -#include "unicode/decimfmt.h" -#include "unicode/datefmt.h" -#include "unicode/smpdtfmt.h" +#include "unicode/appendable.h" #include "unicode/choicfmt.h" +#include "unicode/datefmt.h" +#include "unicode/decimfmt.h" +#include "unicode/localpointer.h" +#include "unicode/msgfmt.h" #include "unicode/plurfmt.h" -#include "unicode/selfmt.h" -#include "unicode/ustring.h" -#include "unicode/ucnv_err.h" -#include "unicode/uchar.h" -#include "unicode/umsg.h" #include "unicode/rbnf.h" +#include "unicode/selfmt.h" +#include "unicode/smpdtfmt.h" +#include "unicode/umsg.h" +#include "unicode/ustring.h" #include "cmemory.h" +#include "patternprops.h" +#include "messageimpl.h" #include "msgfmt_impl.h" -#include "util.h" #include "uassert.h" #include "ustrfmt.h" +#include "util.h" #include "uvector.h" // ***************************************************************************** // class MessageFormat // ***************************************************************************** -#define COMMA ((UChar)0x002C) #define SINGLE_QUOTE ((UChar)0x0027) +#define COMMA ((UChar)0x002C) #define LEFT_CURLY_BRACE ((UChar)0x007B) #define RIGHT_CURLY_BRACE ((UChar)0x007D) //--------------------------------------- // static data -static const UChar ID_EMPTY[] = { - 0 /* empty string, used for default so that null can mark end of list */ -}; - static const UChar ID_NUMBER[] = { 0x6E, 0x75, 0x6D, 0x62, 0x65, 0x72, 0 /* "number" */ }; @@ -67,9 +65,6 @@ static const UChar ID_DATE[] = { static const UChar ID_TIME[] = { 0x74, 0x69, 0x6D, 0x65, 0 /* "time" */ }; -static const UChar ID_CHOICE[] = { - 0x63, 0x68, 0x6F, 0x69, 0x63, 0x65, 0 /* "choice" */ -}; static const UChar ID_SPELLOUT[] = { 0x73, 0x70, 0x65, 0x6c, 0x6c, 0x6f, 0x75, 0x74, 0 /* "spellout" */ }; @@ -79,28 +74,21 @@ static const UChar ID_ORDINAL[] = { static const UChar ID_DURATION[] = { 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0 /* "duration" */ }; -static const UChar ID_PLURAL[] = { - 0x70, 0x6c, 0x75, 0x72, 0x61, 0x6c, 0 /* "plural" */ -}; -static const UChar ID_SELECT[] = { - 0x73, 0x65, 0x6C, 0x65, 0x63, 0x74, 0 /* "select" */ -}; // MessageFormat Type List Number, Date, Time or Choice static const UChar * const TYPE_IDS[] = { - ID_EMPTY, ID_NUMBER, ID_DATE, ID_TIME, - ID_CHOICE, ID_SPELLOUT, ID_ORDINAL, ID_DURATION, - ID_PLURAL, - ID_SELECT, NULL, }; +static const UChar ID_EMPTY[] = { + 0 /* empty string, used for default so that null can mark end of list */ +}; static const UChar ID_CURRENCY[] = { 0x63, 0x75, 0x72, 0x72, 0x65, 0x6E, 0x63, 0x79, 0 /* "currency" */ }; @@ -153,37 +141,31 @@ static const U_NAMESPACE_QUALIFIER DateFormat::EStyle DATE_STYLES[] = { static const int32_t DEFAULT_INITIAL_CAPACITY = 10; +static const UChar NULL_STRING[] = { + 0x6E, 0x75, 0x6C, 0x6C, 0 // "null" +}; + +static const UChar OTHER_STRING[] = { + 0x6F, 0x74, 0x68, 0x65, 0x72, 0 // "other" +}; + +U_CDECL_BEGIN +static UBool U_CALLCONV equalFormatsForHash(const UHashTok key1, + const UHashTok key2) { + return U_NAMESPACE_QUALIFIER MessageFormat::equalFormats(key1.pointer, key2.pointer); +} + +U_CDECL_END + U_NAMESPACE_BEGIN // ------------------------------------- UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MessageFormat) +UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(MessageFormat::DummyFormat) UOBJECT_DEFINE_RTTI_IMPLEMENTATION(FormatNameEnumeration) //-------------------------------------------------------------------- -/** - * Convert a string to an unsigned decimal, ignoring rule whitespace. - * @return a non-negative number if successful, or a negative number - * upon failure. - */ -static int32_t stou(const UnicodeString& string) { - int32_t n = 0; - int32_t count = 0; - UChar32 c; - for (int32_t i=0; i 10) { - return -1; - } - n = 10*n + d; - } - return n; -} - /** * Convert an integer value to a string and append the result to * the given UnicodeString. @@ -195,123 +177,82 @@ static UnicodeString& itos(int32_t i, UnicodeString& appendTo) { return appendTo; } -/* - * A structure representing one subformat of this MessageFormat. - * Each subformat has a Format object, an offset into the plain - * pattern text fPattern, and an argument number. The argument - * number corresponds to the array of arguments to be formatted. - * @internal - */ -class MessageFormat::Subformat : public UMemory { + +// AppendableWrapper: encapsulates the result of formatting, keeping track +// of the string and its length. +class AppendableWrapper : public UMemory { public: - /** - * @internal - */ - Format* format; // formatter - /** - * @internal - */ - int32_t offset; // offset into fPattern - /** - * @internal - */ - // TODO (claireho) or save the number to argName and use itos to convert to number.=> we need this number - int32_t argNum; // 0-based argument number - /** - * @internal - */ - UnicodeString* argName; // argument name or number - - /** - * Clone that.format and assign it to this.format - * Do NOT delete this.format - * @internal - */ - Subformat& operator=(const Subformat& that) { - if (this != &that) { - format = that.format ? that.format->clone() : NULL; - offset = that.offset; - argNum = that.argNum; - argName = (that.argNum==-1) ? new UnicodeString(*that.argName): NULL; + AppendableWrapper(Appendable& appendable) : app(appendable), len(0) { + } + void append(const UnicodeString& s) { + app.appendString(s.getBuffer(), s.length()); + len += s.length(); + } + void append(const UChar* s, const int32_t sLength) { + app.appendString(s, sLength); + len += sLength; + } + void append(const UnicodeString& s, int32_t start, int32_t length) { + append(s.tempSubString(start, length)); + } + void formatAndAppend(const Format* formatter, const Formattable& arg, UErrorCode& ec) { + UnicodeString s; + formatter->format(arg, s, ec); + if (U_SUCCESS(ec)) { + append(s); } - return *this; } - - /** - * @internal - */ - UBool operator==(const Subformat& that) const { - // Do cheap comparisons first - return offset == that.offset && - argNum == that.argNum && - ((argName == that.argName) || - (*argName == *that.argName)) && - ((format == that.format) || // handles NULL - (*format == *that.format)); - } - - /** - * @internal - */ - UBool operator!=(const Subformat& that) const { - return !operator==(that); + int32_t length() { + return len; } +private: + Appendable& app; + int32_t len; }; + // ------------------------------------- // Creates a MessageFormat instance based on the pattern. MessageFormat::MessageFormat(const UnicodeString& pattern, UErrorCode& success) : fLocale(Locale::getDefault()), // Uses the default locale + msgPattern(success), formatAliases(NULL), formatAliasesCapacity(0), - idStart(UCHAR_ID_START), - idContinue(UCHAR_ID_CONTINUE), - subformats(NULL), - subformatCount(0), - subformatCapacity(0), argTypes(NULL), argTypeCount(0), argTypeCapacity(0), - isArgNumeric(TRUE), + hasArgTypeConflicts(FALSE), defaultNumberFormat(NULL), - defaultDateFormat(NULL) + defaultDateFormat(NULL), + cachedFormatters(NULL), + customFormatArgStarts(NULL), + pluralProvider(&fLocale) { - if (!allocateSubformats(DEFAULT_INITIAL_CAPACITY) || - !allocateArgTypes(DEFAULT_INITIAL_CAPACITY)) { - success = U_MEMORY_ALLOCATION_ERROR; - return; - } - applyPattern(pattern, success); setLocaleIDs(fLocale.getName(), fLocale.getName()); + applyPattern(pattern, success); } MessageFormat::MessageFormat(const UnicodeString& pattern, const Locale& newLocale, UErrorCode& success) : fLocale(newLocale), + msgPattern(success), formatAliases(NULL), formatAliasesCapacity(0), - idStart(UCHAR_ID_START), - idContinue(UCHAR_ID_CONTINUE), - subformats(NULL), - subformatCount(0), - subformatCapacity(0), argTypes(NULL), argTypeCount(0), argTypeCapacity(0), - isArgNumeric(TRUE), + hasArgTypeConflicts(FALSE), defaultNumberFormat(NULL), - defaultDateFormat(NULL) + defaultDateFormat(NULL), + cachedFormatters(NULL), + customFormatArgStarts(NULL), + pluralProvider(&fLocale) { - if (!allocateSubformats(DEFAULT_INITIAL_CAPACITY) || - !allocateArgTypes(DEFAULT_INITIAL_CAPACITY)) { - success = U_MEMORY_ALLOCATION_ERROR; - return; - } - applyPattern(pattern, success); setLocaleIDs(fLocale.getName(), fLocale.getName()); + applyPattern(pattern, success); } MessageFormat::MessageFormat(const UnicodeString& pattern, @@ -319,65 +260,55 @@ MessageFormat::MessageFormat(const UnicodeString& pattern, UParseError& parseError, UErrorCode& success) : fLocale(newLocale), + msgPattern(success), formatAliases(NULL), formatAliasesCapacity(0), - idStart(UCHAR_ID_START), - idContinue(UCHAR_ID_CONTINUE), - subformats(NULL), - subformatCount(0), - subformatCapacity(0), argTypes(NULL), argTypeCount(0), argTypeCapacity(0), - isArgNumeric(TRUE), + hasArgTypeConflicts(FALSE), defaultNumberFormat(NULL), - defaultDateFormat(NULL) + defaultDateFormat(NULL), + cachedFormatters(NULL), + customFormatArgStarts(NULL), + pluralProvider(&fLocale) { - if (!allocateSubformats(DEFAULT_INITIAL_CAPACITY) || - !allocateArgTypes(DEFAULT_INITIAL_CAPACITY)) { - success = U_MEMORY_ALLOCATION_ERROR; - return; - } - applyPattern(pattern, parseError, success); setLocaleIDs(fLocale.getName(), fLocale.getName()); + applyPattern(pattern, parseError, success); } MessageFormat::MessageFormat(const MessageFormat& that) -: Format(that), +: + Format(that), + fLocale(that.fLocale), + msgPattern(that.msgPattern), formatAliases(NULL), formatAliasesCapacity(0), - idStart(UCHAR_ID_START), - idContinue(UCHAR_ID_CONTINUE), - subformats(NULL), - subformatCount(0), - subformatCapacity(0), argTypes(NULL), argTypeCount(0), argTypeCapacity(0), - isArgNumeric(TRUE), + hasArgTypeConflicts(that.hasArgTypeConflicts), defaultNumberFormat(NULL), - defaultDateFormat(NULL) + defaultDateFormat(NULL), + cachedFormatters(NULL), + customFormatArgStarts(NULL), + pluralProvider(&fLocale) { - *this = that; + // This will take care of creating the hash tables (since they are NULL). + UErrorCode ec = U_ZERO_ERROR; + copyObjects(that, ec); + if (U_FAILURE(ec)) { + resetPattern(); + } } MessageFormat::~MessageFormat() { - int32_t idx; - for (idx = 0; idx < subformatCount; idx++) { - delete subformats[idx].format; - delete subformats[idx].argName; - } - uprv_free(subformats); - subformats = NULL; - subformatCount = subformatCapacity = 0; + uhash_close(cachedFormatters); + uhash_close(customFormatArgStarts); uprv_free(argTypes); - argTypes = NULL; - argTypeCount = argTypeCapacity = 0; - uprv_free(formatAliases); - delete defaultNumberFormat; delete defaultDateFormat; } @@ -385,37 +316,6 @@ MessageFormat::~MessageFormat() //-------------------------------------------------------------------- // Variable-size array management -/** - * Allocate subformats[] to at least the given capacity and return - * TRUE if successful. If not, leave subformats[] unchanged. - * - * If subformats is NULL, allocate it. If it is not NULL, enlarge it - * if necessary to be at least as large as specified. - */ -UBool MessageFormat::allocateSubformats(int32_t capacity) { - if (subformats == NULL) { - subformats = (Subformat*) uprv_malloc(sizeof(*subformats) * capacity); - subformatCapacity = capacity; - subformatCount = 0; - if (subformats == NULL) { - subformatCapacity = 0; - return FALSE; - } - } else if (subformatCapacity < capacity) { - if (capacity < 2*subformatCapacity) { - capacity = 2*subformatCapacity; - } - Subformat* a = (Subformat*) - uprv_realloc(subformats, sizeof(*subformats) * capacity); - if (a == NULL) { - return FALSE; // request failed - } - subformats = a; - subformatCapacity = capacity; - } - return TRUE; -} - /** * Allocate argTypes[] to at least the given capacity and return * TRUE if successful. If not, leave argTypes[] unchanged. @@ -423,33 +323,26 @@ UBool MessageFormat::allocateSubformats(int32_t capacity) { * If argTypes is NULL, allocate it. If it is not NULL, enlarge it * if necessary to be at least as large as specified. */ -UBool MessageFormat::allocateArgTypes(int32_t capacity) { - if (argTypes == NULL) { - argTypes = (Formattable::Type*) uprv_malloc(sizeof(*argTypes) * capacity); - argTypeCount = 0; - argTypeCapacity = capacity; - if (argTypes == NULL) { - argTypeCapacity = 0; - return FALSE; - } - for (int32_t i=0; i= capacity) { + return TRUE; + } + if (capacity < DEFAULT_INITIAL_CAPACITY) { + capacity = DEFAULT_INITIAL_CAPACITY; + } else if (capacity < 2*argTypeCapacity) { + capacity = 2*argTypeCapacity; + } + Formattable::Type* a = (Formattable::Type*) + uprv_realloc(argTypes, sizeof(*argTypes) * capacity); + if (a == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + argTypes = a; + argTypeCapacity = capacity; return TRUE; } @@ -459,33 +352,19 @@ UBool MessageFormat::allocateArgTypes(int32_t capacity) { const MessageFormat& MessageFormat::operator=(const MessageFormat& that) { - // Reallocate the arrays BEFORE changing this object - if (this != &that && - allocateSubformats(that.subformatCount) && - allocateArgTypes(that.argTypeCount)) { - + if (this != &that) { // Calls the super class for assignment first. Format::operator=(that); - fPattern = that.fPattern; setLocale(that.fLocale); - isArgNumeric = that.isArgNumeric; - int32_t j; - for (j=0; jkey.integer != rhs_cur->key.integer) { + return FALSE; + } + const Format* format = (const Format*)uhash_iget(cachedFormatters, cur->key.integer); + const Format* rhs_format = (const Format*)uhash_iget(that.cachedFormatters, rhs_cur->key.integer); + if (*format != *rhs_format) { return FALSE; } } - return TRUE; } @@ -535,9 +433,10 @@ MessageFormat::setLocale(const Locale& theLocale) defaultNumberFormat = NULL; delete defaultDateFormat; defaultDateFormat = NULL; + fLocale = theLocale; + setLocaleIDs(fLocale.getName(), fLocale.getName()); + pluralProvider.reset(&fLocale); } - fLocale = theLocale; - setLocaleIDs(fLocale.getName(), fLocale.getName()); } // ------------------------------------- @@ -549,9 +448,6 @@ MessageFormat::getLocale() const return fLocale; } - - - void MessageFormat::applyPattern(const UnicodeString& newPattern, UErrorCode& status) @@ -572,324 +468,149 @@ MessageFormat::applyPattern(const UnicodeString& pattern, if(U_FAILURE(ec)) { return; } - // The pattern is broken up into segments. Each time a subformat - // is encountered, 4 segments are recorded. For example, consider - // the pattern: - // "There {0,choice,0.0#are no files|1.0#is one file|1.0(fmt)) != NULL) { - UErrorCode ec = U_ZERO_ERROR; - NumberFormat& formatAlias = *decfmt; - NumberFormat *defaultTemplate = NumberFormat::createInstance(fLocale, ec); - NumberFormat *currencyTemplate = NumberFormat::createCurrencyInstance(fLocale, ec); - NumberFormat *percentTemplate = NumberFormat::createPercentInstance(fLocale, ec); - NumberFormat *integerTemplate = createIntegerFormat(fLocale, ec); - - appendTo += COMMA; - appendTo += ID_NUMBER; - if (formatAlias != *defaultTemplate) { - appendTo += COMMA; - if (formatAlias == *currencyTemplate) { - appendTo += ID_CURRENCY; - } - else if (formatAlias == *percentTemplate) { - appendTo += ID_PERCENT; - } - else if (formatAlias == *integerTemplate) { - appendTo += ID_INTEGER; - } - else { - UnicodeString buffer; - appendTo += decfmt->toPattern(buffer); - } - } - - delete defaultTemplate; - delete currencyTemplate; - delete percentTemplate; - delete integerTemplate; - } - else if ((sdtfmt = dynamic_cast(fmt)) != NULL) { - DateFormat& formatAlias = *sdtfmt; - DateFormat *defaultDateTemplate = DateFormat::createDateInstance(DateFormat::kDefault, fLocale); - DateFormat *shortDateTemplate = DateFormat::createDateInstance(DateFormat::kShort, fLocale); - DateFormat *longDateTemplate = DateFormat::createDateInstance(DateFormat::kLong, fLocale); - DateFormat *fullDateTemplate = DateFormat::createDateInstance(DateFormat::kFull, fLocale); - DateFormat *defaultTimeTemplate = DateFormat::createTimeInstance(DateFormat::kDefault, fLocale); - DateFormat *shortTimeTemplate = DateFormat::createTimeInstance(DateFormat::kShort, fLocale); - DateFormat *longTimeTemplate = DateFormat::createTimeInstance(DateFormat::kLong, fLocale); - DateFormat *fullTimeTemplate = DateFormat::createTimeInstance(DateFormat::kFull, fLocale); - - - appendTo += COMMA; - if (formatAlias == *defaultDateTemplate) { - appendTo += ID_DATE; - } - else if (formatAlias == *shortDateTemplate) { - appendTo += ID_DATE; - appendTo += COMMA; - appendTo += ID_SHORT; - } - else if (formatAlias == *defaultDateTemplate) { - appendTo += ID_DATE; - appendTo += COMMA; - appendTo += ID_MEDIUM; - } - else if (formatAlias == *longDateTemplate) { - appendTo += ID_DATE; - appendTo += COMMA; - appendTo += ID_LONG; - } - else if (formatAlias == *fullDateTemplate) { - appendTo += ID_DATE; - appendTo += COMMA; - appendTo += ID_FULL; - } - else if (formatAlias == *defaultTimeTemplate) { - appendTo += ID_TIME; - } - else if (formatAlias == *shortTimeTemplate) { - appendTo += ID_TIME; - appendTo += COMMA; - appendTo += ID_SHORT; - } - else if (formatAlias == *defaultTimeTemplate) { - appendTo += ID_TIME; - appendTo += COMMA; - appendTo += ID_MEDIUM; - } - else if (formatAlias == *longTimeTemplate) { - appendTo += ID_TIME; - appendTo += COMMA; - appendTo += ID_LONG; - } - else if (formatAlias == *fullTimeTemplate) { - appendTo += ID_TIME; - appendTo += COMMA; - appendTo += ID_FULL; - } - else { - UnicodeString buffer; - appendTo += ID_DATE; - appendTo += COMMA; - appendTo += sdtfmt->toPattern(buffer); - } - - delete defaultDateTemplate; - delete shortDateTemplate; - delete longDateTemplate; - delete fullDateTemplate; - delete defaultTimeTemplate; - delete shortTimeTemplate; - delete longTimeTemplate; - delete fullTimeTemplate; - // {sfb} there should be a more efficient way to do this! - } - else if ((chcfmt = dynamic_cast(fmt)) != NULL) { - UnicodeString buffer; - appendTo += COMMA; - appendTo += ID_CHOICE; - appendTo += COMMA; - appendTo += ((ChoiceFormat*)fmt)->toPattern(buffer); - } - else if ((plfmt = dynamic_cast(fmt)) != NULL) { - UnicodeString buffer; - appendTo += plfmt->toPattern(buffer); - } - else if ((selfmt = dynamic_cast(fmt)) != NULL) { - UnicodeString buffer; - appendTo += ((SelectFormat*)fmt)->toPattern(buffer); - } - else { - //appendTo += ", unknown"; - } - appendTo += RIGHT_CURLY_BRACE; + if ((customFormatArgStarts != NULL && 0 != uhash_count(customFormatArgStarts)) || + 0 == msgPattern.countParts() + ) { + appendTo.setToBogus(); + return appendTo; + } + return appendTo.append(msgPattern.getPatternString()); +} + +int32_t MessageFormat::nextTopLevelArgStart(int32_t partIndex) const { + if (partIndex != 0) { + partIndex = msgPattern.getLimitPartIndex(partIndex); + } + for (;;) { + UMessagePatternPartType type = msgPattern.getPartType(++partIndex); + if (type == UMSGPAT_PART_TYPE_ARG_START) { + return partIndex; + } + if (type == UMSGPAT_PART_TYPE_MSG_LIMIT) { + return -1; + } + } +} + +void MessageFormat::setArgStartFormat(int32_t argStart, + Format* formatter, + UErrorCode& status) { + if (U_FAILURE(status)) { + delete formatter; + } + if (cachedFormatters == NULL) { + cachedFormatters=uhash_open(uhash_hashLong, uhash_compareLong, + equalFormatsForHash, &status); + if (U_FAILURE(status)) { + delete formatter; + return; + } + uhash_setValueDeleter(cachedFormatters, uhash_deleteUObject); + } + if (formatter == NULL) { + formatter = new DummyFormat(); + } + uhash_iput(cachedFormatters, argStart, formatter, &status); +} + + +bool MessageFormat::argNameMatches(int32_t partIndex, const UnicodeString& argName, int32_t argNumber) { + const MessagePattern::Part& part = msgPattern.getPart(partIndex); + return part.getType() == UMSGPAT_PART_TYPE_ARG_NAME ? + msgPattern.partSubstringMatches(part, argName) : + part.getValue() == argNumber; // ARG_NUMBER +} + +// Sets a custom formatter for a MessagePattern ARG_START part index. +// "Custom" formatters are provided by the user via setFormat() or similar APIs. +void MessageFormat::setCustomArgStartFormat(int32_t argStart, + Format* formatter, + UErrorCode& status) { + setArgStartFormat(argStart, formatter, status); + if (customFormatArgStarts == NULL) { + customFormatArgStarts=uhash_open(uhash_hashLong, uhash_compareLong, + NULL, &status); + } + uhash_iputi(customFormatArgStarts, argStart, 1, &status); +} + +Format* MessageFormat::getCachedFormatter(int32_t argumentNumber) const { + if (cachedFormatters == NULL) { + return NULL; + } + void* ptr = uhash_iget(cachedFormatters, argumentNumber); + if (ptr != NULL && dynamic_cast((Format*)ptr) == NULL) { + return (Format*) ptr; + } else { + // Not cached, or a DummyFormat representing setFormat(NULL). + return NULL; } - copyAndFixQuotes(fPattern, lastOffset, fPattern.length(), appendTo); - return appendTo; } // ------------------------------------- // Adopts the new formats array and updates the array count. // This MessageFormat instance owns the new formats. - void MessageFormat::adoptFormats(Format** newFormats, int32_t count) { if (newFormats == NULL || count < 0) { return; } - - int32_t i; - if (allocateSubformats(count)) { - for (i=0; i= 0;) { + setCustomArgStartFormat(partIndex, newFormats[formatNumber], status); + ++formatNumber; + } + // Delete those that didn't get used (if any). + for (; formatNumber < count; ++formatNumber) { + delete newFormats[formatNumber]; } - // TODO: What about the .offset and .argNum fields? } // ------------------------------------- @@ -902,21 +623,31 @@ MessageFormat::setFormats(const Format** newFormats, if (newFormats == NULL || count < 0) { return; } - - if (allocateSubformats(count)) { - int32_t i; - for (i=0; iclone() : NULL; - } - subformatCount = count; + // Throw away any cached formatters. + if (cachedFormatters != NULL) { + uhash_removeAll(cachedFormatters); + } + if (customFormatArgStarts != NULL) { + uhash_removeAll(customFormatArgStarts); } - // TODO: What about the .offset and .arg fields? + UErrorCode status = U_ZERO_ERROR; + int32_t formatNumber = 0; + for (int32_t partIndex = 0; + formatNumber < count && U_SUCCESS(status) && (partIndex = nextTopLevelArgStart(partIndex)) >= 0;) { + Format* newFormat = NULL; + if (newFormats[formatNumber] != NULL) { + newFormat = newFormats[formatNumber]->clone(); + if (newFormat == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } + } + setCustomArgStartFormat(partIndex, newFormat, status); + ++formatNumber; + } + if (U_FAILURE(status)) { + resetPattern(); + } } // ------------------------------------- @@ -925,11 +656,17 @@ MessageFormat::setFormats(const Format** newFormats, void MessageFormat::adoptFormat(int32_t n, Format *newFormat) { - if (n < 0 || n >= subformatCount) { - delete newFormat; - } else { - delete subformats[n].format; - subformats[n].format = newFormat; + LocalPointer p(newFormat); + if (n >= 0) { + int32_t formatNumber = 0; + for (int32_t partIndex = 0; (partIndex = nextTopLevelArgStart(partIndex)) >= 0;) { + if (n == formatNumber) { + UErrorCode status = U_ZERO_ERROR; + setCustomArgStartFormat(partIndex, p.orphan(), status); + return; + } + ++formatNumber; + } } } @@ -940,24 +677,32 @@ void MessageFormat::adoptFormat(const UnicodeString& formatName, Format* formatToAdopt, UErrorCode& status) { - if (isArgNumeric ) { - int32_t argumentNumber = stou(formatName); - if (argumentNumber<0) { - status = U_ARGUMENT_TYPE_MISMATCH; - return; - } - adoptFormat(argumentNumber, formatToAdopt); + LocalPointer p(formatToAdopt); + if (U_FAILURE(status)) { return; } - for (int32_t i=0; i= 0 && U_SUCCESS(status); + ) { + if (argNameMatches(partIndex + 1, formatName, argNumber)) { + Format* f; + if (p.isValid()) { + f = p.orphan(); + } else if (formatToAdopt == NULL) { + f = NULL; } else { - subformats[i].format = formatToAdopt; + f = formatToAdopt->clone(); + if (f == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } } + setCustomArgStartFormat(partIndex, f, status); } } } @@ -965,16 +710,22 @@ MessageFormat::adoptFormat(const UnicodeString& formatName, // ------------------------------------- // Set a single format. // Do nothing if the variable is not less than the array count. - void MessageFormat::setFormat(int32_t n, const Format& newFormat) { - if (n >= 0 && n < subformatCount) { - delete subformats[n].format; - if (&newFormat == NULL) { - // This should never happen -- but we'll be nice if it does - subformats[n].format = NULL; - } else { - subformats[n].format = newFormat.clone(); + + if (n >= 0) { + int32_t formatNumber = 0; + for (int32_t partIndex = 0; + (partIndex = nextTopLevelArgStart(partIndex)) >= 0;) { + if (n == formatNumber) { + Format* new_format = newFormat.clone(); + if (new_format) { + UErrorCode status = U_ZERO_ERROR; + setCustomArgStartFormat(partIndex, new_format, status); + } + return; + } + ++formatNumber; } } } @@ -984,27 +735,16 @@ MessageFormat::setFormat(int32_t n, const Format& newFormat) { // Do nothing if the variable is not less than the array count. Format * MessageFormat::getFormat(const UnicodeString& formatName, UErrorCode& status) { + if (U_FAILURE(status) || cachedFormatters == NULL) return NULL; - if (U_FAILURE(status)) return NULL; - - if (isArgNumeric ) { - int32_t argumentNumber = stou(formatName); - if (argumentNumber<0) { - status = U_ARGUMENT_TYPE_MISMATCH; - return NULL; - } - if (argumentNumber < 0 || argumentNumber >= subformatCount) { - return subformats[argumentNumber].format; - } - else { - return NULL; - } + int32_t argNumber = MessagePattern::validateArgumentName(formatName); + if (argNumber < UMSGPAT_ARG_NAME_NOT_NUMBER) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; } - - for (int32_t i=0; i= 0;) { + if (argNameMatches(partIndex + 1, formatName, argNumber)) { + return getCachedFormatter(partIndex); } } return NULL; @@ -1017,28 +757,33 @@ void MessageFormat::setFormat(const UnicodeString& formatName, const Format& newFormat, UErrorCode& status) { - if (isArgNumeric) { - status = U_ARGUMENT_TYPE_MISMATCH; + if (U_FAILURE(status)) return; + + int32_t argNumber = MessagePattern::validateArgumentName(formatName); + if (argNumber < UMSGPAT_ARG_NAME_NOT_NUMBER) { + status = U_ILLEGAL_ARGUMENT_ERROR; return; } - for (int32_t i=0; i= 0 && U_SUCCESS(status); + ) { + if (argNameMatches(partIndex + 1, formatName, argNumber)) { if (&newFormat == NULL) { - // This should never happen -- but we'll be nice if it does - subformats[i].format = NULL; + setCustomArgStartFormat(partIndex, NULL, status); } else { - subformats[i].format = newFormat.clone(); + Format* new_format = newFormat.clone(); + if (new_format == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + setCustomArgStartFormat(partIndex, new_format, status); } - break; } } } // ------------------------------------- // Gets the format array. - const Format** MessageFormat::getFormats(int32_t& cnt) const { @@ -1047,48 +792,59 @@ MessageFormat::getFormats(int32_t& cnt) const // method on this object. We construct and resize an array // on demand that contains aliases to the subformats[i].format // pointers. - MessageFormat* t = (MessageFormat*) this; + MessageFormat* t = const_cast (this); cnt = 0; if (formatAliases == NULL) { - t->formatAliasesCapacity = (subformatCount<10) ? 10 : subformatCount; + t->formatAliasesCapacity = (argTypeCount<10) ? 10 : argTypeCount; Format** a = (Format**) uprv_malloc(sizeof(Format*) * formatAliasesCapacity); if (a == NULL) { + t->formatAliasesCapacity = 0; return NULL; } t->formatAliases = a; - } else if (subformatCount > formatAliasesCapacity) { + } else if (argTypeCount > formatAliasesCapacity) { Format** a = (Format**) - uprv_realloc(formatAliases, sizeof(Format*) * subformatCount); + uprv_realloc(formatAliases, sizeof(Format*) * argTypeCount); if (a == NULL) { + t->formatAliasesCapacity = 0; return NULL; } t->formatAliases = a; - t->formatAliasesCapacity = subformatCount; + t->formatAliasesCapacity = argTypeCount; } - for (int32_t i=0; iformatAliases[i] = subformats[i].format; + + for (int32_t partIndex = 0; (partIndex = nextTopLevelArgStart(partIndex)) >= 0;) { + t->formatAliases[cnt++] = getCachedFormatter(partIndex); } - cnt = subformatCount; + return (const Format**)formatAliases; } +UnicodeString MessageFormat::getArgName(int32_t partIndex) { + const MessagePattern::Part& part = msgPattern.getPart(partIndex); + if (part.getType() == UMSGPAT_PART_TYPE_ARG_NAME) { + return msgPattern.getSubstring(part); + } else { + UnicodeString temp; + return itos(part.getValue(), temp); + } +} + StringEnumeration* MessageFormat::getFormatNames(UErrorCode& status) { if (U_FAILURE(status)) return NULL; - if (isArgNumeric) { - status = U_ARGUMENT_TYPE_MISMATCH; - return NULL; - } UVector *fFormatNames = new UVector(status); if (U_FAILURE(status)) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } - for (int32_t i=0; iaddElement(new UnicodeString(*subformats[i].argName), status); + fFormatNames->setDeleter(uhash_deleteUObject); + + for (int32_t partIndex = 0; (partIndex = nextTopLevelArgStart(partIndex)) >= 0;) { + fFormatNames->addElement(new UnicodeString(getArgName(partIndex)), status); } StringEnumeration* nameEnumerator = new FormatNameEnumeration(fFormatNames, status); @@ -1106,10 +862,7 @@ MessageFormat::format(const Formattable* source, FieldPosition& ignore, UErrorCode& success) const { - if (U_FAILURE(success)) - return appendTo; - - return format(source, cnt, appendTo, ignore, 0, success); + return format(source, NULL, cnt, appendTo, &ignore, success); } // ------------------------------------- @@ -1125,9 +878,7 @@ MessageFormat::format( const UnicodeString& pattern, UErrorCode& success) { MessageFormat temp(pattern, success); - FieldPosition ignore(0); - temp.format(arguments, cnt, appendTo, ignore, success); - return appendTo; + return temp.format(arguments, NULL, cnt, appendTo, NULL, success); } // ------------------------------------- @@ -1141,171 +892,472 @@ MessageFormat::format(const Formattable& source, FieldPosition& ignore, UErrorCode& success) const { - int32_t cnt; - if (U_FAILURE(success)) return appendTo; if (source.getType() != Formattable::kArray) { success = U_ILLEGAL_ARGUMENT_ERROR; return appendTo; } + int32_t cnt; const Formattable* tmpPtr = source.getArray(cnt); - - return format(tmpPtr, cnt, appendTo, ignore, 0, success); + return format(tmpPtr, NULL, cnt, appendTo, &ignore, success); } - UnicodeString& MessageFormat::format(const UnicodeString* argumentNames, const Formattable* arguments, int32_t count, UnicodeString& appendTo, UErrorCode& success) const { - FieldPosition ignore(0); - return format(arguments, argumentNames, count, appendTo, ignore, 0, success); + return format(arguments, argumentNames, count, appendTo, NULL, success); } -UnicodeString& -MessageFormat::format(const Formattable* arguments, - int32_t cnt, - UnicodeString& appendTo, - FieldPosition& status, - int32_t recursionProtection, - UErrorCode& success) const -{ - return format(arguments, NULL, cnt, appendTo, status, recursionProtection, success); +// Does linear search to find the match for an ArgName. +const Formattable* MessageFormat::getArgFromListByName(const Formattable* arguments, + const UnicodeString *argumentNames, + int32_t cnt, UnicodeString& name) const { + for (int32_t i = 0; i < cnt; ++i) { + if (0 == argumentNames[i].compare(name)) { + return arguments + i; + } + } + return NULL; } -// ------------------------------------- -// Formats the arguments Formattable array and copy into the appendTo buffer. -// Ignore the FieldPosition result for error checking. UnicodeString& MessageFormat::format(const Formattable* arguments, const UnicodeString *argumentNames, int32_t cnt, UnicodeString& appendTo, - FieldPosition& status, - int32_t recursionProtection, - UErrorCode& success) const -{ - int32_t lastOffset = 0; - int32_t argumentNumber=0; - if (cnt < 0 || (cnt && arguments == NULL)) { - success = U_ILLEGAL_ARGUMENT_ERROR; + FieldPosition* pos, + UErrorCode& status) const { + if (U_FAILURE(status)) { return appendTo; } - if ( !isArgNumeric && argumentNames== NULL ) { - success = U_ILLEGAL_ARGUMENT_ERROR; - return appendTo; - } - - const Formattable *obj=NULL; - for (int32_t i=0; i= cnt) { - appendTo += LEFT_CURLY_BRACE; - itos(argumentNumber, appendTo); - appendTo += RIGHT_CURLY_BRACE; - continue; - } - obj = arguments+argumentNumber; - } - else { - for (int32_t j=0; jgetType(); - - // Recursively calling the format process only if the current - // format argument refers to either of the following: - // a ChoiceFormat object, a PluralFormat object, a SelectFormat object. - Format* fmt = subformats[i].format; - if (fmt != NULL) { - UnicodeString argNum; - fmt->format(*obj, argNum, success); - - // Needs to reprocess the ChoiceFormat and PluralFormat and SelectFormat option by using the - // MessageFormat pattern application. - if ((dynamic_cast(fmt) != NULL || - dynamic_cast(fmt) != NULL || - dynamic_cast(fmt) != NULL) && - argNum.indexOf(LEFT_CURLY_BRACE) >= 0 - ) { - MessageFormat temp(argNum, fLocale, success); - // TODO: Implement recursion protection - if ( isArgNumeric ) { - temp.format(arguments, NULL, cnt, appendTo, status, recursionProtection, success); - } - else { - temp.format(arguments, argumentNames, cnt, appendTo, status, recursionProtection, success); - } - if (U_FAILURE(success)) { - return appendTo; - } - } - else { - appendTo += argNum; - } - } - // If the obj data type is a number, use a NumberFormat instance. - else if ((type == Formattable::kDouble) || - (type == Formattable::kLong) || - (type == Formattable::kInt64)) { - - const NumberFormat* nf = getDefaultNumberFormat(success); - if (nf == NULL) { - return appendTo; - } - if (type == Formattable::kDouble) { - nf->format(obj->getDouble(), appendTo); - } else if (type == Formattable::kLong) { - nf->format(obj->getLong(), appendTo); - } else { - nf->format(obj->getInt64(), appendTo); - } - } - // If the obj data type is a Date instance, use a DateFormat instance. - else if (type == Formattable::kDate) { - const DateFormat* df = getDefaultDateFormat(success); - if (df == NULL) { - return appendTo; - } - df->format(obj->getDate(), appendTo); - } - else if (type == Formattable::kString) { - appendTo += obj->getString(); - } - else { - success = U_ILLEGAL_ARGUMENT_ERROR; - return appendTo; - } - } - // Appends the rest of the pattern characters after the real last offset. - appendTo.append(fPattern, lastOffset, 0x7fffffff); + UnicodeStringAppendable usapp(appendTo); + AppendableWrapper app(usapp); + format(0, 0.0, arguments, argumentNames, cnt, app, pos, status); return appendTo; } +// if argumentNames is NULL, this means arguments is a numeric array. +// arguments can not be NULL. +void MessageFormat::format(int32_t msgStart, double pluralNumber, + const Formattable* arguments, + const UnicodeString *argumentNames, + int32_t cnt, + AppendableWrapper& appendTo, + FieldPosition* ignore, + UErrorCode& success) const { + if (U_FAILURE(success)) { + return; + } + const UnicodeString& msgString = msgPattern.getPatternString(); + int32_t prevIndex = msgPattern.getPart(msgStart).getLimit(); + for (int32_t i = msgStart + 1; U_SUCCESS(success) ; ++i) { + const MessagePattern::Part* part = &msgPattern.getPart(i); + const UMessagePatternPartType type = part->getType(); + int32_t index = part->getIndex(); + appendTo.append(msgString, prevIndex, index - prevIndex); + if (type == UMSGPAT_PART_TYPE_MSG_LIMIT) { + return; + } + prevIndex = part->getLimit(); + if (type == UMSGPAT_PART_TYPE_REPLACE_NUMBER) { + const NumberFormat* nf = getDefaultNumberFormat(success); + appendTo.formatAndAppend(nf, Formattable(pluralNumber), success); + continue; + } + if (type != UMSGPAT_PART_TYPE_ARG_START) { + continue; + } + int32_t argLimit = msgPattern.getLimitPartIndex(i); + UMessagePatternArgType argType = part->getArgType(); + part = &msgPattern.getPart(++i); + const Formattable* arg; + UnicodeString noArg; + if (argumentNames == NULL) { + int32_t argNumber = part->getValue(); // ARG_NUMBER + if (0 <= argNumber && argNumber < cnt) { + arg = arguments + argNumber; + } else { + arg = NULL; + noArg.append(LEFT_CURLY_BRACE); + itos(argNumber, noArg); + noArg.append(RIGHT_CURLY_BRACE); + } + } else { + UnicodeString key; + if (part->getType() == UMSGPAT_PART_TYPE_ARG_NAME) { + key = msgPattern.getSubstring(*part); + } else /* UMSGPAT_PART_TYPE_ARG_NUMBER */ { + itos(part->getValue(), key); + } + arg = getArgFromListByName(arguments, argumentNames, cnt, key); + if (arg == NULL) { + noArg.append(LEFT_CURLY_BRACE); + noArg.append(key); + noArg.append(RIGHT_CURLY_BRACE); + } + } + ++i; + int32_t prevDestLength = appendTo.length(); + const Format* formatter = NULL; + if (!noArg.isEmpty()) { + appendTo.append(noArg); + } else if (arg == NULL) { + appendTo.append(NULL_STRING, 4); + } else if ((formatter = getCachedFormatter(i -2))) { + // Handles all ArgType.SIMPLE, and formatters from setFormat() and its siblings. + if (dynamic_cast(formatter) || + dynamic_cast(formatter) || + dynamic_cast(formatter)) { + // We only handle nested formats here if they were provided via + // setFormat() or its siblings. Otherwise they are not cached and instead + // handled below according to argType. + UnicodeString subMsgString; + formatter->format(*arg, subMsgString, success); + if (subMsgString.indexOf(LEFT_CURLY_BRACE) >= 0 || + (subMsgString.indexOf(SINGLE_QUOTE) >= 0 && !MessageImpl::jdkAposMode(msgPattern)) + ) { + MessageFormat subMsgFormat(subMsgString, fLocale, success); + subMsgFormat.format(0, 0, arguments, argumentNames, cnt, appendTo, ignore, success); + } else { + appendTo.append(subMsgString); + } + } else { + appendTo.formatAndAppend(formatter, *arg, success); + } + } else if (argType == UMSGPAT_ARG_TYPE_NONE || (cachedFormatters && uhash_iget(cachedFormatters, i - 2))) { + // We arrive here if getCachedFormatter returned NULL, but there was actually an element in the hash table. + // This can only happen if the hash table contained a DummyFormat, so the if statement above is a check + // for the hash table containind DummyFormat. + if (arg->isNumeric()) { + const NumberFormat* nf = getDefaultNumberFormat(success); + appendTo.formatAndAppend(nf, *arg, success); + } else if (arg->getType() == Formattable::kDate) { + const DateFormat* df = getDefaultDateFormat(success); + appendTo.formatAndAppend(df, *arg, success); + } else { + appendTo.append(arg->getString(success)); + } + } else if (argType == UMSGPAT_ARG_TYPE_CHOICE) { + if (!arg->isNumeric()) { + success = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + // We must use the Formattable::getDouble() variant with the UErrorCode parameter + // because only this one converts non-double numeric types to double. + const double number = arg->getDouble(success); + int32_t subMsgStart = ChoiceFormat::findSubMessage(msgPattern, i, number); + formatComplexSubMessage(subMsgStart, 0, arguments, argumentNames, + cnt, appendTo, success); + } else if (argType == UMSGPAT_ARG_TYPE_PLURAL) { + if (!arg->isNumeric()) { + success = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + // We must use the Formattable::getDouble() variant with the UErrorCode parameter + // because only this one converts non-double numeric types to double. + double number = arg->getDouble(success); + int32_t subMsgStart = PluralFormat::findSubMessage(msgPattern, i, pluralProvider, number, + success); + double offset = msgPattern.getPluralOffset(subMsgStart); + formatComplexSubMessage(subMsgStart, number-offset, arguments, argumentNames, + cnt, appendTo, success); + } else if (argType == UMSGPAT_ARG_TYPE_SELECT) { + int32_t subMsgStart = SelectFormat::findSubMessage(msgPattern, i, arg->getString(success), success); + formatComplexSubMessage(subMsgStart, 0, arguments, argumentNames, + cnt, appendTo, success); + } else { + // This should never happen. + success = U_INTERNAL_PROGRAM_ERROR; + return; + } + ignore = updateMetaData(appendTo, prevDestLength, ignore, arg); + prevIndex = msgPattern.getPart(argLimit).getLimit(); + i = argLimit; + } +} + + +void MessageFormat::formatComplexSubMessage(int32_t msgStart, + double pluralNumber, + const Formattable* arguments, + const UnicodeString *argumentNames, + int32_t cnt, + AppendableWrapper& appendTo, + UErrorCode& success) const { + if (U_FAILURE(success)) { + return; + } + + if (!MessageImpl::jdkAposMode(msgPattern)) { + format(msgStart, pluralNumber, arguments, argumentNames, cnt, appendTo, NULL, success); + return; + } + + // JDK compatibility mode: (see JDK MessageFormat.format() API docs) + // - remove SKIP_SYNTAX; that is, remove half of the apostrophes + // - if the result string contains an open curly brace '{' then + // instantiate a temporary MessageFormat object and format again; + // otherwise just append the result string + const UnicodeString& msgString = msgPattern.getPatternString(); + UnicodeString sb; + int32_t prevIndex = msgPattern.getPart(msgStart).getLimit(); + for (int32_t i = msgStart;;) { + const MessagePattern::Part& part = msgPattern.getPart(++i); + const UMessagePatternPartType type = part.getType(); + int32_t index = part.getIndex(); + if (type == UMSGPAT_PART_TYPE_MSG_LIMIT) { + sb.append(msgString, prevIndex, index - prevIndex); + break; + } else if (type == UMSGPAT_PART_TYPE_REPLACE_NUMBER || type == UMSGPAT_PART_TYPE_SKIP_SYNTAX) { + sb.append(msgString, prevIndex, index - prevIndex); + if (type == UMSGPAT_PART_TYPE_REPLACE_NUMBER) { + const NumberFormat* nf = getDefaultNumberFormat(success); + sb.append(nf->format(pluralNumber, sb, success)); + } + prevIndex = part.getLimit(); + } else if (type == UMSGPAT_PART_TYPE_ARG_START) { + sb.append(msgString, prevIndex, index - prevIndex); + prevIndex = index; + i = msgPattern.getLimitPartIndex(i); + index = msgPattern.getPart(i).getLimit(); + MessageImpl::appendReducedApostrophes(msgString, prevIndex, index, sb); + prevIndex = index; + } + } + if (sb.indexOf(LEFT_CURLY_BRACE) >= 0) { + MessageFormat subMsgFormat(UnicodeString(), fLocale, success); + subMsgFormat.applyPattern(sb, UMSGPAT_APOS_DOUBLE_REQUIRED, NULL, success); + subMsgFormat.format(0, 0, arguments, argumentNames, cnt, appendTo, NULL, success); + } else { + appendTo.append(sb); + } +} + + +UnicodeString MessageFormat::getLiteralStringUntilNextArgument(int32_t from) const { + const UnicodeString& msgString=msgPattern.getPatternString(); + int32_t prevIndex=msgPattern.getPart(from).getLimit(); + UnicodeString b; + for (int32_t i = from + 1; ; ++i) { + const MessagePattern::Part& part = msgPattern.getPart(i); + const UMessagePatternPartType type=part.getType(); + int32_t index=part.getIndex(); + b.append(msgString, prevIndex, index - prevIndex); + if(type==UMSGPAT_PART_TYPE_ARG_START || type==UMSGPAT_PART_TYPE_MSG_LIMIT) { + return b; + } + // Unexpected Part "part" in parsed message. + U_ASSERT(type==UMSGPAT_PART_TYPE_SKIP_SYNTAX || type==UMSGPAT_PART_TYPE_INSERT_CHAR); + prevIndex=part.getLimit(); + } +} + + +FieldPosition* MessageFormat::updateMetaData(AppendableWrapper& /*dest*/, int32_t /*prevLength*/, + FieldPosition* /*fp*/, const Formattable* /*argId*/) const { + // Unlike in Java, there are no field attributes defined for MessageFormat. Do nothing. + return NULL; + /* + if (fp != NULL && Field.ARGUMENT.equals(fp.getFieldAttribute())) { + fp->setBeginIndex(prevLength); + fp->setEndIndex(dest.get_length()); + return NULL; + } + return fp; + */ +} + +void MessageFormat::copyObjects(const MessageFormat& that, UErrorCode& ec) { + // Deep copy pointer fields. + // We need not copy the formatAliases because they are re-filled + // in each getFormats() call. + // The defaultNumberFormat, defaultDateFormat and pluralProvider.rules + // also get created on demand. + argTypeCount = that.argTypeCount; + if (argTypeCount > 0) { + if (!allocateArgTypes(argTypeCount, ec)) { + return; + } + uprv_memcpy(argTypes, that.argTypes, argTypeCount * sizeof(argTypes[0])); + } + if (cachedFormatters != NULL) { + uhash_removeAll(cachedFormatters); + } + if (customFormatArgStarts != NULL) { + uhash_removeAll(customFormatArgStarts); + } + if (that.cachedFormatters) { + if (cachedFormatters == NULL) { + cachedFormatters=uhash_open(uhash_hashLong, uhash_compareLong, + equalFormatsForHash, &ec); + if (U_FAILURE(ec)) { + return; + } + uhash_setValueDeleter(cachedFormatters, uhash_deleteUObject); + } + + const int32_t count = uhash_count(that.cachedFormatters); + int32_t pos, idx; + for (idx = 0, pos = -1; idx < count && U_SUCCESS(ec); ++idx) { + const UHashElement* cur = uhash_nextElement(that.cachedFormatters, &pos); + Format* newFormat = ((Format*)(cur->value.pointer))->clone(); + if (newFormat) { + uhash_iput(cachedFormatters, cur->key.integer, newFormat, &ec); + } else { + ec = U_MEMORY_ALLOCATION_ERROR; + return; + } + } + } + if (that.customFormatArgStarts) { + if (customFormatArgStarts == NULL) { + customFormatArgStarts=uhash_open(uhash_hashLong, uhash_compareLong, + NULL, &ec); + } + const int32_t count = uhash_count(that.customFormatArgStarts); + int32_t pos, idx; + for (idx = 0, pos = -1; idx < count && U_SUCCESS(ec); ++idx) { + const UHashElement* cur = uhash_nextElement(that.customFormatArgStarts, &pos); + uhash_iputi(customFormatArgStarts, cur->key.integer, cur->value.integer, &ec); + } + } +} + + +Formattable* +MessageFormat::parse(int32_t msgStart, + const UnicodeString& source, + ParsePosition& pos, + int32_t& count, + UErrorCode& ec) const { + count = 0; + if (U_FAILURE(ec)) { + pos.setErrorIndex(pos.getIndex()); + return NULL; + } + // parse() does not work with named arguments. + if (msgPattern.hasNamedArguments()) { + ec = U_ARGUMENT_TYPE_MISMATCH; + pos.setErrorIndex(pos.getIndex()); + return NULL; + } + LocalArray resultArray(new Formattable[argTypeCount ? argTypeCount : 1]); + const UnicodeString& msgString=msgPattern.getPatternString(); + int32_t prevIndex=msgPattern.getPart(msgStart).getLimit(); + int32_t sourceOffset = pos.getIndex(); + ParsePosition tempStatus(0); + + for(int32_t i=msgStart+1; ; ++i) { + UBool haveArgResult = FALSE; + const MessagePattern::Part* part=&msgPattern.getPart(i); + const UMessagePatternPartType type=part->getType(); + int32_t index=part->getIndex(); + // Make sure the literal string matches. + int32_t len = index - prevIndex; + if (len == 0 || (0 == msgString.compare(prevIndex, len, source, sourceOffset, len))) { + sourceOffset += len; + prevIndex += len; + } else { + pos.setErrorIndex(sourceOffset); + return NULL; // leave index as is to signal error + } + if(type==UMSGPAT_PART_TYPE_MSG_LIMIT) { + // Things went well! Done. + pos.setIndex(sourceOffset); + return resultArray.orphan(); + } + if(type==UMSGPAT_PART_TYPE_SKIP_SYNTAX || type==UMSGPAT_PART_TYPE_INSERT_CHAR) { + prevIndex=part->getLimit(); + continue; + } + // We do not support parsing Plural formats. (No REPLACE_NUMBER here.) + // Unexpected Part "part" in parsed message. + U_ASSERT(type==UMSGPAT_PART_TYPE_ARG_START); + int32_t argLimit=msgPattern.getLimitPartIndex(i); + + UMessagePatternArgType argType=part->getArgType(); + part=&msgPattern.getPart(++i); + int32_t argNumber = part->getValue(); // ARG_NUMBER + UnicodeString key; + ++i; + const Format* formatter = NULL; + Formattable& argResult = resultArray[argNumber]; + + if(cachedFormatters!=NULL && (formatter = getCachedFormatter(i - 2))!=NULL) { + // Just parse using the formatter. + tempStatus.setIndex(sourceOffset); + formatter->parseObject(source, argResult, tempStatus); + if (tempStatus.getIndex() == sourceOffset) { + pos.setErrorIndex(sourceOffset); + return NULL; // leave index as is to signal error + } + sourceOffset = tempStatus.getIndex(); + haveArgResult = TRUE; + } else if( + argType==UMSGPAT_ARG_TYPE_NONE || (cachedFormatters && uhash_iget(cachedFormatters, i -2))) { + // We arrive here if getCachedFormatter returned NULL, but there was actually an element in the hash table. + // This can only happen if the hash table contained a DummyFormat, so the if statement above is a check + // for the hash table containind DummyFormat. + + // Match as a string. + // if at end, use longest possible match + // otherwise uses first match to intervening string + // does NOT recursively try all possibilities + UnicodeString stringAfterArgument = getLiteralStringUntilNextArgument(argLimit); + int32_t next; + if (!stringAfterArgument.isEmpty()) { + next = source.indexOf(stringAfterArgument, sourceOffset); + } else { + next = source.length(); + } + if (next < 0) { + pos.setErrorIndex(sourceOffset); + return NULL; // leave index as is to signal error + } else { + UnicodeString strValue(source.tempSubString(sourceOffset, next - sourceOffset)); + UnicodeString compValue; + compValue.append(LEFT_CURLY_BRACE); + itos(argNumber, compValue); + compValue.append(RIGHT_CURLY_BRACE); + if (0 != strValue.compare(compValue)) { + argResult.setString(strValue); + haveArgResult = TRUE; + } + sourceOffset = next; + } + } else if(argType==UMSGPAT_ARG_TYPE_CHOICE) { + tempStatus.setIndex(sourceOffset); + double choiceResult = ChoiceFormat::parseArgument(msgPattern, i, source, tempStatus); + if (tempStatus.getIndex() == sourceOffset) { + pos.setErrorIndex(sourceOffset); + return NULL; // leave index as is to signal error + } + argResult.setDouble(choiceResult); + haveArgResult = TRUE; + sourceOffset = tempStatus.getIndex(); + } else if(argType==UMSGPAT_ARG_TYPE_PLURAL || argType==UMSGPAT_ARG_TYPE_SELECT) { + // Parsing not supported. + ec = U_UNSUPPORTED_ERROR; + return NULL; + } else { + // This should never happen. + ec = U_INTERNAL_PROGRAM_ERROR; + return NULL; + } + if (haveArgResult && count <= argNumber) { + count = argNumber + 1; + } + prevIndex=msgPattern.getPart(argLimit).getLimit(); + i=argLimit; + } +} // ------------------------------------- // Parses the source pattern and returns the Formattable objects array, // the array count and the ending parse position. The caller of this method @@ -1314,106 +1366,9 @@ MessageFormat::format(const Formattable* arguments, Formattable* MessageFormat::parse(const UnicodeString& source, ParsePosition& pos, - int32_t& count) const -{ - // Allocate at least one element. Allocating an array of length - // zero causes problems on some platforms (e.g. Win32). - Formattable *resultArray = new Formattable[argTypeCount ? argTypeCount : 1]; - int32_t patternOffset = 0; - int32_t sourceOffset = pos.getIndex(); - ParsePosition tempPos(0); - count = 0; // {sfb} reset to zero - int32_t len; - // If resultArray could not be created, exit out. - // Avoid crossing initialization of variables above. - if (resultArray == NULL) { - goto PARSE_ERROR; - } - for (int32_t i = 0; i < subformatCount; ++i) { - // match up to format - len = subformats[i].offset - patternOffset; - if (len == 0 || - fPattern.compare(patternOffset, len, source, sourceOffset, len) == 0) { - sourceOffset += len; - patternOffset += len; - } - else { - goto PARSE_ERROR; - } - - // now use format - Format* fmt = subformats[i].format; - int32_t argNum = subformats[i].argNum; - if (fmt == NULL) { // string format - // if at end, use longest possible match - // otherwise uses first match to intervening string - // does NOT recursively try all possibilities - int32_t tempLength = (i+1= tempLength) { - next = source.length(); - } - else { - UnicodeString buffer; - fPattern.extract(patternOffset,tempLength - patternOffset, buffer); - next = source.indexOf(buffer, sourceOffset); - } - - if (next < 0) { - goto PARSE_ERROR; - } - else { - UnicodeString buffer; - source.extract(sourceOffset,next - sourceOffset, buffer); - UnicodeString strValue = buffer; - UnicodeString temp(LEFT_CURLY_BRACE); - // {sfb} check this later - if (isArgNumeric) { - itos(argNum, temp); - } - else { - temp+=(*subformats[i].argName); - } - temp += RIGHT_CURLY_BRACE; - if (strValue != temp) { - source.extract(sourceOffset,next - sourceOffset, buffer); - resultArray[argNum].setString(buffer); - // {sfb} not sure about this - if ((argNum + 1) > count) { - count = argNum + 1; - } - } - sourceOffset = next; - } - } - else { - tempPos.setIndex(sourceOffset); - fmt->parseObject(source, resultArray[argNum], tempPos); - if (tempPos.getIndex() == sourceOffset) { - goto PARSE_ERROR; - } - - if ((argNum + 1) > count) { - count = argNum + 1; - } - sourceOffset = tempPos.getIndex(); // update - } - } - len = fPattern.length() - patternOffset; - if (len == 0 || - fPattern.compare(patternOffset, len, source, sourceOffset, len) == 0) { - pos.setIndex(sourceOffset + len); - return resultArray; - } - // else fall through... - - PARSE_ERROR: - pos.setErrorIndex(sourceOffset); - delete [] resultArray; - count = 0; - return NULL; // leave index as is to signal error + int32_t& count) const { + UErrorCode ec = U_ZERO_ERROR; + return parse(0, source, pos, count, ec); } // ------------------------------------- @@ -1426,7 +1381,7 @@ MessageFormat::parse(const UnicodeString& source, int32_t& cnt, UErrorCode& success) const { - if (!isArgNumeric ) { + if (msgPattern.hasNamedArguments()) { success = U_ARGUMENT_TYPE_MISMATCH; return NULL; } @@ -1458,23 +1413,23 @@ MessageFormat::parseObject( const UnicodeString& source, UnicodeString MessageFormat::autoQuoteApostrophe(const UnicodeString& pattern, UErrorCode& status) { - UnicodeString result; - if (U_SUCCESS(status)) { - int32_t plen = pattern.length(); - const UChar* pat = pattern.getBuffer(); - int32_t blen = plen * 2 + 1; // space for null termination, convenience - UChar* buf = result.getBuffer(blen); - if (buf == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } else { - int32_t len = umsg_autoQuoteApostrophe(pat, plen, buf, blen, &status); - result.releaseBuffer(U_SUCCESS(status) ? len : 0); + UnicodeString result; + if (U_SUCCESS(status)) { + int32_t plen = pattern.length(); + const UChar* pat = pattern.getBuffer(); + int32_t blen = plen * 2 + 1; // space for null termination, convenience + UChar* buf = result.getBuffer(blen); + if (buf == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } else { + int32_t len = umsg_autoQuoteApostrophe(pat, plen, buf, blen, &status); + result.releaseBuffer(U_SUCCESS(status) ? len : 0); + } } - } - if (U_FAILURE(status)) { - result.setToBogus(); - } - return result; + if (U_FAILURE(status)) { + result.setToBogus(); + } + return result; } // ------------------------------------- @@ -1490,64 +1445,116 @@ static Format* makeRBNF(URBNFRuleSetTag tag, const Locale& locale, const Unicode return fmt; } -/** - * Reads the segments[] array (see applyPattern()) and parses the - * segments[1..3] into a Format* object. Stores the format object in - * the subformats[] array. Updates the argTypes[] array type - * information for the corresponding argument. - * - * @param formatNumber index into subformats[] for this format - * @param segments array of strings with the parsed pattern segments - * @param parseError parse error data (output param) - * @param ec error code - */ -void -MessageFormat::makeFormat(int32_t formatNumber, - UnicodeString* segments, - UParseError& parseError, - UErrorCode& ec) { - if (U_FAILURE(ec)) { +void MessageFormat::cacheExplicitFormats(UErrorCode& status) { + if (U_FAILURE(status)) { return; } - // Parse the argument number - int32_t argumentNumber = stou(segments[1]); // always unlocalized! - UnicodeString argumentName; - if (argumentNumber < 0) { - if ( (isArgNumeric==TRUE) && (formatNumber !=0) ) { - ec = U_INVALID_FORMAT_ERROR; - return; - } - isArgNumeric = FALSE; - argumentNumber=formatNumber; + if (cachedFormatters != NULL) { + uhash_removeAll(cachedFormatters); } - if (!isArgNumeric) { - if ( !isLegalArgName(segments[1]) ) { - ec = U_INVALID_FORMAT_ERROR; - return; - } - argumentName = segments[1]; + if (customFormatArgStarts != NULL) { + uhash_removeAll(customFormatArgStarts); } - // Parse the format, recording the argument type and creating a - // new Format object (except for string arguments). - Formattable::Type argType; - Format *fmt = NULL; + // The last two "parts" can at most be ARG_LIMIT and MSG_LIMIT + // which we need not examine. + int32_t limit = msgPattern.countParts() - 2; + argTypeCount = 0; + // We also need not look at the first two "parts" + // (at most MSG_START and ARG_START) in this loop. + // We determine the argTypeCount first so that we can allocateArgTypes + // so that the next loop can set argTypes[argNumber]. + // (This is for the C API which needs the argTypes to read its va_arg list.) + for (int32_t i = 2; i < limit && U_SUCCESS(status); ++i) { + const MessagePattern::Part& part = msgPattern.getPart(i); + if (part.getType() == UMSGPAT_PART_TYPE_ARG_NUMBER) { + const int argNumber = part.getValue(); + if (argNumber >= argTypeCount) { + argTypeCount = argNumber + 1; + } + } + } + if (!allocateArgTypes(argTypeCount, status)) { + return; + } + // Set all argTypes to kObject, as a "none" value, for lack of any better value. + // We never use kObject for real arguments. + for (int32_t i = 0; i < argTypeCount; ++i) { + argTypes[i] = Formattable::kObject; + } + hasArgTypeConflicts = FALSE; + + // This loop starts at part index 1 because we do need to examine + // ARG_START parts. (But we can ignore the MSG_START.) + for (int32_t i = 1; i < limit && U_SUCCESS(status); ++i) { + const MessagePattern::Part* part = &msgPattern.getPart(i); + if (part->getType() != UMSGPAT_PART_TYPE_ARG_START) { + continue; + } + UMessagePatternArgType argType = part->getArgType(); + + int32_t argNumber = -1; + part = &msgPattern.getPart(i + 1); + if (part->getType() == UMSGPAT_PART_TYPE_ARG_NUMBER) { + argNumber = part->getValue(); + } + Formattable::Type formattableType; + + switch (argType) { + case UMSGPAT_ARG_TYPE_NONE: + formattableType = Formattable::kString; + break; + case UMSGPAT_ARG_TYPE_SIMPLE: { + int32_t index = i; + i += 2; + UnicodeString explicitType = msgPattern.getSubstring(msgPattern.getPart(i++)); + UnicodeString style; + if ((part = &msgPattern.getPart(i))->getType() == UMSGPAT_PART_TYPE_ARG_STYLE) { + style = msgPattern.getSubstring(*part); + ++i; + } + UParseError parseError; + Format* formatter = createAppropriateFormat(explicitType, style, formattableType, parseError, status); + setArgStartFormat(index, formatter, status); + break; + } + case UMSGPAT_ARG_TYPE_CHOICE: + case UMSGPAT_ARG_TYPE_PLURAL: + formattableType = Formattable::kDouble; + break; + case UMSGPAT_ARG_TYPE_SELECT: + formattableType = Formattable::kString; + break; + default: + status = U_INTERNAL_PROGRAM_ERROR; // Should be unreachable. + formattableType = Formattable::kString; + break; + } + if (argNumber != -1) { + if (argTypes[argNumber] != Formattable::kObject && argTypes[argNumber] != formattableType) { + hasArgTypeConflicts = TRUE; + } + argTypes[argNumber] = formattableType; + } + } +} + + +Format* MessageFormat::createAppropriateFormat(UnicodeString& type, UnicodeString& style, + Formattable::Type& formattableType, UParseError& parseError, + UErrorCode& ec) { + if (U_FAILURE(ec)) { + return NULL; + } + Format* fmt = NULL; int32_t typeID, styleID; - DateFormat::EStyle style; - UnicodeString unquotedPattern, quotedPattern; - UBool inQuote = FALSE; + DateFormat::EStyle date_style; - switch (typeID = findKeyword(segments[2], TYPE_IDS)) { - - case 0: // string - argType = Formattable::kString; - break; - - case 1: // number - argType = Formattable::kDouble; - - switch (findKeyword(segments[3], NUMBER_STYLE_IDS)) { + switch (typeID = findKeyword(type, TYPE_IDS)) { + case 0: // number + formattableType = Formattable::kDouble; + switch (findKeyword(style, NUMBER_STYLE_IDS)) { case 0: // default fmt = NumberFormat::createInstance(fLocale, ec); break; @@ -1558,7 +1565,7 @@ MessageFormat::makeFormat(int32_t formatNumber, fmt = NumberFormat::createPercentInstance(fLocale, ec); break; case 3: // integer - argType = Formattable::kLong; + formattableType = Formattable::kLong; fmt = createIntegerFormat(fLocale, ec); break; default: // pattern @@ -1566,131 +1573,70 @@ MessageFormat::makeFormat(int32_t formatNumber, if (fmt) { DecimalFormat* decfmt = dynamic_cast(fmt); if (decfmt != NULL) { - decfmt->applyPattern(segments[3],parseError,ec); + decfmt->applyPattern(style,parseError,ec); } } break; } break; - case 2: // date - case 3: // time - argType = Formattable::kDate; - styleID = findKeyword(segments[3], DATE_STYLE_IDS); - style = (styleID >= 0) ? DATE_STYLES[styleID] : DateFormat::kDefault; + case 1: // date + case 2: // time + formattableType = Formattable::kDate; + styleID = findKeyword(style, DATE_STYLE_IDS); + date_style = (styleID >= 0) ? DATE_STYLES[styleID] : DateFormat::kDefault; - if (typeID == 2) { - fmt = DateFormat::createDateInstance(style, fLocale); + if (typeID == 1) { + fmt = DateFormat::createDateInstance(date_style, fLocale); } else { - fmt = DateFormat::createTimeInstance(style, fLocale); + fmt = DateFormat::createTimeInstance(date_style, fLocale); } if (styleID < 0 && fmt != NULL) { SimpleDateFormat* sdtfmt = dynamic_cast(fmt); if (sdtfmt != NULL) { - sdtfmt->applyPattern(segments[3]); + sdtfmt->applyPattern(style); } } break; - case 4: // choice - argType = Formattable::kDouble; - - fmt = new ChoiceFormat(segments[3], parseError, ec); + case 3: // spellout + formattableType = Formattable::kDouble; + fmt = makeRBNF(URBNF_SPELLOUT, fLocale, style, ec); break; - - case 5: // spellout - argType = Formattable::kDouble; - fmt = makeRBNF(URBNF_SPELLOUT, fLocale, segments[3], ec); + case 4: // ordinal + formattableType = Formattable::kDouble; + fmt = makeRBNF(URBNF_ORDINAL, fLocale, style, ec); break; - case 6: // ordinal - argType = Formattable::kDouble; - fmt = makeRBNF(URBNF_ORDINAL, fLocale, segments[3], ec); - break; - case 7: // duration - argType = Formattable::kDouble; - fmt = makeRBNF(URBNF_DURATION, fLocale, segments[3], ec); - break; - case 8: // plural - case 9: // Select - if(typeID == 8) - argType = Formattable::kDouble; - else - argType = Formattable::kString; - quotedPattern = segments[3]; - for (int32_t i = 0; i < quotedPattern.length(); ++i) { - UChar ch = quotedPattern.charAt(i); - if (ch == SINGLE_QUOTE) { - if (i+1 < quotedPattern.length() && quotedPattern.charAt(i+1)==SINGLE_QUOTE) { - unquotedPattern+=ch; - ++i; - } - else { - inQuote = !inQuote; - } - } - else { - unquotedPattern += ch; - } - } - if(typeID == 8) - fmt = new PluralFormat(fLocale, unquotedPattern, ec); - else - fmt = new SelectFormat(unquotedPattern, ec); + case 5: // duration + formattableType = Formattable::kDouble; + fmt = makeRBNF(URBNF_DURATION, fLocale, style, ec); break; default: - argType = Formattable::kString; + formattableType = Formattable::kString; ec = U_ILLEGAL_ARGUMENT_ERROR; break; } - if (fmt==NULL && argType!=Formattable::kString && U_SUCCESS(ec)) { - ec = U_MEMORY_ALLOCATION_ERROR; - } - - if (!allocateSubformats(formatNumber+1) || - !allocateArgTypes(argumentNumber+1)) { - ec = U_MEMORY_ALLOCATION_ERROR; - } - - if (U_FAILURE(ec)) { - delete fmt; - return; - } - - // Parse succeeded; record results in our arrays - subformats[formatNumber].format = fmt; - subformats[formatNumber].offset = segments[0].length(); - if (isArgNumeric) { - subformats[formatNumber].argName = NULL; - subformats[formatNumber].argNum = argumentNumber; - } - else { - subformats[formatNumber].argName = new UnicodeString(argumentName); - subformats[formatNumber].argNum = -1; - } - subformatCount = formatNumber+1; - - // Careful here: argumentNumber may in general arrive out of - // sequence, e.g., "There was {2} on {0,date} (see {1,number})." - argTypes[argumentNumber] = argType; - if (argumentNumber+1 > argTypeCount) { - argTypeCount = argumentNumber+1; - } + return fmt; } -// ------------------------------------- + +//------------------------------------- // Finds the string, s, in the string array, list. int32_t MessageFormat::findKeyword(const UnicodeString& s, const UChar * const *list) { - if (s.length() == 0) + if (s.isEmpty()) { return 0; // default + } - UnicodeString buffer = s; + int32_t length = s.length(); + const UChar *ps = PatternProps::trimWhiteSpace(s.getBuffer(), length); + UnicodeString buffer(FALSE, ps, length); // Trims the space characters and turns all characters // in s to lower case. - buffer.trim().toLower(""); + buffer.toLower(""); for (int32_t i = 0; list[i]; ++i) { if (!buffer.compare(list[i], u_strlen(list[i]))) { return i; @@ -1699,48 +1645,6 @@ int32_t MessageFormat::findKeyword(const UnicodeString& s, return -1; } -// ------------------------------------- -// Checks the range of the source text to quote the special -// characters, { and ' and copy to target buffer. - -void -MessageFormat::copyAndFixQuotes(const UnicodeString& source, - int32_t start, - int32_t end, - UnicodeString& appendTo) -{ - UBool gotLB = FALSE; - - for (int32_t i = start; i < end; ++i) { - UChar ch = source[i]; - if (ch == LEFT_CURLY_BRACE) { - appendTo += SINGLE_QUOTE; - appendTo += LEFT_CURLY_BRACE; - appendTo += SINGLE_QUOTE; - gotLB = TRUE; - } - else if (ch == RIGHT_CURLY_BRACE) { - if(gotLB) { - appendTo += RIGHT_CURLY_BRACE; - gotLB = FALSE; - } - else { - // orig code. - appendTo += SINGLE_QUOTE; - appendTo += RIGHT_CURLY_BRACE; - appendTo += SINGLE_QUOTE; - } - } - else if (ch == SINGLE_QUOTE) { - appendTo += SINGLE_QUOTE; - appendTo += SINGLE_QUOTE; - } - else { - appendTo += ch; - } - } -} - /** * Convenience method that ought to be in NumberFormat */ @@ -1798,27 +1702,43 @@ const DateFormat* MessageFormat::getDefaultDateFormat(UErrorCode& ec) const { UBool MessageFormat::usesNamedArguments() const { - return !isArgNumeric; -} - -UBool -MessageFormat::isLegalArgName(const UnicodeString& argName) const { - if(!u_hasBinaryProperty(argName.charAt(0), idStart)) { - return FALSE; - } - for (int32_t i=1; isize(); + return (fFormatNames==NULL) ? 0 : fFormatNames->size(); } FormatNameEnumeration::~FormatNameEnumeration() { - UnicodeString *s; - for (int32_t i=0; isize(); ++i) { - if ((s=(UnicodeString *)fFormatNames->elementAt(i))!=NULL) { - delete s; - } - } delete fFormatNames; } + + +MessageFormat::PluralSelectorProvider::PluralSelectorProvider(const Locale* loc) + : locale(loc), rules(NULL) { +} + +MessageFormat::PluralSelectorProvider::~PluralSelectorProvider() { + // We own the rules but not the locale. + delete rules; +} + +UnicodeString MessageFormat::PluralSelectorProvider::select(double number, UErrorCode& ec) const { + if (U_FAILURE(ec)) { + return UnicodeString(FALSE, OTHER_STRING, 5); + } + MessageFormat::PluralSelectorProvider* t = const_cast(this); + if(rules == NULL) { + t->rules = PluralRules::forLocale(*locale, ec); + if (U_FAILURE(ec)) { + return UnicodeString(FALSE, OTHER_STRING, 5); + } + } + return rules->select(number); +} + +void MessageFormat::PluralSelectorProvider::reset(const Locale* loc) { + locale = loc; + delete rules; + rules = NULL; +} + + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/plurfmt.cpp b/icu4c/source/i18n/plurfmt.cpp index 0464cf57c66..980eb01a76a 100644 --- a/icu4c/source/i18n/plurfmt.cpp +++ b/icu4c/source/i18n/plurfmt.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2009, International Business Machines Corporation and +* Copyright (C) 2009-2011, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * @@ -12,238 +12,161 @@ ******************************************************************************* */ - -#include "unicode/utypes.h" +#include "unicode/messagepattern.h" #include "unicode/plurfmt.h" #include "unicode/plurrule.h" +#include "unicode/utypes.h" +#include "cmemory.h" +#include "messageimpl.h" #include "plurrule_impl.h" +#include "uassert.h" +#include "uhash.h" #if !UCONFIG_NO_FORMATTING U_NAMESPACE_BEGIN -U_CDECL_BEGIN -static void U_CALLCONV -deleteHashStrings(void *obj) { - delete (UnicodeString *)obj; -} -U_CDECL_END +static const UChar OTHER_STRING[] = { + 0x6F, 0x74, 0x68, 0x65, 0x72, 0 // "other" +}; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(PluralFormat) -#define MAX_KEYWORD_SIZE 30 - -PluralFormat::PluralFormat(UErrorCode& status) { - init(NULL, Locale::getDefault(), status); +PluralFormat::PluralFormat(UErrorCode& status) + : locale(Locale::getDefault()), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(NULL, status); } -PluralFormat::PluralFormat(const Locale& loc, UErrorCode& status) { - init(NULL, loc, status); +PluralFormat::PluralFormat(const Locale& loc, UErrorCode& status) + : locale(loc), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(NULL, status); } -PluralFormat::PluralFormat(const PluralRules& rules, UErrorCode& status) { - init(&rules, Locale::getDefault(), status); +PluralFormat::PluralFormat(const PluralRules& rules, UErrorCode& status) + : locale(Locale::getDefault()), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(&rules, status); } -PluralFormat::PluralFormat(const Locale& loc, const PluralRules& rules, UErrorCode& status) { - init(&rules, loc, status); +PluralFormat::PluralFormat(const Locale& loc, + const PluralRules& rules, + UErrorCode& status) + : locale(loc), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(&rules, status); } -PluralFormat::PluralFormat(const UnicodeString& pat, UErrorCode& status) { - init(NULL, Locale::getDefault(), status); +PluralFormat::PluralFormat(const UnicodeString& pat, + UErrorCode& status) + : locale(Locale::getDefault()), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(NULL, status); applyPattern(pat, status); } -PluralFormat::PluralFormat(const Locale& loc, const UnicodeString& pat, UErrorCode& status) { - init(NULL, loc, status); +PluralFormat::PluralFormat(const Locale& loc, + const UnicodeString& pat, + UErrorCode& status) + : locale(loc), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(NULL, status); applyPattern(pat, status); } -PluralFormat::PluralFormat(const PluralRules& rules, const UnicodeString& pat, UErrorCode& status) { - init(&rules, Locale::getDefault(), status); +PluralFormat::PluralFormat(const PluralRules& rules, + const UnicodeString& pat, + UErrorCode& status) + : locale(Locale::getDefault()), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(&rules, status); applyPattern(pat, status); } -PluralFormat::PluralFormat(const Locale& loc, const PluralRules& rules, const UnicodeString& pat, UErrorCode& status) { - init(&rules, loc, status); +PluralFormat::PluralFormat(const Locale& loc, + const PluralRules& rules, + const UnicodeString& pat, + UErrorCode& status) + : locale(loc), + msgPattern(status), + numberFormat(NULL), + offset(0) { + init(&rules, status); applyPattern(pat, status); } -PluralFormat::PluralFormat(const PluralFormat& other) : Format(other) { +PluralFormat::PluralFormat(const PluralFormat& other) + : Format(other), + locale(other.locale), + msgPattern(other.msgPattern), + numberFormat(NULL), + offset(other.offset) { + copyObjects(other); +} + +void +PluralFormat::copyObjects(const PluralFormat& other) { UErrorCode status = U_ZERO_ERROR; - locale = other.locale; - pluralRules = other.pluralRules->clone(); - pattern = other.pattern; - copyHashtable(other.fParsedValuesHash, status); - if (U_FAILURE(status)) { - delete pluralRules; - pluralRules = NULL; - return; + if (other.numberFormat == NULL) { + numberFormat = NumberFormat::createInstance(locale, status); + } else { + numberFormat = (NumberFormat*)other.numberFormat->clone(); } - numberFormat=NumberFormat::createInstance(locale, status); - if (U_FAILURE(status)) { - delete pluralRules; - pluralRules = NULL; - delete fParsedValuesHash; - fParsedValuesHash = NULL; - return; + if (other.pluralRulesWrapper.pluralRules == NULL) { + pluralRulesWrapper.pluralRules = PluralRules::forLocale(locale, status); + } else { + pluralRulesWrapper.pluralRules = other.pluralRulesWrapper.pluralRules->clone(); } - replacedNumberFormat=other.replacedNumberFormat; } + PluralFormat::~PluralFormat() { - delete pluralRules; - delete fParsedValuesHash; delete numberFormat; } void -PluralFormat::init(const PluralRules* rules, const Locale& curLocale, UErrorCode& status) { +PluralFormat::init(const PluralRules* rules, UErrorCode& status) { if (U_FAILURE(status)) { return; } - locale = curLocale; - if ( rules==NULL) { - pluralRules = PluralRules::forLocale(locale, status); - if (U_FAILURE(status)) { + + if (rules==NULL) { + pluralRulesWrapper.pluralRules = PluralRules::forLocale(locale, status); + } else { + pluralRulesWrapper.pluralRules = rules->clone(); + if (pluralRulesWrapper.pluralRules == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; return; } } - else { - pluralRules = rules->clone(); - } - fParsedValuesHash=NULL; - pattern.remove(); - numberFormat= NumberFormat::createInstance(curLocale, status); - if (U_FAILURE(status)) { - delete pluralRules; - pluralRules = NULL; - return; - } - replacedNumberFormat=NULL; + + numberFormat= NumberFormat::createInstance(locale, status); } void PluralFormat::applyPattern(const UnicodeString& newPattern, UErrorCode& status) { + msgPattern.parsePluralStyle(newPattern, NULL, status); if (U_FAILURE(status)) { + msgPattern.clear(); + offset = 0; return; } - this->pattern = newPattern; - UnicodeString token; - int32_t braceCount=0; - fmtToken type; - UBool spaceIncluded=FALSE; - - if (fParsedValuesHash==NULL) { - fParsedValuesHash = new Hashtable(TRUE, status); - if (U_FAILURE(status)) { - return; - } - fParsedValuesHash->setValueDeleter(deleteHashStrings); - } - - UBool getKeyword=TRUE; - UnicodeString hashKeyword; - UnicodeString *hashPattern; - - for (int32_t i=0; iget(token)!= NULL) { - status = U_DUPLICATE_KEYWORD; - return; - } - if (token.length()==0) { - status = U_PATTERN_SYNTAX_ERROR; - return; - } - if (!pluralRules->isKeyword(token)) { - status = U_UNDEFINED_KEYWORD; - return; - } - hashKeyword = token; - getKeyword = FALSE; - token.remove(); - } - else { - if (braceCount==0) { - status = U_UNEXPECTED_TOKEN; - return; - } - else { - token += ch; - } - } - braceCount++; - spaceIncluded = FALSE; - break; - case tRightBrace: - if ( getKeyword ) { - status = U_UNEXPECTED_TOKEN; - return; - } - else { - hashPattern = new UnicodeString(token); - fParsedValuesHash->put(hashKeyword, hashPattern, status); - if (U_FAILURE(status)) { - return; - } - braceCount--; - if ( braceCount==0 ) { - getKeyword=TRUE; - hashKeyword.remove(); - hashPattern=NULL; - token.remove(); - } - else { - token += ch; - } - } - spaceIncluded = FALSE; - break; - case tLetter: - case tNumberSign: - if (spaceIncluded) { - status = U_PATTERN_SYNTAX_ERROR; - return; - } - default: - token+=ch; - break; - } - } - if ( checkSufficientDefinition() ) { - return; - } - else { - status = U_DEFAULT_KEYWORD_MISSING; - return; - } + offset = msgPattern.getPluralOffset(0); } UnicodeString& @@ -253,20 +176,10 @@ PluralFormat::format(const Formattable& obj, UErrorCode& status) const { if (U_FAILURE(status)) return appendTo; - int32_t number; - - switch (obj.getType()) - { - case Formattable::kDouble: - return format((int32_t)obj.getDouble(), appendTo, pos, status); - break; - case Formattable::kLong: - number = (int32_t)obj.getLong(); - return format(number, appendTo, pos, status); - break; - case Formattable::kInt64: - return format((int32_t)obj.getInt64(), appendTo, pos, status); - default: + + if (obj.isNumeric()) { + return format(obj.getDouble(), appendTo, pos, status); + } else { status = U_ILLEGAL_ARGUMENT_ERROR; return appendTo; } @@ -274,30 +187,22 @@ PluralFormat::format(const Formattable& obj, UnicodeString PluralFormat::format(int32_t number, UErrorCode& status) const { - if (U_FAILURE(status)) { - return UnicodeString(); - } FieldPosition fpos(0); UnicodeString result; - return format(number, result, fpos, status); } UnicodeString PluralFormat::format(double number, UErrorCode& status) const { - if (U_FAILURE(status)) { - return UnicodeString(); - } FieldPosition fpos(0); UnicodeString result; - return format(number, result, fpos, status); } UnicodeString& PluralFormat::format(int32_t number, - UnicodeString& appendTo, + UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { return format((double)number, appendTo, pos, status); @@ -305,101 +210,82 @@ PluralFormat::format(int32_t number, UnicodeString& PluralFormat::format(double number, - UnicodeString& appendTo, + UnicodeString& appendTo, FieldPosition& pos, - UErrorCode& /*status*/) const { - - if (fParsedValuesHash==NULL) { - if ( replacedNumberFormat== NULL ) { - return numberFormat->format(number, appendTo, pos); - } - else { - replacedNumberFormat->format(number, appendTo, pos); + UErrorCode& status) const { + if (U_FAILURE(status)) { + return appendTo; + } + if (msgPattern.countParts() == 0) { + return numberFormat->format(number, appendTo, pos); + } + // Get the appropriate sub-message. + int32_t partIndex = findSubMessage(msgPattern, 0, pluralRulesWrapper, number, status); + // Replace syntactic # signs in the top level of this sub-message + // (not in nested arguments) with the formatted number-offset. + const UnicodeString& pattern = msgPattern.getPatternString(); + number -= offset; + int32_t prevIndex = msgPattern.getPart(partIndex).getLimit(); + for (;;) { + const MessagePattern::Part& part = msgPattern.getPart(++partIndex); + const UMessagePatternPartType type = part.getType(); + int32_t index = part.getIndex(); + if (type == UMSGPAT_PART_TYPE_MSG_LIMIT) { + return appendTo.append(pattern, prevIndex, index - prevIndex); + } else if ((type == UMSGPAT_PART_TYPE_REPLACE_NUMBER) || + (type == UMSGPAT_PART_TYPE_SKIP_SYNTAX && MessageImpl::jdkAposMode(msgPattern))) { + appendTo.append(pattern, prevIndex, index - prevIndex); + if (type == UMSGPAT_PART_TYPE_REPLACE_NUMBER) { + numberFormat->format(number, appendTo); + } + prevIndex = part.getLimit(); + } else if (type == UMSGPAT_PART_TYPE_ARG_START) { + appendTo.append(pattern, prevIndex, index - prevIndex); + prevIndex = index; + partIndex = msgPattern.getLimitPartIndex(partIndex); + index = msgPattern.getPart(partIndex).getLimit(); + MessageImpl::appendReducedApostrophes(pattern, prevIndex, index, appendTo); + prevIndex = index; } } - UnicodeString selectedRule = pluralRules->select(number); - UnicodeString *selectedPattern = (UnicodeString *)fParsedValuesHash->get(selectedRule); - if (selectedPattern==NULL) { - selectedPattern = (UnicodeString *)fParsedValuesHash->get(pluralRules->getKeywordOther()); - } - appendTo = insertFormattedNumber(number, *selectedPattern, appendTo, pos); - - return appendTo; } UnicodeString& PluralFormat::toPattern(UnicodeString& appendTo) { - appendTo+= pattern; + if (0 == msgPattern.countParts()) { + appendTo.setToBogus(); + } else { + appendTo.append(msgPattern.getPatternString()); + } return appendTo; } -UBool -PluralFormat::inRange(UChar ch, fmtToken& type) { - if ((ch>=CAP_A) && (ch<=CAP_Z)) { - // we assume all characters are in lower case already. - return FALSE; - } - if ((ch>=LOW_A) && (ch<=LOW_Z)) { - type = tLetter; - return TRUE; - } - switch (ch) { - case LEFTBRACE: - type = tLeftBrace; - return TRUE; - case SPACE: - type = tSpace; - return TRUE; - case RIGHTBRACE: - type = tRightBrace; - return TRUE; - case NUMBER_SIGN: - type = tNumberSign; - return TRUE; - default : - type = none; - return FALSE; - } -} - -UBool -PluralFormat::checkSufficientDefinition() { - // Check that at least the default rule is defined. - if (fParsedValuesHash==NULL) return FALSE; - if (fParsedValuesHash->get(pluralRules->getKeywordOther()) == NULL) { - return FALSE; - } - else { - return TRUE; - } -} - void PluralFormat::setLocale(const Locale& loc, UErrorCode& status) { if (U_FAILURE(status)) { return; } - if (pluralRules!=NULL) { - delete pluralRules; - pluralRules=NULL; - } - if (fParsedValuesHash!= NULL) { - delete fParsedValuesHash; - fParsedValuesHash = NULL; - } - if (numberFormat!=NULL) { - delete numberFormat; - numberFormat = NULL; - replacedNumberFormat=NULL; - } - init(NULL, loc, status); + locale = loc; + msgPattern.clear(); + delete numberFormat; + offset = 0; + numberFormat = NULL; + pluralRulesWrapper.reset(); + init(NULL, status); } void -PluralFormat::setNumberFormat(const NumberFormat* format, UErrorCode& /*status*/) { - // TODO: The copy constructor and assignment op of NumberFormat class are protected. - // create a pointer as the workaround. - replacedNumberFormat = (NumberFormat *)format; +PluralFormat::setNumberFormat(const NumberFormat* format, UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + NumberFormat* nf = (NumberFormat*)format->clone(); + if (nf != NULL) { + delete numberFormat; + numberFormat = nf; + } else { + status = U_MEMORY_ALLOCATION_ERROR; + } } Format* @@ -408,34 +294,14 @@ PluralFormat::clone() const return new PluralFormat(*this); } + PluralFormat& PluralFormat::operator=(const PluralFormat& other) { if (this != &other) { - UErrorCode status = U_ZERO_ERROR; - delete pluralRules; - delete fParsedValuesHash; - delete numberFormat; locale = other.locale; - pluralRules = other.pluralRules->clone(); - pattern = other.pattern; - copyHashtable(other.fParsedValuesHash, status); - if (U_FAILURE(status)) { - delete pluralRules; - pluralRules = NULL; - fParsedValuesHash = NULL; - numberFormat = NULL; - return *this; - } - numberFormat=NumberFormat::createInstance(locale, status); - if (U_FAILURE(status)) { - delete pluralRules; - delete fParsedValuesHash; - pluralRules = NULL; - fParsedValuesHash = NULL; - numberFormat = NULL; - return *this; - } - replacedNumberFormat=other.replacedNumberFormat; + msgPattern = other.msgPattern; + offset = other.offset; + copyObjects(other); } return *this; @@ -443,13 +309,21 @@ PluralFormat::operator=(const PluralFormat& other) { UBool PluralFormat::operator==(const Format& other) const { - // This protected comparison operator should only be called by subclasses - // which have confirmed that the other object being compared against is - // an instance of a sublcass of PluralFormat. THIS IS IMPORTANT. - // Format::operator== guarantees that this cast is safe - PluralFormat* fmt = (PluralFormat*)&other; - return ((*pluralRules == *(fmt->pluralRules)) && - (*numberFormat == *(fmt->numberFormat))); + if (this == &other) { + return TRUE; + } + if (!Format::operator==(other)) { + return FALSE; + } + const PluralFormat& o = (const PluralFormat&)other; + return + locale == o.locale && + msgPattern == o.msgPattern && // implies same offset + (numberFormat == NULL) == (o.numberFormat == NULL) && + (numberFormat == NULL || *numberFormat == *o.numberFormat) && + (pluralRulesWrapper.pluralRules == NULL) == (o.pluralRulesWrapper.pluralRules == NULL) && + (pluralRulesWrapper.pluralRules == NULL || + *pluralRulesWrapper.pluralRules == *o.pluralRulesWrapper.pluralRules); } UBool @@ -460,72 +334,112 @@ PluralFormat::operator!=(const Format& other) const { void PluralFormat::parseObject(const UnicodeString& /*source*/, Formattable& /*result*/, - ParsePosition& /*pos*/) const + ParsePosition& pos) const { - // TODO: not yet supported in icu4j and icu4c + // Parsing not supported. + pos.setErrorIndex(pos.getIndex()); } -UnicodeString -PluralFormat::insertFormattedNumber(double number, - UnicodeString& message, - UnicodeString& appendTo, - FieldPosition& pos) const { - UnicodeString result; - int32_t braceStack=0; - int32_t startIndex=0; - - if (message.length()==0) { - return result; +int32_t PluralFormat::findSubMessage(const MessagePattern& pattern, int32_t partIndex, + const PluralSelector& selector, double number, UErrorCode& ec) { + if (U_FAILURE(ec)) { + return 0; } - appendTo = numberFormat->format(number, appendTo, pos); - for(int32_t i=0; igetType())) { + offset=pattern.getNumericValue(*part); + ++partIndex; + } else { + offset=0; + } + // The keyword is empty until we need to match against non-explicit, not-"other" value. + // Then we get the keyword from the selector. + // (In other words, we never call the selector if we match against an explicit value, + // or if the only non-explicit keyword is "other".) + UnicodeString keyword; + UnicodeString other(FALSE, OTHER_STRING, 5); + // When we find a match, we set msgStart>0 and also set this boolean to true + // to avoid matching the keyword again (duplicates are allowed) + // while we continue to look for an explicit-value match. + UBool haveKeywordMatch=FALSE; + // msgStart is 0 until we find any appropriate sub-message. + // We remember the first "other" sub-message if we have not seen any + // appropriate sub-message before. + // We remember the first matching-keyword sub-message if we have not seen + // one of those before. + // (The parser allows [does not check for] duplicate keywords. + // We just have to make sure to take the first one.) + // We avoid matching the keyword twice by also setting haveKeywordMatch=true + // at the first keyword match. + // We keep going until we find an explicit-value match or reach the end of the plural style. + int32_t msgStart=0; + // Iterate over (ARG_SELECTOR [ARG_INT|ARG_DOUBLE] message) tuples + // until ARG_LIMIT or end of plural-only pattern. + do { + part=&pattern.getPart(partIndex++); + const UMessagePatternPartType type = part->getType(); + if(type==UMSGPAT_PART_TYPE_ARG_LIMIT) { break; - case RIGHTBRACE: - --braceStack; - break; - case NUMBER_SIGN: - if (braceStack==0) { - result += UnicodeString(message, startIndex, i); - result += appendTo; - startIndex = i + 1; + } + U_ASSERT (type==UMSGPAT_PART_TYPE_ARG_SELECTOR); + // part is an ARG_SELECTOR followed by an optional explicit value, and then a message + if(MessagePattern::Part::hasNumericValue(pattern.getPartType(partIndex))) { + // explicit value like "=2" + part=&pattern.getPart(partIndex++); + if(number==pattern.getNumericValue(*part)) { + // matches explicit value + return partIndex; + } + } else if(!haveKeywordMatch) { + // plural keyword like "few" or "other" + // Compare "other" first and call the selector if this is not "other". + if(pattern.partSubstringMatches(*part, other)) { + if(msgStart==0) { + msgStart=partIndex; + if(0 == keyword.compare(other)) { + // This is the first "other" sub-message, + // and the selected keyword is also "other". + // Do not match "other" again. + haveKeywordMatch=TRUE; + } + } + } else { + if(keyword.isEmpty()) { + keyword=selector.select(number-offset, ec); + if(msgStart!=0 && (0 == keyword.compare(other))) { + // We have already seen an "other" sub-message. + // Do not match "other" again. + haveKeywordMatch=TRUE; + continue; + } + } + if(pattern.partSubstringMatches(*part, keyword)) { + // keyword matches + msgStart=partIndex; + // Do not match this keyword again. + haveKeywordMatch=TRUE; + } } - break; } - } - if ( startIndex < message.length() ) { - result += UnicodeString(message, startIndex, message.length()-startIndex); - } - appendTo = result; - return result; + partIndex=pattern.getLimitPartIndex(partIndex); + } while(++partIndexsetValueDeleter(deleteHashStrings); - int32_t pos = -1; - const UHashElement* elem = NULL; - // walk through the hash table and create a deep clone - while((elem = other->nextElement(pos))!= NULL){ - const UHashTok otherKeyTok = elem->key; - UnicodeString* otherKey = (UnicodeString*)otherKeyTok.pointer; - const UHashTok otherKeyToVal = elem->value; - UnicodeString* otherValue = (UnicodeString*)otherKeyToVal.pointer; - fParsedValuesHash->put(*otherKey, new UnicodeString(*otherValue), status); - if(U_FAILURE(status)){ - return; - } - } +PluralFormat::PluralSelectorAdapter::~PluralSelectorAdapter() { + delete pluralRules; +} + +UnicodeString PluralFormat::PluralSelectorAdapter::select(double number, + UErrorCode& /*ec*/) const { + return pluralRules->select(number); +} + +void PluralFormat::PluralSelectorAdapter::reset() { + delete pluralRules; + pluralRules = NULL; } diff --git a/icu4c/source/i18n/plurrule.cpp b/icu4c/source/i18n/plurrule.cpp index 275900f1ebf..5a265a79c4b 100644 --- a/icu4c/source/i18n/plurrule.cpp +++ b/icu4c/source/i18n/plurrule.cpp @@ -13,7 +13,6 @@ */ -#include "unicode/uniset.h" #include "unicode/utypes.h" #include "unicode/ures.h" #include "unicode/plurrule.h" @@ -21,6 +20,7 @@ #include "cstring.h" #include "hash.h" #include "mutex.h" +#include "patternprops.h" #include "plurrule_impl.h" #include "putilimp.h" #include "ucln_in.h" @@ -1159,16 +1159,9 @@ RuleChain::isKeyword(const UnicodeString& keywordParam) const { RuleParser::RuleParser() { - UErrorCode err=U_ZERO_ERROR; - const UnicodeString idStart=UNICODE_STRING_SIMPLE("[[a-z]]"); - const UnicodeString idContinue=UNICODE_STRING_SIMPLE("[[a-z][A-Z][_][0-9]]"); - idStartFilter = new UnicodeSet(idStart, err); - idContinueFilter = new UnicodeSet(idContinue, err); } RuleParser::~RuleParser() { - delete idStartFilter; - delete idContinueFilter; } void @@ -1413,21 +1406,7 @@ RuleParser::getKeyType(const UnicodeString& token, tokenType& keyType, UErrorCod UBool RuleParser::isValidKeyword(const UnicodeString& token) { - if ( token.length()==0 ) { - return FALSE; - } - if ( idStartFilter->contains(token.charAt(0) )==TRUE ) { - int32_t i; - for (i=1; i< token.length(); i++) { - if (idContinueFilter->contains(token.charAt(i))== FALSE) { - return FALSE; - } - } - return TRUE; - } - else { - return FALSE; - } + return PatternProps::isIdentifier(token.getBuffer(), token.length()); } PluralKeywordEnumeration::PluralKeywordEnumeration(RuleChain *header, UErrorCode& status) : diff --git a/icu4c/source/i18n/plurrule_impl.h b/icu4c/source/i18n/plurrule_impl.h index aa23a37e983..351b2256cb7 100644 --- a/icu4c/source/i18n/plurrule_impl.h +++ b/icu4c/source/i18n/plurrule_impl.h @@ -13,10 +13,7 @@ #ifndef PLURRULE_IMPLE #define PLURRULE_IMPLE -/** - * \file - * \brief C++ API: Defines rules for mapping positive long values onto a small set of keywords. - */ +// Internal definitions for the PluralRules implementation. #if !UCONFIG_NO_FORMATTING @@ -89,8 +86,6 @@ U_NAMESPACE_BEGIN #define PLURAL_RANGE_HIGH 0x7fffffff; -class UnicodeSet; - typedef enum PluralKey { pZero, pOne, @@ -138,9 +133,6 @@ public: tokenType& type, UErrorCode &status); void checkSyntax(tokenType prevType, tokenType curType, UErrorCode &status); private: - UnicodeSet *idStartFilter; - UnicodeSet *idContinueFilter; - void getKeyType(const UnicodeString& token, tokenType& type, UErrorCode &status); UBool inRange(UChar ch, tokenType& type); UBool isValidKeyword(const UnicodeString& token); diff --git a/icu4c/source/i18n/selfmt.cpp b/icu4c/source/i18n/selfmt.cpp index 090b6439dce..0945f036c77 100755 --- a/icu4c/source/i18n/selfmt.cpp +++ b/icu4c/source/i18n/selfmt.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. * Copyright (C) 2010 , Yahoo! Inc. ******************************************************************** @@ -16,76 +16,41 @@ #include // for 'typeid' to work -#include "unicode/utypes.h" -#include "unicode/ustring.h" -#include "unicode/ucnv_err.h" -#include "unicode/uchar.h" -#include "unicode/umsg.h" +#include "unicode/messagepattern.h" #include "unicode/rbnf.h" +#include "unicode/selfmt.h" +#include "unicode/uchar.h" +#include "unicode/ucnv_err.h" +#include "unicode/umsg.h" +#include "unicode/ustring.h" +#include "unicode/utypes.h" #include "cmemory.h" -#include "util.h" +#include "messageimpl.h" +#include "patternprops.h" +#include "selfmtimpl.h" #include "uassert.h" #include "ustrfmt.h" +#include "util.h" #include "uvector.h" -#include "unicode/selfmt.h" -#include "selfmtimpl.h" - #if !UCONFIG_NO_FORMATTING U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SelectFormat) -#define MAX_KEYWORD_SIZE 30 static const UChar SELECT_KEYWORD_OTHER[] = {LOW_O, LOW_T, LOW_H, LOW_E, LOW_R, 0}; -SelectFormat::SelectFormat(const UnicodeString& pat, UErrorCode& status) : parsedValuesHash(NULL) { - if (U_FAILURE(status)) { - return; - } - initHashTable(status); +SelectFormat::SelectFormat(const UnicodeString& pat, + UErrorCode& status) : msgPattern(status) { applyPattern(pat, status); } -SelectFormat::SelectFormat(const SelectFormat& other) : Format(other), parsedValuesHash(NULL) { - UErrorCode status = U_ZERO_ERROR; - pattern = other.pattern; - copyHashtable(other.parsedValuesHash, status); +SelectFormat::SelectFormat(const SelectFormat& other) : Format(other), + msgPattern(other.msgPattern) { } SelectFormat::~SelectFormat() { - cleanHashTable(); -} - -void SelectFormat::initHashTable(UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - // has inited - if (parsedValuesHash != NULL) { - return; - } - - parsedValuesHash = new Hashtable(TRUE, status); - if (U_FAILURE(status)) { - cleanHashTable(); - return; - } else { - if (parsedValuesHash == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - } - // to use hashtable->equals(), must set Value Compartor. - parsedValuesHash->setValueComparator(uhash_compareCaselessUnicodeString); -} - -void SelectFormat::cleanHashTable() { - if (parsedValuesHash != NULL) { - delete parsedValuesHash; - parsedValuesHash = NULL; - } } void @@ -94,164 +59,10 @@ SelectFormat::applyPattern(const UnicodeString& newPattern, UErrorCode& status) return; } - pattern = newPattern; - enum State{ startState, keywordState, pastKeywordState, phraseState}; - - //Initialization - UnicodeString keyword ; - UnicodeString phrase ; - UnicodeString* ptrPhrase ; - int32_t braceCount = 0; - - if (parsedValuesHash == NULL) { - initHashTable(status); - if (U_FAILURE(status)) { - return; - } + msgPattern.parseSelectStyle(newPattern, NULL, status); + if (U_FAILURE(status)) { + msgPattern.clear(); } - parsedValuesHash->removeAll(); - parsedValuesHash->setValueDeleter(uhash_deleteUnicodeString); - - //Process the state machine - State state = startState; - for (int32_t i = 0; i < pattern.length(); ++i) { - //Get the character and check its type - UChar ch = pattern.charAt(i); - CharacterClass type = classifyCharacter(ch); - - //Allow any character in phrase but nowhere else - if ( type == tOther ) { - if ( state == phraseState ){ - phrase += ch; - continue; - }else { - status = U_PATTERN_SYNTAX_ERROR; - cleanHashTable(); - return; - } - } - - //Process the state machine - switch (state) { - //At the start of pattern - case startState: - switch (type) { - case tSpace: - break; - case tStartKeyword: - state = keywordState; - keyword += ch; - break; - //If anything else is encountered, it's a syntax error - default: - status = U_PATTERN_SYNTAX_ERROR; - cleanHashTable(); - return; - }//end of switch(type) - break; - - //Handle the keyword state - case keywordState: - switch (type) { - case tSpace: - state = pastKeywordState; - break; - case tStartKeyword: - case tContinueKeyword: - keyword += ch; - break; - case tLeftBrace: - state = phraseState; - break; - //If anything else is encountered, it's a syntax error - default: - status = U_PATTERN_SYNTAX_ERROR; - cleanHashTable(); - return; - }//end of switch(type) - break; - - //Handle the pastkeyword state - case pastKeywordState: - switch (type) { - case tSpace: - break; - case tLeftBrace: - state = phraseState; - break; - //If anything else is encountered, it's a syntax error - default: - status = U_PATTERN_SYNTAX_ERROR; - cleanHashTable(); - return; - }//end of switch(type) - break; - - //Handle the phrase state - case phraseState: - switch (type) { - case tLeftBrace: - braceCount++; - phrase += ch; - break; - case tRightBrace: - //Matching keyword, phrase pair found - if (braceCount == 0){ - //Check validity of keyword - if (parsedValuesHash->get(keyword) != NULL) { - status = U_DUPLICATE_KEYWORD; - cleanHashTable(); - return; - } - if (keyword.length() == 0) { - status = U_PATTERN_SYNTAX_ERROR; - cleanHashTable(); - return; - } - - //Store the keyword, phrase pair in hashTable - ptrPhrase = new UnicodeString(phrase); - parsedValuesHash->put( keyword, ptrPhrase, status); - - //Reinitialize - keyword.remove(); - phrase.remove(); - ptrPhrase = NULL; - state = startState; - } - - if (braceCount > 0){ - braceCount-- ; - phrase += ch; - } - break; - default: - phrase += ch; - }//end of switch(type) - break; - - //Handle the default case of switch(state) - default: - status = U_PATTERN_SYNTAX_ERROR; - cleanHashTable(); - return; - - }//end of switch(state) - } - - //Check if the state machine is back to startState - if ( state != startState){ - status = U_PATTERN_SYNTAX_ERROR; - cleanHashTable(); - return; - } - - //Check if "other" keyword is present - if ( !checkSufficientDefinition() ) { - status = U_DEFAULT_KEYWORD_MISSING; - cleanHashTable(); - } - return; } UnicodeString& @@ -260,14 +71,13 @@ SelectFormat::format(const Formattable& obj, FieldPosition& pos, UErrorCode& status) const { - switch (obj.getType()) - { - case Formattable::kString: - return format(obj.getString(), appendTo, pos, status); - default: - if( U_SUCCESS(status) ){ - status = U_ILLEGAL_ARGUMENT_ERROR; - } + if (U_FAILURE(status)) { + return appendTo; + } + if (obj.getType() == Formattable::kString) { + return format(obj.getString(status), appendTo, pos, status); + } else { + status = U_ILLEGAL_ARGUMENT_ERROR; return appendTo; } } @@ -277,85 +87,66 @@ SelectFormat::format(const UnicodeString& keyword, UnicodeString& appendTo, FieldPosition& /*pos */, UErrorCode& status) const { - - if (U_FAILURE(status)) return appendTo; - - if (parsedValuesHash == NULL) { - status = U_INVALID_FORMAT_ERROR; + if (U_FAILURE(status)) { return appendTo; } - - //Check for the validity of the keyword - if ( !checkValidKeyword(keyword) ){ - status = U_ILLEGAL_ARGUMENT_ERROR; + // Check for the validity of the keyword + if (!PatternProps::isIdentifier(keyword.getBuffer(), keyword.length())) { + status = U_ILLEGAL_ARGUMENT_ERROR; // Invalid formatting argument. + } + if (msgPattern.countParts() == 0) { + status = U_INVALID_STATE_ERROR; return appendTo; } - - UnicodeString *selectedPattern = (UnicodeString *)parsedValuesHash->get(keyword); - if (selectedPattern == NULL) { - selectedPattern = (UnicodeString *)parsedValuesHash->get(SELECT_KEYWORD_OTHER); + int32_t msgStart = findSubMessage(msgPattern, 0, keyword, status); + if (!MessageImpl::jdkAposMode(msgPattern)) { + int32_t patternStart = msgPattern.getPart(msgStart).getLimit(); + int32_t msgLimit = msgPattern.getLimitPartIndex(msgStart); + appendTo.append(msgPattern.getPatternString(), + patternStart, + msgPattern.getPatternIndex(msgLimit) - patternStart); + return appendTo; } - - return appendTo += *selectedPattern; + // JDK compatibility mode: Remove SKIP_SYNTAX. + return MessageImpl::appendSubMessageWithoutSkipSyntax(msgPattern, msgStart, appendTo); } UnicodeString& SelectFormat::toPattern(UnicodeString& appendTo) { - return appendTo += pattern; + if (0 == msgPattern.countParts()) { + appendTo.setToBogus(); + } else { + appendTo.append(msgPattern.getPatternString()); + } + return appendTo; } -SelectFormat::CharacterClass -SelectFormat::classifyCharacter(UChar ch) const{ - if ((ch >= CAP_A) && (ch <= CAP_Z)) { - return tStartKeyword; - } - if ((ch >= LOW_A) && (ch <= LOW_Z)) { - return tStartKeyword; - } - if ((ch >= U_ZERO) && (ch <= U_NINE)) { - return tContinueKeyword; - } - if ( uprv_isRuleWhiteSpace(ch) ){ - return tSpace; - } - switch (ch) { - case LEFTBRACE: - return tLeftBrace; - case RIGHTBRACE: - return tRightBrace; - case HYPHEN: - case LOWLINE: - return tContinueKeyword; - default : - return tOther; - } -} -UBool -SelectFormat::checkSufficientDefinition() { - // Check that at least the default rule is defined. - return (parsedValuesHash != NULL && - parsedValuesHash->get(SELECT_KEYWORD_OTHER) != NULL) ; -} - -UBool -SelectFormat::checkValidKeyword(const UnicodeString& argKeyword ) const{ - int32_t len = argKeyword.length(); - if (len < 1){ - return FALSE; +int32_t SelectFormat::findSubMessage(const MessagePattern& pattern, int32_t partIndex, + const UnicodeString& keyword, UErrorCode& ec) { + if (U_FAILURE(ec)) { + return 0; } - CharacterClass type = classifyCharacter(argKeyword.charAt(0)); - if( type != tStartKeyword ){ - return FALSE; - } - - for (int32_t i = 0; i < argKeyword.length(); ++i) { - type = classifyCharacter(argKeyword.charAt(i)); - if( type != tStartKeyword && type != tContinueKeyword ){ - return FALSE; + UnicodeString other(FALSE, SELECT_KEYWORD_OTHER, 5); + int32_t count = pattern.countParts(); + int32_t msgStart=0; + // Iterate over (ARG_SELECTOR, message) pairs until ARG_LIMIT or end of select-only pattern. + do { + const MessagePattern::Part& part=pattern.getPart(partIndex++); + const UMessagePatternPartType type=part.getType(); + if(type==UMSGPAT_PART_TYPE_ARG_LIMIT) { + break; } - } - return TRUE; + // part is an ARG_SELECTOR followed by a message + if(pattern.partSubstringMatches(part, keyword)) { + // keyword matches + return partIndex; + } else if(msgStart==0 && pattern.partSubstringMatches(part, other)) { + msgStart=partIndex; + } + partIndex=pattern.getLimitPartIndex(partIndex); + } while(++partIndexparsedValuesHash; - if ( parsedValuesHash == NULL && hashOther == NULL) - return TRUE; - if ( parsedValuesHash == NULL || hashOther == NULL) + if (!Format::operator==(other)) { return FALSE; - return parsedValuesHash->equals(*hashOther); + } + const SelectFormat& o = (const SelectFormat&)other; + return msgPattern == o.msgPattern; } UBool @@ -400,46 +184,10 @@ SelectFormat::parseObject(const UnicodeString& /*source*/, Formattable& /*result*/, ParsePosition& pos) const { - // TODO: not yet supported in icu4j and icu4c + // Parsing not supported. pos.setErrorIndex(pos.getIndex()); } -void -SelectFormat::copyHashtable(Hashtable *other, UErrorCode& status) { - if (U_FAILURE(status)) { - return; - } - if (other == NULL) { - cleanHashTable(); - return; - } - if (parsedValuesHash == NULL) { - initHashTable(status); - if (U_FAILURE(status)) { - return; - } - } - - parsedValuesHash->removeAll(); - parsedValuesHash->setValueDeleter(uhash_deleteUnicodeString); - - int32_t pos = -1; - const UHashElement* elem = NULL; - - // walk through the hash table and create a deep clone - while ((elem = other->nextElement(pos)) != NULL){ - const UHashTok otherKeyTok = elem->key; - UnicodeString* otherKey = (UnicodeString*)otherKeyTok.pointer; - const UHashTok otherKeyToVal = elem->value; - UnicodeString* otherValue = (UnicodeString*)otherKeyToVal.pointer; - parsedValuesHash->put(*otherKey, new UnicodeString(*otherValue), status); - if (U_FAILURE(status)){ - cleanHashTable(); - return; - } - } -} - U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/selfmtimpl.h b/icu4c/source/i18n/selfmtimpl.h index 208a6599a2e..dea814a8795 100755 --- a/icu4c/source/i18n/selfmtimpl.h +++ b/icu4c/source/i18n/selfmtimpl.h @@ -1,10 +1,10 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. * Copyright (C) 2010 , Yahoo! Inc. ******************************************************************** - * File SELECTFMT_IMPL.H + * File selectfmtimpl.h * * Date Name Description * 11/11/09 kirtig Finished first cut of implementation. @@ -14,11 +14,6 @@ #ifndef SELFMTIMPL #define SELFMTIMPL -/** - * \file - * \brief C++ API: Defines rules for mapping positive long values onto a small set of keywords. - */ - #if !UCONFIG_NO_FORMATTING #include "unicode/format.h" diff --git a/icu4c/source/i18n/umsg.cpp b/icu4c/source/i18n/umsg.cpp index 8a58ee0442c..6c8a09c2648 100644 --- a/icu4c/source/i18n/umsg.cpp +++ b/icu4c/source/i18n/umsg.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2006, International Business Machines +* Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -32,6 +32,27 @@ #include "uassert.h" #include "ustr_imp.h" +U_NAMESPACE_BEGIN +/** + * This class isolates our access to private internal methods of + * MessageFormat. It is never instantiated; it exists only for C++ + * access management. + */ +class MessageFormatAdapter { +public: + static const Formattable::Type* getArgTypeList(const MessageFormat& m, + int32_t& count); + static UBool hasArgTypeConflicts(const MessageFormat& m) { + return m.hasArgTypeConflicts; + } +}; +const Formattable::Type* +MessageFormatAdapter::getArgTypeList(const MessageFormat& m, + int32_t& count) { + return m.getArgTypeList(count); +} +U_NAMESPACE_END + U_NAMESPACE_USE U_CAPI int32_t @@ -217,25 +238,23 @@ umsg_open( const UChar *pattern, } UParseError tErr; - if(parseError==NULL) { parseError = &tErr; } - - UMessageFormat* retVal = 0; int32_t len = (patternLength == -1 ? u_strlen(pattern) : patternLength); - - UnicodeString patString((patternLength == -1 ? TRUE:FALSE), pattern,len); + UnicodeString patString(patternLength == -1, pattern, len); - retVal = (UMessageFormat*) new MessageFormat(patString,Locale(locale),*parseError,*status); - - if(retVal == 0) { + MessageFormat* retVal = new MessageFormat(patString,Locale(locale),*parseError,*status); + if(retVal == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; - return 0; + return NULL; } - return retVal; + if (U_SUCCESS(*status) && MessageFormatAdapter::hasArgTypeConflicts(*retVal)) { + *status = U_ARGUMENT_TYPE_MISMATCH; + } + return (UMessageFormat*)retVal; } U_CAPI void U_EXPORT2 @@ -366,24 +385,6 @@ umsg_format( const UMessageFormat *fmt, return actLen; } -U_NAMESPACE_BEGIN -/** - * This class isolates our access to private internal methods of - * MessageFormat. It is never instantiated; it exists only for C++ - * access management. - */ -class MessageFormatAdapter { -public: - static const Formattable::Type* getArgTypeList(const MessageFormat& m, - int32_t& count); -}; -const Formattable::Type* -MessageFormatAdapter::getArgTypeList(const MessageFormat& m, - int32_t& count) { - return m.getArgTypeList(count); -} -U_NAMESPACE_END - U_CAPI int32_t U_EXPORT2 umsg_vformat( const UMessageFormat *fmt, UChar *result, @@ -456,11 +457,13 @@ umsg_vformat( const UMessageFormat *fmt, break; case Formattable::kObject: + default: // This will never happen because MessageFormat doesn't // support kObject. When MessageFormat is changed to // understand MeasureFormats, modify this code to do the // right thing. [alan] U_ASSERT(FALSE); + *status=U_ILLEGAL_ARGUMENT_ERROR; break; } } diff --git a/icu4c/source/i18n/unicode/choicfmt.h b/icu4c/source/i18n/unicode/choicfmt.h index 921b0a984ee..663d45aa5b5 100644 --- a/icu4c/source/i18n/unicode/choicfmt.h +++ b/icu4c/source/i18n/unicode/choicfmt.h @@ -1,6 +1,6 @@ /* ******************************************************************************** -* Copyright (C) 1997-2010, International Business Machines +* Copyright (C) 1997-2011, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** * @@ -31,147 +31,91 @@ #if !UCONFIG_NO_FORMATTING -#include "unicode/unistr.h" -#include "unicode/numfmt.h" #include "unicode/fieldpos.h" #include "unicode/format.h" +#include "unicode/messagepattern.h" +#include "unicode/numfmt.h" +#include "unicode/unistr.h" U_NAMESPACE_BEGIN class MessageFormat; /** - * ChoiceFormat converts between ranges of numeric values - * and string names for those ranges. A ChoiceFormat splits - * the real number line -Inf to +Inf into two + * ChoiceFormat converts between ranges of numeric values and strings for those ranges. + * The strings must conform to the MessageFormat pattern syntax. + * + *

    ChoiceFormat is probably not what you need. + * Please use MessageFormat + * with plural arguments for proper plural selection, + * and select arguments for simple selection among a fixed set of choices!

    + * + *

    A ChoiceFormat splits + * the real number line \htmlonly-∞ to + * +∞\endhtmlonly into two * or more contiguous ranges. Each range is mapped to a - * string. ChoiceFormat is generally used in a - * MessageFormat for displaying grammatically correct - * plurals such as "There are 2 files."

    + * string.

    + * + *

    ChoiceFormat was originally intended + * for displaying grammatically correct + * plurals such as "There is one file." vs. "There are 2 files." + * However, plural rules for many languages + * are too complex for the capabilities of ChoiceFormat, + * and its requirement of specifying the precise rules for each message + * is unmanageable for translators.

    * *

    There are two methods of defining a ChoiceFormat; both * are equivalent. The first is by using a string pattern. This is the * preferred method in most cases. The second method is through direct - * specification of the arrays that make up the + * specification of the arrays that logically make up the * ChoiceFormat.

    * - *

    Patterns

    + *

    Note: Typically, choice formatting is done (if done at all) via MessageFormat + * with a choice argument type, + * rather than using a stand-alone ChoiceFormat.

    * - *

    In most cases, the preferred way to define a - * ChoiceFormat is with a pattern. Here is an example of a - * ChoiceFormat pattern:

    + *
    Patterns and Their Interpretation
    * - * \htmlonly
        0≤are no files|1≤is one file|1<are many files
    \endhtmlonly + *

    The pattern string defines the range boundaries and the strings for each number range. + * Syntax: + *

    + * choiceStyle = number separator message ('|' number separator message)*
    + * number = normal_number | ['-'] \htmlonly∞\endhtmlonly (U+221E, infinity)
    + * normal_number = double value (unlocalized ASCII string)
    + * separator = less_than | less_than_or_equal
    + * less_than = '<'
    + * less_than_or_equal = '#' | \htmlonly≤\endhtmlonly (U+2264)
    + * message: see {@link MessageFormat}
    + * 
    + * Pattern_White_Space between syntax elements is ignored, except + * around each range's sub-message.

    * - *

    or equivalently,

    + *

    Each numeric sub-range extends from the current range's number + * to the next range's number. + * The number itself is included in its range if a less_than_or_equal sign is used, + * and excluded from its range (and instead included in the previous range) + * if a less_than sign is used.

    * - * \htmlonly
        0#are no files|1#is one file|1<are many files
    \endhtmlonly + *

    When a ChoiceFormat is constructed from + * arrays of numbers, closure flags and strings, + * they are interpreted just like + * the sequence of (number separator string) in an equivalent pattern string. + * closure[i]==TRUE corresponds to a less_than separator sign. + * The equivalent pattern string will be constructed automatically.

    * - *

    The pattern consists of a number or range specifiers - * separated by vertical bars '|' (U+007C). There is no - * vertical bar after the last range. Each range specifier is of the - * form:

    + *

    During formatting, a number is mapped to the first range + * where the number is not greater than the range's upper limit. + * That range's message string is returned. A NaN maps to the very first range.

    * - * \htmlonly
    Number Separator String
    \endhtmlonly + *

    During parsing, a range is selected for the longest match of + * any range's message. That range's number is returned, ignoring the separator/closure. + * Only a simple string match is performed, without parsing of arguments that + * might be specified in the message strings.

    * - *

    Number is a floating point number that can be parsed by a - * default NumberFormat for the US locale. It gives the - * lower limit of this range. The lower limit is either inclusive or - * exclusive, depending on the separator. The upper limit is - * given by the lower limit of the next range. The Unicode infinity - * sign \htmlonly∞ \endhtmlonly (U+221E) is recognized for positive infinity. It may be preceded by - * '-' (U+002D) to indicate negative infinity.

    + *

    Note that the first range's number is ignored in formatting + * but may be returned from parsing.

    * - *

    String is the format string for this range, with special - * characters enclosed in single quotes ('The # - * sign'). Single quotes themselves are indicated by two single - * quotes in a row ('o''clock').

    - * - *

    Separator is one of the following single characters: - * - *

      - *
    • \htmlonly'≤' \endhtmlonly (U+2264) or '#' (U+0023) - * indicates that the lower limit given by Number is - * inclusive. (The two characters are equivalent to ChoiceFormat.) - * This means that the limit value Number belongs to this - * range. Another way of saying this is that the corresponding - * closure is FALSE.
    • - * - *
    • '<' (U+003C) indicates that the lower limit given by - * Number is exclusive. This means that the value - * Number belongs to the prior range.
    • Another way of - * saying this is that the corresponding closure is - * TRUE. - *
    - * - *

    See below for more information about closures.

    - * - *

    Arrays

    - * - *

    A ChoiceFormat defining n intervals - * (n >= 2) is specified by three arrays of - * n items: - * - *

      - *
    • double limits[] gives the start of each - * interval. This must be a non-decreasing list of values, none of - * which may be NaN.
    • - *
    • UBool closures[] determines whether each limit - * value is contained in the interval below it or in the interval - * above it. If closures[i] is FALSE, then - * limits[i] is a member of interval - * i. Otherwise it is a member of interval - * i+1. If no closures array is specified, this is - * equivalent to having all closures be FALSE. Closures - * allow one to specify half-open, open, or closed intervals.
    • - *
    • UnicodeString formats[] gives the string label - * associated with each interval.
    • - *
    - * - *

    Formatting and Parsing

    - * - *

    During formatting, a number is converted to a - * string. ChoiceFormat accomplishes this by mapping the - * number to an interval using the following rule. Given a number - * X and and index value j in the range - * 0..n-1, where n is the number of ranges:

    - * - * \htmlonly
    \endhtmlonlyX matches j if and only if - * limit[j] <= X < limit[j+1] - * \htmlonly
    \endhtmlonly - * - *

    (This assumes that all closures are FALSE. If some - * closures are TRUE then the relations must be changed to - * <= or < as appropriate.) If there is - * no match, then either the first or last index is used, depending on - * whether the number is too low or too high. Once a number is mapped to - * an interval j, the string formats[j] is - * output.

    - * - *

    During parsing, a string is converted to a - * number. ChoiceFormat finds the element - * formats[j] equal to the string, and returns - * limits[j] as the parsed value.

    - * - *

    Notes

    - * - *

    The first limit value does not define a range boundary. For - * example, in the pattern \htmlonly"1.0#a|2.0#b"\endhtmlonly, the - * intervals are [-Inf, 2.0) and [2.0, +Inf]. It appears that the first - * interval should be [1.0, 2.0). However, since all values that are too - * small are mapped to range zero, the first interval is effectively - * [-Inf, 2.0). However, the first limit value is used during - * formatting. In this example, parse("a") returns - * 1.0.

    - * - *

    There are no gaps between intervals and the entire number line is - * covered. A ChoiceFormat maps all possible - * double values to a finite set of intervals.

    - * - *

    The non-number NaN is mapped to interval zero during - * formatting.

    - * - *

    Examples

    + *
    Examples
    * *

    Here is an example of two arrays that map the number * 1..7 to the English day of the week abbreviations @@ -183,13 +127,15 @@ class MessageFormat; * *

    Here is an example that maps the ranges [-Inf, 1), [1, 1], and (1, * +Inf] to three strings. That is, the number line is split into three - * ranges: x < 1.0, x = 1.0, and x > 1.0.

    + * ranges: x < 1.0, x = 1.0, and x > 1.0. + * (The round parentheses in the notation above indicate an exclusive boundary, + * like the turned bracket in European notation: [-Inf, 1) == [-Inf, 1[ )

    * *
        {0, 1, 1},
      *     {FALSE, FALSE, TRUE},
      *     {"no files", "one file", "many files"}
    * - *

    Here is a simple example that shows formatting and parsing:

    + *

    Here is an example that shows formatting and parsing:

    * * \code * #include @@ -215,43 +161,6 @@ class MessageFormat; * } * \endcode * - *

    Here is a more complex example using a ChoiceFormat - * constructed from a pattern together with a - * MessageFormat.

    - * - * \code - * #include - * #include - * #include - * #include - * - * int main(int argc, char *argv[]) { - * UErrorCode status = U_ZERO_ERROR; - * double filelimits[] = {0,1,2}; - * UnicodeString filepart[] = - * {"are no files","is one file","are {0} files"}; - * ChoiceFormat* fileform = new ChoiceFormat(filelimits, filepart, 3 ); - * Format* testFormats[] = - * {fileform, NULL, NumberFormat::createInstance(status)}; - * MessageFormat pattform("There {0} on {1}", status ); - * pattform.adoptFormats(testFormats, 3); - * Formattable testArgs[] = {0L, "Disk A"}; - * FieldPosition fp(0); - * UnicodeString str; - * char buf[256]; - * for (int32_t i = 0; i < 4; ++i) { - * Formattable fInt(i); - * testArgs[0] = fInt; - * pattform.format(testArgs, 2, str, fp, status ); - * str.extract(0, str.length(), buf, ""); - * str.truncate(0); - * cout << "Output for i=" << i << " : " << buf << endl; - * } - * cout << endl; - * return 0; - * } - * \endcode - * *

    User subclasses are not supported. While clients may write * subclasses, such code will not necessarily work and will not be * guaranteed to work stably from release to release. @@ -259,8 +168,7 @@ class MessageFormat; class U_I18N_API ChoiceFormat: public NumberFormat { public: /** - * Construct a new ChoiceFormat with the limits and the corresponding formats - * based on the pattern. + * Constructs a new ChoiceFormat from the pattern string. * * @param pattern Pattern used to construct object. * @param status Output param to receive success code. If the @@ -272,32 +180,31 @@ public: /** - * Construct a new ChoiceFormat with the given limits and formats. Copy - * the limits and formats instead of adopting them. + * Constructs a new ChoiceFormat with the given limits and message strings. + * All closure flags default to FALSE, + * equivalent to less_than_or_equal separators. + * + * Copies the limits and formats instead of adopting them. * * @param limits Array of limit values. * @param formats Array of formats. * @param count Size of 'limits' and 'formats' arrays. * @stable ICU 2.0 */ - ChoiceFormat(const double* limits, const UnicodeString* formats, int32_t count ); /** - * Construct a new ChoiceFormat with the given limits and formats. - * Copy the limits and formats (instead of adopting them). By - * default, each limit in the array specifies the inclusive lower - * bound of its range, and the exclusive upper bound of the previous - * range. However, if the isLimitOpen element corresponding to a - * limit is TRUE, then the limit is the exclusive lower bound of its - * range, and the inclusive upper bound of the previous range. + * Constructs a new ChoiceFormat with the given limits, closure flags and message strings. + * + * Copies the limits and formats instead of adopting them. + * * @param limits Array of limit values * @param closures Array of booleans specifying whether each * element of 'limits' is open or closed. If FALSE, then the - * corresponding limit is a member of the range above it. If TRUE, - * then the limit belongs to the range below it. + * corresponding limit number is a member of its range. + * If TRUE, then the limit number belongs to the previous range it. * @param formats Array of formats * @param count Size of 'limits', 'closures', and 'formats' arrays * @stable ICU 2.4 @@ -330,8 +237,8 @@ public: virtual ~ChoiceFormat(); /** - * Clone this Format object polymorphically. The caller owns the - * result and should delete it when done. + * Clones this Format object. The caller owns the + * result and must delete it when done. * * @return a copy of this object * @stable ICU 2.0 @@ -339,7 +246,7 @@ public: virtual Format* clone(void) const; /** - * Return true if the given Format objects are semantically equal. + * Returns true if the given Format objects are semantically equal. * Objects of different subclasses are considered unequal. * * @param other ChoiceFormat object to be compared @@ -362,7 +269,7 @@ public: /** * Sets the pattern. * @param pattern The pattern to be applied. - * @param parseError Struct to recieve information on position + * @param parseError Struct to receive information on position * of error if an error is encountered * @param status Output param set to success/failure code on * exit. If the pattern is invalid, this will be @@ -375,7 +282,7 @@ public: /** * Gets the pattern. * - * @param pattern Output param which will recieve the pattern + * @param pattern Output param which will receive the pattern * Previous contents are deleted. * @return A reference to 'pattern' * @stable ICU 2.0 @@ -383,7 +290,8 @@ public: virtual UnicodeString& toPattern(UnicodeString &pattern) const; /** - * Set the choices to be used in formatting. + * Sets the choices to be used in formatting. + * For details see the constructor with the same parameter list. * * @param limitsToCopy Contains the top value that you want * parsed with that format,and should be in @@ -399,8 +307,9 @@ public: int32_t count ); /** - * Set the choices to be used in formatting. See class description - * for documenatation of the limits, closures, and formats arrays. + * Sets the choices to be used in formatting. + * For details see the constructor with the same parameter list. + * * @param limits Array of limits * @param closures Array of limit booleans * @param formats Array of format string @@ -413,30 +322,32 @@ public: int32_t count); /** - * Get the limits passed in the constructor. + * Returns NULL and 0. + * Before ICU 4.8, this used to return the choice limits array. * - * @param count The size of the limits arrays - * @return the limits. - * @stable ICU 2.0 + * @param count Will be set to 0. + * @return NULL + * @deprecated ICU 4.8 Use the MessagePattern class to analyze a ChoiceFormat pattern. */ virtual const double* getLimits(int32_t& count) const; /** - * Get the limit booleans passed in the constructor. The caller - * must not delete the result. + * Returns NULL and 0. + * Before ICU 4.8, this used to return the limit booleans array. * - * @param count The size of the arrays - * @return the closures - * @stable ICU 2.4 + * @param count Will be set to 0. + * @return NULL + * @deprecated ICU 4.8 Use the MessagePattern class to analyze a ChoiceFormat pattern. */ virtual const UBool* getClosures(int32_t& count) const; /** - * Get the formats passed in the constructor. + * Returns NULL and 0. + * Before ICU 4.8, this used to return the array of choice strings. * - * @param count The size of the arrays - * @return the formats. - * @stable ICU 2.0 + * @param count Will be set to 0. + * @return NULL + * @deprecated ICU 4.8 Use the MessagePattern class to analyze a ChoiceFormat pattern. */ virtual const UnicodeString* getFormats(int32_t& count) const; @@ -444,7 +355,7 @@ public: using NumberFormat::format; /** - * Format a double or long number using this object's choices. + * Formats a double number using this object's choices. * * @param number The value to be formatted. * @param appendTo Output parameter to receive result. @@ -458,7 +369,7 @@ public: UnicodeString& appendTo, FieldPosition& pos) const; /** - * Format a int_32t number using this object's choices. + * Formats an int32_t number using this object's choices. * * @param number The value to be formatted. * @param appendTo Output parameter to receive result. @@ -473,7 +384,7 @@ public: FieldPosition& pos) const; /** - * Format an int64_t number using this object's choices. + * Formats an int64_t number using this object's choices. * * @param number The value to be formatted. * @param appendTo Output parameter to receive result. @@ -488,7 +399,7 @@ public: FieldPosition& pos) const; /** - * Format an array of objects using this object's choices. + * Formats an array of objects using this object's choices. * * @param objs The array of objects to be formatted. * @param cnt The size of objs. @@ -507,7 +418,7 @@ public: FieldPosition& pos, UErrorCode& success) const; /** - * Format an object using this object's choices. + * Formats an object using this object's choices. * * * @param obj The object to be formatted. @@ -542,7 +453,7 @@ public: /** * Redeclared NumberFormat method. - * Format a double number. These methods call the NumberFormat + * Formats a double number. These methods call the NumberFormat * pure virtual format() methods with the default FieldPosition. * * @param number The value to be formatted. @@ -556,7 +467,7 @@ public: /** * Redeclared NumberFormat method. - * Format a long number. These methods call the NumberFormat + * Formats an int32_t number. These methods call the NumberFormat * pure virtual format() methods with the default FieldPosition. * * @param number The value to be formatted. @@ -569,13 +480,10 @@ public: UnicodeString& appendTo) const; /** - * Return a long if possible (e.g. within range LONG_MAX, - * LONG_MAX], and with no decimals), otherwise a double. If - * IntegerOnly is set, will stop at a decimal point (or equivalent; - * e.g. for rational numbers "1 2/3", will stop after the 1). - *

    - * If no object can be parsed, parsePosition is unchanged, and NULL is - * returned. + * Looks for the longest match of any message string on the input text and, + * if there is a match, sets the result object to the corresponding range's number. + * + * If no string matches, then the parsePosition is unchanged. * * @param text The text to be parsed. * @param result Formattable to be set to the parse result. @@ -583,7 +491,6 @@ public: * @param parsePosition The position to start parsing at on input. * On output, moved to after the last successfully * parse character. On parse failure, does not change. - * @see NumberFormat::isParseIntegerOnly * @stable ICU 2.0 */ virtual void parse(const UnicodeString& text, @@ -591,32 +498,23 @@ public: ParsePosition& parsePosition) const; /** - * Return a long if possible (e.g. within range LONG_MAX, - * LONG_MAX], and with no decimals), otherwise a double. If - * IntegerOnly is set, will stop at a decimal point (or equivalent; - * e.g. for rational numbers "1 2/3", will stop after the 1). - *

    - * If no object can be parsed, parsePosition is unchanged, and NULL is - * returned. - * - * @param text The text to be parsed. - * @param result Formattable to be set to the parse result. - * If parse fails, return contents are undefined. - * @param status Output param with the formatted string. - * @see NumberFormat::isParseIntegerOnly - * @stable ICU 2.0 - */ + * Looks for the longest match of any message string on the input text and, + * if there is a match, sets the result object to the corresponding range's number. + * + * If no string matches, then the UErrorCode is set to U_INVALID_FORMAT_ERROR. + * + * @param text The text to be parsed. + * @param result Formattable to be set to the parse result. + * If parse fails, return contents are undefined. + * @param status Output param with the formatted string. + * @stable ICU 2.0 + */ virtual void parse(const UnicodeString& text, Formattable& result, UErrorCode& status) const; - -public: /** - * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. - * This method is to implement a simple version of RTTI, since not all - * C++ compilers support genuine RTTI. Polymorphic operator==() and - * clone() methods call this method. + * Returns a unique class ID POLYMORPHICALLY. Part of ICU's "poor man's RTTI". * * @return The class ID for this object. All objects of a * given class have the same class ID. Objects of @@ -626,7 +524,7 @@ public: virtual UClassID getDynamicClassID(void) const; /** - * Return the class ID for this class. This is useful only for + * Returns the class ID for this class. This is useful only for * comparing to a return value from getDynamicClassID(). For example: *

          * .       Base* polymorphic_pointer = createPolymorphicObject();
    @@ -639,22 +537,9 @@ public:
         static UClassID U_EXPORT2 getStaticClassID(void);
     
     private:
    -    // static cache management (thread-safe)
    -  //  static NumberFormat* getNumberFormat(UErrorCode &status); // call this function to 'check out' a numberformat from the cache.
    -  //  static void          releaseNumberFormat(NumberFormat *adopt); // call this function to 'return' the number format to the cache.
    -
         /**
    -     * Converts a string to a double value using a default NumberFormat object
    -     * which is static (shared by all ChoiceFormat instances).
    -     * @param string the string to be converted with.
    -     * @return the converted double number.
    -     */
    -    static double stod(const UnicodeString& string);
    -
    -    /**
    -     * Converts a double value to a string using a default NumberFormat object
    -     * which is static (shared by all ChoiceFormat instances).
    -     * @param value the double number to be converted with.
    +     * Converts a double value to a string.
    +     * @param value the double number to be converted.
          * @param string the result string.
          * @return the converted string.
          */
    @@ -667,7 +552,7 @@ private:
          * based on the pattern.
          *
          * @param newPattern   Pattern used to construct object.
    -     * @param parseError   Struct to recieve information on position
    +     * @param parseError   Struct to receive information on position
          *                     of error if an error is encountered.
          * @param status       Output param to receive success code.  If the
          *                     pattern cannot be parsed, set to failure code.
    @@ -678,7 +563,59 @@ private:
                      UErrorCode& status);
     
         friend class MessageFormat;
    +
    +    virtual void setChoices(const double* limits,
    +                            const UBool* closures,
    +                            const UnicodeString* formats,
    +                            int32_t count,
    +                            UErrorCode &errorCode);
    +
         /**
    +     * Finds the ChoiceFormat sub-message for the given number.
    +     * @param pattern A MessagePattern.
    +     * @param partIndex the index of the first ChoiceFormat argument style part.
    +     * @param number a number to be mapped to one of the ChoiceFormat argument's intervals
    +     * @return the sub-message start part index.
    +     */
    +    static int32_t findSubMessage(const MessagePattern &pattern, int32_t partIndex, double number);
    +
    +    static double parseArgument(
    +            const MessagePattern &pattern, int32_t partIndex,
    +            const UnicodeString &source, ParsePosition &pos);
    +
    +    /**
    +     * Matches the pattern string from the end of the partIndex to
    +     * the beginning of the limitPartIndex,
    +     * including all syntax except SKIP_SYNTAX,
    +     * against the source string starting at sourceOffset.
    +     * If they match, returns the length of the source string match.
    +     * Otherwise returns -1.
    +     */
    +    static int32_t matchStringUntilLimitPart(
    +            const MessagePattern &pattern, int32_t partIndex, int32_t limitPartIndex,
    +            const UnicodeString &source, int32_t sourceOffset);
    +
    +    /**
    +     * Some of the ChoiceFormat constructors do not have a UErrorCode paramater.
    +     * We need _some_ way to provide one for the MessagePattern constructor.
    +     * Alternatively, the MessagePattern could be a pointer field, but that is
    +     * not nice either.
    +     */
    +    UErrorCode constructorErrorCode;
    +
    +    /**
    +     * The MessagePattern which contains the parsed structure of the pattern string.
    +     *
    +     * Starting with ICU 4.8, the MessagePattern contains a sequence of
    +     * numeric/selector/message parts corresponding to the parsed pattern.
    +     * For details see the MessagePattern class API docs.
    +     */
    +    MessagePattern msgPattern;
    +
    +    /**
    +     * Docs & fields from before ICU 4.8, before MessagePattern was used.
    +     * Commented out, and left only for explanation of semantics.
    +     * --------
          * Each ChoiceFormat divides the range -Inf..+Inf into fCount
          * intervals.  The intervals are:
          *
    @@ -713,12 +650,11 @@ private:
          *
          * Because of the nature of interval 0, fClosures[0] has no
          * effect.
    -
          */
    -    double*         fChoiceLimits;
    -    UBool*          fClosures;
    -    UnicodeString*  fChoiceFormats;
    -    int32_t         fCount;
    +    // double*         fChoiceLimits;
    +    // UBool*          fClosures;
    +    // UnicodeString*  fChoiceFormats;
    +    // int32_t         fCount;
     };
     
     inline UnicodeString&
    diff --git a/icu4c/source/i18n/unicode/msgfmt.h b/icu4c/source/i18n/unicode/msgfmt.h
    index 58c74f1cb1b..bc9b07cccc9 100644
    --- a/icu4c/source/i18n/unicode/msgfmt.h
    +++ b/icu4c/source/i18n/unicode/msgfmt.h
    @@ -1,5 +1,5 @@
     /*
    -* Copyright (C) 2007-2010, International Business Machines Corporation and
    +* Copyright (C) 2007-2011, International Business Machines Corporation and
     * others. All Rights Reserved.
     ********************************************************************************
     *
    @@ -28,106 +28,213 @@
     
     #include "unicode/format.h"
     #include "unicode/locid.h"
    +#include "unicode/messagepattern.h"
     #include "unicode/parseerr.h"
    -#include "unicode/uchar.h"
    +#include "unicode/plurfmt.h"
    +#include "unicode/plurrule.h"
    +
    +U_CDECL_BEGIN
    +// Forward declaration.
    +struct UHashtable;
    +typedef struct UHashtable UHashtable;
    +U_CDECL_END
     
     U_NAMESPACE_BEGIN
     
    -class NumberFormat;
    +class AppendableWrapper;
     class DateFormat;
    +class NumberFormat;
     
     /**
    + * 

    MessageFormat prepares strings for display to users, + * with optional arguments (variables/placeholders). + * The arguments can occur in any order, which is necessary for translation + * into languages with different grammars. * - * MessageFormat produces concatenated messages in a language-neutral - * way. Use this whenever concatenating strings that are displayed to - * end users. + *

    A MessageFormat is constructed from a pattern string + * with arguments in {curly braces} which will be replaced by formatted values. * - *

    A MessageFormat contains an array of subformats arranged - * within a template string. Together, the subformats and - * template string determine how the MessageFormat will operate during - * formatting and parsing. + *

    MessageFormat differs from the other Format + * classes in that you create a MessageFormat object with one + * of its constructors (not with a createInstance style factory + * method). Factory methods aren't necessary because MessageFormat + * itself doesn't implement locale-specific behavior. Any locale-specific + * behavior is defined by the pattern that you provide and the + * subformats used for inserted arguments. * - *

    Typically, both the subformats and the template string are - * specified at once in a pattern. By using different - * patterns for different locales, messages may be localized. + *

    Arguments can be named (using identifiers) or numbered (using small ASCII-digit integers). + * Some of the API methods work only with argument numbers and throw an exception + * if the pattern has named arguments (see {@link #usesNamedArguments()}). * - *

    When formatting, MessageFormat takes an array of arguments - * and produces a user-readable string. Each argument is a - * Formattable object; they may be passed in in an array, or as a - * single Formattable object which itself contains an array. Each - * argument is matched up with its corresponding subformat, which then - * formats it into a string. The resulting strings are then assembled - * within the string template of the MessageFormat to produce the - * final output string. + *

    An argument might not specify any format type. In this case, + * a Number value is formatted with a default (for the locale) NumberFormat, + * a Date value is formatted with a default (for the locale) DateFormat, + * and for any other value its toString() value is used. * - *

    Note: - * In ICU 4.0 MessageFormat supports named arguments. If a named argument - * is used, all arguments must be named. Names start with a character in - * UCHAR_ID_START and continue with characters in - * UCHARID_CONTINUE, in particular they do not start with a digit. - * If named arguments are used, {@link #usesNamedArguments()} will return true. + *

    An argument might specify a "simple" type for which the specified + * Format object is created, cached and used. * - *

    The other new methods supporting named arguments are - * {@link #getFormatNames(UErrorCode& status)}, - * {@link #getFormat(const UnicodeString& formatName, UErrorCode& status)} - * {@link #setFormat(const UnicodeString& formatName, const Format& format, UErrorCode& status)}, - * {@link #adoptFormat(const UnicodeString& formatName, Format* formatToAdopt, UErrorCode& status)}, - * {@link #format(const UnicodeString* argumentNames, const Formattable* arguments, - * int32_t count, UnicodeString& appendTo,UErrorCode& status)}. - * These methods are all compatible with patterns that do not used named arguments-- - * in these cases the keys in the input or output use UnicodeStrings - * that name the argument indices, e.g. "0", "1", "2"... etc. + *

    An argument might have a "complex" type with nested MessageFormat sub-patterns. + * During formatting, one of these sub-messages is selected according to the argument value + * and recursively formatted. * - *

    If this format uses named arguments, certain methods that take or - * return arrays do not perform any action, since it is not possible to - * identify positions in an array using a name. Of these methods, - * UErrorCode is set to U_ILLEGAL_ARGUMENT_ERROR by format, and to - * U_ARGUMENT_TYPE_MISMATCH by parse. - * These methods are - * {@link #adoptFormats(Format** formatsToAdopt, int32_t count)}, - * {@link #setFormats(const Format** newFormats,int32_t count)}, - * {@link #adoptFormat(int32_t n, Format *newFormat)}, - * {@link #setFormat(int32_t n, Format& newFormat)}, - * {@link #format(const Formattable* source, int32_t count, UnicodeString& appendTo, FieldPosition& ignore, UErrorCode& success)}, - * {@link #format(const UnicodeString& pattern,const Formattable* arguments,int32_t cnt,UnicodeString& appendTo,UErrorCode& success)}, - * {@link #format(const Formattable& source, UnicodeString& appendTo, FieldPosition& ignore, UErrorCode& success)}, - * {@link #format(const Formattable* arguments, int32_t cnt, UnicodeString& appendTo, FieldPosition& status, int32_t recursionProtection,UErrorCode& success)}, - * {@link #parse(const UnicodeString& source, ParsePosition& pos, int32_t& count)}, - * {@link #parse(const UnicodeString& source, int32_t& cnt, UErrorCode& status)} + *

    After construction, a custom Format object can be set for + * a top-level argument, overriding the default formatting and parsing behavior + * for that argument. + * However, custom formatting can be achieved more simply by writing + * a typeless argument in the pattern string + * and supplying it with a preformatted string value. * - *

    - * During parsing, an input string is matched against the string - * template of the MessageFormat to produce an array of Formattable - * objects. Plain text of the template string is matched directly - * against input text. At each position in the template string where - * a subformat is located, the subformat is called to parse the - * corresponding segment of input text to produce an output argument. - * In this way, an array of arguments is created which together - * constitute the parse result. - *

    - * Parsing may fail or produce unexpected results in a number of - * circumstances. - *

      - *
    • If one of the arguments does not occur in the pattern, it - * will be returned as a default Formattable. - *
    • If the format of an argument loses information, such as with - * a choice format where a large number formats to "many", then the - * parse may not correspond to the originally formatted argument. - *
    • MessageFormat does not handle ChoiceFormat recursion during - * parsing; such parses will fail. - *
    • Parsing will not always find a match (or the correct match) if - * some part of the parse is ambiguous. For example, if the pattern - * "{1},{2}" is used with the string arguments {"a,b", "c"}, it will - * format as "a,b,c". When the result is parsed, it will return {"a", - * "b,c"}. - *
    • If a single argument is formatted more than once in the string, - * then the rightmost subformat in the pattern string will produce the - * parse result; prior subformats with the same argument index will - * have no effect. - *
    - * Here are some examples of usage: - *

    + *

    When formatting, MessageFormat takes a collection of argument values + * and writes an output string. + * The argument values may be passed as an array + * (when the pattern contains only numbered arguments) + * or as an array of names and and an array of arguments (which works for both named + * and numbered arguments). + * + *

    Each argument is matched with one of the input values by array index or argument name + * and formatted according to its pattern specification + * (or using a custom Format object if one was set). + * A numbered pattern argument is matched with an argument name that contains that number + * as an ASCII-decimal-digit string (without leading zero). + * + *

    Patterns and Their Interpretation

    + * + * MessageFormat uses patterns of the following form: + *
    + * message = messageText (argument messageText)*
    + * argument = noneArg | simpleArg | complexArg
    + * complexArg = choiceArg | pluralArg | selectArg
    + *
    + * noneArg = '{' argNameOrNumber '}'
    + * simpleArg = '{' argNameOrNumber ',' argType [',' argStyle] '}'
    + * choiceArg = '{' argNameOrNumber ',' "choice" ',' choiceStyle '}'
    + * pluralArg = '{' argNameOrNumber ',' "plural" ',' pluralStyle '}'
    + * selectArg = '{' argNameOrNumber ',' "select" ',' selectStyle '}'
    + *
    + * choiceStyle: see {@link ChoiceFormat}
    + * pluralStyle: see {@link PluralFormat}
    + * selectStyle: see {@link SelectFormat}
    + *
    + * argNameOrNumber = argName | argNumber
    + * argName = [^[[:Pattern_Syntax:][:Pattern_White_Space:]]]+
    + * argNumber = '0' | ('1'..'9' ('0'..'9')*)
    + *
    + * argType = "number" | "date" | "time" | "spellout" | "ordinal" | "duration"
    + * argStyle = "short" | "medium" | "long" | "full" | "integer" | "currency" | "percent" | argStyleText
    + * 
    + * + *
      + *
    • messageText can contain quoted literal strings including syntax characters. + * A quoted literal string begins with an ASCII apostrophe and a syntax character + * (usually a {curly brace}) and continues until the next single apostrophe. + * A double ASCII apostrohpe inside or outside of a quoted string represents + * one literal apostrophe. + *
    • Quotable syntax characters are the {curly braces} in all messageText parts, + * plus the '#' sign in a messageText immediately inside a pluralStyle, + * and the '|' symbol in a messageText immediately inside a choiceStyle. + *
    • See also {@link MessagePattern.ApostropheMode} + *
    • In argStyleText, every single ASCII apostrophe begins and ends quoted literal text, + * and unquoted {curly braces} must occur in matched pairs. + *
    + * + *

    Recommendation: Use the real apostrophe (single quote) character + * \htmlonly’\endhtmlonly (U+2019) for + * human-readable text, and use the ASCII apostrophe ' (U+0027) + * only in program syntax, like quoting in MessageFormat. + * See the annotations for U+0027 Apostrophe in The Unicode Standard. + * + *

    The argType and argStyle values are used to create + * a Format instance for the format element. The following + * table shows how the values map to Format instances. Combinations not + * shown in the table are illegal. Any argStyleText must + * be a valid pattern string for the Format subclass used. + * + *

    + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
    argType + * argStyle + * resulting Format object + *
    (none) + * null + *
    number + * (none) + * NumberFormat.createInstance(getLocale(), status) + *
    integer + * NumberFormat.createInstance(getLocale(), kNumberStyle, status) + *
    currency + * NumberFormat.createCurrencyInstance(getLocale(), status) + *
    percent + * NumberFormat.createPercentInstance(getLocale(), status) + *
    argStyleText + * new DecimalFormat(argStyleText, new DecimalFormatSymbols(getLocale(), status), status) + *
    date + * (none) + * DateFormat.createDateInstance(kDefault, getLocale(), status) + *
    short + * DateFormat.createDateInstance(kShort, getLocale(), status) + *
    medium + * DateFormat.createDateInstance(kDefault, getLocale(), status) + *
    long + * DateFormat.createDateInstance(kLong, getLocale(), status) + *
    full + * DateFormat.createDateInstance(kFull, getLocale(), status) + *
    argStyleText + * new SimpleDateFormat(argStyleText, getLocale(), status) + *
    time + * (none) + * DateFormat.createTimeInstance(kDefault, getLocale(), status) + *
    short + * DateFormat.createTimeInstance(kShort, getLocale(), status) + *
    medium + * DateFormat.createTimeInstance(kDefault, getLocale(), status) + *
    long + * DateFormat.createTimeInstance(kLong, getLocale(), status) + *
    full + * DateFormat.createTimeInstance(kFull, getLocale(), status) + *
    argStyleText + * new SimpleDateFormat(argStyleText, getLocale(), status) + *
    spellout + * argStyleText (optional) + * new RuleBasedNumberFormat(URBNF_SPELLOUT, getLocale(), status) + *
        .setDefaultRuleset(argStyleText, status);
    + *
    ordinal + * argStyleText (optional) + * new RuleBasedNumberFormat(URBNF_ORDINAL, getLocale(), status) + *
        .setDefaultRuleset(argStyleText, status);
    + *
    duration + * argStyleText (optional) + * new RuleBasedNumberFormat(URBNF_DURATION, getLocale(), status) + *
        .setDefaultRuleset(argStyleText, status);
    + *
    + *

    + * + *

    Usage Information

    + * + *

    Here are some examples of usage: * Example 1: + * *

      * \code
      *     UErrorCode success = U_ZERO_ERROR;
    @@ -148,10 +255,12 @@ class DateFormat;
      *     //             in the Force on planet 7.
      * \endcode
      * 
    + * * Typically, the message format will come from resources, and the * arguments will be dynamically set at runtime. - *

    - * Example 2: + * + *

    Example 2: + * *

      *  \code
      *     success = U_ZERO_ERROR;
    @@ -171,122 +280,40 @@ class DateFormat;
      *  \endcode
      *  
    * - * The pattern is of the following form. Legend: - *
    - * \code
    - *       {optional item}
    - *       (group that may be repeated)*
    - * \endcode
    - *  
    - * Do not confuse optional items with items inside quoted braces, such - * as this: "{". Quoted braces are literals. - *
    - *  \code
    - *       messageFormatPattern := string ( "{" messageFormatElement "}" string )*
      *
    - *       messageFormatElement := argumentIndex | argumentName { "," elementFormat }
    - *
    - *       elementFormat := "time" { "," datetimeStyle }
    - *                      | "date" { "," datetimeStyle }
    - *                      | "number" { "," numberStyle }
    - *                      | "choice" "," choiceStyle
    - *                      | "spellout" { "," spelloutStyle }
    - *                      | "ordinal" { "," spelloutStyle }
    - *                      | "duration" { "," spelloutStyle }
    - *                      | "plural" "," pluralStyle
    - *                      | "select" "," selectStyle
    - *
    - *       datetimeStyle := "short"
    - *                      | "medium"
    - *                      | "long"
    - *                      | "full"
    - *                      | dateFormatPattern
    - *
    - *       numberStyle :=   "currency"
    - *                      | "percent"
    - *                      | "integer"
    - *                      | numberFormatPattern
    - *
    - *       choiceStyle :=   choiceFormatPattern
    - *
    - *       pluralStyle :=   pluralFormatPattern
    - *
    - *       selectStyle :=   selectFormatPattern
    - *
    - *       spelloutStyle := ruleSetName
    - * \endcode
    - * 
    - * If there is no elementFormat, then the argument must be a string, - * which is substituted. If there is no dateTimeStyle or numberStyle, - * then the default format is used (e.g. NumberFormat::createInstance(), - * DateFormat::createTimeInstance(DateFormat::kDefault, ...) or - * DateFormat::createDateInstance(DateFormat::kDefault, ...). For - * a RuleBasedNumberFormat, if there is no ruleSetName, the default - * rule set is used. For a ChoiceFormat or PluralFormat or SelectFormat, the pattern - * must always be specified, since there is no default. - *

    - * In strings, single quotes can be used to quote syntax characters. - * A literal single quote is represented by '', both within and outside - * of single-quoted segments. Inside a - * messageFormatElement, quotes are not removed. For example, - * {1,number,$'#',##} will produce a number format with the pound-sign - * quoted, with a result such as: "$#31,45". - *

    - * If a pattern is used, then unquoted braces in the pattern, if any, - * must match: that is, "ab {0} de" and "ab '}' de" are ok, but "ab - * {0'}' de" and "ab } de" are not. - *

    - *

    Warning:
    The rules for using quotes within message - * format patterns unfortunately have shown to be somewhat confusing. - * In particular, it isn't always obvious to localizers whether single - * quotes need to be doubled or not. Make sure to inform localizers about - * the rules, and tell them (for example, by using comments in resource - * bundle source files) which strings will be processed by MessageFormat. - * Note that localizers may need to use single quotes in translated - * strings where the original version doesn't have them. - *
    Note also that the simplest way to avoid the problem is to - * use the real apostrophe (single quote) character U+2019 (') for - * human-readable text, and to use the ASCII apostrophe (U+0027 ' ) - * only in program syntax, like quoting in MessageFormat. - * See the annotations for U+0027 Apostrophe in The Unicode Standard.

    - *
    - *

    - * The argumentIndex is a non-negative integer, which corresponds to the - * index of the arguments presented in an array to be formatted. The - * first argument has argumentIndex 0. - *

    - * It is acceptable to have unused arguments in the array. With missing - * arguments, or arguments that are not of the right class for the - * specified format, a failing UErrorCode result is set. - *

    - * Creating internationalized messages that include plural forms, you - * can use a PluralFormat: + *

    For messages that include plural forms, you can use a plural argument: *

      * \code
    - *  UErrorCode err = U_ZERO_ERROR;
    - *  UnicodeString t1("{0, plural, one{C''est # fichier} other{Ce sont # fichiers}} dans la liste.");
    - *  MessageFormat* msgFmt = new MessageFormat(t1, Locale("fr"), err);
    - *  if (U_FAILURE(err)) {
    - *      return err;
    - *  }
    - *
    - *  Formattable args1[] = {(int32_t)0};
    - *  Formattable args2[] = {(int32_t)3};
    - *  FieldPosition ignore(FieldPosition::DONT_CARE);
    + *  success = U_ZERO_ERROR;
    + *  MessageFormat msgFmt(
    + *       "{num_files, plural, "
    + *       "=0{There are no files on disk \"{disk_name}\".}"
    + *       "=1{There is one file on disk \"{disk_name}\".}"
    + *       "other{There are # files on disk \"{disk_name}\".}}",
    + *      Locale("en"),
    + *      success);
    + *  FieldPosition fpos = 0;
    + *  Formattable testArgs[] = {0L, "MyDisk"};
    + *  UnicodeString testArgsNames[] = {"num_files", "disk_name"};
      *  UnicodeString result;
    - *  msgFmt->format(args1, 1, result, ignore, status);
    - *  cout << result << endl;
    - *  result.remove();
    - *  msgFmt->format(args2, 1, result, ignore, status);
    - *  cout << result << endl;
    - *
    - *  // output, with different args
    - *  // output: C'est 0,0 fichier dans la liste.
    - *  // output: Ce sont 3 fichiers dans la liste."
    + *  cout << msgFmt.format(testArgs, testArgsNames, 2, result, fpos, 0, success);
    + *  testArgs[0] = 3L;
    + *  cout << msgFmt.format(testArgs, testArgsNames, 2, result, fpos, 0, success);
      * \endcode
    + * output:
    + * There are no files on disk "MyDisk".
    + * There are 3 files on "MyDisk".
      * 
    - * Please check PluralFormat and PluralRules for details. - *

    + * See {@link PluralFormat} and {@link PluralRules} for details. + * + *

    Synchronization

    + * + *

    MessageFormats are not synchronized. + * It is recommended to create separate format instances for each thread. + * If multiple threads access a format concurrently, it must be synchronized + * externally. + * + * @stable ICU 2.0 */ class U_I18N_API MessageFormat : public Format { public: @@ -331,8 +358,8 @@ public: * Constructs a new MessageFormat using the given pattern and locale. * @param pattern Pattern used to construct object. * @param newLocale The locale to use for formatting dates and numbers. - * @param parseError Struct to recieve information on position - * of error within the pattern. + * @param parseError Struct to receive information on the position + * of an error within the pattern. * @param status Input/output error code. If the * pattern cannot be parsed, set to failure code. * @stable ICU 2.0 @@ -376,15 +403,14 @@ public: virtual UBool operator==(const Format& other) const; /** - * Sets the locale. This locale is used for fetching default number or date - * format information. + * Sets the locale to be used for creating argument Format objects. * @param theLocale the new locale value to be set. * @stable ICU 2.0 */ virtual void setLocale(const Locale& theLocale); /** - * Gets the locale. This locale is used for fetching default number or date + * Gets the locale used for creating argument Format objects. * format information. * @return the locale of the object. * @stable ICU 2.0 @@ -405,8 +431,8 @@ public: * Applies the given pattern string to this message format. * * @param pattern The pattern to be applied. - * @param parseError Struct to recieve information on position - * of error within pattern. + * @param parseError Struct to receive information on the position + * of an error within the pattern. * @param status Input/output error code. If the * pattern cannot be parsed, set to failure code. * @stable ICU 2.0 @@ -415,6 +441,37 @@ public: UParseError& parseError, UErrorCode& status); + /** + * Sets the UMessagePatternApostropheMode and the pattern used by this message format. + * Parses the pattern and caches Format objects for simple argument types. + * Patterns and their interpretation are specified in the + * class description. + *

    + * This method is best used only once on a given object to avoid confusion about the mode, + * and after constructing the object with an empty pattern string to minimize overhead. + * + * @param pattern The pattern to be applied. + * @param aposMode The new apostrophe mode. + * @param parseError Struct to receive information on the position + * of an error within the pattern. + * Can be NULL. + * @param status Input/output error code. If the + * pattern cannot be parsed, set to failure code. + * @draft ICU 4.8 + */ + virtual void applyPattern(const UnicodeString& pattern, + UMessagePatternApostropheMode aposMode, + UParseError* parseError, + UErrorCode& status); + + /** + * @return this instance's UMessagePatternApostropheMode. + * @draft ICU 4.8 + */ + UMessagePatternApostropheMode getApostropheMode() const { + return msgPattern.getApostropheMode(); + } + /** * Returns a pattern that can be used to recreate this object. * @@ -490,7 +547,7 @@ public: /** * Gets format names. This function returns formatNames in StringEnumerations * which can be used with getFormat() and setFormat() to export formattable - * array from current MessageFormat to another. It is caller's resposibility + * array from current MessageFormat to another. It is the caller's responsibility * to delete the returned formatNames. * @param status output param set to success/failure code. * @stable ICU 4.0 @@ -747,6 +804,7 @@ public: static UnicodeString autoQuoteApostrophe(const UnicodeString& pattern, UErrorCode& status); + /** * Returns true if this MessageFormat uses named arguments, * and false otherwise. See class description. @@ -795,33 +853,44 @@ public: */ static UClassID U_EXPORT2 getStaticClassID(void); + /** + * Compares two Format objects. This is used for constructing the hash + * tables. + * + * @param left pointer to a Format object. Must not be NULL. + * @param right pointer to a Format object. Must not be NULL. + * + * @return whether the two objects are the same + * @internal + */ + static UBool equalFormats(const void* left, const void* right); + private: Locale fLocale; - UnicodeString fPattern; + MessagePattern msgPattern; Format** formatAliases; // see getFormats int32_t formatAliasesCapacity; - UProperty idStart; - UProperty idContinue; MessageFormat(); // default constructor not implemented - /* - * A structure representing one subformat of this MessageFormat. - * Each subformat has a Format object, an offset into the plain - * pattern text fPattern, and an argument number. The argument - * number corresponds to the array of arguments to be formatted. - * @internal - */ - class Subformat; + /** + * This provider helps defer instantiation of a PluralRules object + * until we actually need to select a keyword. + * For example, if the number matches an explicit-value selector like "=1" + * we do not need any PluralRules. + */ + class PluralSelectorProvider : public PluralFormat::PluralSelector { + public: + PluralSelectorProvider(const Locale* loc); + virtual ~PluralSelectorProvider(); + virtual UnicodeString select(double number, UErrorCode& ec) const; - /** - * A MessageFormat contains an array of subformats. This array - * needs to grow dynamically if the MessageFormat is modified. - */ - Subformat* subformats; - int32_t subformatCount; - int32_t subformatCapacity; + void reset(const Locale* loc); + private: + const Locale* locale; + PluralRules* rules; + }; /** * A MessageFormat formats an array of arguments. Each argument @@ -836,14 +905,14 @@ private: int32_t argTypeCapacity; /** - * Is true iff all argument names are non-negative numbers. - * - */ - UBool isArgNumeric; + * TRUE if there are different argTypes for the same argument. + * This only matters when the MessageFormat is used in the plain C (umsg_xxx) API + * where the pattern argTypes determine how the va_arg list is read. + */ + UBool hasArgTypeConflicts; // Variable-size array management - UBool allocateSubformats(int32_t capacity); - UBool allocateArgTypes(int32_t capacity); + UBool allocateArgTypes(int32_t capacity, UErrorCode& status); /** * Default Format objects used when no format is specified and a @@ -855,6 +924,11 @@ private: NumberFormat* defaultNumberFormat; DateFormat* defaultDateFormat; + UHashtable* cachedFormatters; + UHashtable* customFormatArgStarts; + + PluralSelectorProvider pluralProvider; + /** * Method to retrieve default formats (or NULL on failure). * These are semantically const, but may modify *this. @@ -872,57 +946,93 @@ private: const UChar * const *list); /** - * Formats the array of arguments and copies the result into the - * result buffer, updates the field position. - * - * @param arguments The formattable objects array. - * @param cnt The array count. - * @param appendTo Output parameter to receive result. - * Result is appended to existing contents. - * @param status Field position status. - * @param recursionProtection - * Initially zero. Bits 0..9 are used to indicate - * that a parameter has already been seen, to - * avoid recursion. Currently unused. - * @param success The error code status. - * @return Reference to 'appendTo' parameter. + * Thin wrapper around the format(... AppendableWrapper ...) variant. + * Wraps the destination UnicodeString into an AppendableWrapper and + * supplies default values for some other parameters. */ - UnicodeString& format( const Formattable* arguments, - int32_t cnt, - UnicodeString& appendTo, - FieldPosition& status, - int32_t recursionProtection, - UErrorCode& success) const; + UnicodeString& format(const Formattable* arguments, + const UnicodeString *argumentNames, + int32_t cnt, + UnicodeString& appendTo, + FieldPosition* pos, + UErrorCode& status) const; - UnicodeString& format( const Formattable* arguments, - const UnicodeString *argumentNames, - int32_t cnt, - UnicodeString& appendTo, - FieldPosition& status, - int32_t recursionProtection, - UErrorCode& success) const; + /** + * Formats the arguments and writes the result into the + * AppendableWrapper, updates the field position. + * + * @param msgStart Index to msgPattern part to start formatting from. + * @param pluralNumber Zero except when formatting a plural argument sub-message + * where a '#' is replaced by the format string for this number. + * @param arguments The formattable objects array. (Must not be NULL.) + * @param argumentNames NULL if numbered values are used. Otherwise the same + * length as "arguments", and each entry is the name of the + * corresponding argument in "arguments". + * @param cnt The length of arguments (and of argumentNames if that is not NULL). + * @param appendTo Output parameter to receive the result. + * The result string is appended to existing contents. + * @param pos Field position status. + * @param success The error code status. + */ + void format(int32_t msgStart, + double pluralNumber, + const Formattable* arguments, + const UnicodeString *argumentNames, + int32_t cnt, + AppendableWrapper& appendTo, + FieldPosition* pos, + UErrorCode& success) const; - void makeFormat(int32_t offsetNumber, - UnicodeString* segments, - UParseError& parseError, - UErrorCode& success); + UnicodeString getArgName(int32_t partIndex); + + void setArgStartFormat(int32_t argStart, Format* formatter, UErrorCode& status); + + void setCustomArgStartFormat(int32_t argStart, Format* formatter, UErrorCode& status); + + int32_t nextTopLevelArgStart(int32_t partIndex) const; + + bool argNameMatches(int32_t partIndex, const UnicodeString& argName, int32_t argNumber); + + void cacheExplicitFormats(UErrorCode& status); + + Format* createAppropriateFormat(UnicodeString& type, + UnicodeString& style, + Formattable::Type& formattableType, + UParseError& parseError, + UErrorCode& ec); + + const Formattable* getArgFromListByName(const Formattable* arguments, + const UnicodeString *argumentNames, + int32_t cnt, UnicodeString& name) const; + + Formattable* parse(int32_t msgStart, + const UnicodeString& source, + ParsePosition& pos, + int32_t& count, + UErrorCode& ec) const; + + FieldPosition* updateMetaData(AppendableWrapper& dest, int32_t prevLength, + FieldPosition* fp, const Formattable* argId) const; + + Format* getCachedFormatter(int32_t argumentNumber) const; + + UnicodeString getLiteralStringUntilNextArgument(int32_t from) const; + + void copyObjects(const MessageFormat& that, UErrorCode& ec); + + void formatComplexSubMessage(int32_t msgStart, + double pluralNumber, + const Formattable* arguments, + const UnicodeString *argumentNames, + int32_t cnt, + AppendableWrapper& appendTo, + UErrorCode& success) const; /** * Convenience method that ought to be in NumberFormat */ NumberFormat* createIntegerFormat(const Locale& locale, UErrorCode& status) const; - /** - * Checks the range of the source text to quote the special - * characters, { and ' and copy to target buffer. - * @param source - * @param start the text offset to start the process of in the source string - * @param end the text offset to end the process of in the source string - * @param appendTo Output parameter to receive result. - * Result is appended to existing contents. - */ - static void copyAndFixQuotes(const UnicodeString& appendTo, int32_t start, int32_t end, UnicodeString& target); - /** * Returns array of argument types in the parsed pattern * for use in C API. Only for the use of umsg_vformat(). Not @@ -937,11 +1047,25 @@ private: } /** - * Returns FALSE if the argument name is not legal. - * @param argName argument name. - * @return TRUE if the argument name is legal, otherwise return FALSE. + * Resets the internal MessagePattern, and other associated caches. */ - UBool isLegalArgName(const UnicodeString& argName) const; + void resetPattern(); + + // A DummyFormatter that we use solely to store a NULL value. UHash does + // not support storing NULL values. + class U_I18N_API DummyFormat : public Format { + public: + virtual UBool operator==(const Format&) const; + virtual Format* clone() const; + virtual UnicodeString& format(const Formattable&, + UnicodeString& appendTo, + FieldPosition&, + UErrorCode& status) const; + virtual void parseObject(const UnicodeString&, + Formattable&, + ParsePosition&) const; + virtual UClassID getDynamicClassID() const; + }; friend class MessageFormatAdapter; // getFormatTypeList() access }; @@ -953,6 +1077,7 @@ MessageFormat::format(const Formattable& obj, return Format::format(obj, appendTo, status); } + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/unicode/plurfmt.h b/icu4c/source/i18n/unicode/plurfmt.h index 35a6172bbb1..b55fb61fe0b 100644 --- a/icu4c/source/i18n/unicode/plurfmt.h +++ b/icu4c/source/i18n/unicode/plurfmt.h @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2007-2010, International Business Machines Corporation and +* Copyright (C) 2007-2011, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * @@ -25,6 +25,7 @@ #if !UCONFIG_NO_FORMATTING +#include "unicode/messagepattern.h" #include "unicode/numfmt.h" #include "unicode/plurrule.h" @@ -37,7 +38,7 @@ class Hashtable; * PluralFormat supports the creation of internationalized * messages with plural inflection. It is based on plural * selection, i.e. the caller specifies messages for each - * plural case that can appear in the users language and the + * plural case that can appear in the user's language and the * PluralFormat selects the appropriate message based on * the number. *

    @@ -51,7 +52,7 @@ class Hashtable; * each message and selects the message whose interval contains a * given number. This can only handle a finite number of * intervals. But in some languages, like Polish, one plural case - * applies to infinitely many intervals (e.g., paucal applies to + * applies to infinitely many intervals (e.g., the plural case applies to * numbers ending with 2, 3, or 4 except those ending with 12, 13, or * 14). Thus ChoiceFormat is not adequate. *

    @@ -62,17 +63,20 @@ class Hashtable; * conditions for a plural case than just a single interval. These plural * rules define both what plural cases exist in a language, and to * which numbers these cases apply. - *

  • It provides predefined plural rules for many locales. Thus, the programmer - * need not worry about the plural cases of a language. On the flip side, - * the localizer does not have to specify the plural cases; he can simply + *
  • It provides predefined plural rules for many languages. Thus, the programmer + * need not worry about the plural cases of a language and + * does not have to define the plural cases; they can simply * use the predefined keywords. The whole plural formatting of messages can * be done using localized patterns from resource bundles. For predefined plural - * rules, see CLDR Language Plural Rules page at + * rules, see the CLDR Language Plural Rules page at * http://unicode.org/repos/cldr-tmp/trunk/diff/supplemental/language_plural_rules.html *
*

*

Usage of PluralFormat

- *

+ *

Note: Typically, plural formatting is done via MessageFormat + * with a plural argument type, + * rather than using a stand-alone PluralFormat. + *

* This discussion assumes that you use PluralFormat with * a predefined set of plural rules. You can create one using one of * the constructors that takes a locale object. To @@ -85,82 +89,46 @@ class Hashtable; *

Patterns and Their Interpretation
*

* The pattern text defines the message output for each plural case of the - * used locale. The pattern is a sequence of - * caseKeyword{message} clauses, separated by white - * space characters. Each clause assigns the message message - * to the plural case identified by caseKeyword. - *

- * There are 6 predefined casekeyword in ICU - 'zero', 'one', 'two', 'few', 'many' and - * 'other'. You always have to define a message text for the default plural case - * "other" which is contained in every rule set. If the plural - * rules of the PluralFormat object do not contain a plural case - * identified by caseKeyword, U_DEFAULT_KEYWORD_MISSING - * will be set to status. - * If you do not specify a message text for a particular plural case, the - * message text of the plural case "other" gets assigned to this - * plural case. If you specify more than one message for the same plural case, - * U_DUPLICATE_KEYWORD will be set to status. - *
- * Spaces between caseKeyword and - * message will be ignored; spaces within - * message will be preserved. - *

- * The message text for a particular plural case may contain other message - * format patterns. PluralFormat preserves these so that you - * can use the strings produced by PluralFormat with other - * formatters. If you are using PluralFormat inside a - * MessageFormat pattern, MessageFormat will - * automatically evaluate the resulting format pattern.
- * Thus, curly braces ({, }) are only allowed - * in message texts to define a nested format pattern.
- * The pound sign (#) will be interpreted as the number placeholder - * in the message text, if it is not contained in curly braces (to preserve - * NumberFormat patterns). PluralFormat will - * replace each of those pound signs by the number passed to the - * format() method. It will be formatted using a - * NumberFormat for the PluralFormat's locale. If you - * need special number formatting, you have to explicitly specify a - * NumberFormat for the PluralFormat to use. - *

- * Example + * specified locale. Syntax: *
- * \code
- * UErrorCode status = U_ZERO_ERROR;
- * MessageFormat* msgFmt = new MessageFormat(UnicodeString("{0, plural,
- *   one{{0, number, C''est #,##0.0#  fichier}} other {Ce sont # fichiers}} dans la liste."),
- *   Locale("fr"), status);
- * if (U_FAILURE(status)) {
- *     return;
- * }
- * Formattable args1[] = {(int32_t)0};
- * Formattable args2[] = {(int32_t)3};
- * FieldPosition ignore(FieldPosition::DONT_CARE);
- * UnicodeString result;
- * msgFmt->format(args1, 1, result, ignore, status);
- * cout << result << endl;
- * result.remove();
- * msgFmt->format(args2, 1, result, ignore, status);
- * cout << result << endl;
- * \endcode
+ * pluralStyle = [offsetValue] (selector '{' message '}')+
+ * offsetValue = "offset:" number
+ * selector = explicitValue | keyword
+ * explicitValue = '=' number  // adjacent, no white space in between
+ * keyword = [^[[:Pattern_Syntax:][:Pattern_White_Space:]]]+
+ * message: see {@link MessageFormat}
  * 
- * Produces the output:
- * C'est 0,0 fichier dans la liste.
- * Ce sont 3 fichiers dans la liste. - *

- * Note:
- * Currently PluralFormat - * does not make use of quotes like MessageFormat. - * If you use plural format strings with MessageFormat and want - * to use a quote sign ', you have to write ''. - * MessageFormat unquotes this pattern and passes the unquoted - * pattern to PluralFormat. It's a bit trickier if you use - * nested formats that do quoting. In the example above, we wanted to insert - * ' in the number format pattern. Since - * NumberFormat supports quotes, we had to insert - * ''. But since MessageFormat unquotes the - * pattern before it gets passed to PluralFormat, we have to - * double these quotes, i.e. write ''''. + * Pattern_White_Space between syntax elements is ignored, except + * between the {curly braces} and their sub-message, + * and between the '=' and the number of an explicitValue. + * + *

+ * There are 6 predefined casekeyword in CLDR/ICU - 'zero', 'one', 'two', 'few', 'many' and + * 'other'. You always have to define a message text for the default plural case + * other which is contained in every rule set. + * If you do not specify a message text for a particular plural case, the + * message text of the plural case other gets assigned to this + * plural case. + *

+ * When formatting, the input number is first matched against the explicitValue clauses. + * If there is no exact-number match, then a keyword is selected by calling + * the PluralRules with the input number minus the offset. + * (The offset defaults to 0 if it is omitted from the pattern string.) + * If there is no clause with that keyword, then the "other" clauses is returned. + *

+ * An unquoted pound sign (#) in the selected sub-message + * itself (i.e., outside of arguments nested in the sub-message) + * is replaced by the input number minus the offset. + * The number-minus-offset value is formatted using a + * NumberFormat for the PluralFormat's locale. If you + * need special number formatting, you have to use a MessageFormat + * and explicitly specify a NumberFormat argument. + * Note: That argument is formatting without subtracting the offset! + * If you need a custom format and have a non-zero offset, then you need to pass the + * number-minus-offset value as a separate parameter. *

+ * For a usage example, see the {@link MessageFormat} class documentation. + * *

Defining Custom Plural Rules

*

If you need to use PluralFormat with custom rules, you can * create a PluralRules object and pass it to @@ -511,34 +479,63 @@ public: */ virtual UClassID getDynamicClassID() const; -private: - typedef enum fmtToken { - none, - tLetter, - tNumber, - tSpace, - tNumberSign, - tLeftBrace, - tRightBrace - }fmtToken; + private: + + class PluralSelector { + public: + /** + * Given a number, returns the appropriate PluralFormat keyword. + * + * @param number The number to be plural-formatted. + * @param ec Error code. + * @return The selected PluralFormat keyword. + */ + virtual UnicodeString select(double number, UErrorCode& ec) const = 0; + }; + + class PluralSelectorAdapter : public PluralSelector { + public: + PluralSelectorAdapter() : pluralRules(NULL) { + } + + virtual ~PluralSelectorAdapter(); + + virtual UnicodeString select(double number, UErrorCode& /*ec*/) const; + + void reset(); + + PluralRules* pluralRules; + }; Locale locale; - PluralRules* pluralRules; - UnicodeString pattern; - Hashtable *fParsedValuesHash; + MessagePattern msgPattern; NumberFormat* numberFormat; - NumberFormat* replacedNumberFormat; + double offset; + PluralSelectorAdapter pluralRulesWrapper; PluralFormat(); // default constructor not implemented - void init(const PluralRules* rules, const Locale& curlocale, UErrorCode& status); - UBool inRange(UChar ch, fmtToken& type); - UBool checkSufficientDefinition(); - void parsingFailure(); - UnicodeString insertFormattedNumber(double number, - UnicodeString& message, - UnicodeString& appendTo, - FieldPosition& pos) const; - void copyHashtable(Hashtable *other, UErrorCode& status); + void init(const PluralRules* rules, UErrorCode& status); + /** + * Copies dynamically allocated values (pointer fields). + * Others are copied using their copy constructors and assignment operators. + */ + void copyObjects(const PluralFormat& other); + + /** + * Finds the PluralFormat sub-message for the given number, or the "other" sub-message. + * @param pattern A MessagePattern. + * @param partIndex the index of the first PluralFormat argument style part. + * @param selector the PluralSelector for mapping the number (minus offset) to a keyword. + * @param number a number to be matched to one of the PluralFormat argument's explicit values, + * or mapped via the PluralSelector. + * @param ec ICU error code. + * @return the sub-message start part index. + */ + static int32_t findSubMessage( + const MessagePattern& pattern, int32_t partIndex, + const PluralSelector& selector, double number, UErrorCode& ec); + + friend class MessageFormat; }; U_NAMESPACE_END diff --git a/icu4c/source/i18n/unicode/plurrule.h b/icu4c/source/i18n/unicode/plurrule.h index b96bbb57b7c..c0a5dace819 100644 --- a/icu4c/source/i18n/unicode/plurrule.h +++ b/icu4c/source/i18n/unicode/plurrule.h @@ -84,15 +84,18 @@ class PluralKeywordEnumeration; * \endcode *

*

- * The difference between 'in' and 'within' is that 'in' only includes - * integers in the specified range, while 'within' includes all values.

- *

- * Keywords - * could be defined by users or from ICU locale data. There are 6 - * predefined values in ICU - 'zero', 'one', 'two', 'few', 'many' and - * 'other'. Callers need to check the value of keyword returned by - * {@link #select} method. - *

+ * An "identifier" is a sequence of characters that do not have the + * Unicode Pattern_Syntax or Pattern_White_Space properties. + *

+ * The difference between 'in' and 'within' is that 'in' only includes + * integers in the specified range, while 'within' includes all values.

+ *

+ * Keywords + * could be defined by users or from ICU locale data. There are 6 + * predefined values in ICU - 'zero', 'one', 'two', 'few', 'many' and + * 'other'. Callers need to check the value of keyword returned by + * {@link #select} method. + *

* * Examples:
  * UnicodeString keyword = pl->select(number);
diff --git a/icu4c/source/i18n/unicode/selfmt.h b/icu4c/source/i18n/unicode/selfmt.h
index e53f4ce633c..29b78f427be 100755
--- a/icu4c/source/i18n/unicode/selfmt.h
+++ b/icu4c/source/i18n/unicode/selfmt.h
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT:
- * Copyright (c) 1997-2010, International Business Machines Corporation and
+ * Copyright (c) 1997-2011, International Business Machines Corporation and
  * others. All Rights Reserved.
  * Copyright (C) 2010 , Yahoo! Inc.
  ********************************************************************
@@ -16,8 +16,9 @@
 #ifndef SELFMT
 #define SELFMT
 
-#include "unicode/utypes.h"
+#include "unicode/messagepattern.h"
 #include "unicode/numfmt.h"
+#include "unicode/utypes.h"
 
 /**
  * \file
@@ -28,7 +29,7 @@
 
 U_NAMESPACE_BEGIN
 
-class Hashtable;
+class MessageFormat;
 
 /**
   * 

SelectFormat supports the creation of internationalized @@ -40,6 +41,10 @@ class Hashtable; * *

Using SelectFormat for Gender Agreement

* + *

Note: Typically, select formatting is done via MessageFormat + * with a select argument type, + * rather than using a stand-alone SelectFormat.

+ * *

The main use case for the select format is gender based inflection. * When names or nouns are inserted into sentences, their gender can affect pronouns, * verb forms, articles, and adjectives. Special care needs to be @@ -73,6 +78,9 @@ class Hashtable; * but similar in grammatical use. * Some African languages have around 20 noun classes.

* + *

Note:For the gender of a person in a given sentence, + * we usually need to distinguish only between female, male and other/unknown.

+ * *

To enable localizers to create sentence patterns that take their * language's gender dependencies into consideration, software has to provide * information about the gender associated with a noun or name to @@ -81,8 +89,8 @@ class Hashtable; * *

    *
  • For people, natural gender information should be maintained for each person. - * The keywords "male", "female", "mixed" (for groups of people) - * and "unknown" are used. + * Keywords like "male", "female", "mixed" (for groups of people) + * and "unknown" could be used. * *
  • For nouns, grammatical gender information should be maintained for * each noun and per language, e.g., in resource bundles. @@ -100,6 +108,11 @@ class Hashtable; * *
    {0} went to {2}.
    * + *

    Note: The entire sentence should be included (and partially repeated) + * inside each phrase. Otherwise translators would have to be trained on how to + * move bits of the sentence in and out of the select argument of a message. + * (The examples below do not follow this recommendation!)

    + * *

    The sentence pattern for French, where the gender of the person affects * the form of the participle, uses a select format based on argument 1:

    * @@ -121,39 +134,24 @@ class Hashtable; * *

    Patterns and Their Interpretation

    * - *

    The SelectFormat pattern text defines the phrase output + *

    The SelectFormat pattern string defines the phrase output * for each user-defined keyword. - * The pattern is a sequence of keyword{phrase} - * clauses. - * Each clause assigns the phrase phrase - * to the user-defined keyword.

    + * The pattern is a sequence of (keyword, message) pairs. + * A keyword is a "pattern identifier": [^[[:Pattern_Syntax:][:Pattern_White_Space:]]]+

    * - *

    Keywords must match the pattern [a-zA-Z][a-zA-Z0-9_-]*; keywords - * that don't match this pattern result in the error code - * U_ILLEGAL_CHARACTER. - * You always have to define a phrase for the default keyword + *

    Each message is a MessageFormat pattern string enclosed in {curly braces}.

    + * + *

    You always have to define a phrase for the default keyword * other; this phrase is returned when the keyword * provided to * the format method matches no other keyword. * If a pattern does not provide a phrase for other, the method * it's provided to returns the error U_DEFAULT_KEYWORD_MISSING. - * If a pattern provides more than one phrase for the same keyword, the - * error U_DUPLICATE_KEYWORD is returned. *
    - * Spaces between keyword and - * {phrase} will be ignored; spaces within - * {phrase} will be preserved.

    + * Pattern_White_Space between keywords and messages is ignored. + * Pattern_White_Space within a message is preserved and output.

    * - *

    The phrase for a particular select case may contain other message - * format patterns. SelectFormat preserves these so that you - * can use the strings produced by SelectFormat with other - * formatters. If you are using SelectFormat inside a - * MessageFormat pattern, MessageFormat will - * automatically evaluate the resulting format pattern. - * Thus, curly braces ({, }) are only allowed - * in phrases to define a nested format pattern.

    - * - *

    Example: + *

    Example:
       * \htmlonly
       *
       * UErrorCode status = U_ZERO_ERROR;
    @@ -342,31 +340,22 @@ public:
         virtual UClassID getDynamicClassID() const;
     
     private:
    -    typedef enum classesForSelectFormat{
    -        tStartKeyword,
    -        tContinueKeyword,
    -        tLeftBrace,
    -        tRightBrace,
    -        tSpace,
    -        tOther
    -    }CharacterClass;
    -
    -    UnicodeString pattern;
    -    //Hash to store the keyword, phrase pairs.
    -    Hashtable  *parsedValuesHash;
    +    friend class MessageFormat;
     
         SelectFormat();   // default constructor not implemented.
    -    void initHashTable(UErrorCode &status);
    -    void cleanHashTable();
     
    -    //For the applyPattern , classifies char.s in one of the characterClass.
    -    CharacterClass classifyCharacter(UChar ch) const;
    -    //Checks if the "other" keyword is present in pattern.
    -    UBool checkSufficientDefinition();
    -    //Checks if the keyword passed is valid.
    -    UBool checkValidKeyword(const UnicodeString& argKeyword) const;
    -    void parsingFailure();
    -    void copyHashtable(Hashtable *other, UErrorCode& status);
    +    /**
    +     * Finds the SelectFormat sub-message for the given keyword, or the "other" sub-message.
    +     * @param pattern A MessagePattern.
    +     * @param partIndex the index of the first SelectFormat argument style part.
    +     * @param keyword a keyword to be matched to one of the SelectFormat argument's keywords.
    +     * @param ec Error code.
    +     * @return the sub-message start part index.
    +     */
    +    static int32_t findSubMessage(const MessagePattern& pattern, int32_t partIndex,
    +                                  const UnicodeString& keyword, UErrorCode& ec);
    +
    +    MessagePattern msgPattern;
     };
     
     U_NAMESPACE_END
    diff --git a/icu4c/source/i18n/unicode/umsg.h b/icu4c/source/i18n/unicode/umsg.h
    index 7d08fd3b79b..52fe90e2d6a 100644
    --- a/icu4c/source/i18n/unicode/umsg.h
    +++ b/icu4c/source/i18n/unicode/umsg.h
    @@ -1,6 +1,6 @@
     /********************************************************************
      * COPYRIGHT: 
    - * Copyright (c) 1997-2010, International Business Machines Corporation and
    + * Copyright (c) 1997-2011, International Business Machines Corporation and
      * others. All Rights Reserved.
      * Copyright (C) 2010 , Yahoo! Inc. 
      ********************************************************************
    @@ -13,7 +13,6 @@
      *   Change history:
      *
      *   08/5/2001  Ram         Added C wrappers for C++ API.
    - *                          
      ********************************************************************/
     
     #ifndef UMSG_H
    @@ -27,19 +26,30 @@
     #include "unicode/uloc.h"
     #include "unicode/parseerr.h"
     #include 
    +
     /**
      * \file
      * \brief C API: MessageFormat
      *
    - * 

    Message Format C API

    + *

    MessageFormat C API

    * - * Provides means to produce concatenated messages in language-neutral way. - * Use this for all concatenations that show up to end users. - *

    - * Takes a set of objects, formats them, then inserts the formatted - * strings into the pattern at the appropriate places. - *

    - * Here are some examples of usage: + *

    MessageFormat prepares strings for display to users, + * with optional arguments (variables/placeholders). + * The arguments can occur in any order, which is necessary for translation + * into languages with different grammars. + * + *

    The opaque UMessageFormat type is a thin C wrapper around + * a C++ MessageFormat. It is constructed from a pattern string + * with arguments in {curly braces} which will be replaced by formatted values. + * + *

    Currently, the C API supports only numbered arguments. + * + *

    For details about the pattern syntax and behavior, + * especially about the ASCII apostrophe vs. the + * real apostrophe (single quote) character \htmlonly’\endhtmlonly (U+2019), + * see the C++ MessageFormat class documentation. + * + *

    Here are some examples of C API usage: * Example 1: *

      * \code
    @@ -143,102 +153,6 @@
      * }
      * \endcode
      *  
    - * - - * The pattern is of the following form. Legend: - *
    - * \code
    - *       {optional item}
    - *       (group that may be repeated)*
    - * \endcode
    - *  
    - * Do not confuse optional items with items inside quotes braces, such - * as this: "{". Quoted braces are literals. - *
    - * \code
    - *       messageFormatPattern := string ( "{" messageFormatElement "}" string )*
    - *
    - *       messageFormatElement := argument { "," elementFormat }
    - *
    - *       elementFormat := "time" { "," datetimeStyle }
    - *                      | "date" { "," datetimeStyle }
    - *                      | "number" { "," numberStyle }
    - *                      | "choice" "," choiceStyle
    - *                      | "select" "," selectStyle
    - *
    - *       datetimeStyle := "short"
    - *                      | "medium"
    - *                      | "long"
    - *                      | "full"
    - *                      | dateFormatPattern
    - *
    - *       numberStyle :=   "currency"
    - *                      | "percent"
    - *                      | "integer"
    - *                      | numberFormatPattern
    - *
    - *       choiceStyle :=   choiceFormatPattern
    - *
    - *       selectStyle :=   selectFormatPattern
    - * \endcode
    - * 
    - * If there is no elementFormat, then the argument must be a string, - * which is substituted. If there is no dateTimeStyle or numberStyle, - * then the default format is used (e.g. NumberFormat.getInstance(), - * DateFormat.getDefaultTime() or DateFormat.getDefaultDate(). For - * a ChoiceFormat, the pattern must always be specified, since there - * is no default. - *

    - * In strings, single quotes can be used to quote the "{" sign if - * necessary. A real single quote is represented by ''. Inside a - * messageFormatElement, quotes are [not] removed. For example, - * {1,number,$'#',##} will produce a number format with the pound-sign - * quoted, with a result such as: "$#31,45". - *

    - * If a pattern is used, then unquoted braces in the pattern, if any, - * must match: that is, "ab {0} de" and "ab '}' de" are ok, but "ab - * {0'}' de" and "ab } de" are not. - *

    - *

    Warning:
    The rules for using quotes within message - * format patterns unfortunately have shown to be somewhat confusing. - * In particular, it isn't always obvious to localizers whether single - * quotes need to be doubled or not. Make sure to inform localizers about - * the rules, and tell them (for example, by using comments in resource - * bundle source files) which strings will be processed by MessageFormat. - * Note that localizers may need to use single quotes in translated - * strings where the original version doesn't have them. - *
    Note also that the simplest way to avoid the problem is to - * use the real apostrophe (single quote) character U+2019 (') for - * human-readable text, and to use the ASCII apostrophe (U+0027 ' ) - * only in program syntax, like quoting in MessageFormat. - * See the annotations for U+0027 Apostrophe in The Unicode Standard.

    - *
    - *

    - * The argument is a number from 0 to 9, which corresponds to the - * arguments presented in an array to be formatted. - *

    - * It is ok to have unused arguments in the array. With missing - * arguments or arguments that are not of the right class for the - * specified format, a failing UErrorCode result is set. - *

    - - *

    - * [Note:] As we see above, the string produced by a choice Format in - * MessageFormat is treated specially; occurances of '{' are used to - * indicated subformats. - *

    - * [Note:] Formats are numbered by order of variable in the string. - * This is [not] the same as the argument numbering! - *

    - * \code
    - *    For example: with "abc{2}def{3}ghi{0}...",
    - *
    - *    format0 affects the first variable {2}
    - *    format1 affects the second variable {3}
    - *    format2 affects the second variable {0}
    - * \endcode
    - * 
    - * and so on. */ /** diff --git a/icu4c/source/test/cintltst/cmsgtst.c b/icu4c/source/test/cintltst/cmsgtst.c index 0d759105daf..9b337322b9c 100644 --- a/icu4c/source/test/cintltst/cmsgtst.c +++ b/icu4c/source/test/cintltst/cmsgtst.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************** * @@ -736,8 +736,14 @@ static void TestMsgFormatChoice(void) str=(UChar*)malloc(sizeof(UChar) * 25); u_uastrcpy(str, "MyDisk"); log_verbose("Testing message format with choice test #6\n:"); - /*There {0,choice,0#are no files|1#is one file|1applyPattern", possibleDataError); tempBuffer.remove(); tempBuffer = messageFormatter->format(params, 1, tempBuffer, pos, status); - if (tempBuffer != "Double ' Quotes 7 test and quoted {1} test plus other {2} stuff.") + if (tempBuffer != "Double ' Quotes 7 test and quoted {1} test plus 'other {2} stuff'.") dataerrln("quote format test (w/ params) failed. - %s", u_errorName(status)); logln("Formatted with params : " + tempBuffer); @@ -911,12 +913,21 @@ void MessageFormatRegressionTest::Test4142938() */ void MessageFormatRegressionTest::TestChoicePatternQuote() { + // ICU 4.8 ChoiceFormat (like PluralFormat & SelectFormat) + // returns the chosen string unmodified, so that it is usable in a MessageFormat. + // We modified the test strings accordingly. + // Note: Without further formatting/trimming/etc., it is not possible + // to get a single apostrophe as the last character of a non-final choice sub-message + // because the single apostrophe before the pipe '|' would start quoted text. + // Normally, ChoiceFormat is used inside a MessageFormat, where a double apostrophe + // can be used in that case and will be formatted as a single one. + // (Better: Use a "real" apostrophe, U+2019.) UnicodeString DATA [] = { // Pattern 0 value 1 value // {sfb} hacked - changed \u2264 to = (copied from Character Map) - (UnicodeString)"0#can''t|1#can", (UnicodeString)"can't", (UnicodeString)"can", - (UnicodeString)"0#'pound(#)=''#'''|1#xyz", (UnicodeString)"pound(#)='#'", (UnicodeString)"xyz", - (UnicodeString)"0#'1<2 | 1=1'|1#''", (UnicodeString)"1<2 | 1=1", (UnicodeString)"'", + "0#can't|1#can", "can't", "can", + "0#pound(#)='#''|1#xyz", "pound(#)='#''", "xyz", + "0#1<2 '| 1=1'|1#'", "1<2 '| 1=1'", "'", }; for (int i=0; i<9; i+=3) { //try { @@ -929,7 +940,7 @@ void MessageFormatRegressionTest::TestChoicePatternQuote() out = cf->format((double)j, out, pos); if (out != DATA[i+1+j]) errln("Fail: Pattern \"" + DATA[i] + "\" x "+j+" -> " + - out + "; want \"" + DATA[i+1+j] + '"'); + out + "; want \"" + DATA[i+1+j] + "\""); } UnicodeString pat; pat = cf->toPattern(pat); @@ -937,9 +948,9 @@ void MessageFormatRegressionTest::TestChoicePatternQuote() ChoiceFormat *cf2 = new ChoiceFormat(pat, status); pat2 = cf2->toPattern(pat2); if (pat != pat2) - errln("Fail: Pattern \"" + DATA[i] + "\" x toPattern -> \"" + pat + '"'); + errln("Fail: Pattern \"" + DATA[i] + "\" x toPattern -> \"" + pat + "\""); else - logln("Ok: Pattern \"" + DATA[i] + "\" x toPattern -> \"" + pat + '"'); + logln("Ok: Pattern \"" + DATA[i] + "\" x toPattern -> \"" + pat + "\""); /*} catch (IllegalArgumentException e) { errln("Fail: Pattern \"" + DATA[i] + "\" -> " + e); @@ -980,12 +991,12 @@ void MessageFormatRegressionTest::TestAPI() { // Test adoptFormat MessageFormat *fmt = new MessageFormat("",status); - format->adoptFormat("",fmt,status); + format->adoptFormat("some_name",fmt,status); // Must at least pass a valid identifier. failure(status, "adoptFormat"); // Test getFormat format->setFormat((int32_t)0,*fmt); - format->getFormat("",status); + format->getFormat("some_other_name",status); // Must at least pass a valid identifier. failure(status, "getFormat"); delete format; } diff --git a/icu4c/source/test/intltest/plurfmts.cpp b/icu4c/source/test/intltest/plurfmts.cpp index 30d1e287e3d..e1308582800 100644 --- a/icu4c/source/test/intltest/plurfmts.cpp +++ b/icu4c/source/test/intltest/plurfmts.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2007-2010, International Business Machines Corporation and + * Copyright (c) 2007-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -35,6 +35,8 @@ void PluralFormatTest::runIndexedTest( int32_t index, UBool exec, const char* &n TESTCASE(0, pluralFormatBasicTest); TESTCASE(1, pluralFormatUnitTest); TESTCASE(2, pluralFormatLocaleTest); + TESTCASE(3, pluralFormatExtendedTest); + TESTCASE(4, pluralFormatExtendedParseTest); default: name = ""; break; } @@ -159,26 +161,28 @@ void PluralFormatTest::pluralFormatUnitTest(/*char *par*/) UNICODE_STRING_SIMPLE("odd {# is odd.} other{# is even.}"), UNICODE_STRING_SIMPLE("other{# is odd or even.}"), UNICODE_STRING_SIMPLE("odd{The number {0, number, #.#0} is odd.}other{The number {0, number, #.#0} is even.}"), - UNICODE_STRING_SIMPLE("odd{The number {#} is odd.}other{The number {#} is even.}"), + UNICODE_STRING_SIMPLE("odd{The number {1, number, #} is odd.}other{The number {2, number, #} is even.}"), }; UnicodeString patternOddTestResult[PLURAL_PATTERN_DATA] = { UNICODE_STRING_SIMPLE(" is odd."), UNICODE_STRING_SIMPLE(" is odd or even."), UNICODE_STRING_SIMPLE("The number {0, number, #.#0} is odd."), - UNICODE_STRING_SIMPLE("The number {#} is odd."), + UNICODE_STRING_SIMPLE("The number {1, number, #} is odd."), }; UnicodeString patternEvenTestResult[PLURAL_PATTERN_DATA] = { UNICODE_STRING_SIMPLE(" is even."), UNICODE_STRING_SIMPLE(" is odd or even."), UNICODE_STRING_SIMPLE("The number {0, number, #.#0} is even."), - UNICODE_STRING_SIMPLE("The number {#} is even."), + UNICODE_STRING_SIMPLE("The number {2, number, #} is even."), }; UnicodeString checkSyntaxtData[PLURAL_SYNTAX_DATA] = { - UNICODE_STRING_SIMPLE("odd{foo} odd{bar} other{foobar}"), - UNICODE_STRING_SIMPLE("odd{foo} other{bar} other{foobar}"), + // ICU 4.8 does not check for duplicate keywords any more. + //UNICODE_STRING_SIMPLE("odd{foo} odd{bar} other{foobar}"), + //UNICODE_STRING_SIMPLE("odd{foo} other{bar} other{foobar}"), UNICODE_STRING_SIMPLE("odd{foo}"), - UNICODE_STRING_SIMPLE("otto{foo} other{bar}"), - UNICODE_STRING_SIMPLE("1odd{foo} other{bar}"), + // ICU 4.8 does not check for unknown keywords any more. + //UNICODE_STRING_SIMPLE("otto{foo} other{bar}"), + UNICODE_STRING_SIMPLE("*odd{foo} other{bar}"), UNICODE_STRING_SIMPLE("odd{foo},other{bar}"), UNICODE_STRING_SIMPLE("od d{foo} other{bar}"), UNICODE_STRING_SIMPLE("odd{foo}{foobar}other{foo}"), @@ -264,7 +268,7 @@ void PluralFormatTest::pluralFormatUnitTest(/*char *par*/) } numberFormatTest(&pluralFmt, numFmt, 5, 5, NULL, NULL, FALSE, &message); pluralFmt.applyPattern(UNICODE_STRING_SIMPLE("odd__{odd} other{even}"), status); - if (U_SUCCESS(status)) { + if (pluralFmt.format(1, status) != UNICODE_STRING_SIMPLE("even")) { errln("SetLocale should reset rules but did not."); } status = U_ZERO_ERROR; @@ -491,6 +495,73 @@ PluralFormatTest::pluralFormatLocaleTest(/*char *par*/) } } +void +PluralFormatTest::pluralFormatExtendedTest(void) { + const char *targets[] = { + "There are no widgets.", + "There is one widget.", + "There is a bling widget and one other widget.", + "There is a bling widget and 2 other widgets.", + "There is a bling widget and 3 other widgets.", + "Widgets, five (5-1=4) there be.", + "There is a bling widget and 5 other widgets.", + "There is a bling widget and 6 other widgets.", + }; + + const char* fmt = + "offset:1.0 " + "=0 {There are no widgets.} " + "=1.0 {There is one widget.} " + "=5 {Widgets, five (5-1=#) there be.} " + "one {There is a bling widget and one other widget.} " + "other {There is a bling widget and # other widgets.}"; + + UErrorCode status = U_ZERO_ERROR; + UnicodeString fmtString(fmt, -1, US_INV); + PluralFormat pf(fmtString, status); + if (U_FAILURE(status)) { + errln("Failed to apply pattern - %s\n", u_errorName(status)); + return; + } + for (int i = 0; i < 7; ++i) { + UnicodeString result = pf.format(i, status); + if (U_FAILURE(status)) { + errln("Failed to format - %s\n", u_errorName(status)); + } + UnicodeString expected(targets[i], -1, US_INV); + if (expected != result) { + UnicodeString message("Expected '", -1, US_INV); + message.append(expected); + message.append(UnicodeString("' but got '", -1, US_INV)); + message.append(result); + message.append("'", -1, US_INV); + errln(message); + return; + } + } +} + +void +PluralFormatTest::pluralFormatExtendedParseTest(void) { + const char *failures[] = { + "offset:1..0 =0 {Foo}", + "offset:1.0 {Foo}", + "=0= {Foo}", + "=0 {Foo} =0.0 {Bar}", + " = {Foo}", + }; + int len = sizeof(failures)/sizeof(failures[0]); + + for (int i = 0; i < len; ++i) { + UErrorCode status = U_ZERO_ERROR; + UnicodeString fmt(failures[i], -1, US_INV); + PluralFormat pf(fmt, status); + if (U_SUCCESS(status)) { + errln("expected failure when parsing '" + fmt + "'"); + } + } +} + void PluralFormatTest::numberFormatTest(PluralFormat* plFmt, NumberFormat *numFmt, diff --git a/icu4c/source/test/intltest/plurfmts.h b/icu4c/source/test/intltest/plurfmts.h index d60170037bc..beea2e74308 100644 --- a/icu4c/source/test/intltest/plurfmts.h +++ b/icu4c/source/test/intltest/plurfmts.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2001, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -29,6 +29,8 @@ private: void pluralFormatBasicTest(/* char* par */); void pluralFormatUnitTest(/* char* par */); void pluralFormatLocaleTest(/* char* par */); + void pluralFormatExtendedTest(); + void pluralFormatExtendedParseTest(); void numberFormatTest(PluralFormat* plFmt, NumberFormat *numFmt, int32_t start, diff --git a/icu4c/source/test/intltest/selfmts.cpp b/icu4c/source/test/intltest/selfmts.cpp index 46aca86c2de..3c7796d8bdb 100644 --- a/icu4c/source/test/intltest/selfmts.cpp +++ b/icu4c/source/test/intltest/selfmts.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. * Copyright (C) 2010 , Yahoo! Inc. ********************************************************************/ @@ -12,7 +12,8 @@ #include "selfmts.h" #include "cmemory.h" #include "unicode/selfmt.h" -#include "stdio.h" + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) #define SIMPLE_PATTERN_STRING "feminine {feminineVerbValue} other{otherVerbValue}" @@ -81,10 +82,8 @@ void SelectFormatTest::selectFormatUnitTest(/*char *par*/) }; UnicodeString checkSyntaxData[SELECT_SYNTAX_DATA] = { - UNICODE_STRING_SIMPLE("odd{foo} odd{bar} other{foobar}"), - UNICODE_STRING_SIMPLE("odd{foo} other{bar} other{foobar}"), UNICODE_STRING_SIMPLE("odd{foo}"), - UNICODE_STRING_SIMPLE("1odd{foo} other{bar}"), + UNICODE_STRING_SIMPLE("*odd{foo} other{bar}"), UNICODE_STRING_SIMPLE("odd{foo},other{bar}"), UNICODE_STRING_SIMPLE("od d{foo} other{bar}"), UNICODE_STRING_SIMPLE("odd{foo}{foobar}other{foo}"), @@ -93,19 +92,6 @@ void SelectFormatTest::selectFormatUnitTest(/*char *par*/) UNICODE_STRING_SIMPLE("odd{fo{o1}other{foo2}}") }; - UErrorCode expErrorCodes[SELECT_SYNTAX_DATA]={ - U_DUPLICATE_KEYWORD, - U_DUPLICATE_KEYWORD, - U_DEFAULT_KEYWORD_MISSING, - U_PATTERN_SYNTAX_ERROR, - U_PATTERN_SYNTAX_ERROR, - U_PATTERN_SYNTAX_ERROR, - U_PATTERN_SYNTAX_ERROR, - U_PATTERN_SYNTAX_ERROR, - U_PATTERN_SYNTAX_ERROR, - U_DEFAULT_KEYWORD_MISSING - }; - UErrorCode status = U_ZERO_ERROR; VERBOSE_USTRING(SIMPLE_PATTERN); SelectFormat* selFmt = new SelectFormat( SIMPLE_PATTERN , status); @@ -113,7 +99,7 @@ void SelectFormatTest::selectFormatUnitTest(/*char *par*/) dataerrln("ERROR: SelectFormat Unit Test constructor failed in unit tests.- exitting"); return; } - + // ======= Test SelectFormat pattern syntax. logln("SelectFormat Unit Test : Testing SelectFormat pattern syntax."); for (int32_t i=0; iapplyPattern(checkSyntaxData[i], status); - if( status!= expErrorCodes[i] ){ - errln("\nERROR: Unexpected result - SelectFormat Unit Test failed to detect syntax error with pattern: "+checkSyntaxData[i]+" and expected status="+ u_errorName(expErrorCodes[i]) + " and resulted status="+u_errorName(status)); + if (U_SUCCESS(status)){ + errln("\nERROR: Unexpected result - SelectFormat Unit Test failed to detect syntax error with pattern: "+checkSyntaxData[i]); } } + // ICU 4.8 does not check for duplicate keywords any more. + status = U_ZERO_ERROR; + selFmt->applyPattern("odd{foo} odd{bar} other{foobar}", status); + FieldPosition format_ignore(FieldPosition::DONT_CARE); + UnicodeString format_result; + selFmt->format(UnicodeString("odd"), format_result, format_ignore, status); + assertEquals("should use first occurrence of the 'odd' keyword", "foo", format_result); + format_result.remove(); + selFmt->applyPattern("odd{foo} other{bar} other{foobar}", status); + selFmt->format(UnicodeString("other"), format_result, format_ignore, status); + assertEquals("should use first occurrence of the 'other' keyword", "bar", format_result); + delete selFmt; selFmt = NULL; @@ -166,27 +164,31 @@ void SelectFormatTest::selectFormatUnitTest(/*char *par*/) } //Test with an invalid keyword + // one which contains Pattern_Syntax or Pattern_White_Space. logln("SelectFormat Unit test: Testing format() with keyword method and with invalid keywords..."); status = U_ZERO_ERROR; result.remove(); UnicodeString keywords[] = { - "9Keyword-_", //Starts with a digit - "-Keyword-_", //Starts with a hyphen - "_Keyword-_", //Starts with a underscore - "\\u00E9Keyword-_", //Starts with non-ASCII character - "Key*word-_", //Contains a sepial character not allowed - "*Keyword-_" //Starts with a sepial character not allowed + "9Keyword-_", + "-Keyword-_", + "_Keyword-_", + "\\u00E9Keyword-_", + "Key word-_", + " Keyword-_", + "Key*word-_", + "*Keyword-_" }; delete selFmt; selFmt = NULL; selFmt = new SelectFormat( SIMPLE_PATTERN , status); - for (int32_t i = 0; i< 6; i++ ){ + for (int32_t i = 0; i < LENGTHOF(keywords); i++ ){ status = U_ZERO_ERROR; selFmt->format( keywords[i], result , ignore , status); if (!U_FAILURE(status)) { - errln("ERROR: SelectFormat Unit test failed in format() with keyWord and with an invalid keyword as : "+ keywords[i]); + errln("ERROR: SelectFormat Unit test failed in format() with keyWord and with an invalid keyword as : "+ + keywords[i]+" ("+u_errorName(status)+")"); } } diff --git a/icu4c/source/test/intltest/tchcfmt.cpp b/icu4c/source/test/intltest/tchcfmt.cpp index c45ca837fb6..bd50389d4d8 100644 --- a/icu4c/source/test/intltest/tchcfmt.cpp +++ b/icu4c/source/test/intltest/tchcfmt.cpp @@ -1,7 +1,7 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2009, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -80,11 +80,23 @@ TestChoiceFormat::TestSimpleExample( void ) } delete formequal; delete formnew; - + //Testing getLimits() - double *gotLimits=0; int32_t count=0; - gotLimits=(double*)form->getLimits(count); + const double *gotLimits=form->getLimits(count); +#if 1 // ICU 4.8 deprecates and disables the ChoiceFormat getters. + if(count != 0 || gotLimits != NULL) { + errln("getLimits() returns something, should be disabled"); + } + const UnicodeString *gotFormats=form->getFormats(count); + if(count != 0 || gotFormats != NULL) { + errln("getFormats() returns something, should be disabled"); + } + const UBool *gotClosures=form->getClosures(count); + if(count != 0 || gotClosures != NULL) { + errln("getClosures() returns something, should be disabled"); + } +#else if(count != 7){ errln("getLimits didn't update the count correctly\n"); } @@ -93,10 +105,9 @@ TestChoiceFormat::TestSimpleExample( void ) errln((UnicodeString)"getLimits didn't get the limits correctly. Expected " + limits[ix] + " Got " + gotLimits[ix]); } } - //Testing getFormat() + //Testing getFormats() count=0; - UnicodeString *gotFormats=0; - gotFormats=(UnicodeString*)form->getFormats(count); + const UnicodeString *gotFormats=form->getFormats(count); if(count != 7){ errln("getFormats didn't update the count correctly\n"); } @@ -105,10 +116,9 @@ TestChoiceFormat::TestSimpleExample( void ) errln((UnicodeString)"getFormats didn't get the Formats correctly. Expected " + monthNames[ix] + " Got " + gotFormats[ix]); } } - - +#endif + delete form; - } void @@ -216,6 +226,7 @@ TestChoiceFormat::TestComplexExample( void ) it_logln("------ additional testing in complex test ------"); it_logln(); // +#if 0 // ICU 4.8 deprecates and disables the ChoiceFormat getters. int32_t retCount; const double* retLimits = fileform->getLimits( retCount ); if ((retCount == 4) && (retLimits) @@ -238,6 +249,7 @@ TestChoiceFormat::TestComplexExample( void ) }else{ it_errln("*** getFormats unexpected result!"); } +#endif UnicodeString checkstr2[] = { "There is no folder on Disk_A", @@ -486,6 +498,7 @@ void TestChoiceFormat::TestClosures(void) { errln("FAIL: fmt1 != fmt2"); } +#if 0 // ICU 4.8 deprecates and disables the ChoiceFormat getters. int32_t i; int32_t count2 = 0; const double *limits2 = fmt2.getLimits(count2); @@ -507,6 +520,7 @@ void TestChoiceFormat::TestClosures(void) { } } } +#endif // Now test both format objects UnicodeString exp[] = { @@ -596,6 +610,7 @@ void TestChoiceFormat::TestPatterns(void) { 1.0, "b", 1.0 + 1e-9, "c"); +#if 0 // ICU 4.8 only checks the pattern syntax, not whether the ranges make sense. // Try an invalid pattern that isolates a single value. // [-Inf,1.0) [1.0,1.0) [1.0,+Inf] _testPattern("0.0#a|1.0#b|1.0#c", FALSE, @@ -614,6 +629,7 @@ void TestChoiceFormat::TestPatterns(void) { // [-Inf,2.0) [2.0,1.0) [1.0,+Inf] _testPattern("0.0#a|2.0#b|1.0#c", FALSE, 0, 0, 0, 0, 0, 0); +#endif } void TestChoiceFormat::TestChoiceFormatToPatternOverflow() diff --git a/icu4c/source/test/intltest/tmsgfmt.cpp b/icu4c/source/test/intltest/tmsgfmt.cpp index 3b08e8e66f1..dc7f291f1b4 100644 --- a/icu4c/source/test/intltest/tmsgfmt.cpp +++ b/icu4c/source/test/intltest/tmsgfmt.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************** * File TMSGFMT.CPP @@ -25,43 +25,46 @@ #include "unicode/msgfmt.h" #include "unicode/numfmt.h" #include "unicode/choicfmt.h" +#include "unicode/messagepattern.h" #include "unicode/selfmt.h" #include "unicode/gregocal.h" #include -#define E_WITH_ACUTE ((char)0x00E9) -static const char E_ACCENTED[]={E_WITH_ACUTE,0}; +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) void TestMessageFormat::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /*par*/) { - switch (index) { - TESTCASE(0,testBug1); - TESTCASE(1,testBug2); - TESTCASE(2,sample); - TESTCASE(3,PatternTest); - TESTCASE(4,testStaticFormat); - TESTCASE(5,testSimpleFormat); - TESTCASE(6,testMsgFormatChoice); - TESTCASE(7,testCopyConstructor); - TESTCASE(8,testAssignment); - TESTCASE(9,testClone); - TESTCASE(10,testEquals); - TESTCASE(11,testNotEquals); - TESTCASE(12,testSetLocale); - TESTCASE(13,testFormat); - TESTCASE(14,testParse); - TESTCASE(15,testAdopt); - TESTCASE(16,testCopyConstructor2); - TESTCASE(17,TestUnlimitedArgsAndSubformats); - TESTCASE(18,TestRBNF); - TESTCASE(19,TestTurkishCasing); - TESTCASE(20,testAutoQuoteApostrophe); - TESTCASE(21,testMsgFormatPlural); - TESTCASE(22,testCoverage); - TESTCASE(23,testMsgFormatSelect); - default: name = ""; break; - } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(testBug1); + TESTCASE_AUTO(testBug2); + TESTCASE_AUTO(sample); + TESTCASE_AUTO(PatternTest); + TESTCASE_AUTO(testStaticFormat); + TESTCASE_AUTO(testSimpleFormat); + TESTCASE_AUTO(testMsgFormatChoice); + TESTCASE_AUTO(testCopyConstructor); + TESTCASE_AUTO(testAssignment); + TESTCASE_AUTO(testClone); + TESTCASE_AUTO(testEquals); + TESTCASE_AUTO(testNotEquals); + TESTCASE_AUTO(testSetLocale); + TESTCASE_AUTO(testFormat); + TESTCASE_AUTO(testParse); + TESTCASE_AUTO(testAdopt); + TESTCASE_AUTO(testCopyConstructor2); + TESTCASE_AUTO(TestUnlimitedArgsAndSubformats); + TESTCASE_AUTO(TestRBNF); + TESTCASE_AUTO(TestTurkishCasing); + TESTCASE_AUTO(testAutoQuoteApostrophe); + TESTCASE_AUTO(testMsgFormatPlural); + TESTCASE_AUTO(testMsgFormatSelect); + TESTCASE_AUTO(testApostropheInPluralAndSelect); + TESTCASE_AUTO(TestApostropheMode); + TESTCASE_AUTO(TestCompatibleApostrophe); + TESTCASE_AUTO(testCoverage); + TESTCASE_AUTO(TestTrimArgumentName); + TESTCASE_AUTO_END; } void TestMessageFormat::testBug3() @@ -261,7 +264,9 @@ void TestMessageFormat::PatternTest() "'{1,number,#,##}' {1,number,#,##}", }; - UnicodeString testResultPatterns[] = { + // ICU 4.8 returns the original pattern (testCases), + // rather than toPattern() reconstituting a new, equivalent pattern string (testResultPatterns). + /*UnicodeString testResultPatterns[] = { "Quotes '', '{', a {0} '{'0}", "Quotes '', '{', a {0,number} '{'0}", "'{'1,number,#,##} {1,number,'#'#,##}", @@ -271,12 +276,12 @@ void TestMessageFormat::PatternTest() "'{'1,date,full}, {1,date,full},", "'{'3,date,full}, {3,date,full},", "'{'1,number,#,##} {1,number,#,##}" - }; + };*/ UnicodeString testResultStrings[] = { - "Quotes ', {, a 1 {0}", - "Quotes ', {, a 1 {0}", - "{1,number,#,##} #34,56", + "Quotes ', {, 'a' 1 {0}", + "Quotes ', {, 'a' 1 {0}", + "{1,number,'#',##} #34,56", "There are 3,456 files on Disk at 1/12/70 5:46 AM.", "On Disk, there are 3,456 files, with $1.00.", "{1,number,percent}, 345,600%,", @@ -298,11 +303,17 @@ void TestMessageFormat::PatternTest() logln(((UnicodeString)"MessageFormat for ") + testCases[i] + " creation failed.\n"); continue; } - if (form->toPattern(buffer) != testResultPatterns[i]) { + // ICU 4.8 returns the original pattern (testCases), + // rather than toPattern() reconstituting a new, equivalent pattern string (testResultPatterns). + if (form->toPattern(buffer) != testCases[i]) { + // Note: An alternative test would be to build MessagePattern objects for + // both the input and output patterns and compare them, taking SKIP_SYNTAX etc. + // into account. + // (Too much trouble...) errln(UnicodeString("TestMessageFormat::PatternTest failed test #2, i = ") + i); //form->toPattern(buffer); errln(((UnicodeString)" Orig: ") + testCases[i]); - errln(((UnicodeString)" Exp: ") + testResultPatterns[i]); + errln(((UnicodeString)" Exp: ") + testCases[i]); errln(((UnicodeString)" Got: ") + buffer); } @@ -322,7 +333,7 @@ void TestMessageFormat::PatternTest() logln(UnicodeString(" Result: ") + result ); logln(UnicodeString(" Expected: ") + testResultStrings[i] ); } - + //it_out << "Result: " << result); #if 0 @@ -534,7 +545,7 @@ void TestMessageFormat::testMsgFormatPlural(/* char* par */) UnicodeString t2("{argument, plural, one{C''est # fichier} other {Ce sont # fichiers}} dans la liste."); UnicodeString t3("There {0, plural, one{is # zavod}few{are {0, number,###.0} zavoda} other{are # zavodov}} in the directory."); UnicodeString t4("There {argument, plural, one{is # zavod}few{are {argument, number,###.0} zavoda} other{are #zavodov}} in the directory."); - UnicodeString t5("{0, plural, one {{0, number,C''''est #,##0.0# fichier}} other {Ce sont # fichiers}} dans la liste."); + UnicodeString t5("{0, plural, one {{0, number,C''est #,##0.0# fichier}} other {Ce sont # fichiers}} dans la liste."); MessageFormat* mfNum = new MessageFormat(t1, Locale("fr"), err); if (U_FAILURE(err)) { dataerrln("TestMessageFormat::testMsgFormatPlural #1 - argumentIndex - %s", u_errorName(err)); @@ -611,15 +622,32 @@ void TestMessageFormat::testMsgFormatPlural(/* char* par */) errln("TestMessageFormat::test nested PluralFormat with argumentName"); } if ( argNameResult!= UnicodeString("C'est 0,0 fichier dans la liste.")) { - errln(UnicodeString("TestMessageFormat::test nested named PluralFormat.")); + errln(UnicodeString("TestMessageFormat::test nested named PluralFormat: ") + argNameResult); logln(UnicodeString("The unexpected nested named PluralFormat.")); } delete msgFmt; } +void TestMessageFormat::testApostropheInPluralAndSelect() { + UErrorCode errorCode = U_ZERO_ERROR; + MessageFormat msgFmt(UNICODE_STRING_SIMPLE( + "abc_{0,plural,other{#'#'#'{'#''}}_def_{1,select,other{sel'}'ect''}}_xyz"), + Locale::getEnglish(), + errorCode); + if (U_FAILURE(errorCode)) { + errln("MessageFormat constructor failed - %s\n", u_errorName(errorCode)); + return; + } + UnicodeString expected = UNICODE_STRING_SIMPLE("abc_3#3{3'_def_sel}ect'_xyz"); + Formattable args[] = { 3, UNICODE_STRING_SIMPLE("x") }; + internalFormat( + &msgFmt, args, 2, expected, + "MessageFormat with apostrophes in plural/select arguments failed:\n"); +} + void TestMessageFormat::internalFormat(MessageFormat* msgFmt , Formattable* args , int32_t numOfArgs , - UnicodeString expected ,char* errMsg) + UnicodeString expected, const char* errMsg) { UnicodeString result; FieldPosition ignore(FieldPosition::DONT_CARE); @@ -1236,7 +1264,12 @@ void TestMessageFormat::testAdopt() } assertEquals("msgCmp.toPattern()", formatStr, msgCmp.toPattern(patCmp.remove())); - assertEquals("msg.toPattern()", formatStr, msg.toPattern(patAct.remove())); + // ICU 4.8 does not support toPattern() when there are custom formats (from setFormat() etc.). + // assertEquals("msg.toPattern()", formatStr, msg.toPattern(patAct.remove())); + msg.toPattern(patCmp.remove()); + if (!patCmp.isBogus()) { + errln("msg.setFormat().toPattern() succeeds."); + } for (i = 0; i < countAct; i++) { a = formatsAct[i]; @@ -1279,7 +1312,8 @@ void TestMessageFormat::testAdopt() delete[] formatsToAdopt; assertEquals("msgCmp.toPattern()", formatStr, msgCmp.toPattern(patCmp.remove())); - assertEquals("msg.toPattern()", formatStr, msg.toPattern(patAct.remove())); + // ICU 4.8 does not support toPattern() when there are custom formats (from setFormat() etc.). + // assertEquals("msg.toPattern()", formatStr, msg.toPattern(patAct.remove())); formatsAct = msg.getFormats(countAct); if (!formatsAct || (countAct <=0) || (countAct != countCmp)) { @@ -1330,7 +1364,8 @@ void TestMessageFormat::testAdopt() delete[] formatsToAdopt; // array itself not needed in this case; assertEquals("msgCmp.toPattern()", formatStr, msgCmp.toPattern(patCmp.remove())); - assertEquals("msg.toPattern()", formatStr, msg.toPattern(patAct.remove())); + // ICU 4.8 does not support toPattern() when there are custom formats (from setFormat() etc.). + // assertEquals("msg.toPattern()", formatStr, msg.toPattern(patAct.remove())); formatsAct = msg.getFormats(countAct); if (!formatsAct || (countAct <=0) || (countAct != countCmp)) { @@ -1519,6 +1554,116 @@ void TestMessageFormat::TestRBNF(void) { delete numFmt; } +UnicodeString TestMessageFormat::GetPatternAndSkipSyntax(const MessagePattern& pattern) { + UnicodeString us(pattern.getPatternString()); + int count = pattern.countParts(); + for (int i = count; i > 0;) { + const MessagePattern::Part& part = pattern.getPart(--i); + if (part.getType() == UMSGPAT_PART_TYPE_SKIP_SYNTAX) { + us.remove(part.getIndex(), part.getLimit() - part.getIndex()); + } + } + return us; +} + +void TestMessageFormat::TestApostropheMode() { + UErrorCode status = U_ZERO_ERROR; + MessagePattern *ado_mp = new MessagePattern(UMSGPAT_APOS_DOUBLE_OPTIONAL, status); + MessagePattern *adr_mp = new MessagePattern(UMSGPAT_APOS_DOUBLE_REQUIRED, status); + if (ado_mp->getApostropheMode() != UMSGPAT_APOS_DOUBLE_OPTIONAL) { + errln("wrong value from ado_mp->getApostropheMode()."); + } + if (adr_mp->getApostropheMode() != UMSGPAT_APOS_DOUBLE_REQUIRED) { + errln("wrong value from adr_mp->getApostropheMode()."); + } + + + UnicodeString tuples[] = { + // Desired output + // DOUBLE_OPTIONAL pattern + // DOUBLE_REQUIRED pattern (empty=same as DOUBLE_OPTIONAL) + "I see {many}", "I see '{many}'", "", + "I said {'Wow!'}", "I said '{''Wow!''}'", "", + "I dont know", "I dont know", "I don't know", + "I don't know", "I don't know", "I don''t know", + "I don't know", "I don''t know", "I don''t know" + }; + int32_t tuples_count = LENGTHOF(tuples); + + for (int i = 0; i < tuples_count; i += 3) { + UnicodeString& desired = tuples[i]; + UnicodeString& ado_pattern = tuples[i + 1]; + UErrorCode status = U_ZERO_ERROR; + assertEquals("DOUBLE_OPTIONAL failure", + desired, + GetPatternAndSkipSyntax(ado_mp->parse(ado_pattern, NULL, status))); + UnicodeString& adr_pattern = tuples[i + 2].isEmpty() ? ado_pattern : tuples[i + 2]; + assertEquals("DOUBLE_REQUIRED failure", desired, + GetPatternAndSkipSyntax(adr_mp->parse(adr_pattern, NULL, status))); + } + delete adr_mp; + delete ado_mp; +} + + +// Compare behavior of DOUBLE_OPTIONAL (new default) and DOUBLE_REQUIRED JDK-compatibility mode. +void TestMessageFormat::TestCompatibleApostrophe() { + // Message with choice argument which does not contain another argument. + // The JDK performs only one apostrophe-quoting pass on this pattern. + UnicodeString pattern = "ab{0,choice,0#1'2''3'''4''''.}yz"; + + UErrorCode ec = U_ZERO_ERROR; + MessageFormat compMsg("", Locale::getUS(), ec); + compMsg.applyPattern(pattern, UMSGPAT_APOS_DOUBLE_REQUIRED, NULL, ec); + if (compMsg.getApostropheMode() != UMSGPAT_APOS_DOUBLE_REQUIRED) { + errln("wrong value from compMsg.getApostropheMode()."); + } + + MessageFormat icuMsg("", Locale::getUS(), ec); + icuMsg.applyPattern(pattern, UMSGPAT_APOS_DOUBLE_OPTIONAL, NULL, ec); + if (icuMsg.getApostropheMode() != UMSGPAT_APOS_DOUBLE_OPTIONAL) { + errln("wrong value from icuMsg.getApostropheMode()."); + } + + Formattable zero0[] = { 0 }; + FieldPosition fieldpos(0); + UnicodeString buffer1, buffer2; + assertEquals("incompatible ICU MessageFormat compatibility-apostrophe behavior", + "ab12'3'4''.yz", + compMsg.format(zero0, 1, buffer1, fieldpos, ec)); + assertEquals("unexpected ICU MessageFormat double-apostrophe-optional behavior", + "ab1'2'3''4''.yz", + icuMsg.format(zero0, 1, buffer2, fieldpos, ec)); + + // Message with choice argument which contains a nested simple argument. + // The DOUBLE_REQUIRED version performs two apostrophe-quoting passes. + buffer1.remove(); + buffer2.remove(); + pattern = "ab{0,choice,0#1'2''3'''4''''.{0,number,'#x'}}yz"; + compMsg.applyPattern(pattern, ec); + icuMsg.applyPattern(pattern, ec); + assertEquals("incompatible ICU MessageFormat compatibility-apostrophe behavior", + "ab1234'.0xyz", + compMsg.format(zero0, 1, buffer1, fieldpos, ec)); + assertEquals("unexpected ICU MessageFormat double-apostrophe-optional behavior", + "ab1'2'3''4''.#x0yz", + icuMsg.format(zero0, 1, buffer2, fieldpos, ec)); + + // This part is copied over from Java tests but cannot be properly tested here + // because we do not have a live reference implementation with JDK behavior. + // The JDK ChoiceFormat itself always performs one apostrophe-quoting pass. + /* + ChoiceFormat choice = new ChoiceFormat("0#1'2''3'''4''''."); + assertEquals("unexpected JDK ChoiceFormat apostrophe behavior", + "12'3'4''.", + choice.format(0)); + choice.applyPattern("0#1'2''3'''4''''.{0,number,'#x'}"); + assertEquals("unexpected JDK ChoiceFormat apostrophe behavior", + "12'3'4''.{0,number,#x}", + choice.format(0)); + */ +} + void TestMessageFormat::testAutoQuoteApostrophe(void) { const char* patterns[] = { // pattern, expected pattern "'", "''", @@ -1595,7 +1740,10 @@ void TestMessageFormat::testCoverage(void) { } } - msgfmt->adoptFormat("adopt", &cf, status); + // adoptFormat() takes ownership of the input Format object. + // We need to clone the stack-allocated cf so that we do not attempt to delete cf. + Format *cfClone = cf.clone(); + msgfmt->adoptFormat("adopt", cfClone, status); delete en; delete msgfmt; @@ -1609,18 +1757,38 @@ void TestMessageFormat::testCoverage(void) { errln("FAIL: Unable to detect usage of named arguments."); } + // Starting with ICU 4.8, we support setFormat(name, ...) and getFormatNames() + // on a MessageFormat without named arguments. msgfmt->setFormat("formatName", cf, status); - if (!U_FAILURE(status)) { - errln("FAIL: Should fail to setFormat instead of passing."); + if (U_FAILURE(status)) { + errln("FAIL: Should work to setFormat(name, ...) regardless of pattern."); } status = U_ZERO_ERROR; en = msgfmt->getFormatNames(status); - if (!U_FAILURE(status)) { - errln("FAIL: Should fail to get format names enumeration instead of passing."); + if (U_FAILURE(status)) { + errln("FAIL: Should work to get format names enumeration regardless of pattern."); } delete en; delete msgfmt; } +void TestMessageFormat::TestTrimArgumentName() { + // ICU 4.8 allows and ignores white space around argument names and numbers. + IcuTestErrorCode errorCode(*this, "TestTrimArgumentName"); + MessageFormat m("a { 0 , number , '#,#'#.0 } z", Locale::getEnglish(), errorCode); + Formattable args[1] = { 2 }; + FieldPosition ignore(0); + UnicodeString result; + assertEquals("trim-numbered-arg format() failed", "a #,#2.0 z", + m.format(args, 1, result, ignore, errorCode)); + + m.applyPattern("x { _oOo_ , number , integer } y", errorCode); + UnicodeString argName = UNICODE_STRING_SIMPLE("_oOo_"); + args[0].setLong(3); + result.remove(); + assertEquals("trim-named-arg format() failed", "x 3 y", + m.format(&argName, args, 1, result, errorCode)); +} + #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/test/intltest/tmsgfmt.h b/icu4c/source/test/intltest/tmsgfmt.h index 9d47c860b83..80b31b4a90f 100644 --- a/icu4c/source/test/intltest/tmsgfmt.h +++ b/icu4c/source/test/intltest/tmsgfmt.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ #ifndef _TESTMESSAGEFORMAT @@ -65,12 +65,14 @@ public: **/ void testMsgFormatSelect(/* char* par */); + void testApostropheInPluralAndSelect(); + /** * Internal method to format a MessageFormat object with passed args **/ void internalFormat(MessageFormat* msgFmt , Formattable* args , int32_t numOfArgs , - UnicodeString expected ,char* errMsg); + UnicodeString expected, const char* errMsg); /** * Internal method to create a MessageFormat object with passed args @@ -89,7 +91,10 @@ public: */ void TestRBNF(); - // + void TestApostropheMode(); + + void TestCompatibleApostrophe(); + /** * ------------ API tests ---------- * These routines test various API functionality. @@ -108,11 +113,13 @@ public: void testAdopt(void); void TestTurkishCasing(void); void testAutoQuoteApostrophe(void); + void TestTrimArgumentName(); /* Provide better code coverage */ void testCoverage(void); private: + UnicodeString GetPatternAndSkipSyntax(const MessagePattern& pattern); }; #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/test/intltest/ucdtest.cpp b/icu4c/source/test/intltest/ucdtest.cpp index 8393b9f447b..0daca0241f1 100644 --- a/icu4c/source/test/intltest/ucdtest.cpp +++ b/icu4c/source/test/intltest/ucdtest.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -10,6 +10,7 @@ #include "unicode/putil.h" #include "cstring.h" #include "hash.h" +#include "patternprops.h" #include "normalizer2impl.h" #include "uparse.h" #include "ucdtest.h" @@ -50,13 +51,15 @@ UnicodeTest::~UnicodeTest() void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) { - if (exec) logln("TestSuite UnicodeTest: "); - switch (index) { - case 0: name = "TestAdditionalProperties"; if(exec) TestAdditionalProperties(); break; - case 1: name = "TestBinaryValues"; if(exec) TestBinaryValues(); break; - case 2: name = "TestConsistency"; if(exec) TestConsistency(); break; - default: name = ""; break; //needed to end loop + if(exec) { + logln("TestSuite UnicodeTest: "); } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(TestAdditionalProperties); + TESTCASE_AUTO(TestBinaryValues); + TESTCASE_AUTO(TestConsistency); + TESTCASE_AUTO(TestPatternProperties); + TESTCASE_AUTO_END; } //==================================================== @@ -339,7 +342,7 @@ void UnicodeTest::TestConsistency() { IcuTestErrorCode errorCode(*this, "TestConsistency"); const Normalizer2 *nfd=Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode); const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); - if(errorCode.isFailure()) { + if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) { dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n", errorCode.errorName()); errorCode.reset(); @@ -369,3 +372,57 @@ void UnicodeTest::TestConsistency() { } #endif } + +/** + * Test various implementations of Pattern_Syntax & Pattern_White_Space. + */ +void UnicodeTest::TestPatternProperties() { + IcuTestErrorCode errorCode(*this, "TestPatternProperties()"); + UnicodeSet syn_pp; + UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode); + UnicodeSet syn_list(UNICODE_STRING_SIMPLE( + "[!-/\\:-@\\[-\\^`\\{-~" + "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7" + "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775" + "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]"), errorCode); + UnicodeSet ws_pp; + UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode); + UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode); + UnicodeSet syn_ws_pp; + UnicodeSet syn_ws_prop(syn_prop); + syn_ws_prop.addAll(ws_prop); + for(UChar32 c=0; c<=0xffff; ++c) { + if(PatternProps::isSyntax(c)) { + syn_pp.add(c); + } + if(PatternProps::isWhiteSpace(c)) { + ws_pp.add(c); + } + if(PatternProps::isSyntaxOrWhiteSpace(c)) { + syn_ws_pp.add(c); + } + } + compareUSets(syn_pp, syn_prop, + "PatternProps.isSyntax()", "[:Pattern_Syntax:]", TRUE); + compareUSets(syn_pp, syn_list, + "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", TRUE); + compareUSets(ws_pp, ws_prop, + "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", TRUE); + compareUSets(ws_pp, ws_list, + "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", TRUE); + compareUSets(syn_ws_pp, syn_ws_prop, + "PatternProps.isSyntaxOrWhiteSpace()", + "[[:Pattern_Syntax:][:Pattern_White_Space:]]", TRUE); +} + +// So far only minimal port of Java & cucdtst.c compareUSets(). +UBool +UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b, + const char *a_name, const char *b_name, + UBool diffIsError) { + UBool same= a==b; + if(!same && diffIsError) { + errln("Sets are different: %s vs. %s\n", a_name, b_name); + } + return same; +} diff --git a/icu4c/source/test/intltest/ucdtest.h b/icu4c/source/test/intltest/ucdtest.h index 93a096c0030..38c7eca223d 100644 --- a/icu4c/source/test/intltest/ucdtest.h +++ b/icu4c/source/test/intltest/ucdtest.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2010, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -36,6 +36,7 @@ public: void TestAdditionalProperties(); void TestBinaryValues(); void TestConsistency(); + void TestPatternProperties(); private: @@ -50,5 +51,8 @@ private: UnicodeSet derivedProps[30]; U_NAMESPACE_QUALIFIER Hashtable *unknownPropertyNames; -}; + UBool compareUSets(const UnicodeSet &a, const UnicodeSet &b, + const char *a_name, const char *b_name, + UBool diffIsError); +}; diff --git a/icu4c/source/test/intltest/uobjtest.cpp b/icu4c/source/test/intltest/uobjtest.cpp index ab6708608a7..a75c9a038ca 100644 --- a/icu4c/source/test/intltest/uobjtest.cpp +++ b/icu4c/source/test/intltest/uobjtest.cpp @@ -322,7 +322,6 @@ void UObjectTest::testIDs() { ids_count = 0; UErrorCode status = U_ZERO_ERROR; - static const UChar SMALL_STR[] = {0x51, 0x51, 0x51, 0}; // "QQQ" #if !UCONFIG_NO_TRANSLITERATION || !UCONFIG_NO_FORMATTING UParseError parseError; @@ -364,6 +363,8 @@ void UObjectTest::testIDs() TESTCLASSID_CTOR(DecimalFormatSymbols, (status)); TESTCLASSID_DEFAULT(FieldPosition); TESTCLASSID_DEFAULT(Formattable); + + static const UChar SMALL_STR[] = {0x51, 0x51, 0x51, 0}; // "QQQ" TESTCLASSID_CTOR(CurrencyAmount, (1.0, SMALL_STR, status)); TESTCLASSID_CTOR(CurrencyUnit, (SMALL_STR, status)); TESTCLASSID_NONE_FACTORY(LocaleDisplayNames, LocaleDisplayNames::createInstance("de")); @@ -570,6 +571,7 @@ void UObjectTest::TestMFCCompatibility() { } void UObjectTest::TestCompilerRTTI() { +#if !UCONFIG_NO_FORMATTING UErrorCode errorCode = U_ZERO_ERROR; NumberFormat *nf = NumberFormat::createInstance("de", errorCode); if (U_FAILURE(errorCode)) { @@ -587,6 +589,7 @@ void UObjectTest::TestCompilerRTTI() { errln("typeid(NumberFormat) failed"); } delete nf; +#endif } /* --------------- */ diff --git a/icu4c/source/test/iotest/filetst.c b/icu4c/source/test/iotest/filetst.c index b65dc34e765..c0ddd5ed281 100644 --- a/icu4c/source/test/iotest/filetst.c +++ b/icu4c/source/test/iotest/filetst.c @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2004-2010, International Business Machines + * Copyright (C) 2004-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: filetst.c @@ -910,8 +910,8 @@ static void TestCodepage(void) { } static void TestCodepageFlush(void) { -#if UCONFIG_NO_LEGACY_CONVERSION - log_verbose("Skipping, legacy conversion is disabled."); +#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_NO_FORMATTING + log_verbose("Skipping, legacy conversion or formatting is disabled."); #else UChar utf16String[] = { 0x39, 0x39, 0x39, 0x20, 0x65E0, 0x6CD6, 0x5728, 0x0000 }; uint8_t inBuf[200];