diff --git a/icu4c/data/de.txt b/icu4c/data/de.txt index 1348beed349..786f208ed2d 100644 --- a/icu4c/data/de.txt +++ b/icu4c/data/de.txt @@ -519,4 +519,58 @@ de { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + + +// * RuleBasedNumberFormat data for German + + // again, I'm not 100% sure of these rules. I think both "hundert" and + // "einhundert" are correct or 100, but I'm not sure which is preferable + // in situations where this framework is likely to be used. Also, is it + // really true that numbers are run together into compound words all the + // time? + + SpelloutRules { + // 1 is "eins" when by itself, but turns into "ein" in most + // combinations + "%alt-ones:\n" + " -x: minus >>;\n" + " x.x: << komma >>;\n" + " null; eins; =%%main=;\n" + "%%main:\n" + // words for numbers from 0 to 12. Notice that the values + // from 13 to 19 can derived algorithmically, unlike in most + // other languages + " null; ein; zwei; drei; vier; f\u00fcnf; sechs; sieben; acht; neun;\n" + " zehn; elf; zw\u00f6lf; >>zehn;\n" + // rules for the multiples of 10. Notice that the ones digit + // goes on the front + " 20: [>>und]zwanzig;\n" + " 30: [>>und]drei\u00dfig;\n" + " 40: [>>und]vierzig;\n" + " 50: [>>und]f\u00fcnfzig;\n" + " 60: [>>und]sechzig;\n" + " 70: [>>und]siebzig;\n" + " 80: [>>und]achtzig;\n" + " 90: [>>und]neunzig;\n" + " 100: hundert[>%alt-ones>];\n" + " 200: <%alt-ones>];\n" + " 1000: tausend[>%alt-ones>];\n" + " 2000: <%alt-ones>];\n" + " 1,000,000: eine Million[ >%alt-ones>];\n" + " 2,000,000: << Millionen[ >%alt-ones>];\n" + " 1,000,000,000: eine Milliarde[ >%alt-ones>];\n" + " 2,000,000,000: << Milliarden[ >%alt-ones>];\n" + " 1,000,000,000,000: eine Billion[ >%alt-ones>];\n" + " 2,000,000,000,000: << Billionen[ >%alt-ones>];\n" + " 1,000,000,000,000,000: =#,##0=;" + "%%lenient-parse:\n" + " &\u0000 << ' ' << '-'\n" + " & ae , \u00e4 & ae , \u00c4\n" + " & oe , \u00f6 & oe , \u00d6\n" + " & ue , \u00fc & ue , \u00dc\n" + } } diff --git a/icu4c/data/el.txt b/icu4c/data/el.txt index 8e02ed5c67a..9f94ea58763 100644 --- a/icu4c/data/el.txt +++ b/icu4c/data/el.txt @@ -116,4 +116,53 @@ el { "Greek",// Script Name "Grek" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Greek. Again in Greek we have to supply the words +// * for the multiples of 100 because they can't be derived algorithmically. +// * Also, the tens dgit changes form when followed by a ones digit: an +// * accent mark disappears from the tens digit and moves to the ones digit. +// * Therefore, instead of using the [] notation, we actually have to use +// * two separate rules for each multiple of 10 to show the two forms of +// * the word. + + // Can someone supply me with information on negatives and decimals? + // I'm also missing the word for zero. Can someone clue me in? + + SpelloutRules { + "zero (incomplete data); \u03ad\u03bd\u03b1; \u03b4\u03cd\u03bf; \u03b4\u03c1\u03af\u03b1; " + "\u03c4\u03ad\u03c3\u03c3\u03b5\u03c1\u03b1; \u03c0\u03ad\u03bd\u03c4\u03b5; " + "\u03ad\u03be\u03b9; \u03b5\u03c0\u03c4\u03ac; \u03bf\u03ba\u03c4\u03ce; " + "\u03b5\u03bd\u03bd\u03ad\u03b1;\n" + "10: \u03b4\u03ad\u03ba\u03b1; " + "\u03ad\u03bd\u03b4\u03b5\u03ba\u03b1; \u03b4\u03ce\u03b4\u03b5\u03ba\u03b1; " + "\u03b4\u03b5\u03ba\u03b1>>;\n" + "20: \u03b5\u03af\u03ba\u03bf\u03c3\u03b9; \u03b5\u03b9\u03ba\u03bf\u03c3\u03b9>>;\n" + "30: \u03c4\u03c1\u03b9\u03ac\u03bd\u03c4\u03b1; \u03c4\u03c1\u03b9\u03b1\u03bd\u03c4\u03b1>>;\n" + "40: \u03c3\u03b1\u03c1\u03ac\u03bd\u03c4\u03b1; \u03c3\u03b1\u03c1\u03b1\u03bd\u03c4\u03b1>>;\n" + "50: \u03c0\u03b5\u03bd\u03ae\u03bd\u03c4\u03b1; \u03c0\u03b5\u03bd\u03b7\u03bd\u03c4\u03b1>>;\n" + "60: \u03b5\u03be\u03ae\u03bd\u03c4\u03b1; \u03b5\u03be\u03b7\u03bd\u03c4\u03b1>>;\n" + "70: \u03b5\u03b2\u03b4\u03bf\u03bc\u03ae\u03bd\u03c4\u03b1; " + "\u03b5\u03b2\u03b4\u03bf\u03bc\u03b7\u03bd\u03c4\u03b1>>;\n" + "80: \u03bf\u03b3\u03b4\u03cc\u03bd\u03c4\u03b1; \u03bf\u03b3\u03b4\u03bf\u03bd\u03c4\u03b1>>;\n" + "90: \u03b5\u03bd\u03bd\u03b5\u03bd\u03ae\u03bd\u03c4\u03b1; " + "\u03b5\u03bd\u03bd\u03b5\u03bd\u03b7\u03bd\u03c4\u03b1>>;\n" + "100: \u03b5\u03ba\u03b1\u03c4\u03cc[\u03bd >>];\n" + "200: \u03b4\u03b9\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "300: \u03c4\u03c1\u03b9\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "400: \u03c4\u03b5\u03c4\u03c1\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "500: \u03c0\u03b5\u03bd\u03c4\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "600: \u03b5\u03be\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "700: \u03b5\u03c0\u03c4\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "800: \u03bf\u03ba\u03c4\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "900: \u03b5\u03bd\u03bd\u03b9\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "1000: \u03c7\u03af\u03bb\u03b9\u03b1[ >>];\n" + "2000: << \u03c7\u03af\u03bb\u03b9\u03b1[ >>];\n" + "1,000,000: << \u03b5\u03ba\u03b1\u03c4\u03bf\u03bc\u03bc\u03b9\u03cc\u03c1\u03b9\u03bf[ >>];\n" + "1,000,000,000: << \u03b4\u03b9\u03c3\u03b5\u03ba\u03b1\u03c4\u03bf\u03bc\u03bc\u03b9\u03cc\u03c1\u03b9\u03bf[ >>];\n" + "1,000,000,000,000: =#,##0=" + } } diff --git a/icu4c/data/en.txt b/icu4c/data/en.txt index efac070c26c..23c89fce318 100644 --- a/icu4c/data/en.txt +++ b/icu4c/data/en.txt @@ -233,4 +233,11 @@ en { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// inherited from root + } diff --git a/icu4c/data/en_GB.txt b/icu4c/data/en_GB.txt index b36f0f25f50..80a00dddeee 100644 --- a/icu4c/data/en_GB.txt +++ b/icu4c/data/en_GB.txt @@ -49,4 +49,70 @@ en_GB { "BST", } } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for U.K. English. U.K. English has one significant +// * difference from U.S. English: the names for values of 1,000,000,000 +// * and higher. In American English, each successive "-illion" is 1,000 +// * times greater than the preceding one: 1,000,000,000 is "one billion" +// * and 1,000,000,000,000 is "one trillion." In British English, each +// * successive "-illion" is one million times greater than the one before: +// * "one billion" is 1,000,000,000,000 (or what Americans would call a +// * "trillion"), and "one trillion" is 1,000,000,000,000,000,000. +// * 1,000,000,000 in British English is "one thousand million." (This +// * value is sometimes called a "milliard," but this word seems to have +// * fallen into disuse.) + + // Could someone please correct me if I'm wrong about "milliard" falling + // into disuse, or have missed any other details of how large numbers + // are rendered. Also, could someone please provide me with information + // on which other English-speaking countries use which system? Right now, + // I'm assuming that the U.S. system is used in Canada and that all the + // other English-speaking countries follow the British system. Can + // someone out there confirm this? + + SpelloutRules { + "%simplified:\n" + " -x: minus >>;\n" + " x.x: << point >>;\n" + " zero; one; two; three; four; five; six; seven; eight; nine;\n" + " ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen;\n" + " seventeen; eighteen; nineteen;\n" + " 20: twenty[->>];\n" + " 30: thirty[->>];\n" + " 40: forty[->>];\n" + " 50: fifty[->>];\n" + " 60: sixty[->>];\n" + " 70: seventy[->>];\n" + " 80: eighty[->>];\n" + " 90: ninety[->>];\n" + " 100: << hundred[ >>];\n" + " 1000: << thousand[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%default:\n" + " -x: minus >>;\n" + " x.x: << point >>;\n" + " =%simplified=;\n" + " 100: << hundred[ >%%and>];\n" + " 1000: << thousand[ >%%and>];\n" + " 100,000>>: << thousand[>%%commas>];\n" + " 1,000,000: << million[>%%commas>];\n" + " 1,000,000,000,000: << billion[>%%commas>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%%and:\n" + " and =%default=;\n" + " 100: =%default=;\n" + "%%commas:\n" + " ' and =%default=;\n" + " 100: , =%default=;\n" + " 1000: , <%default< thousand, >%default>;\n" + " 1,000,000: , =%default=;" + "%%lenient-parse:\n" + " & ' ' , ',' ;\n" + } } diff --git a/icu4c/data/eo.txt b/icu4c/data/eo.txt index 0e3a42027d8..82a376fa9fb 100644 --- a/icu4c/data/eo.txt +++ b/icu4c/data/eo.txt @@ -140,4 +140,31 @@ eo { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// data from 'Esperanto-programita 1' courtesy of Markus Scherer + + SpelloutRules { + "-x: minus >>;\n" + "x.x: << komo >>;\n" + "nulo; unu; du; tri; kvar; kvin; ses; sep; ok; na\u016d;\n" + "10: dek[ >>];\n" + "20: <>];\n" + "100: cent[ >>];\n" + "200: <>];\n" + "1000: mil[ >>];\n" + "2000: <>];\n" + "10000: dekmil[ >>];\n" + "11000>: << mil[ >>];\n" + "1,000,000: miliono[ >>];\n" + "2,000,000: << milionoj[ >>];\n" + "1,000,000,000: miliardo[ >>];\n" + "2,000,000,000: << miliardoj[ >>];\n" + "1,000,000,000,000: biliono[ >>];\n" + "2,000,000,000,000: << bilionoj[ >>];\n" + "1,000,000,000,000,000: =#,##0=;\n" + } } diff --git a/icu4c/data/es.txt b/icu4c/data/es.txt index c897fa2226f..f3ed4b2eb8a 100644 --- a/icu4c/data/es.txt +++ b/icu4c/data/es.txt @@ -258,4 +258,69 @@ es { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Spanish. The Spanish rules are quite similar to +// * the English rules, but there are some important differences: +// * First, we have to provide separate rules for most of the twenties +// * because the ones digit frequently picks up an accent mark that it +// * doesn't have when standing alone. Second, each multiple of 100 has +// * to be specified separately because the multiplier on 100 very often +// * changes form in the contraction: 500 is "quinientos," not +// * "cincocientos." In addition, the word for 100 is "cien" when +// * standing alone, but changes to "ciento" when followed by more digits. +// * There also some other differences. + + // The Spanish rules are incomplete. I'm missing information on negative + // numbers and numbers with fractional parts. I also don't have + // information on numbers higher than the millions. + + SpelloutRules { + // negative-number and fraction rules + "-x: menos >>;\n" + "x.x: << punto >>;\n" + // words for values from 0 to 19 + "cero; uno; dos; tres; cuatro; cinco; seis; siete; ocho; nueve;\n" + "diez; once; doce; trece; catorce; quince; diecis\u00e9is;\n" + " diecisiete; dieciocho; diecinueve;\n" + // words for values from 20 to 29 (necessary because the ones digit + // often picks up an accent mark it doesn't have when standing alone) + "veinte; veintiuno; veintid\u00f3s; veintitr\u00e9s; veinticuatro;\n" + " veinticinco; veintis\u00e9is; veintisiete; veintiocho;\n" + " veintinueve;\n" + // words for multiples of 10 (notice that the tens digit is separated + // from the ones digit by the word "y".) + "30: treinta[ y >>];\n" + "40: cuarenta[ y >>];\n" + "50: cincuenta[ y >>];\n" + "60: sesenta[ y >>];\n" + "70: setenta[ y >>];\n" + "80: ochenta[ y >>];\n" + "90: noventa[ y >>];\n" + // 100 by itself is "cien," but 100 followed by something is "cineto" + "100: cien;\n" + "101: ciento >>;\n" + // words for multiples of 100 (must be stated because they're + // rarely simple concatenations) + "200: doscientos[ >>];\n" + "300: trescientos[ >>];\n" + "400: cuatrocientos[ >>];\n" + "500: quinientos[ >>];\n" + "600: seiscientos[ >>];\n" + "700: setecientos[ >>];\n" + "800: ochocientos[ >>];\n" + "900: novecientos[ >>];\n" + // for 1,000, the multiplier on "mil" is omitted: 2,000 is "dos mil," + // but 1,000 is just "mil." + "1000: mil[ >>];\n" + "2000: << mil[ >>];\n" + // 1,000,000 is "un millon," not "uno millon" + "1,000,000: un mill\u00f3n[ >>];\n" + "2,000,000: << mill\u00f3n[ >>];\n" + // overflow rule + "1,000,000,000: =#,##0= (incomplete data);" + } } diff --git a/icu4c/data/fr.txt b/icu4c/data/fr.txt index 9f5991361d8..185e1fa46aa 100644 --- a/icu4c/data/fr.txt +++ b/icu4c/data/fr.txt @@ -190,10 +190,73 @@ fr { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for French. French adds some interesting quirks of its +// * own: 1) The word "et" is interposed between the tens and ones digits, +// * but only if the ones digit if 1: 20 is "vingt," and 2 is "vingt-deux," +// * but 21 is "vingt-et-un." 2) There are no words for 70, 80, or 90. +// * "quatre-vingts" ("four twenties") is used for 80, and values proceed +// * by score from 60 to 99 (e.g., 73 is "soixante-treize" ["sixty-thirteen"]). +// * Numbers from 1,100 to 1,199 are rendered as hundreds rather than +// * thousands: 1,100 is "onze cents" ("eleven hundred"), rather than +// * "mille cent" ("one thousand one hundred") + + SpelloutRules { + // the main rule set + "%main:\n" + " -x: moins >>;\n" + " x.x: << virgule >>;\n" + // words for numbers from 0 to 10 + " z\u00e9ro; un; deux; trois; quatre; cinq; six; sept; huit; neuf;\n" + " dix; onze; douze; treize; quatorze; quinze; seize;\n" + " dix-sept; dix-huit; dix-neuf;\n" + // ords for the multiples of 10: %%alt-ones inserts "et" + // when needed + " 20: vingt[->%%alt-ones>];\n" + " 30: trente[->%%alt-ones>];\n" + " 40: quarante[->%%alt-ones>];\n" + " 50: cinquante[->%%alt-ones>];\n" + // rule for 60. The /20 causes this rule's multiplier to be + // 20 rather than 10, allowinhg us to recurse for all values + // from 60 to 79... + " 60/20: soixante[->%%alt-ones>];\n" + // ...except for 71, which must be special-cased + " 71: soixante et onze;\n" + // at 72, we have to repeat the rule for 60 to get us to 79 + " 72/20: soixante->%%alt-ones>;\n" + // at 80, we state a new rule with the phrase for 80. Since + // it changes form when there's a ones digit, we need a second + // rule at 81. This rule also includes "/20," allowing it to + // be used correctly for all values up to 99 + " 80: quatre-vingts; 81/20: quatre-vingt->>;\n" + // "cent" becomes plural when preceded by a multiplier, and + // the multiplier is omitted from the singular form + " 100: cent[ >>];\n" + " 200: << cents[ >>];\n" + " 1000: mille[ >>];\n" + // values from 1,100 to 1,199 are rendered as "onze cents..." + // instead of "mille cent..." The > after "1000" decreases + // the rule's exponent, causing its multiplier to be 100 instead + // of 1,000. This prevents us from getting "onze cents cent + // vingt-deux" ("eleven hundred one hundred twenty-two"). + " 1100>: onze cents[ >>];\n" + // at 1,200, we go back to formating in thousands, so we + // repeat the rule for 1,000 + " 1200: mille >>;\n" + // at 2,000, the multiplier is added + " 2000: << mille[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000: << milliarde[ >>];\n" + " 1,000,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + // %%alt-ones is used to insert "et" when the ones digit is 1 + "%%alt-ones:\n" + " ; et-un; =%main=;\n" + "%%lenient-parse:\n" + " &\u0000 << ' ' << ',' << '-';\n" + } } - - - - - - diff --git a/icu4c/data/fr_CH.txt b/icu4c/data/fr_CH.txt index ce7d398b313..e9c896f5b06 100644 --- a/icu4c/data/fr_CH.txt +++ b/icu4c/data/fr_CH.txt @@ -56,4 +56,51 @@ fr_CH { "GMT", } } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Swiss French. Swiss French differs from French French +// * in that it does have words for 70, 80, and 90. This rule set shows them, +// * and is simpler as a result. + + // again, I'm missing information on negative numbers and decimals for + // these to rule sets. Also, I'm not 100% sure about Swiss French. Is + // this correct? Is "onze cents" commonly used for 1,100 in both France + // and Switzerland? Can someone fill me in on the rules for the other + // French-speaking countries? I've heard conflicting opinions on which + // version is used in Canada, and I understand there's an alternate set + // of words for 70, 80, and 90 that is used somewhere, but I don't know + // what those words are or where they're used. + + SpelloutRules { + "%main:\n" + " -x: moins >>;\n" + " x.x: << virgule >>;\n" + " z\u00e9ro; un; deux; trois; quatre; cinq; six; sept; huit; neuf;\n" + " dix; onze; douze; treize; quatorze; quinze; seize;\n" + " dix-sept; dix-huit; dix-neuf;\n" + " 20: vingt[->%%alt-ones>];\n" + " 30: trente[->%%alt-ones>];\n" + " 40: quarante[->%%alt-ones>];\n" + " 50: cinquante[->%%alt-ones>];\n" + " 60: soixante[->%%alt-ones>];\n" + // notice new words for 70, 80, and 90 + " 70: septante[->%%alt-ones>];\n" + " 80: octante[->%%alt-ones>];\n" + " 90: nonante[->%%alt-ones>];\n" + " 100: cent[ >>];\n" + " 200: << cents[ >>];\n" + " 1000: mille[ >>];\n" + " 1100>: onze cents[ >>];\n" + " 1200: mille >>;\n" + " 2000: << mille[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000: << milliarde[ >>];\n" + " 1,000,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%%alt-ones:\n" + " ; et-un; =%main=;" + } } diff --git a/icu4c/data/he.txt b/icu4c/data/he.txt index f3f38a2b2f5..d622e8a963f 100644 --- a/icu4c/data/he.txt +++ b/icu4c/data/he.txt @@ -102,4 +102,38 @@ he { "Hebrew", // Script Name "Hebr" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Hebrew. Hebrew actually has inflected forms for +// * most of the lower-order numbers. The masculine forms are shown +// * here. + + // This data is woefully incomplete. Can someone fill me in on the + // various inflected forms of the numbers, which seem to be necessary + // to do Hebrew correctly? Can somone supply me with data for values + // from 1,000,000 on up? What about the word for zero? What about + // information on negatives and decimals? + + SpelloutRules { + "zero (incomplete data); \u05d0\u05d4\u05d3; \u05e9\u05d2\u05d9\u05d9\u05dd; \u05e9\u05dc\u05d5\u05e9\u05d4;\n" + "4: \u05d0\u05d3\u05d1\u05e6\u05d4; \u05d7\u05d2\u05d5\u05d9\u05e9\u05d4; \u05e9\u05e9\u05d4;\n" + "7: \u05e9\u05d1\u05e6\u05d4; \u05e9\u05de\u05d5\u05d2\u05d4; \u05ea\u05e9\u05e6\u05d4;\n" + "10: \u05e6\u05e9\u05d3\u05d4[ >>];\n" + "20: \u05e6\u05e9\u05d3\u05d9\u05dd[ >>];\n" + "30: \u05e9\u05dc\u05d5\u05e9\u05d9\u05dd[ >>];\n" + "40: \u05d0\u05d3\u05d1\u05e6\u05d9\u05dd[ >>];\n" + "50: \u05d7\u05de\u05d9\u05e9\u05d9\u05dd[ >>];\n" + "60: \u05e9\u05e9\u05d9\u05dd[ >>];\n" + "70: \u05e9\u05d1\u05e6\u05d9\u05dd[ >>];\n" + "80: \u05e9\u05de\u05d5\u05d2\u05d9\u05dd[ >>];\n" + "90: \u05ea\u05e9\u05e6\u05d9\u05dd[ >>];\n" + "100: \u05de\u05d0\u05d4[ >>];\n" + "200: << \u05de\u05d0\u05d4[ >>];\n" + "1000: \u05d0\u05dc\u05e3[ >>];\n" + "2000: << \u05d0\u05dc\u05e3[ >>];\n" + "1,000,000: =#,##0= (incomplete data);" + } } diff --git a/icu4c/data/it.txt b/icu4c/data/it.txt index 6769b9f6893..a3dcbf7d43e 100644 --- a/icu4c/data/it.txt +++ b/icu4c/data/it.txt @@ -117,4 +117,106 @@ it { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Italian. Like German, most Italian numbers are +// * written as single words. What makes these rules complicated is the rule +// * that says that when a word ending in a vowel and a word beginning with +// * a vowel are combined into a compound, the vowel is dropped from the +// * end of the first word: 180 is "centottanta," not "centoottanta." +// * The complexity of this rule set is to produce this behavior. + + // Can someone confirm that I did the vowel-eliding thing right? I'm + // not 100% sure I'm doing it in all the right places, or completely + // correctly. Also, I don't have information for negatives and decimals, + // and I lack words fror values from 1,000,000 on up. + + SpelloutRules { + // main rule set. Follows the patterns of the preceding rule sets, + // except that the final vowel is omitted from words ending in + // vowels when they are followed by another word; instead, we have + // separate rule sets that are identical to this one, except that + // all the words that don't begin with a vowel have a vowel tacked + // onto them at the front. A word ending in a vowel calls a + // substitution that will supply that vowel, unless that vowel is to + // be elided. + "%main:\n" + " -x: meno >>;\n" + " x.x: << virgola >>;\n" + " zero; uno; due; tre; quattro; cinque; sei; sette; otto;\n" + " nove;\n" + " dieci; undici; dodici; tredici; quattordici; quindici; sedici;\n" + " diciasette; diciotto; diciannove;\n" + " 20: venti; vent>%%with-i>;\n" + " 30: trenta; trent>%%with-i>;\n" + " 40: quaranta; quarant>%%with-a>;\n" + " 50: cinquanta; cinquant>%%with-a>;\n" + " 60: sessanta; sessant>%%with-a>;\n" + " 70: settanta; settant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: novanta; novant>%%with-a>;\n" + " 100: cento; cent[>%%with-o>];\n" + " 200: <%%with-o>];\n" + " 1000: mille; mill[>%%with-i>];\n" + " 2000: <%%with-a>];\n" + " 100,000>>: <>];\n" + " 1,000,000: =#,##0= (incomplete data);\n" + "%%with-a:\n" + " azero; uno; adue; atre; aquattro; acinque; asei; asette; otto;\n" + " anove;\n" + " adieci; undici; adodici; atredici; aquattordici; aquindici; asedici;\n" + " adiciasette; adiciotto; adiciannove;\n" + " 20: aventi; avent>%%with-i>;\n" + " 30: atrenta; atrent>%%with-i>;\n" + " 40: aquaranta; aquarant>%%with-a>;\n" + " 50: acinquanta; acinquant>%%with-a>;\n" + " 60: asessanta; asessant>%%with-a>;\n" + " 70: asettanta; asettant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: anovanta; anovant>%%with-a>;\n" + " 100: acento; acent[>%%with-o>];\n" + " 200: <%%with-a%%with-o>];\n" + " 1000: amille; amill[>%%with-i>];\n" + " 2000: <%%with-a%%with-a>];\n" + " 100,000: =%main=;\n" + "%%with-i:\n" + " izero; uno; idue; itre; iquattro; icinque; isei; isette; otto;\n" + " inove;\n" + " idieci; undici; idodici; itredici; iquattordici; iquindici; isedici;\n" + " idiciasette; idiciotto; idiciannove;\n" + " 20: iventi; ivent>%%with-i>;\n" + " 30: itrenta; itrent>%%with-i>;\n" + " 40: iquaranta; iquarant>%%with-a>;\n" + " 50: icinquanta; icinquant>%%with-a>;\n" + " 60: isessanta; isessant>%%with-a>;\n" + " 70: isettanta; isettant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: inovanta; inovant>%%with-a>;\n" + " 100: icento; icent[>%%with-o>];\n" + " 200: <%%with-i%%with-o>];\n" + " 1000: imille; imill[>%%with-i>];\n" + " 2000: <%%with-i%%with-a>];\n" + " 100,000: =%main=;\n" + "%%with-o:\n" + " ozero; uno; odue; otre; oquattro; ocinque; osei; osette; otto;\n" + " onove;\n" + " odieci; undici; ododici; otredici; oquattordici; oquindici; osedici;\n" + " odiciasette; odiciotto; odiciannove;\n" + " 20: oventi; ovent>%%with-i>;\n" + " 30: otrenta; otrent>%%with-i>;\n" + " 40: oquaranta; oquarant>%%with-a>;\n" + " 50: ocinquanta; ocinquant>%%with-a>;\n" + " 60: osessanta; osessant>%%with-a>;\n" + " 70: osettanta; osettant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: onovanta; onovant>%%with-a>;\n" + " 100: ocento; ocent[>%%with-o>];\n" + " 200: <%%with-o%%with-o>];\n" + " 1000: omille; omill[>%%with-i>];\n" + " 2000: <%%with-o%%with-a>];\n" + " 100,000: =%main=;\n" + } } diff --git a/icu4c/data/ja.txt b/icu4c/data/ja.txt index 42ec50a9102..6be89505b73 100644 --- a/icu4c/data/ja.txt +++ b/icu4c/data/ja.txt @@ -859,4 +859,47 @@ ja { "JST", } } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Japanese. In Japanese, there really isn't any +// * distinction between a number written out in digits and a number +// * written out in words: the ideographic characters are both digits +// * and words. This rule set provides two variants: %traditional +// * uses the traditional CJK numerals (which are also used in China +// * and Korea). %financial uses alternate ideographs for many numbers +// * that are harder to alter than the traditional numerals (one could +// * fairly easily change a one to +// * a three just by adding two strokes, for example). This is also done in +// * the other countries using Chinese idographs, but different ideographs +// * are used in those places. + + // Can someone supply me with the right fraud-proof ideographs for + // Simplified and Traditional Chinese, and for Korean? Can someone + // supply me with information on negatives and decimals? + + SpelloutRules { + "%financial:\n" + " \u96f6; \u58f1; \u5f10; \u53c2; \u56db; \u4f0d; \u516d; \u4e03; \u516b; \u4e5d;\n" + " \u62fe[>>];\n" + " 20: <<\u62fe[>>];\n" + " 100: <<\u767e[>>];\n" + " 1000: <<\u5343[>>];\n" + " 10,000: <<\u4e07[>>];\n" + " 100,000,000: <<\u5104[>>];\n" + " 1,000,000,000,000: <<\u5146[>>];\n" + " 10,000,000,000,000,000: =#,##0=;\n" + "%traditional:\n" + " \u96f6; \u4e00; \u4e8c; \u4e09; \u56db; \u4e94; \u516d; \u4e03; \u516b; \u4e5d;\n" + " \u5341[>>];\n" + " 20: <<\u5341[>>];\n" + " 100: <<\u767e[>>];\n" + " 1000: <<\u5343[>>];\n" + " 10,000: <<\u4e07[>>];\n" + " 100,000,000: <<\u5104[>>];\n" + " 1,000,000,000,000: <<\u5146[>>];\n" + " 10,000,000,000,000,000: =#,##0=;" + } } diff --git a/icu4c/data/nl.txt b/icu4c/data/nl.txt index 9e0a6818784..c6d1e777add 100644 --- a/icu4c/data/nl.txt +++ b/icu4c/data/nl.txt @@ -108,4 +108,32 @@ nl { "Latg", // ISO 15924 Name } + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Dutch + + // can someone supply me with information on negatives and decimals? + + SpelloutRules { + " -x: min >>;\n" + "x.x: << komma >>;\n" + "(zero?); een; twee; drie; vier; vijf; zes; zeven; acht; negen;\n" + "tien; elf; twaalf; dertien; veertien; vijftien; zestien;\n" + "zeventien; achtien; negentien;\n" + "20: [>> en ]twintig;\n" + "30: [>> en ]dertig;\n" + "40: [>> en ]veertig;\n" + "50: [>> en ]vijftig;\n" + "60: [>> en ]zestig;\n" + "70: [>> en ]zeventig;\n" + "80: [>> en ]tachtig;\n" + "90: [>> en ]negentig;\n" + "100: << honderd[ >>];\n" + "1000: << duizend[ >>];\n" + "1,000,000: << miljoen[ >>];\n" + "1,000,000,000: << biljoen[ >>];\n" + "1,000,000,000,000: =#,##0=" + } } diff --git a/icu4c/data/root.txt b/icu4c/data/root.txt index cc0cf0278f7..d5ccbdea702 100644 --- a/icu4c/data/root.txt +++ b/icu4c/data/root.txt @@ -1180,11 +1180,221 @@ root { "Anchorage", } } + + LocaleScript{ "Latin", "Latn", "Latf", "Latg" } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for U.S. English. This rule set has two variants: +// * %simplified is a set of rules showing the simple method of spelling +// * out numbers in English: 289 is formatted as "two hundred eighty-nine". +// * %default uses a more complicated algorithm to format +// * numbers in a more natural way: 289 is formatted as "two hundred AND +// * eighty-nine" and commas are inserted between the thousands groups for +// * values above 100,000. + + SpelloutRules { + // This rule set shows the normal simple formatting rules for English + "%simplified:\n" + // negative number rule. This rule is used to format negative + // numbers. The result of formatting the number's absolute + // value is placed where the >> is. + " -x: minus >>;\n" + // faction rule. This rule is used for formatting numbers + // with fractional parts. The result of formatting the + // number's integral part is substituted for the <<, and + // the result of formatting the number's fractional part + // (one digit at a time, e.g., 0.123 is "zero point one two + // three") replaces the >>. + " x.x: << point >>;\n" + // the rules for the values from 0 to 19 are simply the + // words for those numbers + " zero; one; two; three; four; five; six; seven; eight; nine;\n" + " ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen;\n" + " seventeen; eighteen; nineteen;\n" + // beginning at 20, we use the >> to mark the position where + // the result of formatting the number's ones digit. Thus, + // we only need a new rule at every multiple of 10. Text in + // backets is omitted if the value being formatted is an + // even multiple of 10. + " 20: twenty[->>];\n" + " 30: thirty[->>];\n" + " 40: forty[->>];\n" + " 50: fifty[->>];\n" + " 60: sixty[->>];\n" + " 70: seventy[->>];\n" + " 80: eighty[->>];\n" + " 90: ninety[->>];\n" + // beginning at 100, we can use << to mark the position where + // the result of formatting the multiple of 100 is to be + // inserted. Notice also that the meaning of >> has shifted: + // here, it refers to both the ones place and the tens place. + // The meanings of the << and >> tokens depend on the base value + // of the rule. A rule's divisor is (usually) the highest + // power of 10 that is less than or equal to the rule's base + // value. The value being formatted is divided by the rule's + // divisor, and the integral quotient is used to get the text + // for <<, while the remainder is used to produce the text + // for >>. Again, text in brackets is omitted if the value + // being formatted is an even multiple of the rule's divisor + // (in this case, an even multiple of 100) + " 100: << hundred[ >>];\n" + // The rules for the higher numbers work the same way as the + // rule for 100: Again, the << and >> tokens depend on the + // rule's divisor, which for all these rules is also the rule's + // base value. To group by thousand, we simply don't have any + // rules between 1,000 and 1,000,000. + " 1000: << thousand[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000: << trillion[ >>];\n" + // overflow rule. This rule specifies that values of a + // quadrillion or more are shown in numerals rather than words. + // The == token means to format (with new rules) the value + // being formatted by this rule and place the result where + // the == is. The #,##0 inside the == signs is a + // DecimalFormat pattern. It specifies that the value should + // be formatted with a DecimalFormat object, and that it + // should be formatted with no decimal places, at least one + // digit, and a thousands separator. + " 1,000,000,000,000,000: =#,##0=;\n" + + // %default is a more elaborate form of %simplified; It is basically + // the same, except that it introduces "and" before the ones digit + // when appropriate (basically, between the tens and ones digits) and + // separates the thousands groups with commas in values over 100,000. + "%default:\n" + // negative-number and fraction rules. These are the same + // as those for %simplified, but have to be stated here too + // because this is an entry point + " -x: minus >>;\n" + " x.x: << point >>;\n" + // just use %simplified for values below 100 + " =%simplified=;\n" + // for values from 100 to 9,999 use %%and to decide whether or + // not to interpose the "and" + " 100: << hundred[ >%%and>];\n" + " 1000: << thousand[ >%%and>];\n" + // for values of 100,000 and up, use %%commas to interpose the + // commas in the right places (and also to interpose the "and") + " 100,000>>: << thousand[>%%commas>];\n" + " 1,000,000: << million[>%%commas>];\n" + " 1,000,000,000: << billion[>%%commas>];\n" + " 1,000,000,000,000: << trillion[>%%commas>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + // if the value passed to this rule set is greater than 100, don't + // add the "and"; if it's less than 100, add "and" before the last + // digits + "%%and:\n" + " and =%default=;\n" + " 100: =%default=;\n" + // this rule set is used to place the commas + "%%commas:\n" + // for values below 100, add "and" (the apostrophe at the + // beginning is ignored, but causes the space that follows it + // to be significant: this is necessary because the rules + // calling %%commas don't put a space before it) + " ' and =%default=;\n" + // put a comma after the thousands (or whatever preceded the + // hundreds) + " 100: , =%default=;\n" + // put a comma after the millions (or whatever precedes the + // thousands) + " 1000: , <%default< thousand, >%default>;\n" + // and so on... + " 1,000,000: , =%default=;" + // %%lenient-parse isn't really a set of number formatting rules; + // it's a set of collation rules. Lenient-parse mode uses a Collator + // object to compare fragments of the text being parsed to the text + // in the rules, allowing more leeway in the matching text. This set + // of rules tells the formatter to ignore commas when parsing (it + // already ignores spaces, which is why we refer to the space; it also + // ignores hyphens, making "twenty one" and "twenty-one" parse + // identically) + "%%lenient-parse:\n" + // " & ' ' , ',' ;\n" + " &\u0000 << ' ' << ',' << '-'; \n" + } + + +// * This rule set adds an English ordinal abbreviation to the end of a +// * number. For example, 2 is formatted as "2nd". Parsing doesn't work with +// * this rule set. To parse, use DecimalFormat on the numeral. + OrdinalRules { + // this rule set formats the numeral and calls %%abbrev to + // supply the abbreviation + "%main:\n" + " =#,##0==%%abbrev=;\n" + // this rule set supplies the abbreviation + "%%abbrev:\n" + // the abbreviations. Everything from 4 to 19 ends in "th" + " th; st; nd; rd; th;\n" + // at 20, we begin repeating the cycle every 10 (13 is "13th", + // but 23 and 33 are "23rd" and "33rd") We do this by + // ignoring all bug the ones digit in selecting the abbreviation + " 20: >>;\n" + // at 100, we repeat the whole cycle by considering only the + // tens and ones digits in picking an abbreviation + " 100: >>;\n" + } + +// * This rule set formats a number of seconds in sexagesimal notation +// * (i.e., hours, minutes, and seconds). %with-words formats it with +// * words (3,740 is "1 hour, 2 minutes, 20 seconds") and %in-numerals +// * formats it entirely in numerals (3,740 is "1:02:20"). + DurationRules { + // main rule set for formatting with words + "%with-words:\n" + // take care of singular and plural forms of "second" + " 0 seconds; 1 second; =0= seconds;\n" + // use %%min to format values greater than 60 seconds + " 60/60: <%%min<[, >>];\n" + // use %%hr to format values greater than 3,600 seconds + // (the ">>>" below causes us to see the number of minutes + // when when there are zero minutes) + " 3600/60: <%%hr<[, >>>];\n" + // this rule set takes care of the singular and plural forms + // of "minute" + "%%min:\n" + " 0 minutes; 1 minute; =0= minutes;\n" + // this rule set takes care of the singular and plural forms + // of "hour" + "%%hr:\n" + " 0 hours; 1 hour; =0= hours;\n" + + // main rule set for formatting in numerals + "%in-numerals:\n" + // values below 60 seconds are shown with "sec." + " =0= sec.;\n" + // higher values are shown with colons: %%min-sec is used for + // values below 3,600 seconds... + " 60: =%%min-sec=;\n" + // ...and %%hr-min-sec is used for values of 3,600 seconds + // and above + " 3600: =%%hr-min-sec=;\n" + // this rule causes values of less than 10 minutes to show without + // a leading zero + "%%min-sec:\n" + " 0: :=00=;\n" + " 60/60: <0<>>;\n" + // this rule set is used for values of 3,600 or more. Minutes are always + // shown, and always shown with two digits + "%%hr-min-sec:\n" + " 0: :=00=;\n" + " 60/60: <00<>>;\n" + " 3600/60: <#,##0<:>>>;\n" + // the lenient-parse rules allow several different characters to be used + // as delimiters between hours, minutes, and seconds + "%%lenient-parse:\n" + " & ':' = '.' = ' ' = '-';\n" + } } diff --git a/icu4c/data/ru.txt b/icu4c/data/ru.txt index d29b8fa793c..6b58587b3cd 100644 --- a/icu4c/data/ru.txt +++ b/icu4c/data/ru.txt @@ -122,4 +122,44 @@ ru { "Cyrs" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Russian. + + // Can someone supply me with information on negatives and decimals? + // How about words for billions and trillions? + + SpelloutRules { + "\u043d\u043e\u043b\u044c; \u043e\u0434\u0438\u043d; \u0434\u0432\u0430; \u0442\u0440\u0438; " + "\u0447\u0435\u0442\u044b\u0440\u0435; \u043f\u044f\u0442; \u0448\u0435\u0441\u0442; " + "\u0441\u0435\u043c\u044c; \u0432\u043e\u0441\u0435\u043c\u044c; \u0434\u0435\u0432\u044f\u0442;\n" + "10: \u0434\u0435\u0441\u044f\u0442; " + "\u043e\u0434\u0438\u043d\u043d\u0430\u0434\u0446\u0430\u0442\u044c;\n" + "\u0434\u0432\u0435\u043d\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0442\u0440\u0438\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0447\u0435\u0442\u044b\u0440\u043d\u0430\u0434\u0446\u0430\u0442\u044c;\n" + "15: \u043f\u044f\u0442\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0448\u0435\u0441\u0442\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0441\u0435\u043c\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0432\u043e\u0441\u0435\u043c\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0434\u0435\u0432\u044f\u0442\u043d\u0430\u0434\u0446\u0430\u0442\u044c;\n" + "20: \u0434\u0432\u0430\u0434\u0446\u0430\u0442\u044c[ >>];\n" + "30: \u0442\u0440\u043b\u0434\u0446\u0430\u0442\u044c[ >>];\n" + "40: \u0441\u043e\u0440\u043e\u043a[ >>];\n" + "50: \u043f\u044f\u0442\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "60: \u0448\u0435\u0441\u0442\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "70: \u0441\u0435\u043c\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "80: \u0432\u043e\u0441\u0435\u043c\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "90: \u0434\u0435\u0432\u044f\u043d\u043e\u0441\u0442\u043e[ >>];\n" + "100: \u0441\u0442\u043e[ >>];\n" + "200: << \u0441\u0442\u043e[ >>];\n" + "1000: \u0442\u044b\u0441\u044f\u0447\u0430[ >>];\n" + "2000: << \u0442\u044b\u0441\u044f\u0447\u0430[ >>];\n" + "1,000,000: \u043c\u0438\u043b\u043b\u0438\u043e\u043d[ >>];\n" + "2,000,000: << \u043c\u0438\u043b\u043b\u0438\u043e\u043d[ >>];\n" + "1,000,000,000: =#,##0=;" + } } diff --git a/icu4c/data/sv.txt b/icu4c/data/sv.txt index 8dfa52272a9..aee882fa95d 100644 --- a/icu4c/data/sv.txt +++ b/icu4c/data/sv.txt @@ -118,4 +118,36 @@ sv { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Swedish. + + // can someone supply me with information on negatives and decimals? + + SpelloutRules { + "noll; ett; tv\u00e5; tre; fyra; fem; sex; sjo; \u00e5tta; nio;\n" + "tio; elva; tolv; tretton; fjorton; femton; sexton; sjutton; arton; nitton;\n" + "20: tjugo[>>];\n" + "30: trettio[>>];\n" + "40: fyrtio[>>];\n" + "50: femtio[>>];\n" + "60: sextio[>>];\n" + "70: sjuttio[>>];\n" + "80: \u00e5ttio[>>];\n" + "90: nittio[>>];\n" + "100: hundra[>>];\n" + "200: <>];\n" + "1000: tusen[ >>];\n" + "2000: << tusen[ >>];\n" + "1,000,000: en miljon[ >>];\n" + "2,000,000: << miljon[ >>];\n" + "1,000,000,000: en miljard[ >>];\n" + "2,000,000,000: << miljard[ >>];\n" + "1,000,000,000,000: en biljon[ >>];\n" + "2,000,000,000,000: << biljon[ >>];\n" + "1,000,000,000,000,000: =#,##0=" + } } diff --git a/icu4c/data/th.txt b/icu4c/data/th.txt index f89fd80c9b3..8324d1a30a9 100644 --- a/icu4c/data/th.txt +++ b/icu4c/data/th.txt @@ -245,4 +245,43 @@ th { "Thai", // Script Name "Thai" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// Spellout rules for Thai. Data from Suwit Srivilairith, IBM Thailand + + SpelloutRules { + "%default:\n" + " -x: \u0e25\u0e1a>>;\n" + " x.x: <<\u0e08\u0e38\u0e14>>>;\n" + " \u0e28\u0e39\u0e19\u0e22\u0e4c; \u0e2b\u0e19\u0e36\u0e48\u0e07; \u0e2a\u0e2d\u0e07; \u0e2a\u0e32\u0e21;\n" + " \u0e2a\u0e35\u0e48; \u0e2b\u0e49\u0e32; \u0e2b\u0e01; \u0e40\u0e08\u0e47\u0e14; \u0e41\u0e1b\u0e14;\n" + " \u0e40\u0e01\u0e49\u0e32; \u0e2a\u0e34\u0e1a; \u0e2a\u0e34\u0e1a\u0e40\u0e2d\u0e47\u0e14;\n" + " \u0e2a\u0e34\u0e1a\u0e2a\u0e2d\u0e07; \u0e2a\u0e34\u0e1a\u0e2a\u0e32\u0e21;\n" + " \u0e2a\u0e34\u0e1a\u0e2a\u0e35\u0e48; \u0e2a\u0e34\u0e1a\u0e2b\u0e49\u0e32;\n" + " \u0e2a\u0e34\u0e1a\u0e2b\u0e01; \u0e2a\u0e34\u0e1a\u0e40\u0e08\u0e47\u0e14;\n" + " \u0e2a\u0e34\u0e1a\u0e41\u0e1b\u0e14; \u0e2a\u0e34\u0e1a\u0e40\u0e01\u0e49\u0e32;\n" + " 20: \u0e22\u0e35\u0e48\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 30: \u0e2a\u0e32\u0e21\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 40: \u0e2a\u0e35\u0e48\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 50: \u0e2b\u0e49\u0e32\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 60: \u0e2b\u0e01\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 70: \u0e40\u0e08\u0e47\u0e14\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 80: \u0e41\u0e1b\u0e14\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 90: \u0e40\u0e01\u0e49\u0e32\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 100: <<\u0e23\u0e49\u0e2d\u0e22[>>];\n" + " 1000: <<\u0e1e\u0e31\u0e19[>>];\n" + " 10000: <<\u0e2b\u0e21\u0e37\u0e48\u0e19[>>];\n" + " 100000: <<\u0e41\u0e2a\u0e19[>>];\n" + " 1,000,000: <<\u0e25\u0e49\u0e32\u0e19[>>];\n" + " 1,000,000,000: <<\u0e1e\u0e31\u0e19\u0e25\u0e49\u0e32\u0e19[>>];\n" + " 1,000,000,000,000: <<\u0e25\u0e49\u0e32\u0e19\u0e25\u0e49\u0e32\u0e19[>>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%%alt-ones:\n" + " \u0e28\u0e39\u0e19\u0e22\u0e4c;\n" + " \u0e40\u0e2d\u0e47\u0e14;\n" + " =%default=;\n"; + } } diff --git a/icu4c/source/data/locales/de.txt b/icu4c/source/data/locales/de.txt index 1348beed349..786f208ed2d 100644 --- a/icu4c/source/data/locales/de.txt +++ b/icu4c/source/data/locales/de.txt @@ -519,4 +519,58 @@ de { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + + +// * RuleBasedNumberFormat data for German + + // again, I'm not 100% sure of these rules. I think both "hundert" and + // "einhundert" are correct or 100, but I'm not sure which is preferable + // in situations where this framework is likely to be used. Also, is it + // really true that numbers are run together into compound words all the + // time? + + SpelloutRules { + // 1 is "eins" when by itself, but turns into "ein" in most + // combinations + "%alt-ones:\n" + " -x: minus >>;\n" + " x.x: << komma >>;\n" + " null; eins; =%%main=;\n" + "%%main:\n" + // words for numbers from 0 to 12. Notice that the values + // from 13 to 19 can derived algorithmically, unlike in most + // other languages + " null; ein; zwei; drei; vier; f\u00fcnf; sechs; sieben; acht; neun;\n" + " zehn; elf; zw\u00f6lf; >>zehn;\n" + // rules for the multiples of 10. Notice that the ones digit + // goes on the front + " 20: [>>und]zwanzig;\n" + " 30: [>>und]drei\u00dfig;\n" + " 40: [>>und]vierzig;\n" + " 50: [>>und]f\u00fcnfzig;\n" + " 60: [>>und]sechzig;\n" + " 70: [>>und]siebzig;\n" + " 80: [>>und]achtzig;\n" + " 90: [>>und]neunzig;\n" + " 100: hundert[>%alt-ones>];\n" + " 200: <%alt-ones>];\n" + " 1000: tausend[>%alt-ones>];\n" + " 2000: <%alt-ones>];\n" + " 1,000,000: eine Million[ >%alt-ones>];\n" + " 2,000,000: << Millionen[ >%alt-ones>];\n" + " 1,000,000,000: eine Milliarde[ >%alt-ones>];\n" + " 2,000,000,000: << Milliarden[ >%alt-ones>];\n" + " 1,000,000,000,000: eine Billion[ >%alt-ones>];\n" + " 2,000,000,000,000: << Billionen[ >%alt-ones>];\n" + " 1,000,000,000,000,000: =#,##0=;" + "%%lenient-parse:\n" + " &\u0000 << ' ' << '-'\n" + " & ae , \u00e4 & ae , \u00c4\n" + " & oe , \u00f6 & oe , \u00d6\n" + " & ue , \u00fc & ue , \u00dc\n" + } } diff --git a/icu4c/source/data/locales/el.txt b/icu4c/source/data/locales/el.txt index 8e02ed5c67a..9f94ea58763 100644 --- a/icu4c/source/data/locales/el.txt +++ b/icu4c/source/data/locales/el.txt @@ -116,4 +116,53 @@ el { "Greek",// Script Name "Grek" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Greek. Again in Greek we have to supply the words +// * for the multiples of 100 because they can't be derived algorithmically. +// * Also, the tens dgit changes form when followed by a ones digit: an +// * accent mark disappears from the tens digit and moves to the ones digit. +// * Therefore, instead of using the [] notation, we actually have to use +// * two separate rules for each multiple of 10 to show the two forms of +// * the word. + + // Can someone supply me with information on negatives and decimals? + // I'm also missing the word for zero. Can someone clue me in? + + SpelloutRules { + "zero (incomplete data); \u03ad\u03bd\u03b1; \u03b4\u03cd\u03bf; \u03b4\u03c1\u03af\u03b1; " + "\u03c4\u03ad\u03c3\u03c3\u03b5\u03c1\u03b1; \u03c0\u03ad\u03bd\u03c4\u03b5; " + "\u03ad\u03be\u03b9; \u03b5\u03c0\u03c4\u03ac; \u03bf\u03ba\u03c4\u03ce; " + "\u03b5\u03bd\u03bd\u03ad\u03b1;\n" + "10: \u03b4\u03ad\u03ba\u03b1; " + "\u03ad\u03bd\u03b4\u03b5\u03ba\u03b1; \u03b4\u03ce\u03b4\u03b5\u03ba\u03b1; " + "\u03b4\u03b5\u03ba\u03b1>>;\n" + "20: \u03b5\u03af\u03ba\u03bf\u03c3\u03b9; \u03b5\u03b9\u03ba\u03bf\u03c3\u03b9>>;\n" + "30: \u03c4\u03c1\u03b9\u03ac\u03bd\u03c4\u03b1; \u03c4\u03c1\u03b9\u03b1\u03bd\u03c4\u03b1>>;\n" + "40: \u03c3\u03b1\u03c1\u03ac\u03bd\u03c4\u03b1; \u03c3\u03b1\u03c1\u03b1\u03bd\u03c4\u03b1>>;\n" + "50: \u03c0\u03b5\u03bd\u03ae\u03bd\u03c4\u03b1; \u03c0\u03b5\u03bd\u03b7\u03bd\u03c4\u03b1>>;\n" + "60: \u03b5\u03be\u03ae\u03bd\u03c4\u03b1; \u03b5\u03be\u03b7\u03bd\u03c4\u03b1>>;\n" + "70: \u03b5\u03b2\u03b4\u03bf\u03bc\u03ae\u03bd\u03c4\u03b1; " + "\u03b5\u03b2\u03b4\u03bf\u03bc\u03b7\u03bd\u03c4\u03b1>>;\n" + "80: \u03bf\u03b3\u03b4\u03cc\u03bd\u03c4\u03b1; \u03bf\u03b3\u03b4\u03bf\u03bd\u03c4\u03b1>>;\n" + "90: \u03b5\u03bd\u03bd\u03b5\u03bd\u03ae\u03bd\u03c4\u03b1; " + "\u03b5\u03bd\u03bd\u03b5\u03bd\u03b7\u03bd\u03c4\u03b1>>;\n" + "100: \u03b5\u03ba\u03b1\u03c4\u03cc[\u03bd >>];\n" + "200: \u03b4\u03b9\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "300: \u03c4\u03c1\u03b9\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "400: \u03c4\u03b5\u03c4\u03c1\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "500: \u03c0\u03b5\u03bd\u03c4\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "600: \u03b5\u03be\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "700: \u03b5\u03c0\u03c4\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "800: \u03bf\u03ba\u03c4\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "900: \u03b5\u03bd\u03bd\u03b9\u03b1\u03ba\u03cc\u03c3\u03b9\u03b1[ >>];\n" + "1000: \u03c7\u03af\u03bb\u03b9\u03b1[ >>];\n" + "2000: << \u03c7\u03af\u03bb\u03b9\u03b1[ >>];\n" + "1,000,000: << \u03b5\u03ba\u03b1\u03c4\u03bf\u03bc\u03bc\u03b9\u03cc\u03c1\u03b9\u03bf[ >>];\n" + "1,000,000,000: << \u03b4\u03b9\u03c3\u03b5\u03ba\u03b1\u03c4\u03bf\u03bc\u03bc\u03b9\u03cc\u03c1\u03b9\u03bf[ >>];\n" + "1,000,000,000,000: =#,##0=" + } } diff --git a/icu4c/source/data/locales/en.txt b/icu4c/source/data/locales/en.txt index efac070c26c..23c89fce318 100644 --- a/icu4c/source/data/locales/en.txt +++ b/icu4c/source/data/locales/en.txt @@ -233,4 +233,11 @@ en { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// inherited from root + } diff --git a/icu4c/source/data/locales/en_GB.txt b/icu4c/source/data/locales/en_GB.txt index b36f0f25f50..80a00dddeee 100644 --- a/icu4c/source/data/locales/en_GB.txt +++ b/icu4c/source/data/locales/en_GB.txt @@ -49,4 +49,70 @@ en_GB { "BST", } } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for U.K. English. U.K. English has one significant +// * difference from U.S. English: the names for values of 1,000,000,000 +// * and higher. In American English, each successive "-illion" is 1,000 +// * times greater than the preceding one: 1,000,000,000 is "one billion" +// * and 1,000,000,000,000 is "one trillion." In British English, each +// * successive "-illion" is one million times greater than the one before: +// * "one billion" is 1,000,000,000,000 (or what Americans would call a +// * "trillion"), and "one trillion" is 1,000,000,000,000,000,000. +// * 1,000,000,000 in British English is "one thousand million." (This +// * value is sometimes called a "milliard," but this word seems to have +// * fallen into disuse.) + + // Could someone please correct me if I'm wrong about "milliard" falling + // into disuse, or have missed any other details of how large numbers + // are rendered. Also, could someone please provide me with information + // on which other English-speaking countries use which system? Right now, + // I'm assuming that the U.S. system is used in Canada and that all the + // other English-speaking countries follow the British system. Can + // someone out there confirm this? + + SpelloutRules { + "%simplified:\n" + " -x: minus >>;\n" + " x.x: << point >>;\n" + " zero; one; two; three; four; five; six; seven; eight; nine;\n" + " ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen;\n" + " seventeen; eighteen; nineteen;\n" + " 20: twenty[->>];\n" + " 30: thirty[->>];\n" + " 40: forty[->>];\n" + " 50: fifty[->>];\n" + " 60: sixty[->>];\n" + " 70: seventy[->>];\n" + " 80: eighty[->>];\n" + " 90: ninety[->>];\n" + " 100: << hundred[ >>];\n" + " 1000: << thousand[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%default:\n" + " -x: minus >>;\n" + " x.x: << point >>;\n" + " =%simplified=;\n" + " 100: << hundred[ >%%and>];\n" + " 1000: << thousand[ >%%and>];\n" + " 100,000>>: << thousand[>%%commas>];\n" + " 1,000,000: << million[>%%commas>];\n" + " 1,000,000,000,000: << billion[>%%commas>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%%and:\n" + " and =%default=;\n" + " 100: =%default=;\n" + "%%commas:\n" + " ' and =%default=;\n" + " 100: , =%default=;\n" + " 1000: , <%default< thousand, >%default>;\n" + " 1,000,000: , =%default=;" + "%%lenient-parse:\n" + " & ' ' , ',' ;\n" + } } diff --git a/icu4c/source/data/locales/eo.txt b/icu4c/source/data/locales/eo.txt index 0e3a42027d8..82a376fa9fb 100644 --- a/icu4c/source/data/locales/eo.txt +++ b/icu4c/source/data/locales/eo.txt @@ -140,4 +140,31 @@ eo { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// data from 'Esperanto-programita 1' courtesy of Markus Scherer + + SpelloutRules { + "-x: minus >>;\n" + "x.x: << komo >>;\n" + "nulo; unu; du; tri; kvar; kvin; ses; sep; ok; na\u016d;\n" + "10: dek[ >>];\n" + "20: <>];\n" + "100: cent[ >>];\n" + "200: <>];\n" + "1000: mil[ >>];\n" + "2000: <>];\n" + "10000: dekmil[ >>];\n" + "11000>: << mil[ >>];\n" + "1,000,000: miliono[ >>];\n" + "2,000,000: << milionoj[ >>];\n" + "1,000,000,000: miliardo[ >>];\n" + "2,000,000,000: << miliardoj[ >>];\n" + "1,000,000,000,000: biliono[ >>];\n" + "2,000,000,000,000: << bilionoj[ >>];\n" + "1,000,000,000,000,000: =#,##0=;\n" + } } diff --git a/icu4c/source/data/locales/es.txt b/icu4c/source/data/locales/es.txt index c897fa2226f..f3ed4b2eb8a 100644 --- a/icu4c/source/data/locales/es.txt +++ b/icu4c/source/data/locales/es.txt @@ -258,4 +258,69 @@ es { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Spanish. The Spanish rules are quite similar to +// * the English rules, but there are some important differences: +// * First, we have to provide separate rules for most of the twenties +// * because the ones digit frequently picks up an accent mark that it +// * doesn't have when standing alone. Second, each multiple of 100 has +// * to be specified separately because the multiplier on 100 very often +// * changes form in the contraction: 500 is "quinientos," not +// * "cincocientos." In addition, the word for 100 is "cien" when +// * standing alone, but changes to "ciento" when followed by more digits. +// * There also some other differences. + + // The Spanish rules are incomplete. I'm missing information on negative + // numbers and numbers with fractional parts. I also don't have + // information on numbers higher than the millions. + + SpelloutRules { + // negative-number and fraction rules + "-x: menos >>;\n" + "x.x: << punto >>;\n" + // words for values from 0 to 19 + "cero; uno; dos; tres; cuatro; cinco; seis; siete; ocho; nueve;\n" + "diez; once; doce; trece; catorce; quince; diecis\u00e9is;\n" + " diecisiete; dieciocho; diecinueve;\n" + // words for values from 20 to 29 (necessary because the ones digit + // often picks up an accent mark it doesn't have when standing alone) + "veinte; veintiuno; veintid\u00f3s; veintitr\u00e9s; veinticuatro;\n" + " veinticinco; veintis\u00e9is; veintisiete; veintiocho;\n" + " veintinueve;\n" + // words for multiples of 10 (notice that the tens digit is separated + // from the ones digit by the word "y".) + "30: treinta[ y >>];\n" + "40: cuarenta[ y >>];\n" + "50: cincuenta[ y >>];\n" + "60: sesenta[ y >>];\n" + "70: setenta[ y >>];\n" + "80: ochenta[ y >>];\n" + "90: noventa[ y >>];\n" + // 100 by itself is "cien," but 100 followed by something is "cineto" + "100: cien;\n" + "101: ciento >>;\n" + // words for multiples of 100 (must be stated because they're + // rarely simple concatenations) + "200: doscientos[ >>];\n" + "300: trescientos[ >>];\n" + "400: cuatrocientos[ >>];\n" + "500: quinientos[ >>];\n" + "600: seiscientos[ >>];\n" + "700: setecientos[ >>];\n" + "800: ochocientos[ >>];\n" + "900: novecientos[ >>];\n" + // for 1,000, the multiplier on "mil" is omitted: 2,000 is "dos mil," + // but 1,000 is just "mil." + "1000: mil[ >>];\n" + "2000: << mil[ >>];\n" + // 1,000,000 is "un millon," not "uno millon" + "1,000,000: un mill\u00f3n[ >>];\n" + "2,000,000: << mill\u00f3n[ >>];\n" + // overflow rule + "1,000,000,000: =#,##0= (incomplete data);" + } } diff --git a/icu4c/source/data/locales/fr.txt b/icu4c/source/data/locales/fr.txt index 9f5991361d8..185e1fa46aa 100644 --- a/icu4c/source/data/locales/fr.txt +++ b/icu4c/source/data/locales/fr.txt @@ -190,10 +190,73 @@ fr { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for French. French adds some interesting quirks of its +// * own: 1) The word "et" is interposed between the tens and ones digits, +// * but only if the ones digit if 1: 20 is "vingt," and 2 is "vingt-deux," +// * but 21 is "vingt-et-un." 2) There are no words for 70, 80, or 90. +// * "quatre-vingts" ("four twenties") is used for 80, and values proceed +// * by score from 60 to 99 (e.g., 73 is "soixante-treize" ["sixty-thirteen"]). +// * Numbers from 1,100 to 1,199 are rendered as hundreds rather than +// * thousands: 1,100 is "onze cents" ("eleven hundred"), rather than +// * "mille cent" ("one thousand one hundred") + + SpelloutRules { + // the main rule set + "%main:\n" + " -x: moins >>;\n" + " x.x: << virgule >>;\n" + // words for numbers from 0 to 10 + " z\u00e9ro; un; deux; trois; quatre; cinq; six; sept; huit; neuf;\n" + " dix; onze; douze; treize; quatorze; quinze; seize;\n" + " dix-sept; dix-huit; dix-neuf;\n" + // ords for the multiples of 10: %%alt-ones inserts "et" + // when needed + " 20: vingt[->%%alt-ones>];\n" + " 30: trente[->%%alt-ones>];\n" + " 40: quarante[->%%alt-ones>];\n" + " 50: cinquante[->%%alt-ones>];\n" + // rule for 60. The /20 causes this rule's multiplier to be + // 20 rather than 10, allowinhg us to recurse for all values + // from 60 to 79... + " 60/20: soixante[->%%alt-ones>];\n" + // ...except for 71, which must be special-cased + " 71: soixante et onze;\n" + // at 72, we have to repeat the rule for 60 to get us to 79 + " 72/20: soixante->%%alt-ones>;\n" + // at 80, we state a new rule with the phrase for 80. Since + // it changes form when there's a ones digit, we need a second + // rule at 81. This rule also includes "/20," allowing it to + // be used correctly for all values up to 99 + " 80: quatre-vingts; 81/20: quatre-vingt->>;\n" + // "cent" becomes plural when preceded by a multiplier, and + // the multiplier is omitted from the singular form + " 100: cent[ >>];\n" + " 200: << cents[ >>];\n" + " 1000: mille[ >>];\n" + // values from 1,100 to 1,199 are rendered as "onze cents..." + // instead of "mille cent..." The > after "1000" decreases + // the rule's exponent, causing its multiplier to be 100 instead + // of 1,000. This prevents us from getting "onze cents cent + // vingt-deux" ("eleven hundred one hundred twenty-two"). + " 1100>: onze cents[ >>];\n" + // at 1,200, we go back to formating in thousands, so we + // repeat the rule for 1,000 + " 1200: mille >>;\n" + // at 2,000, the multiplier is added + " 2000: << mille[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000: << milliarde[ >>];\n" + " 1,000,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + // %%alt-ones is used to insert "et" when the ones digit is 1 + "%%alt-ones:\n" + " ; et-un; =%main=;\n" + "%%lenient-parse:\n" + " &\u0000 << ' ' << ',' << '-';\n" + } } - - - - - - diff --git a/icu4c/source/data/locales/fr_CH.txt b/icu4c/source/data/locales/fr_CH.txt index ce7d398b313..e9c896f5b06 100644 --- a/icu4c/source/data/locales/fr_CH.txt +++ b/icu4c/source/data/locales/fr_CH.txt @@ -56,4 +56,51 @@ fr_CH { "GMT", } } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Swiss French. Swiss French differs from French French +// * in that it does have words for 70, 80, and 90. This rule set shows them, +// * and is simpler as a result. + + // again, I'm missing information on negative numbers and decimals for + // these to rule sets. Also, I'm not 100% sure about Swiss French. Is + // this correct? Is "onze cents" commonly used for 1,100 in both France + // and Switzerland? Can someone fill me in on the rules for the other + // French-speaking countries? I've heard conflicting opinions on which + // version is used in Canada, and I understand there's an alternate set + // of words for 70, 80, and 90 that is used somewhere, but I don't know + // what those words are or where they're used. + + SpelloutRules { + "%main:\n" + " -x: moins >>;\n" + " x.x: << virgule >>;\n" + " z\u00e9ro; un; deux; trois; quatre; cinq; six; sept; huit; neuf;\n" + " dix; onze; douze; treize; quatorze; quinze; seize;\n" + " dix-sept; dix-huit; dix-neuf;\n" + " 20: vingt[->%%alt-ones>];\n" + " 30: trente[->%%alt-ones>];\n" + " 40: quarante[->%%alt-ones>];\n" + " 50: cinquante[->%%alt-ones>];\n" + " 60: soixante[->%%alt-ones>];\n" + // notice new words for 70, 80, and 90 + " 70: septante[->%%alt-ones>];\n" + " 80: octante[->%%alt-ones>];\n" + " 90: nonante[->%%alt-ones>];\n" + " 100: cent[ >>];\n" + " 200: << cents[ >>];\n" + " 1000: mille[ >>];\n" + " 1100>: onze cents[ >>];\n" + " 1200: mille >>;\n" + " 2000: << mille[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000: << milliarde[ >>];\n" + " 1,000,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%%alt-ones:\n" + " ; et-un; =%main=;" + } } diff --git a/icu4c/source/data/locales/he.txt b/icu4c/source/data/locales/he.txt index f3f38a2b2f5..d622e8a963f 100644 --- a/icu4c/source/data/locales/he.txt +++ b/icu4c/source/data/locales/he.txt @@ -102,4 +102,38 @@ he { "Hebrew", // Script Name "Hebr" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Hebrew. Hebrew actually has inflected forms for +// * most of the lower-order numbers. The masculine forms are shown +// * here. + + // This data is woefully incomplete. Can someone fill me in on the + // various inflected forms of the numbers, which seem to be necessary + // to do Hebrew correctly? Can somone supply me with data for values + // from 1,000,000 on up? What about the word for zero? What about + // information on negatives and decimals? + + SpelloutRules { + "zero (incomplete data); \u05d0\u05d4\u05d3; \u05e9\u05d2\u05d9\u05d9\u05dd; \u05e9\u05dc\u05d5\u05e9\u05d4;\n" + "4: \u05d0\u05d3\u05d1\u05e6\u05d4; \u05d7\u05d2\u05d5\u05d9\u05e9\u05d4; \u05e9\u05e9\u05d4;\n" + "7: \u05e9\u05d1\u05e6\u05d4; \u05e9\u05de\u05d5\u05d2\u05d4; \u05ea\u05e9\u05e6\u05d4;\n" + "10: \u05e6\u05e9\u05d3\u05d4[ >>];\n" + "20: \u05e6\u05e9\u05d3\u05d9\u05dd[ >>];\n" + "30: \u05e9\u05dc\u05d5\u05e9\u05d9\u05dd[ >>];\n" + "40: \u05d0\u05d3\u05d1\u05e6\u05d9\u05dd[ >>];\n" + "50: \u05d7\u05de\u05d9\u05e9\u05d9\u05dd[ >>];\n" + "60: \u05e9\u05e9\u05d9\u05dd[ >>];\n" + "70: \u05e9\u05d1\u05e6\u05d9\u05dd[ >>];\n" + "80: \u05e9\u05de\u05d5\u05d2\u05d9\u05dd[ >>];\n" + "90: \u05ea\u05e9\u05e6\u05d9\u05dd[ >>];\n" + "100: \u05de\u05d0\u05d4[ >>];\n" + "200: << \u05de\u05d0\u05d4[ >>];\n" + "1000: \u05d0\u05dc\u05e3[ >>];\n" + "2000: << \u05d0\u05dc\u05e3[ >>];\n" + "1,000,000: =#,##0= (incomplete data);" + } } diff --git a/icu4c/source/data/locales/it.txt b/icu4c/source/data/locales/it.txt index 6769b9f6893..a3dcbf7d43e 100644 --- a/icu4c/source/data/locales/it.txt +++ b/icu4c/source/data/locales/it.txt @@ -117,4 +117,106 @@ it { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Italian. Like German, most Italian numbers are +// * written as single words. What makes these rules complicated is the rule +// * that says that when a word ending in a vowel and a word beginning with +// * a vowel are combined into a compound, the vowel is dropped from the +// * end of the first word: 180 is "centottanta," not "centoottanta." +// * The complexity of this rule set is to produce this behavior. + + // Can someone confirm that I did the vowel-eliding thing right? I'm + // not 100% sure I'm doing it in all the right places, or completely + // correctly. Also, I don't have information for negatives and decimals, + // and I lack words fror values from 1,000,000 on up. + + SpelloutRules { + // main rule set. Follows the patterns of the preceding rule sets, + // except that the final vowel is omitted from words ending in + // vowels when they are followed by another word; instead, we have + // separate rule sets that are identical to this one, except that + // all the words that don't begin with a vowel have a vowel tacked + // onto them at the front. A word ending in a vowel calls a + // substitution that will supply that vowel, unless that vowel is to + // be elided. + "%main:\n" + " -x: meno >>;\n" + " x.x: << virgola >>;\n" + " zero; uno; due; tre; quattro; cinque; sei; sette; otto;\n" + " nove;\n" + " dieci; undici; dodici; tredici; quattordici; quindici; sedici;\n" + " diciasette; diciotto; diciannove;\n" + " 20: venti; vent>%%with-i>;\n" + " 30: trenta; trent>%%with-i>;\n" + " 40: quaranta; quarant>%%with-a>;\n" + " 50: cinquanta; cinquant>%%with-a>;\n" + " 60: sessanta; sessant>%%with-a>;\n" + " 70: settanta; settant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: novanta; novant>%%with-a>;\n" + " 100: cento; cent[>%%with-o>];\n" + " 200: <%%with-o>];\n" + " 1000: mille; mill[>%%with-i>];\n" + " 2000: <%%with-a>];\n" + " 100,000>>: <>];\n" + " 1,000,000: =#,##0= (incomplete data);\n" + "%%with-a:\n" + " azero; uno; adue; atre; aquattro; acinque; asei; asette; otto;\n" + " anove;\n" + " adieci; undici; adodici; atredici; aquattordici; aquindici; asedici;\n" + " adiciasette; adiciotto; adiciannove;\n" + " 20: aventi; avent>%%with-i>;\n" + " 30: atrenta; atrent>%%with-i>;\n" + " 40: aquaranta; aquarant>%%with-a>;\n" + " 50: acinquanta; acinquant>%%with-a>;\n" + " 60: asessanta; asessant>%%with-a>;\n" + " 70: asettanta; asettant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: anovanta; anovant>%%with-a>;\n" + " 100: acento; acent[>%%with-o>];\n" + " 200: <%%with-a%%with-o>];\n" + " 1000: amille; amill[>%%with-i>];\n" + " 2000: <%%with-a%%with-a>];\n" + " 100,000: =%main=;\n" + "%%with-i:\n" + " izero; uno; idue; itre; iquattro; icinque; isei; isette; otto;\n" + " inove;\n" + " idieci; undici; idodici; itredici; iquattordici; iquindici; isedici;\n" + " idiciasette; idiciotto; idiciannove;\n" + " 20: iventi; ivent>%%with-i>;\n" + " 30: itrenta; itrent>%%with-i>;\n" + " 40: iquaranta; iquarant>%%with-a>;\n" + " 50: icinquanta; icinquant>%%with-a>;\n" + " 60: isessanta; isessant>%%with-a>;\n" + " 70: isettanta; isettant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: inovanta; inovant>%%with-a>;\n" + " 100: icento; icent[>%%with-o>];\n" + " 200: <%%with-i%%with-o>];\n" + " 1000: imille; imill[>%%with-i>];\n" + " 2000: <%%with-i%%with-a>];\n" + " 100,000: =%main=;\n" + "%%with-o:\n" + " ozero; uno; odue; otre; oquattro; ocinque; osei; osette; otto;\n" + " onove;\n" + " odieci; undici; ododici; otredici; oquattordici; oquindici; osedici;\n" + " odiciasette; odiciotto; odiciannove;\n" + " 20: oventi; ovent>%%with-i>;\n" + " 30: otrenta; otrent>%%with-i>;\n" + " 40: oquaranta; oquarant>%%with-a>;\n" + " 50: ocinquanta; ocinquant>%%with-a>;\n" + " 60: osessanta; osessant>%%with-a>;\n" + " 70: osettanta; osettant>%%with-a>;\n" + " 80: ottanta; ottant>%%with-a>;\n" + " 90: onovanta; onovant>%%with-a>;\n" + " 100: ocento; ocent[>%%with-o>];\n" + " 200: <%%with-o%%with-o>];\n" + " 1000: omille; omill[>%%with-i>];\n" + " 2000: <%%with-o%%with-a>];\n" + " 100,000: =%main=;\n" + } } diff --git a/icu4c/source/data/locales/ja.txt b/icu4c/source/data/locales/ja.txt index 42ec50a9102..6be89505b73 100644 --- a/icu4c/source/data/locales/ja.txt +++ b/icu4c/source/data/locales/ja.txt @@ -859,4 +859,47 @@ ja { "JST", } } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Japanese. In Japanese, there really isn't any +// * distinction between a number written out in digits and a number +// * written out in words: the ideographic characters are both digits +// * and words. This rule set provides two variants: %traditional +// * uses the traditional CJK numerals (which are also used in China +// * and Korea). %financial uses alternate ideographs for many numbers +// * that are harder to alter than the traditional numerals (one could +// * fairly easily change a one to +// * a three just by adding two strokes, for example). This is also done in +// * the other countries using Chinese idographs, but different ideographs +// * are used in those places. + + // Can someone supply me with the right fraud-proof ideographs for + // Simplified and Traditional Chinese, and for Korean? Can someone + // supply me with information on negatives and decimals? + + SpelloutRules { + "%financial:\n" + " \u96f6; \u58f1; \u5f10; \u53c2; \u56db; \u4f0d; \u516d; \u4e03; \u516b; \u4e5d;\n" + " \u62fe[>>];\n" + " 20: <<\u62fe[>>];\n" + " 100: <<\u767e[>>];\n" + " 1000: <<\u5343[>>];\n" + " 10,000: <<\u4e07[>>];\n" + " 100,000,000: <<\u5104[>>];\n" + " 1,000,000,000,000: <<\u5146[>>];\n" + " 10,000,000,000,000,000: =#,##0=;\n" + "%traditional:\n" + " \u96f6; \u4e00; \u4e8c; \u4e09; \u56db; \u4e94; \u516d; \u4e03; \u516b; \u4e5d;\n" + " \u5341[>>];\n" + " 20: <<\u5341[>>];\n" + " 100: <<\u767e[>>];\n" + " 1000: <<\u5343[>>];\n" + " 10,000: <<\u4e07[>>];\n" + " 100,000,000: <<\u5104[>>];\n" + " 1,000,000,000,000: <<\u5146[>>];\n" + " 10,000,000,000,000,000: =#,##0=;" + } } diff --git a/icu4c/source/data/locales/nl.txt b/icu4c/source/data/locales/nl.txt index 9e0a6818784..c6d1e777add 100644 --- a/icu4c/source/data/locales/nl.txt +++ b/icu4c/source/data/locales/nl.txt @@ -108,4 +108,32 @@ nl { "Latg", // ISO 15924 Name } + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Dutch + + // can someone supply me with information on negatives and decimals? + + SpelloutRules { + " -x: min >>;\n" + "x.x: << komma >>;\n" + "(zero?); een; twee; drie; vier; vijf; zes; zeven; acht; negen;\n" + "tien; elf; twaalf; dertien; veertien; vijftien; zestien;\n" + "zeventien; achtien; negentien;\n" + "20: [>> en ]twintig;\n" + "30: [>> en ]dertig;\n" + "40: [>> en ]veertig;\n" + "50: [>> en ]vijftig;\n" + "60: [>> en ]zestig;\n" + "70: [>> en ]zeventig;\n" + "80: [>> en ]tachtig;\n" + "90: [>> en ]negentig;\n" + "100: << honderd[ >>];\n" + "1000: << duizend[ >>];\n" + "1,000,000: << miljoen[ >>];\n" + "1,000,000,000: << biljoen[ >>];\n" + "1,000,000,000,000: =#,##0=" + } } diff --git a/icu4c/source/data/locales/root.txt b/icu4c/source/data/locales/root.txt index cc0cf0278f7..d5ccbdea702 100644 --- a/icu4c/source/data/locales/root.txt +++ b/icu4c/source/data/locales/root.txt @@ -1180,11 +1180,221 @@ root { "Anchorage", } } + + LocaleScript{ "Latin", "Latn", "Latf", "Latg" } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for U.S. English. This rule set has two variants: +// * %simplified is a set of rules showing the simple method of spelling +// * out numbers in English: 289 is formatted as "two hundred eighty-nine". +// * %default uses a more complicated algorithm to format +// * numbers in a more natural way: 289 is formatted as "two hundred AND +// * eighty-nine" and commas are inserted between the thousands groups for +// * values above 100,000. + + SpelloutRules { + // This rule set shows the normal simple formatting rules for English + "%simplified:\n" + // negative number rule. This rule is used to format negative + // numbers. The result of formatting the number's absolute + // value is placed where the >> is. + " -x: minus >>;\n" + // faction rule. This rule is used for formatting numbers + // with fractional parts. The result of formatting the + // number's integral part is substituted for the <<, and + // the result of formatting the number's fractional part + // (one digit at a time, e.g., 0.123 is "zero point one two + // three") replaces the >>. + " x.x: << point >>;\n" + // the rules for the values from 0 to 19 are simply the + // words for those numbers + " zero; one; two; three; four; five; six; seven; eight; nine;\n" + " ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen;\n" + " seventeen; eighteen; nineteen;\n" + // beginning at 20, we use the >> to mark the position where + // the result of formatting the number's ones digit. Thus, + // we only need a new rule at every multiple of 10. Text in + // backets is omitted if the value being formatted is an + // even multiple of 10. + " 20: twenty[->>];\n" + " 30: thirty[->>];\n" + " 40: forty[->>];\n" + " 50: fifty[->>];\n" + " 60: sixty[->>];\n" + " 70: seventy[->>];\n" + " 80: eighty[->>];\n" + " 90: ninety[->>];\n" + // beginning at 100, we can use << to mark the position where + // the result of formatting the multiple of 100 is to be + // inserted. Notice also that the meaning of >> has shifted: + // here, it refers to both the ones place and the tens place. + // The meanings of the << and >> tokens depend on the base value + // of the rule. A rule's divisor is (usually) the highest + // power of 10 that is less than or equal to the rule's base + // value. The value being formatted is divided by the rule's + // divisor, and the integral quotient is used to get the text + // for <<, while the remainder is used to produce the text + // for >>. Again, text in brackets is omitted if the value + // being formatted is an even multiple of the rule's divisor + // (in this case, an even multiple of 100) + " 100: << hundred[ >>];\n" + // The rules for the higher numbers work the same way as the + // rule for 100: Again, the << and >> tokens depend on the + // rule's divisor, which for all these rules is also the rule's + // base value. To group by thousand, we simply don't have any + // rules between 1,000 and 1,000,000. + " 1000: << thousand[ >>];\n" + " 1,000,000: << million[ >>];\n" + " 1,000,000,000: << billion[ >>];\n" + " 1,000,000,000,000: << trillion[ >>];\n" + // overflow rule. This rule specifies that values of a + // quadrillion or more are shown in numerals rather than words. + // The == token means to format (with new rules) the value + // being formatted by this rule and place the result where + // the == is. The #,##0 inside the == signs is a + // DecimalFormat pattern. It specifies that the value should + // be formatted with a DecimalFormat object, and that it + // should be formatted with no decimal places, at least one + // digit, and a thousands separator. + " 1,000,000,000,000,000: =#,##0=;\n" + + // %default is a more elaborate form of %simplified; It is basically + // the same, except that it introduces "and" before the ones digit + // when appropriate (basically, between the tens and ones digits) and + // separates the thousands groups with commas in values over 100,000. + "%default:\n" + // negative-number and fraction rules. These are the same + // as those for %simplified, but have to be stated here too + // because this is an entry point + " -x: minus >>;\n" + " x.x: << point >>;\n" + // just use %simplified for values below 100 + " =%simplified=;\n" + // for values from 100 to 9,999 use %%and to decide whether or + // not to interpose the "and" + " 100: << hundred[ >%%and>];\n" + " 1000: << thousand[ >%%and>];\n" + // for values of 100,000 and up, use %%commas to interpose the + // commas in the right places (and also to interpose the "and") + " 100,000>>: << thousand[>%%commas>];\n" + " 1,000,000: << million[>%%commas>];\n" + " 1,000,000,000: << billion[>%%commas>];\n" + " 1,000,000,000,000: << trillion[>%%commas>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + // if the value passed to this rule set is greater than 100, don't + // add the "and"; if it's less than 100, add "and" before the last + // digits + "%%and:\n" + " and =%default=;\n" + " 100: =%default=;\n" + // this rule set is used to place the commas + "%%commas:\n" + // for values below 100, add "and" (the apostrophe at the + // beginning is ignored, but causes the space that follows it + // to be significant: this is necessary because the rules + // calling %%commas don't put a space before it) + " ' and =%default=;\n" + // put a comma after the thousands (or whatever preceded the + // hundreds) + " 100: , =%default=;\n" + // put a comma after the millions (or whatever precedes the + // thousands) + " 1000: , <%default< thousand, >%default>;\n" + // and so on... + " 1,000,000: , =%default=;" + // %%lenient-parse isn't really a set of number formatting rules; + // it's a set of collation rules. Lenient-parse mode uses a Collator + // object to compare fragments of the text being parsed to the text + // in the rules, allowing more leeway in the matching text. This set + // of rules tells the formatter to ignore commas when parsing (it + // already ignores spaces, which is why we refer to the space; it also + // ignores hyphens, making "twenty one" and "twenty-one" parse + // identically) + "%%lenient-parse:\n" + // " & ' ' , ',' ;\n" + " &\u0000 << ' ' << ',' << '-'; \n" + } + + +// * This rule set adds an English ordinal abbreviation to the end of a +// * number. For example, 2 is formatted as "2nd". Parsing doesn't work with +// * this rule set. To parse, use DecimalFormat on the numeral. + OrdinalRules { + // this rule set formats the numeral and calls %%abbrev to + // supply the abbreviation + "%main:\n" + " =#,##0==%%abbrev=;\n" + // this rule set supplies the abbreviation + "%%abbrev:\n" + // the abbreviations. Everything from 4 to 19 ends in "th" + " th; st; nd; rd; th;\n" + // at 20, we begin repeating the cycle every 10 (13 is "13th", + // but 23 and 33 are "23rd" and "33rd") We do this by + // ignoring all bug the ones digit in selecting the abbreviation + " 20: >>;\n" + // at 100, we repeat the whole cycle by considering only the + // tens and ones digits in picking an abbreviation + " 100: >>;\n" + } + +// * This rule set formats a number of seconds in sexagesimal notation +// * (i.e., hours, minutes, and seconds). %with-words formats it with +// * words (3,740 is "1 hour, 2 minutes, 20 seconds") and %in-numerals +// * formats it entirely in numerals (3,740 is "1:02:20"). + DurationRules { + // main rule set for formatting with words + "%with-words:\n" + // take care of singular and plural forms of "second" + " 0 seconds; 1 second; =0= seconds;\n" + // use %%min to format values greater than 60 seconds + " 60/60: <%%min<[, >>];\n" + // use %%hr to format values greater than 3,600 seconds + // (the ">>>" below causes us to see the number of minutes + // when when there are zero minutes) + " 3600/60: <%%hr<[, >>>];\n" + // this rule set takes care of the singular and plural forms + // of "minute" + "%%min:\n" + " 0 minutes; 1 minute; =0= minutes;\n" + // this rule set takes care of the singular and plural forms + // of "hour" + "%%hr:\n" + " 0 hours; 1 hour; =0= hours;\n" + + // main rule set for formatting in numerals + "%in-numerals:\n" + // values below 60 seconds are shown with "sec." + " =0= sec.;\n" + // higher values are shown with colons: %%min-sec is used for + // values below 3,600 seconds... + " 60: =%%min-sec=;\n" + // ...and %%hr-min-sec is used for values of 3,600 seconds + // and above + " 3600: =%%hr-min-sec=;\n" + // this rule causes values of less than 10 minutes to show without + // a leading zero + "%%min-sec:\n" + " 0: :=00=;\n" + " 60/60: <0<>>;\n" + // this rule set is used for values of 3,600 or more. Minutes are always + // shown, and always shown with two digits + "%%hr-min-sec:\n" + " 0: :=00=;\n" + " 60/60: <00<>>;\n" + " 3600/60: <#,##0<:>>>;\n" + // the lenient-parse rules allow several different characters to be used + // as delimiters between hours, minutes, and seconds + "%%lenient-parse:\n" + " & ':' = '.' = ' ' = '-';\n" + } } diff --git a/icu4c/source/data/locales/ru.txt b/icu4c/source/data/locales/ru.txt index d29b8fa793c..6b58587b3cd 100644 --- a/icu4c/source/data/locales/ru.txt +++ b/icu4c/source/data/locales/ru.txt @@ -122,4 +122,44 @@ ru { "Cyrs" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Russian. + + // Can someone supply me with information on negatives and decimals? + // How about words for billions and trillions? + + SpelloutRules { + "\u043d\u043e\u043b\u044c; \u043e\u0434\u0438\u043d; \u0434\u0432\u0430; \u0442\u0440\u0438; " + "\u0447\u0435\u0442\u044b\u0440\u0435; \u043f\u044f\u0442; \u0448\u0435\u0441\u0442; " + "\u0441\u0435\u043c\u044c; \u0432\u043e\u0441\u0435\u043c\u044c; \u0434\u0435\u0432\u044f\u0442;\n" + "10: \u0434\u0435\u0441\u044f\u0442; " + "\u043e\u0434\u0438\u043d\u043d\u0430\u0434\u0446\u0430\u0442\u044c;\n" + "\u0434\u0432\u0435\u043d\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0442\u0440\u0438\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0447\u0435\u0442\u044b\u0440\u043d\u0430\u0434\u0446\u0430\u0442\u044c;\n" + "15: \u043f\u044f\u0442\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0448\u0435\u0441\u0442\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0441\u0435\u043c\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0432\u043e\u0441\u0435\u043c\u043d\u0430\u0434\u0446\u0430\u0442\u044c; " + "\u0434\u0435\u0432\u044f\u0442\u043d\u0430\u0434\u0446\u0430\u0442\u044c;\n" + "20: \u0434\u0432\u0430\u0434\u0446\u0430\u0442\u044c[ >>];\n" + "30: \u0442\u0440\u043b\u0434\u0446\u0430\u0442\u044c[ >>];\n" + "40: \u0441\u043e\u0440\u043e\u043a[ >>];\n" + "50: \u043f\u044f\u0442\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "60: \u0448\u0435\u0441\u0442\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "70: \u0441\u0435\u043c\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "80: \u0432\u043e\u0441\u0435\u043c\u044c\u0434\u0435\u0441\u044f\u0442[ >>];\n" + "90: \u0434\u0435\u0432\u044f\u043d\u043e\u0441\u0442\u043e[ >>];\n" + "100: \u0441\u0442\u043e[ >>];\n" + "200: << \u0441\u0442\u043e[ >>];\n" + "1000: \u0442\u044b\u0441\u044f\u0447\u0430[ >>];\n" + "2000: << \u0442\u044b\u0441\u044f\u0447\u0430[ >>];\n" + "1,000,000: \u043c\u0438\u043b\u043b\u0438\u043e\u043d[ >>];\n" + "2,000,000: << \u043c\u0438\u043b\u043b\u0438\u043e\u043d[ >>];\n" + "1,000,000,000: =#,##0=;" + } } diff --git a/icu4c/source/data/locales/sv.txt b/icu4c/source/data/locales/sv.txt index 8dfa52272a9..aee882fa95d 100644 --- a/icu4c/source/data/locales/sv.txt +++ b/icu4c/source/data/locales/sv.txt @@ -118,4 +118,36 @@ sv { "Latf", // ISO 15924 Name "Latg", // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// * Spellout rules for Swedish. + + // can someone supply me with information on negatives and decimals? + + SpelloutRules { + "noll; ett; tv\u00e5; tre; fyra; fem; sex; sjo; \u00e5tta; nio;\n" + "tio; elva; tolv; tretton; fjorton; femton; sexton; sjutton; arton; nitton;\n" + "20: tjugo[>>];\n" + "30: trettio[>>];\n" + "40: fyrtio[>>];\n" + "50: femtio[>>];\n" + "60: sextio[>>];\n" + "70: sjuttio[>>];\n" + "80: \u00e5ttio[>>];\n" + "90: nittio[>>];\n" + "100: hundra[>>];\n" + "200: <>];\n" + "1000: tusen[ >>];\n" + "2000: << tusen[ >>];\n" + "1,000,000: en miljon[ >>];\n" + "2,000,000: << miljon[ >>];\n" + "1,000,000,000: en miljard[ >>];\n" + "2,000,000,000: << miljard[ >>];\n" + "1,000,000,000,000: en biljon[ >>];\n" + "2,000,000,000,000: << biljon[ >>];\n" + "1,000,000,000,000,000: =#,##0=" + } } diff --git a/icu4c/source/data/locales/th.txt b/icu4c/source/data/locales/th.txt index f89fd80c9b3..8324d1a30a9 100644 --- a/icu4c/source/data/locales/th.txt +++ b/icu4c/source/data/locales/th.txt @@ -245,4 +245,43 @@ th { "Thai", // Script Name "Thai" // ISO 15924 Name } + + //------------------------------------------------------------ + // Rule Based Number Format Support + //------------------------------------------------------------ + +// Spellout rules for Thai. Data from Suwit Srivilairith, IBM Thailand + + SpelloutRules { + "%default:\n" + " -x: \u0e25\u0e1a>>;\n" + " x.x: <<\u0e08\u0e38\u0e14>>>;\n" + " \u0e28\u0e39\u0e19\u0e22\u0e4c; \u0e2b\u0e19\u0e36\u0e48\u0e07; \u0e2a\u0e2d\u0e07; \u0e2a\u0e32\u0e21;\n" + " \u0e2a\u0e35\u0e48; \u0e2b\u0e49\u0e32; \u0e2b\u0e01; \u0e40\u0e08\u0e47\u0e14; \u0e41\u0e1b\u0e14;\n" + " \u0e40\u0e01\u0e49\u0e32; \u0e2a\u0e34\u0e1a; \u0e2a\u0e34\u0e1a\u0e40\u0e2d\u0e47\u0e14;\n" + " \u0e2a\u0e34\u0e1a\u0e2a\u0e2d\u0e07; \u0e2a\u0e34\u0e1a\u0e2a\u0e32\u0e21;\n" + " \u0e2a\u0e34\u0e1a\u0e2a\u0e35\u0e48; \u0e2a\u0e34\u0e1a\u0e2b\u0e49\u0e32;\n" + " \u0e2a\u0e34\u0e1a\u0e2b\u0e01; \u0e2a\u0e34\u0e1a\u0e40\u0e08\u0e47\u0e14;\n" + " \u0e2a\u0e34\u0e1a\u0e41\u0e1b\u0e14; \u0e2a\u0e34\u0e1a\u0e40\u0e01\u0e49\u0e32;\n" + " 20: \u0e22\u0e35\u0e48\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 30: \u0e2a\u0e32\u0e21\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 40: \u0e2a\u0e35\u0e48\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 50: \u0e2b\u0e49\u0e32\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 60: \u0e2b\u0e01\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 70: \u0e40\u0e08\u0e47\u0e14\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 80: \u0e41\u0e1b\u0e14\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 90: \u0e40\u0e01\u0e49\u0e32\u0e2a\u0e34\u0e1a[>%%alt-ones>];\n" + " 100: <<\u0e23\u0e49\u0e2d\u0e22[>>];\n" + " 1000: <<\u0e1e\u0e31\u0e19[>>];\n" + " 10000: <<\u0e2b\u0e21\u0e37\u0e48\u0e19[>>];\n" + " 100000: <<\u0e41\u0e2a\u0e19[>>];\n" + " 1,000,000: <<\u0e25\u0e49\u0e32\u0e19[>>];\n" + " 1,000,000,000: <<\u0e1e\u0e31\u0e19\u0e25\u0e49\u0e32\u0e19[>>];\n" + " 1,000,000,000,000: <<\u0e25\u0e49\u0e32\u0e19\u0e25\u0e49\u0e32\u0e19[>>];\n" + " 1,000,000,000,000,000: =#,##0=;\n" + "%%alt-ones:\n" + " \u0e28\u0e39\u0e19\u0e22\u0e4c;\n" + " \u0e40\u0e2d\u0e47\u0e14;\n" + " =%default=;\n"; + } } diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index 40afd19acae..4a2923c1fdc 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -70,7 +70,9 @@ uniset.o unifltlg.o unirange.o translit.o utrans.o \ cpdtrans.o hextouni.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \ dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o nultrans.o \ remtrans.o titletrn.o tolowtrn.o toupptrn.o xformtrn.o \ -name2uni.o uni2name.o unitohex.o nortrans.o unifilt.o quant.o transreg.o +name2uni.o uni2name.o unitohex.o nortrans.o unifilt.o quant.o transreg.o \ +llong.o nfrs.o nfrule.o nfsubs.o rbnf.o + STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) diff --git a/icu4c/source/i18n/i18n.dsp b/icu4c/source/i18n/i18n.dsp index cbcc7a277b0..7de5d7f211b 100644 --- a/icu4c/source/i18n/i18n.dsp +++ b/icu4c/source/i18n/i18n.dsp @@ -166,6 +166,10 @@ SOURCE=.\hextouni.cpp # End Source File # Begin Source File +SOURCE=.\llong.cpp +# End Source File +# Begin Source File + SOURCE=.\msgfmt.cpp # End Source File # Begin Source File @@ -174,6 +178,18 @@ SOURCE=.\name2uni.cpp # End Source File # Begin Source File +SOURCE=.\nfrs.cpp +# End Source File +# Begin Source File + +SOURCE=.\nfrule.cpp +# End Source File +# Begin Source File + +SOURCE=.\nfsubs.cpp +# End Source File +# Begin Source File + SOURCE=.\nortrans.cpp # End Source File # Begin Source File @@ -198,6 +214,10 @@ SOURCE=.\rbbi_tbl.cpp # End Source File # Begin Source File +SOURCE=.\rbnf.cpp +# End Source File +# Begin Source File + SOURCE=.\rbt.cpp # End Source File # Begin Source File @@ -814,6 +834,10 @@ InputPath=.\unicode\hextouni.h # End Source File # Begin Source File +SOURCE=.\llong.h +# End Source File +# Begin Source File + SOURCE=.\unicode\msgfmt.h !IF "$(CFG)" == "i18n - Win32 Release" @@ -860,6 +884,22 @@ InputPath=.\unicode\name2uni.h # End Source File # Begin Source File +SOURCE=.\nfrlist.h +# End Source File +# Begin Source File + +SOURCE=.\nfrs.h +# End Source File +# Begin Source File + +SOURCE=.\nfrule.h +# End Source File +# Begin Source File + +SOURCE=.\nfsubs.h +# End Source File +# Begin Source File + SOURCE=.\unicode\nortrans.h !IF "$(CFG)" == "i18n - Win32 Release" @@ -1022,6 +1062,25 @@ SOURCE=.\rbbi_tbl.h # End Source File # Begin Source File +SOURCE=.\unicode\rbnf.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\rbnf.h + +"..\..\include\unicode\rbnf.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\rbnf.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unicode\rbt.h !IF "$(CFG)" == "i18n - Win32 Release" diff --git a/icu4c/source/i18n/llong.cpp b/icu4c/source/i18n/llong.cpp new file mode 100644 index 00000000000..efe5d121b37 --- /dev/null +++ b/icu4c/source/i18n/llong.cpp @@ -0,0 +1,301 @@ +#include "llong.h" + +#if 0 +/* + * This should work, I think, but SOLARISCC -xO3 can't handle it. + * Works with SOLARISGCC, SOLARISCC -g, Win32... + * + */ +const llong& llong::kMaxValue = llong(0x7fffffff, 0xffffffff); +const llong& llong::kMinValue = llong(0x80000000, 0x0); +const llong& llong::kMinusOne = llong(0xffffffff, 0xffffffff); +const llong& llong::kZero = llong(0x0, 0x0); +const llong& llong::kOne = llong(0x0, 0x1); +const llong& llong::kTwo = llong(0x0, 0x2); +const llong& llong::kMaxDouble = llong(0x200000, 0x0); +const llong& llong::kMinDouble = -kMaxDouble; +#endif + +static llong kMaxValueObj(0x7fffffff, 0xffffffff); +static llong kMinValueObj(0x80000000, 0x0); +static llong kMinusOneObj(0xffffffff, 0xffffffff); +static llong kZeroObj(0x0, 0x0); +static llong kOneObj(0x0, 0x1); +static llong kTwoObj(0x0, 0x2); +static llong kMaxDoubleObj(0x200000, 0x0); +static llong kMinDoubleObj(-kMaxDoubleObj); + +const llong& llong::kMaxValue = kMaxValueObj; +const llong& llong::kMinValue = kMinValueObj; +const llong& llong::kMinusOne = kMinusOneObj; +const llong& llong::kZero = kZeroObj; +const llong& llong::kOne = kOneObj; +const llong& llong::kTwo = kTwoObj; +const llong& llong::kMaxDouble = kMaxDoubleObj; +const llong& llong::kMinDouble = kMinDoubleObj; + +#define SQRT231 46340 + +const double llong::kD32 = ((double)(0xffffffffu)) + 1; +const double llong::kDMax = llong_asDouble(kMaxDouble); +const double llong::kDMin = -kDMax; + +llong& llong::operator*=(const llong& rhs) +{ + // optimize small positive multiplications + if (hi == 0 && rhs.hi == 0 && lo < SQRT231 && rhs.lo < SQRT231) { + lo *= rhs.lo; + } else { + int retry = 0; + + llong a(*this); + if (a.isNegative()) { + retry = 1; + a.negate(); + } + + llong b(rhs); + if (b.isNegative()) { + retry = 1; + b.negate(); + } + + llong r; + // optimize small negative multiplications + if (retry && a.hi == 0 && b.hi == 0 && a.lo < SQRT231 && b.lo < SQRT231) { + r.lo = a.lo * b.lo; + } else { + if (a < b) { + llong t = a; + a = b; + b = t; + } + while (b.notZero()) { + if (b.lo & 0x1) { + r += a; + } + b >>= 1; + a <<= 1; + } + } + if (isNegative() != rhs.isNegative()) { + r.negate(); + } + *this = r; + } + return *this; +} + +llong& llong::operator/=(const llong& rhs) +{ + if (isZero()) { + return *this; + } + int32_t sign = 1; + llong a(*this); + if (a.isNegative()) { + sign = -1; + a.negate(); + } + llong b(rhs); + if (b.isNegative()) { + sign = -sign; + b.negate(); + } + + if (b.isZero()) { // should throw div by zero error + *this = sign < 0 ? kMinValue : kMaxValue; + } else if (a.hi == 0 && b.hi == 0) { + *this = (int32_t)(sign * (a.lo / b.lo)); + } else if (b > a) { + *this = kZero; + } else if (b == a) { + *this = sign; + } else { + llong r; + llong m((int32_t)1); + + while (ule(b, a)) { // a positive so topmost bit is 0, this will always terminate + m <<= 1; + b <<= 1; + } + + do { + m.ushr(1); // don't sign-extend! + if (m.isZero()) break; + + b.ushr(1); + if (b <= a) { + r |= m; + a -= b; + } + } while (a >= rhs); + + if (sign < 0) { + r.negate(); + } + *this = r; + } + return *this; +} + +static uint8_t asciiDigits[] = { + (char)0x30, (char)0x31, (char)0x32, (char)0x33, (char)0x34, (char)0x35, (char)0x36, (char)0x37, + (char)0x38, (char)0x39, (char)0x61, (char)0x62, (char)0x63, (char)0x64, (char)0x65, (char)0x66, + (char)0x67, (char)0x68, (char)0x69, (char)0x6a, (char)0x6b, (char)0x6c, (char)0x6d, (char)0x6e, + (char)0x6f, (char)0x70, (char)0x71, (char)0x72, (char)0x73, (char)0x74, (char)0x75, (char)0x76, + (char)0x77, (char)0x78, (char)0x79, (char)0x7a, +}; + +static UChar kUMinus = (UChar)0x002d; +static char kMinus = (char)0x2d; + +static uint8_t digitInfo[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0, 0, 0, 0, 0, 0, + 0, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, + 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0, 0, 0, 0, 0, + 0, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, + 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0, 0, 0, 0, 0, +}; + +llong atoll(const char* str, uint32_t radix) +{ + if (radix > 36) { + radix = 36; + } else if (radix < 2) { + radix = 2; + } + llong lradix(radix); + + int neg = 0; + if (*str == kMinus) { + ++str; + neg = 1; + } + llong result; + uint8_t b; + while ((b = digitInfo[*str++]) && ((b &= 0x7f) < radix)) { + result *= lradix; + result += (int32_t)b; + } + if (neg) { + result.negate(); + } + return result; +} + +llong u_atoll(const UChar* str, uint32_t radix) +{ + if (radix > 36) { + radix = 36; + } else if (radix < 2) { + radix = 2; + } + llong lradix(radix); + + int neg = 0; + if (*str == kUMinus) { + ++str; + neg = 1; + } + llong result; + UChar c; + uint8_t b; + while (((c = *str++) < 0x0080) && (b = digitInfo[c]) && ((b &= 0x7f) < radix)) { + result *= lradix; + result += (int32_t)b; + } + if (neg) { + result.negate(); + } + return result; +} + +uint32_t lltoa(const llong& val, char* buf, uint32_t len, uint32_t radix, UBool raw) +{ + if (radix > 36) { + radix = 36; + } else if (radix < 2) { + radix = 2; + } + llong base(radix); + + char* p = buf; + llong w(val); + if (len && w.isNegative()) { + w.negate(); + *p++ = kMinus; + --len; + } + + while (len && w.notZero()) { + llong n = w / base; + llong m = n * base; + int32_t d = llong_asInt(w-m); + *p++ = raw ? (char)d : asciiDigits[d]; + w = n; + --len; + } + if (len) { + *p = 0; // null terminate if room for caller convenience + } + + len = p - buf; + if (*buf == kMinus) { + ++buf; + } + while (--p > buf) { + char c = *p; + *p = *buf; + *buf = c; + ++buf; + } + + return len; +} + +uint32_t u_lltoa(const llong& val, UChar* buf, uint32_t len, uint32_t radix, UBool raw) +{ + if (radix > 36) { + radix = 36; + } else if (radix < 2) { + radix = 2; + } + llong base(radix); + + UChar* p = buf; + llong w(val); + if (len && w.isNegative()) { + w.negate(); + *p++ = kUMinus; + --len; + } + + while (len && w.notZero()) { + llong n = w / base; + llong m = n * base; + int32_t d = llong_asInt(w-m); + *p++ = (UChar)(raw ? d : asciiDigits[d]); + w = n; + --len; + } + if (len) { + *p = 0; // null terminate if room for caller convenience + } + + len = p - buf; + if (*buf == kUMinus) { + ++buf; + } + while (--p > buf) { + UChar c = *p; + *p = *buf; + *buf = c; + ++buf; + } + + return len; +} diff --git a/icu4c/source/i18n/llong.h b/icu4c/source/i18n/llong.h new file mode 100644 index 00000000000..2c54e618834 --- /dev/null +++ b/icu4c/source/i18n/llong.h @@ -0,0 +1,312 @@ +// thanks to Mike Cowlishaw + +#ifndef LLONG_H +#define LLONG_H + +// debug +#include + +#include "unicode/utypes.h" + +// machine dependent value, need to move +#define __u_IntBits 32 + +class llong { +public: + uint32_t lo; + int32_t hi; +private: + enum { + MASK32 = 0xffffffffu + }; + + static const double kD32; // 2^^32 as a double + static const double kDMin; // -(2^^54), minimum double with full integer precision + static const double kDMax; // 2^^54, maximum double with full integer precision + + // private constructor + // should be private, but we can't construct the way we want using SOLARISCC + // so make public in order that file statics can access this constructor + public: + llong(int32_t h, uint32_t l) : lo(l), hi(h) {} + private: + // convenience, size reduction in inline code + llong& nnot() { hi = ~hi; lo = ~lo; return *this; } + llong& negate() { hi = ~hi; lo = ~lo; if (!++lo) ++hi; return *this; } + llong& abs() { if (hi < 0) negate(); return *this; } + UBool notZero() const { return (hi | lo) != 0; } + UBool isZero() const { return (hi | lo) == 0; } + UBool isNegative() const { return hi < 0; } + +public: + llong() : lo(0), hi(0) {} + llong(const int32_t l) : lo((unsigned)l), hi(l < 0 ? -1 : 0) {} + llong(const int16_t l) : lo((unsigned)l), hi(l < 0 ? -1 : 0) {} + llong(const int8_t l) : lo((unsigned)l), hi(l < 0 ? -1 : 0) {} +#if __u_IntBits == 64 + llong(const int i) : lo(i & MASK32), hi(i >> 32) {} +#endif + llong(uint16_t s) : lo(s), hi(0) {} + llong(uint32_t l) : lo(l), hi(0) {} +#if __u_IntBits == 64 + llong(unsigned int i) : lo(i & MASK32), hi(i >> 32) {} +#endif + llong(double d) { // avoid dependency on bit representation of double + if (uprv_isNaN(d)) { + *this = llong::kZero; + } else if (d < kDMin) { + *this = llong::kMinDouble; + } else if (d > kDMax) { + *this = llong::kMaxDouble; + } else { + int neg = d < 0; + if (neg) d = -d; + d = uprv_floor(d); + hi = (int32_t)uprv_floor(d / kD32); + d -= kD32 * hi; + lo = (uint32_t)d; + if (neg) negate(); + } + } + + llong(const llong& rhs) : lo(rhs.lo), hi(rhs.hi) {} + + // the following cause ambiguities in binary expressions, + // even if we overload all methods on all args! + // so you have to use global functions + // operator const int32_t() const; + // operator const uint32_t() const; + // operator const double() const; + + friend int32_t llong_asInt(const llong& lhs); + friend uint32_t llong_asUInt(const llong& lhs); + friend double llong_asDouble(const llong& lhs); + + llong& operator=(const llong& rhs) { lo = rhs.lo; hi = rhs.hi; return *this; } + + // left shift + llong& operator<<=(int32_t shift) { + shift &= 63; // like java spec + if (shift < 32) { + hi = (signed)(hi << shift | lo >> (32 - shift)); // no sign extension on lo since unsigned + lo <<= shift; + } else { + hi = (signed)(lo << (shift - 32)); + lo = 0; + } + return *this; + } + llong operator<<(int32_t shift) const { llong r(*this); r <<= shift; return r; } + + // right shift with sign extension + llong& operator>>=(int32_t shift) { + shift &= 63; // like java spec + if (shift < 32) { + lo >>= shift; + lo |= (hi << (32 - shift)); + hi = hi >> shift; // note sign extension + } else { + lo = (unsigned)(hi >> (shift - 32)); // note sign extension + hi = hi < 0 ? -1 : 0; + } + return *this; + } + llong operator>>(int32_t shift) const { llong r(*this); r >>= shift; return r; } + + // unsigned right shift + friend llong ushr(const llong& lhs, int32_t shift); + + // bit operations + friend llong operator&(const llong& lhs, const llong& rhs); + friend llong operator|(const llong& lhs, const llong& rhs); + friend llong operator^(const llong& lhs, const llong& rhs); + + friend llong operator&(const llong& lhs, const uint32_t rhs); + friend llong operator|(const llong& lhs, const uint32_t rhs); + friend llong operator^(const llong& lhs, const uint32_t rhs); + + llong operator~() const { return llong(~hi, ~lo); } + // is this useful? + // UBool operator!() const { return !(hi | lo); } + + llong& operator&=(const llong& rhs) { hi &= rhs.hi; lo &= rhs.lo; return *this; } + llong& operator|=(const llong& rhs) { hi |= rhs.hi; lo |= rhs.lo; return *this; } + llong& operator^=(const llong& rhs) { hi ^= rhs.hi; lo ^= rhs.lo; return *this; } + + llong& operator&=(const uint32_t rhs) { hi = 0; lo &= rhs; return *this; } + llong& operator|=(const uint32_t rhs) { lo |= rhs; return *this; } + llong& operator^=(const uint32_t rhs) { lo ^= rhs; return *this; } + + // no logical ops since we can't enforce order of evaluation, not much use anyway? + + // comparison + friend UBool operator==(const llong& lhs, const llong& rhs); + friend UBool operator!=(const llong& lhs, const llong& rhs); + friend UBool operator> (const llong& lhs, const llong& rhs); + friend UBool operator< (const llong& lhs, const llong& rhs); + friend UBool operator>=(const llong& lhs, const llong& rhs); + friend UBool operator<=(const llong& lhs, const llong& rhs); + + // overload comparison to native int to avoid conversion to llong for common comparisons + friend UBool operator==(const llong& lhs, const int32_t rhs); + friend UBool operator!=(const llong& lhs, const int32_t rhs); + friend UBool operator> (const llong& lhs, const int32_t rhs); + friend UBool operator< (const llong& lhs, const int32_t rhs); + friend UBool operator>=(const llong& lhs, const int32_t rhs); + friend UBool operator<=(const llong& lhs, const int32_t rhs); + + // unsigned comparison + friend UBool ugt(const llong& lhs, const llong& rhs); + friend UBool ult(const llong& lhs, const llong& rhs); + friend UBool uge(const llong& lhs, const llong& rhs); + friend UBool ule(const llong& lhs, const llong& rhs); + + // prefix inc/dec + llong& operator++() { if (!++lo) ++hi; return *this; } + llong& operator--() { if (!lo--) --hi; return *this; } + + // postfix inc/dec + llong operator++(int) { llong r(*this); if (!++lo) ++hi; return r; } + llong operator--(int) { llong r(*this); if (!lo--) --hi; return r; } + + // unary minus + llong operator-() const { uint32_t l = ~lo + 1; return llong(l ? ~hi : ~hi + 1, l); } + + // addition and subtraction + llong& operator-=(const llong& rhs) { hi -= rhs.hi; if (lo < rhs.lo) --hi; lo -= rhs.lo; return *this; } + friend llong operator-(const llong& lhs, const llong& rhs); + + llong& operator+=(const llong& rhs) { return *this -= -rhs; } + friend llong operator+(const llong& lhs, const llong& rhs); + + // pluttification and fizzen' + llong& operator*=(const llong& rhs); + friend llong operator*(const llong& lhs, const llong& rhs); + + llong& operator/=(const llong& rhs); + friend llong operator/(const llong& lhs, const llong& rhs); + + llong& operator%=(const llong& rhs) { return operator-=((*this / rhs) * rhs); } + friend llong operator%(const llong& lhs, const llong& rhs); + + // power function, positive integral powers only + friend llong llong_pow(const llong& lhs, uint32_t n); + + // absolute value + friend llong llong_abs(const llong& lhs); + + // simple construction from ASCII and Unicode strings + friend llong atoll(const char* str, uint32_t radix = 10); + friend llong u_atoll(const UChar* str, uint32_t radix = 10); + + // output as ASCII or Unicode strings or as raw values, preceeding '-' if signed + friend uint32_t lltoa(const llong& lhs, char* buffer, uint32_t buflen, uint32_t radix = 10, UBool raw = FALSE); + friend uint32_t u_lltoa(const llong& lhs, UChar* buffer, uint32_t buflen, uint32_t radix = 10, UBool raw = FALSE); + + // useful public constants - perhaps should not have class statics + static const llong& kMaxValue; + static const llong& kMinValue; + static const llong& kMinusOne; + static const llong& kZero; + static const llong& kOne; + static const llong& kTwo; + +private: + static const llong& kMaxDouble; + static const llong& kMinDouble; + + // right shift without sign extension + llong& ushr(int32_t shift) { + shift &= 0x63; + if (shift < 32) { + lo >>= shift; + lo |= (hi << (32 - shift)); + hi = (signed)(((unsigned)hi) >> shift); + } else { + lo = (unsigned)(((unsigned)hi) >> (shift - 32)); + hi = 0; + } + return *this; + } + + // back door for test + friend void llong_test(); +}; + +inline llong operator& (const llong& lhs, const llong& rhs) { return llong(lhs.hi & rhs.hi, lhs.lo & rhs.lo); } +inline llong operator| (const llong& lhs, const llong& rhs) { return llong(lhs.hi | rhs.hi, lhs.lo | rhs.lo); } +inline llong operator^ (const llong& lhs, const llong& rhs) { return llong(lhs.hi ^ rhs.hi, lhs.lo ^ rhs.lo); } + +inline llong operator& (const llong& lhs, const uint32_t rhs) { return llong(0, lhs.lo & rhs); } +inline llong operator| (const llong& lhs, const uint32_t rhs) { return llong(lhs.hi, lhs.lo | rhs); } +inline llong operator^ (const llong& lhs, const uint32_t rhs) { return llong(lhs.hi, lhs.lo ^ rhs); } + +inline UBool operator==(const llong& lhs, const llong& rhs) { return lhs.lo == rhs.lo && lhs.hi == rhs.hi; } +inline UBool operator!=(const llong& lhs, const llong& rhs) { return lhs.lo != rhs.lo || lhs.hi != rhs.hi; } +inline UBool operator> (const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo > rhs.lo : lhs.hi > rhs.hi; } +inline UBool operator< (const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo < rhs.lo : lhs.hi < rhs.hi; } +inline UBool operator>=(const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo >= rhs.lo : lhs.hi >= rhs.hi; } +inline UBool operator<=(const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo <= rhs.lo : lhs.hi <= rhs.hi; } + +inline UBool operator==(const llong& lhs, const int32_t rhs) { return lhs.lo == (unsigned)rhs && lhs.hi == (rhs < 0 ? -1 : 0); } +inline UBool operator!=(const llong& lhs, const int32_t rhs) { return lhs.lo != (unsigned)rhs || lhs.hi != (rhs < 0 ? -1 : 0); } +inline UBool operator> (const llong& lhs, const int32_t rhs) { return rhs < 0 ? (lhs.hi == -1 ? lhs.lo > (unsigned)rhs : lhs.hi > -1) + : (lhs.hi == 0 ? lhs.lo > (unsigned)rhs : lhs.hi > 0); } +inline UBool operator< (const llong& lhs, const int32_t rhs) { return rhs < 0 ? (lhs.hi == -1 ? lhs.lo < (unsigned)rhs : lhs.hi < -1) + : (lhs.hi == 0 ? lhs.lo < (unsigned)rhs : lhs.hi < 0); } +inline UBool operator>=(const llong& lhs, const int32_t rhs) { return rhs < 0 ? (lhs.hi == -1 ? lhs.lo >= (unsigned)rhs : lhs.hi > -1) + : (lhs.hi == 0 ? lhs.lo >= (unsigned)rhs : lhs.hi > 0); } +inline UBool operator<=(const llong& lhs, const int32_t rhs) { return rhs < 0 ? (lhs.hi == -1 ? lhs.lo <= (unsigned)rhs : lhs.hi < -1) + : (lhs.hi == 0 ? lhs.lo <= (unsigned)rhs : lhs.hi < 0); } + +inline UBool ugt(const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo > rhs.lo : (unsigned)lhs.hi > (unsigned)rhs.hi; } +inline UBool ult(const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo < rhs.lo : (unsigned)lhs.hi < (unsigned)rhs.hi; } +inline UBool uge(const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo >= rhs.lo : (unsigned)lhs.hi >= (unsigned)rhs.hi; } +inline UBool ule(const llong& lhs, const llong& rhs) { return lhs.hi == rhs.hi ? lhs.lo <= rhs.lo : (unsigned)lhs.hi <= (unsigned)rhs.hi; } + +inline llong ushr(const llong& lhs, int32_t shift) { llong r(lhs); r.ushr(shift); return r; } + +inline llong operator-(const llong& lhs, const llong& rhs) { return llong(lhs.lo < rhs.lo ? lhs.hi - rhs.hi - 1 : lhs.hi - rhs.hi, lhs.lo - rhs.lo); } +inline llong operator+(const llong& lhs, const llong& rhs) { return lhs - -rhs; } + +inline llong operator*(const llong& lhs, const llong& rhs) { llong r(lhs); r *= rhs; return r; } +inline llong operator/(const llong& lhs, const llong& rhs) { llong r(lhs); r /= rhs; return r; } +inline llong operator%(const llong& lhs, const llong& rhs) { llong r(lhs); r %= rhs; return r; } + +inline int32_t llong_asInt(const llong& lhs) { return (int32_t)(lhs.lo | (lhs.hi < 0 ? 0x80000000 : 0)); } +inline uint32_t llong_asUInt(const llong& lhs) { return lhs.lo; } +inline double llong_asDouble(const llong& lhs) { return llong::kD32 * lhs.hi + lhs.lo; } + +inline llong llong_pow(const llong& lhs, uint32_t n) { + if (lhs.isZero()) { + return llong::kZero; + } else if (n == 0) { + return llong::kOne; + } else { + llong r(lhs); + while (--n > 0) { + r *= lhs; + } + return r; + } +} + +inline llong llong_abs(const llong& lhs) { return lhs.isNegative() ? -lhs : lhs; } + +// Originally, I thought that overloading on int32 was too complex or to large to get inlined, and +// since I mainly wanted to optimize comparisons to zero, I overloaded on uint32_t instead +// since it has a simpler implementation. +// But this means that llong(-1) != -1 (since the comparison treats the rhs as unsigned, but +// the constructor does not). So I am using the signed versions after all. + +#if 0 +inline UBool operator==(const llong& lhs, const uint32_t rhs) { return lhs.lo == rhs && lhs.hi == 0; } +inline UBool operator!=(const llong& lhs, const uint32_t rhs) { return lhs.lo != rhs || lhs.hi != 0; } +inline UBool operator> (const llong& lhs, const uint32_t rhs) { return lhs.hi == 0 ? lhs.lo > rhs : lhs.hi > 0; } +inline UBool operator< (const llong& lhs, const uint32_t rhs) { return lhs.hi == 0 ? lhs.lo < rhs : lhs.hi < 0; } +inline UBool operator>=(const llong& lhs, const uint32_t rhs) { return lhs.hi == 0 ? lhs.lo >= rhs : lhs.hi >= 0; } +inline UBool operator<=(const llong& lhs, const uint32_t rhs) { return lhs.hi == 0 ? lhs.lo <= rhs : lhs.hi <= 0; } +#endif + +// LLONG_H +#endif diff --git a/icu4c/source/i18n/nfrlist.h b/icu4c/source/i18n/nfrlist.h new file mode 100644 index 00000000000..a17e6bffd1c --- /dev/null +++ b/icu4c/source/i18n/nfrlist.h @@ -0,0 +1,68 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ +#ifndef NFRLIST_H +#define NFRLIST_H + +#include "cmemory.h" +#include "unicode/umachine.h" + +#include "nfrule.h" + +U_NAMESPACE_BEGIN + +// unsafe class for internal use only. assume memory allocations succeed, indexes are valid. +// should be a template, but we can't use them + +class NFRuleList { +protected: + NFRule** fStuff; + uint32_t fCount; + uint32_t fCapacity; +public: + NFRuleList(int capacity = 10) + : fStuff(capacity ? (NFRule**)uprv_malloc(capacity * sizeof(NFRule*)) : NULL) + , fCount(0) + , fCapacity(capacity) {}; + ~NFRuleList() { + if (fStuff) { + for(uint32_t i = 0; i < fCount; ++i) { + delete fStuff[i]; + } + uprv_free(fStuff); + } + } + NFRule* operator[](uint32_t index) const { return fStuff[index]; } + NFRule* remove(uint32_t index) { + NFRule* result = fStuff[index]; + fCount -= 1; + for (uint32_t i = index; i < fCount; ++i) { // assumes small arrays + fStuff[i] = fStuff[i+1]; + } + return result; + } + void add(NFRule* thing) { + if (fCount == fCapacity) { + fCapacity += 10; + fStuff = (NFRule**)uprv_realloc(fStuff, fCapacity * sizeof(NFRule*)); // assume success + } + fStuff[fCount++] = thing; + } + uint32_t size() const { return fCount; } + NFRule* last() const { return fCount > 0 ? fStuff[fCount-1] : NULL; } + NFRule** release() { + add(NULL); // ensure null termination + NFRule** result = fStuff; + fStuff = NULL; + fCount = 0; + fCapacity = 0; + return result; + } +}; + +U_NAMESPACE_END + +// NFRLIST_H +#endif diff --git a/icu4c/source/i18n/nfrs.cpp b/icu4c/source/i18n/nfrs.cpp new file mode 100644 index 00000000000..dd971beed82 --- /dev/null +++ b/icu4c/source/i18n/nfrs.cpp @@ -0,0 +1,659 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +#include + +#include "nfrs.h" +#include "nfrule.h" +#include "nfrlist.h" +#include "cmemory.h" + +U_NAMESPACE_BEGIN + +#if 0 +// euclid's algorithm works with doubles +// note, doubles only get us up to one quadrillion or so, which +// isn't as much range as we get with longs. We probably still +// want either 64-bit math, or BigInteger. + +static llong +util_lcm(llong x, llong y) +{ + x.abs(); + y.abs(); + + if (x == 0 || y == 0) { + return 0; + } else { + do { + if (x < y) { + llong t = x; x = y; y = t; + } + x -= y * (x/y); + } while (x != 0); + + return y; + } +} + +#else +/** + * Calculates the least common multiple of x and y. + */ +static llong +util_lcm(llong x, llong y) +{ + // binary gcd algorithm from Knuth, "The Art of Computer Programming," + // vol. 2, 1st ed., pp. 298-299 + llong x1 = x; + llong y1 = y; + + int p2 = 0; + while ((x1 & 1) == 0 && (y1 & 1) == 0) { + ++p2; + x1 >>= 1; + y1 >>= 1; + } + + llong t; + if ((x1 & 1) == 1) { + t = -y1; + } else { + t = x1; + } + + while (t != 0) { + while ((t & 1) == 0) { + t >>= 1; + } + if (t > 0) { + x1 = t; + } else { + y1 = -t; + } + t = x1 - y1; + } + + llong gcd = x1 << p2; + + // x * y == gcd(x, y) * lcm(x, y) + return x / gcd * y; +} +#endif + +static const UChar gPercent = 0x0025; +static const UChar gColon = 0x003a; +static const UChar gSemicolon = 0x003b; +static const UChar gLineFeed = 0x0010; + +static const UnicodeString gFourSpaces(" "); +static const UnicodeString gPercentPercent("%%"); + +NFRuleSet::NFRuleSet(UnicodeString* descriptions, int32_t index, UErrorCode& status) + : name() + , rules(0) + , negativeNumberRule(NULL) + , fIsFractionRuleSet(FALSE) + , fIsPublic(FALSE) +{ + for (int i = 0; i < 3; ++i) { + fractionRules[i] = NULL; + } + + if (U_FAILURE(status)) { + return; + } + + UnicodeString& description = descriptions[index]; // !!! make sure index is valid + + // if the description begins with a rule set name (the rule set + // name can be omitted in formatter descriptions that consist + // of only one rule set), copy it out into our "name" member + // and delete it from the description + if (description.charAt(0) == gPercent) { + UTextOffset pos = description.indexOf(gColon); + if (pos == -1) { + // throw new IllegalArgumentException("Rule set name doesn't end in colon"); + status = U_PARSE_ERROR; + } else { + name.setTo(description, 0, pos); + while (pos < description.length() && u_isWhitespace(description.charAt(++pos))) { + } + description.remove(0, pos); + } + } else { + name.setTo("%default"); + } + + if (description.length() == 0) { + // throw new IllegalArgumentException("Empty rule set description"); + status = U_PARSE_ERROR; + } + + fIsPublic = name.indexOf(gPercentPercent) != 0; + + // all of the other members of NFRuleSet are initialized + // by parseRules() +} + +void +NFRuleSet::parseRules(UnicodeString& description, const RuleBasedNumberFormat* owner, UErrorCode& status) +{ + // start by creating a Vector whose elements are Strings containing + // the descriptions of the rules (one rule per element). The rules + // are separated by semicolons (there's no escape facility: ALL + // semicolons are rule delimiters) + + if (U_FAILURE(status)) { + return; + } + + // dlf - the original code kept a separate description array for no reason, + // so I got rid of it. The loop was too complex so I simplified it. + + UnicodeString currentDescription; + UTextOffset oldP = 0; + while (oldP < description.length()) { + UTextOffset p = description.indexOf(gSemicolon, oldP); + if (p == -1) { + p = description.length(); + } + currentDescription.setTo(description, oldP, p - oldP); + NFRule::makeRules(currentDescription, this, rules.last(), owner, rules, status); + oldP = p + 1; + } + + // for rules that didn't specify a base value, their base values + // were initialized to 0. Make another pass through the list and + // set all those rules' base values. We also remove any special + // rules from the list and put them into their own member variables + llong defaultBaseValue = (int32_t)0; + + // (this isn't a for loop because we might be deleting items from + // the vector-- we want to make sure we only increment i when + // we _didn't_ delete aything from the vector) + uint32_t i = 0; + while (i < rules.size()) { + NFRule* rule = rules[i]; + + switch (rule->getType()) { + // if the rule's base value is 0, fill in a default + // base value (this will be 1 plus the preceding + // rule's base value for regular rule sets, and the + // same as the preceding rule's base value in fraction + // rule sets) + case NFRule::kNoBase: + rule->setBaseValue(defaultBaseValue); + if (!isFractionRuleSet()) { + ++defaultBaseValue; + } + ++i; + break; + + // if it's the negative-number rule, copy it into its own + // data member and delete it from the list + case NFRule::kNegativeNumberRule: + negativeNumberRule = rules.remove(i); + break; + + // if it's the improper fraction rule, copy it into the + // correct element of fractionRules + case NFRule::kImproperFractionRule: + fractionRules[0] = rules.remove(i); + break; + + // if it's the proper fraction rule, copy it into the + // correct element of fractionRules + case NFRule::kProperFractionRule: + fractionRules[1] = rules.remove(i); + break; + + // if it's the master rule, copy it into the + // correct element of fractionRules + case NFRule::kMasterRule: + fractionRules[2] = rules.remove(i); + break; + + // if it's a regular rule that already knows its base value, + // check to make sure the rules are in order, and update + // the default base value for the next rule + default: + if (rule->getBaseValue() < defaultBaseValue) { + // throw new IllegalArgumentException("Rules are not in order"); + status = U_PARSE_ERROR; + return; + } + defaultBaseValue = rule->getBaseValue(); + if (!isFractionRuleSet()) { + ++defaultBaseValue; + } + ++i; + break; + } + } +} + +NFRuleSet::~NFRuleSet() +{ + delete negativeNumberRule; + delete fractionRules[0]; + delete fractionRules[1]; + delete fractionRules[2]; +} + +UBool +util_equalRules(const NFRule* rule1, const NFRule* rule2) +{ + if (rule1) { + if (rule2) { + return *rule1 == *rule2; + } + } else if (!rule2) { + return TRUE; + } + return FALSE; +} + +UBool +NFRuleSet::operator==(const NFRuleSet& rhs) const +{ + if (rules.size() == rhs.rules.size() && + fIsFractionRuleSet == rhs.fIsFractionRuleSet && + name == rhs.name && + util_equalRules(negativeNumberRule, rhs.negativeNumberRule) && + util_equalRules(fractionRules[0], rhs.fractionRules[0]) && + util_equalRules(fractionRules[1], rhs.fractionRules[1]) && + util_equalRules(fractionRules[2], rhs.fractionRules[2])) { + + for (uint32_t i = 0; i < rules.size(); ++i) { + if (*rules[i] != *rhs.rules[i]) { + return FALSE; + } + } + return TRUE; + } + return FALSE; +} + +void +NFRuleSet::format(llong number, UnicodeString& toAppendTo, int32_t pos) const +{ + NFRule *rule = findNormalRule(number); + rule->doFormat(number, toAppendTo, pos); +} + +void +NFRuleSet::format(double number, UnicodeString& toAppendTo, int32_t pos) const +{ + NFRule *rule = findDoubleRule(number); + rule->doFormat(number, toAppendTo, pos); +} + +NFRule* +NFRuleSet::findDoubleRule(double number) const +{ + // if this is a fraction rule set, use findFractionRuleSetRule() + if (isFractionRuleSet()) { + return findFractionRuleSetRule(number); + } + + // if the number is negative, return the negative number rule + // (if there isn't a negative-number rule, we pretend it's a + // positive number) + if (number < 0) { + if (negativeNumberRule) { + return negativeNumberRule; + } else { + number = -number; + } + } + + // if the number isn't an integer, we use one of the fraction rules... + if (number != uprv_floor(number)) { + // if the number is between 0 and 1, return the proper + // fraction rule + if (number < 1 && fractionRules[1]) { + return fractionRules[1]; + } + // otherwise, return the improper fraction rule + else if (fractionRules[0]) { + return fractionRules[0]; + } + } + + // if there's a master rule, use it to format the number + if (fractionRules[2]) { + return fractionRules[2]; + } + + // and if we haven't yet returned a rule, use findNormalRule() + // to find the applicable rule + llong r = number + 0.5; + return findNormalRule(r); +} + +NFRule * +NFRuleSet::findNormalRule(llong number) const +{ + // if this is a fraction rule set, use findFractionRuleSetRule() + // to find the rule (we should only go into this clause if the + // value is 0) + if (fIsFractionRuleSet) { + return findFractionRuleSetRule(llong_asDouble(number)); + } + + // if the number is negative, return the negative-number rule + // (if there isn't one, pretend the number is positive) + if (number < 0) { + if (negativeNumberRule) { + return negativeNumberRule; + } else { + number = -number; + } + } + + // we have to repeat the preceding two checks, even though we + // do them in findRule(), because the version of format() that + // takes a long bypasses findRule() and goes straight to this + // function. This function does skip the fraction rules since + // we know the value is an integer (it also skips the master + // rule, since it's considered a fraction rule. Skipping the + // master rule in this function is also how we avoid infinite + // recursion) + + // binary-search the rule list for the applicable rule + // (a rule is used for all values from its base value to + // the next rule's base value) + int32_t lo = 0; + int32_t hi = rules.size(); + while (lo < hi) { + int32_t mid = (lo + hi) / 2; + if (rules[mid]->getBaseValue() == number) { + return rules[mid]; + } + else if (rules[mid]->getBaseValue() > number) { + hi = mid; + } + else { + lo = mid + 1; + } + } + NFRule *result = rules[hi - 1]; + + // use shouldRollBack() to see whether we need to invoke the + // rollback rule (see shouldRollBack()'s documentation for + // an explanation of the rollback rule). If we do, roll back + // one rule and return that one instead of the one we'd normally + // return + if (result->shouldRollBack(llong_asDouble(number))) { + result = rules[hi - 2]; + } + + return result; +} + +/** + * If this rule is a fraction rule set, this function is used by + * findRule() to select the most appropriate rule for formatting + * the number. Basically, the base value of each rule in the rule + * set is treated as the denominator of a fraction. Whichever + * denominator can produce the fraction closest in value to the + * number passed in is the result. If there's a tie, the earlier + * one in the list wins. (If there are two rules in a row with the + * same base value, the first one is used when the numerator of the + * fraction would be 1, and the second rule is used the rest of the + * time. + * @param number The number being formatted (which will always be + * a number between 0 and 1) + * @return The rule to use to format this number + */ +NFRule* +NFRuleSet::findFractionRuleSetRule(double number) const +{ + // the obvious way to do this (multiply the value being formatted + // by each rule's base value until you get an integral result) + // doesn't work because of rounding error. This method is more + // accurate + + // find the least common multiple of the rules' base values + // and multiply this by the number being formatted. This is + // all the precision we need, and we can do all of the rest + // of the math using integer arithmetic + llong leastCommonMultiple = rules[0]->getBaseValue(); + llong numerator; + { + for (uint32_t i = 1; i < rules.size(); ++i) { + leastCommonMultiple = util_lcm(leastCommonMultiple, rules[i]->getBaseValue()); + } + numerator = number * llong_asDouble(leastCommonMultiple) + 0.5; + } + // for each rule, do the following... + llong tempDifference; + llong difference = llong::kMaxValue; + int32_t winner = 0; + for (uint32_t i = 0; i < rules.size(); ++i) { + // "numerator" is the numerator of the fraction if the + // denominator is the LCD. The numerator if the rule's + // base value is the denominator is "numerator" times the + // base value divided bythe LCD. Here we check to see if + // that's an integer, and if not, how close it is to being + // an integer. + tempDifference = numerator * rules[i]->getBaseValue() % leastCommonMultiple; + + + // normalize the result of the above calculation: we want + // the numerator's distance from the CLOSEST multiple + // of the LCD + if (leastCommonMultiple - tempDifference < tempDifference) { + tempDifference = leastCommonMultiple - tempDifference; + } + + // if this is as close as we've come, keep track of how close + // that is, and the line number of the rule that did it. If + // we've scored a direct hit, we don't have to look at any more + // rules + if (tempDifference < difference) { + difference = tempDifference; + winner = i; + if (difference == 0) { + break; + } + } + } + + // if we have two successive rules that both have the winning base + // value, then the first one (the one we found above) is used if + // the numerator of the fraction is 1 and the second one is used if + // the numerator of the fraction is anything else (this lets us + // do things like "one third"/"two thirds" without haveing to define + // a whole bunch of extra rule sets) + if ((unsigned)(winner + 1) < rules.size() && + rules[winner + 1]->getBaseValue() == rules[winner]->getBaseValue()) { + double n = llong_asDouble(rules[winner]->getBaseValue()) * number; + if (n < 0.5 || n >= 2) { + ++winner; + } + } + + // finally, return the winning rule + return rules[winner]; +} + +/** + * Parses a string. Matches the string to be parsed against each + * of its rules (with a base value less than upperBound) and returns + * the value produced by the rule that matched the most charcters + * in the source string. + * @param text The string to parse + * @param parsePosition The initial position is ignored and assumed + * to be 0. On exit, this object has been updated to point to the + * first character position this rule set didn't consume. + * @param upperBound Limits the rules that can be allowed to match. + * Only rules whose base values are strictly less than upperBound + * are considered. + * @return The numerical result of parsing this string. This will + * be the matching rule's base value, composed appropriately with + * the results of matching any of its substitutions. The object + * will be an instance of Long if it's an integral value; otherwise, + * it will be an instance of Double. This function always returns + * a valid object: If nothing matched the input string at all, + * this function returns new Long(0), and the parse position is + * left unchanged. + */ +#ifdef RBNF_DEBUG +static void dumpUS(FILE* f, const UnicodeString& us) { + int len = us.length(); + char* buf = new char[len+1]; + us.extract(0, len, buf); + buf[len] = 0; + fprintf(f, "%s", buf); + delete[] buf; +} +#endif + +UBool +NFRuleSet::parse(const UnicodeString& text, ParsePosition& pos, double upperBound, Formattable& result) const +{ + // try matching each rule in the rule set against the text being + // parsed. Whichever one matches the most characters is the one + // that determines the value we return. + + result.setLong(0); + + // dump out if there's no text to parse + if (text.length() == 0) { + return 0; + } + + ParsePosition highWaterMark; + ParsePosition workingPos = pos; + +#ifdef RBNF_DEBUG + fprintf(stderr, " %x '", this); + dumpUS(stderr, name); + fprintf(stderr, "' text '"); + dumpUS(stderr, text); + fprintf(stderr, "'\n"); + fprintf(stderr, " parse negative: %d\n", this, negativeNumberRule != 0); +#endif + + // start by trying the negative number rule (if there is one) + if (negativeNumberRule) { + Formattable tempResult; +#ifdef RBNF_DEBUG + fprintf(stderr, " %x ub: %g\n", negativeNumberRule, upperBound); +#endif + UBool success = negativeNumberRule->doParse(text, workingPos, 0, upperBound, tempResult); +#ifdef RBNF_DEBUG + fprintf(stderr, " success: %d wpi: %d\n", success, workingPos.getIndex()); +#endif + if (success && workingPos.getIndex() > highWaterMark.getIndex()) { + result = tempResult; + highWaterMark = workingPos; + } + workingPos = pos; + } +#ifdef RBNF_DEBUG + fprintf(stderr, " continue fractional with text '"); + dumpUS(stderr, text); + fprintf(stderr, "' hwm: %d\n", highWaterMark.getIndex()); +#endif + // then try each of the fraction rules + { + for (int i = 0; i < 3; i++) { + if (fractionRules[i]) { + Formattable tempResult; + UBool success = fractionRules[i]->doParse(text, workingPos, 0, upperBound, tempResult); + if (success && (workingPos.getIndex() > highWaterMark.getIndex())) { + result = tempResult; + highWaterMark = workingPos; + } + workingPos = pos; + } + } + } +#ifdef RBNF_DEBUG + fprintf(stderr, " continue other with text '"); + dumpUS(stderr, text); + fprintf(stderr, "' hwm: %d\n", highWaterMark.getIndex()); +#endif + + // finally, go through the regular rules one at a time. We start + // at the end of the list because we want to try matching the most + // sigificant rule first (this helps ensure that we parse + // "five thousand three hundred six" as + // "(five thousand) (three hundred) (six)" rather than + // "((five thousand three) hundred) (six)"). Skip rules whose + // base values are higher than the upper bound (again, this helps + // limit ambiguity by making sure the rules that match a rule's + // are less significant than the rule containing the substitutions)/ + { + llong ub(upperBound); +#ifdef RBNF_DEBUG + { + char ubstr[64]; + lltoa(ub, ubstr, 64); + fprintf(stderr, "ub: %g, ll: %s(%x/%x)\n", upperBound, ubstr, ub.hi, ub.lo); + } +#endif + for (int32_t i = rules.size(); --i >= 0 && highWaterMark.getIndex() < text.length();) { + if ((!fIsFractionRuleSet) && (rules[i]->getBaseValue() >= ub)) { + continue; + } + Formattable tempResult; + UBool success = rules[i]->doParse(text, workingPos, fIsFractionRuleSet, upperBound, tempResult); + if (success && workingPos.getIndex() > highWaterMark.getIndex()) { + result = tempResult; + highWaterMark = workingPos; + } + workingPos = pos; + } + } +#ifdef RBNF_DEBUG + fprintf(stderr, " exit\n"); +#endif + // finally, update the parse postion we were passed to point to the + // first character we didn't use, and return the result that + // corresponds to that string of characters + pos = highWaterMark; + + return 1; +} + +void +NFRuleSet::appendRules(UnicodeString& result) const +{ + // the rule set name goes first... + result.append(name); + result.append(gColon); + result.append(gLineFeed); + + // followed by the regular rules... + for (uint32_t i = 0; i < rules.size(); i++) { + result.append(gFourSpaces); + rules[i]->appendRuleText(result); + result.append(gLineFeed); + } + + // followed by the special rules (if they exist) + if (negativeNumberRule) { + result.append(gFourSpaces); + negativeNumberRule->appendRuleText(result); + result.append(gLineFeed); + } + + { + for (uint32_t i = 0; i < 3; ++i) { + if (fractionRules[i]) { + result.append(gFourSpaces); + fractionRules[i]->appendRuleText(result); + result.append(gLineFeed); + } + } + } + } + +U_NAMESPACE_END diff --git a/icu4c/source/i18n/nfrs.h b/icu4c/source/i18n/nfrs.h new file mode 100644 index 00000000000..09d6c0fe910 --- /dev/null +++ b/icu4c/source/i18n/nfrs.h @@ -0,0 +1,61 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +#ifndef NFRS_H +#define NFRS_H + +#include "unicode/utypes.h" +#include "unicode/umisc.h" + +#include "unicode/rbnf.h" +#include "nfrlist.h" +#include "llong.h" + +U_NAMESPACE_BEGIN + +class NFRuleSet { + public: + NFRuleSet(UnicodeString* descriptions, int32_t index, UErrorCode& status); + void parseRules(UnicodeString& rules, const RuleBasedNumberFormat* owner, UErrorCode& status); + void makeIntoFractionRuleSet() { fIsFractionRuleSet = TRUE; } + + ~NFRuleSet(); + + UBool operator==(const NFRuleSet& rhs) const; + UBool operator!=(const NFRuleSet& rhs) const { return !operator==(rhs); } + + UBool isPublic() const { return fIsPublic; } + UBool isFractionRuleSet() const { return fIsFractionRuleSet; } + + void getName(UnicodeString& result) const { result.setTo(name); } + UBool isNamed(const UnicodeString& _name) const { return this->name == _name; } + + void format(llong number, UnicodeString& toAppendTo, int32_t pos) const; + void format(double number, UnicodeString& toAppendTo, int32_t pos) const; + + UBool parse(const UnicodeString& text, ParsePosition& pos, double upperBound, Formattable& result) const; + + void appendRules(UnicodeString& result) const; // toString + + private: + NFRule * findNormalRule(llong number) const; + NFRule * findDoubleRule(double number) const; + NFRule * findFractionRuleSetRule(double number) const; + + private: + UnicodeString name; + NFRuleList rules; + NFRule *negativeNumberRule; + NFRule *fractionRules[3]; + UBool fIsFractionRuleSet; + UBool fIsPublic; +}; + +U_NAMESPACE_END + +// NFRS_H +#endif + diff --git a/icu4c/source/i18n/nfrule.cpp b/icu4c/source/i18n/nfrule.cpp new file mode 100644 index 00000000000..d6e2ef4183f --- /dev/null +++ b/icu4c/source/i18n/nfrule.cpp @@ -0,0 +1,1377 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +#include "nfrule.h" + +#include "unicode/rbnf.h" +#include "unicode/tblcoll.h" +#include "unicode/coleitr.h" +#include "nfrs.h" +#include "nfrlist.h" +#include "nfsubs.h" + +U_NAMESPACE_BEGIN + +extern const UChar* CSleftBracket; +extern const UChar* CSrightBracket; + +NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) + : baseValue((int32_t)0) + , radix(0) + , exponent(0) + , ruleText() + , sub1(NULL) + , sub2(NULL) + , formatter(_rbnf) +{ +} + +NFRule::~NFRule() +{ + delete sub1; + delete sub2; +} + +static UChar gLeftBracket = 0x005b; +static UChar gRightBracket = 0x005d; +static UChar gColon = 0x003a; +static UChar gZero = 0x0030; +static UChar gNine = 0x0039; +static UChar gSpace = 0x0020; +static UChar gSlash = 0x002f; +static UChar gGreaterThan = 0x003e; +static UChar gComma = 0x002c; +static UChar gDot = 0x002e; +static UChar gTick = 0x0027; +static UChar gMinus = 0x002d; +static UChar gSemicolon = 0x003b; + +static UnicodeString gMinusX("-x"); +static UnicodeString gXDotX("x.x"); +static UnicodeString gXDotZero("x.0"); +static UnicodeString gZeroDotX("0.x"); + +static UnicodeString gLessLess("<<"); +static UnicodeString gLessPercent("<%"); +static UnicodeString gLessHash("<#"); +static UnicodeString gLessZero("<0"); +static UnicodeString gGreaterGreater(">>"); +static UnicodeString gGreaterPercent(">%"); +static UnicodeString gGreaterHash(">#"); +static UnicodeString gGreaterZero(">0"); +static UnicodeString gEqualPercent("=%"); +static UnicodeString gEqualHash("=#"); +static UnicodeString gEqualZero("=0"); +static UnicodeString gEmptyString(""); +static UnicodeString gGreaterGreaterGreater(">>>"); + +static const UnicodeString* tokenStrings[] = { + &gLessLess, &gLessPercent, &gLessHash, &gLessZero, + &gGreaterGreater, &gGreaterPercent,&gGreaterHash, &gGreaterZero, + &gEqualPercent, &gEqualHash, &gEqualZero, NULL +}; + +void +NFRule::makeRules(UnicodeString& description, + const NFRuleSet *ruleSet, + const NFRule *predecessor, + const RuleBasedNumberFormat *rbnf, + NFRuleList& rules, + UErrorCode& status) +{ + // we know we're making at least one rule, so go ahead and + // new it up and initialize its basevalue and divisor + // (this also strips the rule descriptor, if any, off the + // descripton string) + NFRule* rule1 = new NFRule(rbnf); + rule1->parseRuleDescriptor(description, status); + + // check the description to see whether there's text enclosed + // in brackets + int32_t brack1 = description.indexOf(gLeftBracket); + int32_t brack2 = description.indexOf(gRightBracket); + + // if the description doesn't contain a matched pair of brackets, + // or if it's of a type that doesn't recognize bracketed text, + // then leave the description alone, initialize the rule's + // rule text and substitutions, and return that rule + if (brack1 == -1 || brack2 == -1 || brack1 > brack2 + || rule1->getType() == kProperFractionRule + || rule1->getType() == kNegativeNumberRule) { + rule1->ruleText = description; + rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); + rules.add(rule1); + } else { + // if the description does contain a matched pair of brackets, + // then it's really shorthand for two rules (with one exception) + NFRule* rule2 = NULL; + UnicodeString sbuf; + + // we'll actually only split the rule into two rules if its + // base value is an even multiple of its divisor (or it's one + // of the special rules) + if ((rule1->baseValue > 0 + && (rule1->baseValue % llong_pow((int32_t)rule1->radix, (int32_t)rule1->exponent)) == 0) + || rule1->getType() == kImproperFractionRule + || rule1->getType() == kMasterRule) { + + // if it passes that test, new up the second rule. If the + // rule set both rules will belong to is a fraction rule + // set, they both have the same base value; otherwise, + // increment the original rule's base value ("rule1" actually + // goes SECOND in the rule set's rule list) + rule2 = new NFRule(rbnf); + if (rule1->baseValue >= 0) { + rule2->baseValue = rule1->baseValue; + if (!ruleSet->isFractionRuleSet()) { + ++rule1->baseValue; + } + } + + // if the description began with "x.x" and contains bracketed + // text, it describes both the improper fraction rule and + // the proper fraction rule + else if (rule1->getType() == kImproperFractionRule) { + rule2->setType(kProperFractionRule); + } + + // if the description began with "x.0" and contains bracketed + // text, it describes both the master rule and the + // improper fraction rule + else if (rule1->getType() == kMasterRule) { + rule2->baseValue = rule1->baseValue; + rule1->setType(kImproperFractionRule); + } + + // both rules have the same radix and exponent (i.e., the + // same divisor) + rule2->radix = rule1->radix; + rule2->exponent = rule1->exponent; + + // rule2's rule text omits the stuff in brackets: initalize + // its rule text and substitutions accordingly + sbuf.append(description, 0, brack1); + if (brack2 + 1 < description.length()) { + sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); + } + rule2->ruleText.setTo(sbuf); + rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status); + } + + // rule1's text includes the text in the brackets but omits + // the brackets themselves: initialize _its_ rule text and + // substitutions accordingly + sbuf.setTo(description, 0, brack1); + sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); + if (brack2 + 1 < description.length()) { + sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); + } + rule1->ruleText.setTo(sbuf); + rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); + + // if we only have one rule, return it; if we have two, return + // a two-element array containing them (notice that rule2 goes + // BEFORE rule1 in the list: in all cases, rule2 OMITS the + // material in the brackets and rule1 INCLUDES the material + // in the brackets) + if (rule2 != NULL) { + rules.add(rule2); + } + rules.add(rule1); + } +} + +/** +* This function parses the rule's rule descriptor (i.e., the base +* value and/or other tokens that precede the rule's rule text +* in the description) and sets the rule's base value, radix, and +* exponent according to the descriptor. (If the description doesn't +* include a rule descriptor, then this function sets everything to +* default values and the rule set sets the rule's real base value). +* @param description The rule's description +* @return If "description" included a rule descriptor, this is +* "description" with the descriptor and any trailing whitespace +* stripped off. Otherwise; it's "descriptor" unchangd. +*/ +void +NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) +{ + // the description consists of a rule descriptor and a rule body, + // separated by a colon. The rule descriptor is optional. If + // it's omitted, just set the base value to 0. + int32_t p = description.indexOf(gColon); + if (p == -1) { + setBaseValue((int32_t)0); + } else { + // copy the descriptor out into its own string and strip it, + // along with any trailing whitespace, out of the original + // description + UnicodeString descriptor; + descriptor.setTo(description, 0, p); + + ++p; + while (p < description.length() && u_isWhitespace(description.charAt(p))) + ++p; + description.removeBetween(0, p); + + // check first to see if the rule descriptor matches the token + // for one of the special rules. If it does, set the base + // value to the correct identfier value + if (descriptor == gMinusX) { + setType(kNegativeNumberRule); + } + else if (descriptor == gXDotX) { + setType(kImproperFractionRule); + } + else if (descriptor == gZeroDotX) { + setType(kProperFractionRule); + } + else if (descriptor == gXDotZero) { + setType(kMasterRule); + } + + // if the rule descriptor begins with a digit, it's a descriptor + // for a normal rule + // since we don't have Long.parseLong, and this isn't much work anyway, + // just build up the value as we encounter the digits. + else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) { + llong val = (int32_t)0; + p = 0; + UChar c = gSpace; + + // begin parsing the descriptor: copy digits + // into "tempValue", skip periods, commas, and spaces, + // stop on a slash or > sign (or at the end of the string), + // and throw an exception on any other character + llong ll_10 = (int32_t)10; + while (p < descriptor.length()) { + c = descriptor.charAt(p); + if (c >= gZero && c <= gNine) { + val = val * ll_10 + (int32_t)(c - gZero); + } + else if (c == gSlash || c == gGreaterThan) { + break; + } + else if (u_isWhitespace(c) || c == gComma || c == gDot) { + } + else { + // throw new IllegalArgumentException("Illegal character in rule descriptor"); + status = U_PARSE_ERROR; + return; + } + ++p; + } + + // we have the base value, so set it + setBaseValue(val); + + // if we stopped the previous loop on a slash, we're + // now parsing the rule's radix. Again, accumulate digits + // in tempValue, skip punctuation, stop on a > mark, and + // throw an exception on anything else + if (c == '/') { + val = (int32_t)0; + ++p; + llong ll_10 = (int32_t)10; + while (p < descriptor.length()) { + c = descriptor.charAt(p); + if (c >= gZero && c <= gNine) { + val = val * ll_10 + (int32_t)(c - gZero); + } + else if (c == gGreaterThan) { + break; + } + else if (u_isWhitespace(c) || c == gComma || c == gDot) { + } + else { + // throw new IllegalArgumentException("Illegal character is rule descriptor"); + status = U_PARSE_ERROR; + return; + } + ++p; + } + + // tempValue now contain's the rule's radix. Set it + // accordingly, and recalculate the rule's exponent + radix = (int16_t)llong_asInt(val); + if (radix == 0) { + // throw new IllegalArgumentException("Rule can't have radix of 0"); + status = U_PARSE_ERROR; + } + + exponent = expectedExponent(); + } + + // if we stopped the previous loop on a > sign, then continue + // for as long as we still see > signs. For each one, + // decrement the exponent (unless the exponent is already 0). + // If we see another character before reaching the end of + // the descriptor, that's also a syntax error. + if (c == gGreaterThan) { + while (p < descriptor.length()) { + c = descriptor.charAt(p); + if (c == gGreaterThan && exponent > 0) { + --exponent; + } else { + // throw new IllegalArgumentException("Illegal character in rule descriptor"); + status = U_PARSE_ERROR; + return; + } + ++p; + } + } + } + } + + // finally, if the rule body begins with an apostrophe, strip it off + // (this is generally used to put whitespace at the beginning of + // a rule's rule text) + if (description.length() > 0 && description.charAt(0) == gTick) { + description.removeBetween(0, 1); + } + + // return the description with all the stuff we've just waded through + // stripped off the front. It now contains just the rule body. + // return description; +} + +/** +* Searches the rule's rule text for the substitution tokens, +* creates the substitutions, and removes the substitution tokens +* from the rule's rule text. +* @param owner The rule set containing this rule +* @param predecessor The rule preseding this one in "owners" rule list +* @param ownersOwner The RuleBasedFormat that owns this rule +*/ +void +NFRule::extractSubstitutions(const NFRuleSet* ruleSet, + const NFRule* predecessor, + const RuleBasedNumberFormat* rbnf, + UErrorCode& status) +{ + if (U_SUCCESS(status)) { + sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status); + sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status); + } +} + +/** +* Searches the rule's rule text for the first substitution token, +* creates a substitution based on it, and removes the token from +* the rule's rule text. +* @param owner The rule set containing this rule +* @param predecessor The rule preceding this one in the rule set's +* rule list +* @param ownersOwner The RuleBasedNumberFormat that owns this rule +* @return The newly-created substitution. This is never null; if +* the rule text doesn't contain any substitution tokens, this will +* be a NullSubstitution. +*/ +NFSubstitution * +NFRule::extractSubstitution(const NFRuleSet* ruleSet, + const NFRule* predecessor, + const RuleBasedNumberFormat* rbnf, + UErrorCode& status) +{ + NFSubstitution* result = NULL; + + // search the rule's rule text for the first two characters of + // a substitution token + int32_t subStart = indexOfAny(tokenStrings); + int32_t subEnd = subStart; + + // if we didn't find one, create a null substitution positioned + // at the end of the rule text + if (subStart == -1) { + return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, + ruleSet, rbnf, gEmptyString, status); + } + + // special-case the ">>>" token, since searching for the > at the + // end will actually find the > in the middle + if (ruleText.indexOf(gGreaterGreaterGreater) == subStart) { + subEnd = subStart + 2; + + // otherwise the substitution token ends with the same character + // it began with + } else { + subEnd = ruleText.indexOf(ruleText.charAt(subStart), subStart + 1); + } + + // if we don't find the end of the token (i.e., if we're on a single, + // unmatched token character), create a null substitution positioned + // at the end of the rule + if (subEnd == -1) { + return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, + ruleSet, rbnf, gEmptyString, status); + } + + // if we get here, we have a real substitution token (or at least + // some text bounded by substitution token characters). Use + // makeSubstitution() to create the right kind of substitution + UnicodeString subToken; + subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); + result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, + rbnf, subToken, status); + + // remove the substitution from the rule text + ruleText.removeBetween(subStart, subEnd+1); + + return result; +} + +/** +* Sets the rule's base value, and causes the radix and exponent +* to be recalculated. This is used during construction when we +* don't know the rule's base value until after it's been +* constructed. It should be used at any other time. +* @param The new base value for the rule. +*/ +void +NFRule::setBaseValue(llong newBaseValue) +{ + // set the base value + baseValue = newBaseValue; + + // if this isn't a special rule, recalculate the radix and exponent + // (the radix always defaults to 10; if it's supposed to be something + // else, it's cleaned up by the caller and the exponent is + // recalculated again-- the only function that does this is + // NFRule.parseRuleDescriptor() ) + if (baseValue >= 1) { + radix = 10; + exponent = expectedExponent(); + + // this function gets called on a fully-constructed rule whose + // description didn't specify a base value. This means it + // has substitutions, and some substitutions hold on to copies + // of the rule's divisor. Fix their copies of the divisor. + if (sub1 != NULL) { + sub1->setDivisor(radix, exponent); + } + if (sub2 != NULL) { + sub2->setDivisor(radix, exponent); + } + + // if this is a special rule, its radix and exponent are basically + // ignored. Set them to "safe" default values + } else { + radix = 10; + exponent = 0; + } +} + +/** +* This calculates the rule's exponent based on its radix and base +* value. This will be the highest power the radix can be raised to +* and still produce a result less than or equal to the base value. +*/ +int16_t +NFRule::expectedExponent() const +{ + // since the log of 0, or the log base 0 of something, causes an + // error, declare the exponent in these cases to be 0 (we also + // deal with the special-rule identifiers here) + if (radix == 0 || baseValue < 1) { + return 0; + } + + // we get rounding error in some cases-- for example, log 1000 / log 10 + // gives us 1.9999999996 instead of 2. The extra logic here is to take + // that into account + int16_t tempResult = (int16_t)(log(llong_asDouble(baseValue)) / log((double)radix)); + llong temp = llong_pow(radix, tempResult + 1); + if (temp <= baseValue) { + tempResult += 1; + } + return tempResult; +} + +/** + * Searches the rule's rule text for any of the specified strings. + * @param strings An array of strings to search the rule's rule + * text for + * @return The index of the first match in the rule's rule text + * (i.e., the first substring in the rule's rule text that matches + * _any_ of the strings in "strings"). If none of the strings in + * "strings" is found in the rule's rule text, returns -1. + */ +int32_t +NFRule::indexOfAny(const UnicodeString* strings[]) const +{ + int result = -1; + for (int i = 0; strings[i]; i++) { + int32_t pos = ruleText.indexOf(*strings[i]); + if (pos != -1 && (result == -1 || pos < result)) { + result = pos; + } + } + return result; +} + +//----------------------------------------------------------------------- +// boilerplate +//----------------------------------------------------------------------- + +/** +* Tests two rules for equality. +* @param that The rule to compare this one against +* @return True is the two rules are functionally equivalent +*/ +UBool +NFRule::operator==(const NFRule& rhs) const +{ + return baseValue == rhs.baseValue + && radix == rhs.radix + && exponent == rhs.exponent + && ruleText == rhs.ruleText + && *sub1 == *rhs.sub1 + && *sub2 == *rhs.sub2; +} + +static void +util_append_llong(UnicodeString& result, const llong& value) +{ + llong n(value); + + if (n < 0) { + result.append(gMinus); + n = -n; + } + if (n == 0) { + result.append(gZero); + } else { + llong ll_10((int32_t)10); + while (n != 0) { + llong nn = n / ll_10; + result.append((UChar)(gZero + llong_asInt(n - nn * ll_10))); + n = nn; + } + } +} + +/** +* Returns a textual representation of the rule. This won't +* necessarily be the same as the description that this rule +* was created with, but it will produce the same result. +* @return A textual description of the rule +*/ +static void util_append64(UnicodeString& result, const llong& n) +{ + UChar buffer[256]; + int32_t len = u_lltoa(n, buffer, sizeof(buffer)); + UnicodeString temp(buffer, len); + result.append(temp); +} + +void +NFRule::appendRuleText(UnicodeString& result) const +{ + switch (getType()) { + case kNegativeNumberRule: result.append(gMinusX); break; + case kImproperFractionRule: result.append(gXDotX); break; + case kProperFractionRule: result.append(gZeroDotX); break; + case kMasterRule: result.append(gXDotZero); break; + default: + + // for a normal rule, write out its base value, and if the radix is + // something other than 10, write out the radix (with the preceding + // slash, of course). Then calculate the expected exponent and if + // if isn't the same as the actual exponent, write an appropriate + // number of > signs. Finally, terminate the whole thing with + // a colon. + util_append64(result, baseValue); + if (radix != 10) { + result.append(gSlash); + util_append64(result, radix); + } + int numCarets = expectedExponent() - exponent; + for (int i = 0; i < numCarets; i++) { + result.append(gGreaterThan); + } + break; + } + result.append(gColon); + result.append(gSpace); + + // if the rule text begins with a space, write an apostrophe + // (whitespace after the rule descriptor is ignored; the + // apostrophe is used to make the whitespace significant) + if (ruleText.startsWith(gSpace) && sub1->getPos() != 0) { + result.append(gTick); + } + + // now, write the rule's rule text, inserting appropriate + // substitution tokens in the appropriate places + UnicodeString ruleTextCopy; + ruleTextCopy.setTo(ruleText); + + UnicodeString temp; + sub2->toString(temp); + ruleTextCopy.insert(sub2->getPos(), temp); + sub1->toString(temp); + ruleTextCopy.insert(sub1->getPos(), temp); + + result.append(ruleTextCopy); + + // and finally, top the whole thing off with a semicolon and + // return the result + result.append(gSemicolon); +} + +//----------------------------------------------------------------------- +// formatting +//----------------------------------------------------------------------- + +/** +* Formats the number, and inserts the resulting text into +* toInsertInto. +* @param number The number being formatted +* @param toInsertInto The string where the resultant text should +* be inserted +* @param pos The position in toInsertInto where the resultant text +* should be inserted +*/ +void +NFRule::doFormat(llong number, UnicodeString& toInsertInto, int32_t pos) const +{ + // first, insert the rule's rule text into toInsertInto at the + // specified position, then insert the results of the substitutions + // into the right places in toInsertInto (notice we do the + // substitutions in reverse order so that the offsets don't get + // messed up) + toInsertInto.insert(pos, ruleText); + sub2->doSubstitution(number, toInsertInto, pos); + sub1->doSubstitution(number, toInsertInto, pos); +} + +/** +* Formats the number, and inserts the resulting text into +* toInsertInto. +* @param number The number being formatted +* @param toInsertInto The string where the resultant text should +* be inserted +* @param pos The position in toInsertInto where the resultant text +* should be inserted +*/ +void +NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const +{ + // first, insert the rule's rule text into toInsertInto at the + // specified position, then insert the results of the substitutions + // into the right places in toInsertInto + // [again, we have two copies of this routine that do the same thing + // so that we don't sacrifice precision in a long by casting it + // to a double] + toInsertInto.insert(pos, ruleText); + sub2->doSubstitution(number, toInsertInto, pos); + sub1->doSubstitution(number, toInsertInto, pos); +} + +/** +* Used by the owning rule set to determine whether to invoke the +* rollback rule (i.e., whether this rule or the one that precedes +* it in the rule set's list should be used to format the number) +* @param The number being formatted +* @return True if the rule set should use the rule that precedes +* this one in its list; false if it should use this rule +*/ +UBool +NFRule::shouldRollBack(double number) const +{ + // we roll back if the rule contains a modulus substitution, + // the number being formatted is an even multiple of the rule's + // divisor, and the rule's base value is NOT an even multiple + // of its divisor + // In other words, if the original description had + // 100: << hundred[ >>]; + // that expands into + // 100: << hundred; + // 101: << hundred >>; + // internally. But when we're formatting 200, if we use the rule + // at 101, which would normally apply, we get "two hundred zero". + // To prevent this, we roll back and use the rule at 100 instead. + // This is the logic that makes this happen: the rule at 101 has + // a modulus substitution, its base value isn't an even multiple + // of 100, and the value we're trying to format _is_ an even + // multiple of 100. This is called the "rollback rule." + if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { + llong re = llong_pow(radix, exponent); + return java_fmod(number, llong_asDouble(re)) == 0 && (baseValue % re) != 0; + } + return FALSE; +} + +//----------------------------------------------------------------------- +// parsing +//----------------------------------------------------------------------- + +/** +* Attempts to parse the string with this rule. +* @param text The string being parsed +* @param parsePosition On entry, the value is ignored and assumed to +* be 0. On exit, this has been updated with the position of the first +* character not consumed by matching the text against this rule +* (if this rule doesn't match the text at all, the parse position +* if left unchanged (presumably at 0) and the function returns +* new Long(0)). +* @param isFractionRule True if this rule is contained within a +* fraction rule set. This is only used if the rule has no +* substitutions. +* @return If this rule matched the text, this is the rule's base value +* combined appropriately with the results of parsing the substitutions. +* If nothing matched, this is new Long(0) and the parse position is +* left unchanged. The result will be an instance of Long if the +* result is an integer and Double otherwise. The result is never null. +*/ +#ifdef RBNF_DEBUG +static void dumpUS(FILE* f, const UnicodeString& us) { + int len = us.length(); + char* buf = new char[len+1]; + us.extract(0, len, buf); + buf[len] = 0; + fprintf(f, "%s", buf); + delete[] buf; +} +#endif + +UBool +NFRule::doParse(const UnicodeString& text, + ParsePosition& parsePosition, + UBool isFractionRule, + double upperBound, + Formattable& resVal) const +{ + // internally we operate on a copy of the string being parsed + // (because we're going to change it) and use our own ParsePosition + ParsePosition pp; + UnicodeString workText(text); + + // check to see whether the text before the first substitution + // matches the text at the beginning of the string being + // parsed. If it does, strip that off the front of workText; + // otherwise, dump out with a mismatch + UnicodeString prefix; + prefix.setTo(ruleText, 0, sub1->getPos()); + +#ifdef RBNF_DEBUG + fprintf(stderr, "doParse %x ", this); + { + UnicodeString rt; + appendRuleText(rt); + dumpUS(stderr, rt); + } + + fprintf(stderr, " text: '", this); + dumpUS(stderr, text); + fprintf(stderr, "' prefix: '"); + dumpUS(stderr, prefix); +#endif + stripPrefix(workText, prefix, pp); + int32_t prefixLength = text.length() - workText.length(); + +#ifdef RBNF_DEBUG + fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos()); +#endif + + if (pp.getIndex() == 0 && sub1->getPos() != 0) { + // commented out because ParsePosition doesn't have error index in 1.1.x + // restored for ICU4C port + parsePosition.setErrorIndex(pp.getErrorIndex()); + resVal.setLong(0); + return TRUE; + } + + // this is the fun part. The basic guts of the rule-matching + // logic is matchToDelimiter(), which is called twice. The first + // time it searches the input string for the rule text BETWEEN + // the substitutions and tries to match the intervening text + // in the input string with the first substitution. If that + // succeeds, it then calls it again, this time to look for the + // rule text after the second substitution and to match the + // intervening input text against the second substitution. + // + // For example, say we have a rule that looks like this: + // first << middle >> last; + // and input text that looks like this: + // first one middle two last + // First we use stripPrefix() to match "first " in both places and + // strip it off the front, leaving + // one middle two last + // Then we use matchToDelimiter() to match " middle " and try to + // match "one" against a substitution. If it's successful, we now + // have + // two last + // We use matchToDelimiter() a second time to match " last" and + // try to match "two" against a substitution. If "two" matches + // the substitution, we have a successful parse. + // + // Since it's possible in many cases to find multiple instances + // of each of these pieces of rule text in the input string, + // we need to try all the possible combinations of these + // locations. This prevents us from prematurely declaring a mismatch, + // and makes sure we match as much input text as we can. + int highWaterMark = 0; + double result = 0; + int start = 0; + double tempBaseValue = (baseValue <= 0) ? 0 : llong_asDouble(baseValue); + + UnicodeString temp; + do { + // our partial parse result starts out as this rule's base + // value. If it finds a successful match, matchToDelimiter() + // will compose this in some way with what it gets back from + // the substitution, giving us a new partial parse result + pp.setIndex(0); + + temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); + double partialResult = matchToDelimiter(workText, start, tempBaseValue, + temp, pp, sub1, + upperBound); + + // if we got a successful match (or were trying to match a + // null substitution), pp is now pointing at the first unmatched + // character. Take note of that, and try matchToDelimiter() + // on the input text again + if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { + start = pp.getIndex(); + + UnicodeString workText2; + workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); + ParsePosition pp2; + + // the second matchToDelimiter() will compose our previous + // partial result with whatever it gets back from its + // substitution if there's a successful match, giving us + // a real result + temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos()); + partialResult = matchToDelimiter(workText2, 0, partialResult, + temp, pp2, sub2, + upperBound); + + // if we got a successful match on this second + // matchToDelimiter() call, update the high-water mark + // and result (if necessary) + if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { + if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { + highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); + result = partialResult; + } + } + // commented out because ParsePosition doesn't have error index in 1.1.x + // restored for ICU4C port + else { + int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex(); + if (temp> parsePosition.getErrorIndex()) { + parsePosition.setErrorIndex(temp); + } + } + } + // commented out because ParsePosition doesn't have error index in 1.1.x + // restored for ICU4C port + else { + int32_t temp = sub1->getPos() + pp.getErrorIndex(); + if (temp > parsePosition.getErrorIndex()) { + parsePosition.setErrorIndex(temp); + } + } + // keep trying to match things until the outer matchToDelimiter() + // call fails to make a match (each time, it picks up where it + // left off the previous time) + } while (sub1->getPos() != sub2->getPos() + && pp.getIndex() > 0 + && pp.getIndex() < workText.length() + && pp.getIndex() != start); + + // update the caller's ParsePosition with our high-water mark + // (i.e., it now points at the first character this function + // didn't match-- the ParsePosition is therefore unchanged if + // we didn't match anything) + parsePosition.setIndex(highWaterMark); + // commented out because ParsePosition doesn't have error index in 1.1.x + // restored for ICU4C port + if (highWaterMark > 0) { + parsePosition.setErrorIndex(0); + } + + // this is a hack for one unusual condition: Normally, whether this + // rule belong to a fraction rule set or not is handled by its + // substitutions. But if that rule HAS NO substitutions, then + // we have to account for it here. By definition, if the matching + // rule in a fraction rule set has no substitutions, its numerator + // is 1, and so the result is the reciprocal of its base value. + if (isFractionRule && + highWaterMark > 0 && + sub1->isNullSubstitution()) { + result = 1 / result; + } + + resVal.setDouble(result); + return TRUE; // ??? do we need to worry if it is a long or a double? +} + +/** +* This function is used by parse() to match the text being parsed +* against a possible prefix string. This function +* matches characters from the beginning of the string being parsed +* to characters from the prospective prefix. If they match, pp is +* updated to the first character not matched, and the result is +* the unparsed part of the string. If they don't match, the whole +* string is returned, and pp is left unchanged. +* @param text The string being parsed +* @param prefix The text to match against +* @param pp On entry, ignored and assumed to be 0. On exit, points +* to the first unmatched character (assuming the whole prefix matched), +* or is unchanged (if the whole prefix didn't match). +* @return If things match, this is the unparsed part of "text"; +* if they didn't match, this is "text". +*/ +void +NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const +{ + // if the prefix text is empty, dump out without doing anything + if (prefix.length() != 0) { + // use prefixLength() to match the beginning of + // "text" against "prefix". This function returns the + // number of characters from "text" that matched (or 0 if + // we didn't match the whole prefix) + int32_t pfl = prefixLength(text, prefix); + if (pfl != 0) { + // if we got a successful match, update the parse position + // and strip the prefix off of "text" + pp.setIndex(pp.getIndex() + pfl); + text.remove(0, pfl); + } + } +} + +/** +* Used by parse() to match a substitution and any following text. +* "text" is searched for instances of "delimiter". For each instance +* of delimiter, the intervening text is tested to see whether it +* matches the substitution. The longest match wins. +* @param text The string being parsed +* @param startPos The position in "text" where we should start looking +* for "delimiter". +* @param baseValue A partial parse result (often the rule's base value), +* which is combined with the result from matching the substitution +* @param delimiter The string to search "text" for. +* @param pp Ignored and presumed to be 0 on entry. If there's a match, +* on exit this will point to the first unmatched character. +* @param sub If we find "delimiter" in "text", this substitution is used +* to match the text between the beginning of the string and the +* position of "delimiter." (If "delimiter" is the empty string, then +* this function just matches against this substitution and updates +* everything accordingly.) +* @param upperBound When matching the substitution, it will only +* consider rules with base values lower than this value. +* @return If there's a match, this is the result of composing +* baseValue with the result of matching the substitution. Otherwise, +* this is new Long(0). It's never null. If the result is an integer, +* this will be an instance of Long; otherwise, it's an instance of +* Double. +* +* !!! note {dlf} in point of fact, in the java code the caller always converts +* the result to a double, so we might as well return one. +*/ +double +NFRule::matchToDelimiter(const UnicodeString& text, + int32_t startPos, + double _baseValue, + const UnicodeString& delimiter, + ParsePosition& pp, + const NFSubstitution* sub, + double upperBound) const +{ + // if "delimiter" contains real (i.e., non-ignorable) text, search + // it for "delimiter" beginning at "start". If that succeeds, then + // use "sub"'s doParse() method to match the text before the + // instance of "delimiter" we just found. + if (!allIgnorable(delimiter)) { + ParsePosition tempPP; + Formattable result; + + // use findText() to search for "delimiter". It returns a two- + // element array: element 0 is the position of the match, and + // element 1 is the number of characters that matched + // "delimiter". + int32_t dLen; + int32_t dPos = findText(text, delimiter, startPos, &dLen); + + // if findText() succeeded, isolate the text preceding the + // match, and use "sub" to match that text + while (dPos >= 0) { + UnicodeString subText; + subText.setTo(text, 0, dPos); + if (subText.length() > 0) { + UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, + formatter->isLenient(), result); + + // if the substitution could match all the text up to + // where we found "delimiter", then this function has + // a successful match. Bump the caller's parse position + // to point to the first character after the text + // that matches "delimiter", and return the result + // we got from parsing the substitution. + if (success && tempPP.getIndex() == dPos) { + pp.setIndex(dPos + dLen); + return result.getDouble(); + } + // commented out because ParsePosition doesn't have error index in 1.1.x + // restored for ICU4C port + else { + if (tempPP.getErrorIndex() > 0) { + pp.setErrorIndex(tempPP.getErrorIndex()); + } else { + pp.setErrorIndex(tempPP.getIndex()); + } + } + } + + // if we didn't match the substitution, search for another + // copy of "delimiter" in "text" and repeat the loop if + // we find it + tempPP.setIndex(0); + dPos = findText(text, delimiter, dPos + dLen, &dLen); + } + // if we make it here, this was an unsuccessful match, and we + // leave pp unchanged and return 0 + pp.setIndex(0); + return 0; + + // if "delimiter" is empty, or consists only of ignorable characters + // (i.e., is semantically empty), thwe we obviously can't search + // for "delimiter". Instead, just use "sub" to parse as much of + // "text" as possible. + } else { + ParsePosition tempPP; + Formattable result; + + // try to match the whole string against the substitution + UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, + formatter->isLenient(), result); + if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { + // if there's a successful match (or it's a null + // substitution), update pp to point to the first + // character we didn't match, and pass the result from + // sub.doParse() on through to the caller + pp.setIndex(tempPP.getIndex()); + return result.getDouble(); + } + // commented out because ParsePosition doesn't have error index in 1.1.x + // restored for ICU4C port + else { + pp.setErrorIndex(tempPP.getErrorIndex()); + } + + // and if we get to here, then nothing matched, so we return + // 0 and leave pp alone + return 0; + } +} + +/** +* Used by stripPrefix() to match characters. If lenient parse mode +* is off, this just calls startsWith(). If lenient parse mode is on, +* this function uses CollationElementIterators to match characters in +* the strings (only primary-order differences are significant in +* determining whether there's a match). +* @param str The string being tested +* @param prefix The text we're hoping to see at the beginning +* of "str" +* @return If "prefix" is found at the beginning of "str", this +* is the number of characters in "str" that were matched (this +* isn't necessarily the same as the length of "prefix" when matching +* text with a collator). If there's no match, this is 0. +*/ +int32_t +NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix) const +{ + // if we're looking for an empty prefix, it obviously matches + // zero characters. Just go ahead and return 0. + if (prefix.length() == 0) { + return 0; + } + + // go through all this grief if we're in lenient-parse mode + if (formatter->isLenient()) { + // get the formatter's collator and use it to create two + // collation element iterators, one over the target string + // and another over the prefix (right now, we'll throw an + // exception if the collator we get back from the formatter + // isn't a RuleBasedCollator, because RuleBasedCollator defines + // the CollationElementIteratoer protocol. Hopefully, this + // will change someday.) + RuleBasedCollator* collator = (RuleBasedCollator*)formatter->getCollator(); + CollationElementIterator* strIter = collator->createCollationElementIterator(str); + CollationElementIterator* prefixIter = collator->createCollationElementIterator(prefix); + + UErrorCode err = U_ZERO_ERROR; + + // match collation elements between the strings + int32_t oStr = strIter->next(err); + int32_t oPrefix = prefixIter->next(err); + + while (oPrefix != CollationElementIterator::NULLORDER) { + // skip over ignorable characters in the target string + while (CollationElementIterator::primaryOrder(oStr) == 0 + && oStr != CollationElementIterator::NULLORDER) { + oStr = strIter->next(err); + } + + // skip over ignorable characters in the prefix + while (CollationElementIterator::primaryOrder(oPrefix) == 0 + && oPrefix != CollationElementIterator::NULLORDER) { + oPrefix = prefixIter->next(err); + } + + // if skipping over ignorables brought us to the end + // of the target string, we didn't match and return 0 + if (oStr == CollationElementIterator::NULLORDER) { + delete prefixIter; + delete strIter; + return 0; + } + + // if skipping over ignorables brought to the end of + // the prefix, we DID match: drop out of the loop + else if (oPrefix == CollationElementIterator::NULLORDER) { + break; + } + + // match collation elements from the two strings + // (considering only primary differences). If we + // get a mismatch, dump out and return 0 + if (CollationElementIterator::primaryOrder(oStr) + != CollationElementIterator::primaryOrder(oPrefix)) { + delete prefixIter; + delete strIter; + return 0; + + // otherwise, advance to the next character in each string + // and loop (we drop out of the loop when we exhaust + // collation elements in the prefix) + } else { + oStr = strIter->next(err); + oPrefix = prefixIter->next(err); + } + } + + delete prefixIter; + delete strIter; + + //---------------------------------------------------------------- + // JDK 1.2-specific API call + // return strIter.getOffset(); + //---------------------------------------------------------------- + // JDK 1.1 HACK (take out for 1.2-specific code) + + // if we make it to here, we have a successful match. Now we + // have to find out HOW MANY characters from the target string + // matched the prefix (there isn't necessarily a one-to-one + // mapping between collation elements and characters). + // In JDK 1.2, there's a simple getOffset() call we can use. + // In JDK 1.1, on the other hand, we have to go through some + // ugly contortions. First, use the collator to compare the + // same number of characters from the prefix and target string. + // If they're equal, we're done. + collator->setStrength(Collator::PRIMARY); + if (str.length() >= prefix.length()) { + UnicodeString temp; + temp.setTo(str, 0, prefix.length()); + if (collator->equals(temp, prefix)) { + return prefix.length(); + } + } + + // if they're not equal, then we have to compare successively + // larger and larger substrings of the target string until we + // get to one that matches the prefix. At that point, we know + // how many characters matched the prefix, and we can return. + int32_t p = 1; + while (p <= str.length()) { + UnicodeString temp; + temp.setTo(str, 0, p); + if (collator->equals(temp, prefix)) { + return p; + } else { + ++p; + } + } + + // SHOULD NEVER GET HERE!!! + return 0; + //---------------------------------------------------------------- + + // If lenient parsing is turned off, forget all that crap above. + // Just use String.startsWith() and be done with it. + } else { + if (str.startsWith(prefix)) { + return prefix.length(); + } else { + return 0; + } + } +} + +/** +* Searches a string for another string. If lenient parsing is off, +* this just calls indexOf(). If lenient parsing is on, this function +* uses CollationElementIterator to match characters, and only +* primary-order differences are significant in determining whether +* there's a match. +* @param str The string to search +* @param key The string to search "str" for +* @param startingAt The index into "str" where the search is to +* begin +* @return A two-element array of ints. Element 0 is the position +* of the match, or -1 if there was no match. Element 1 is the +* number of characters in "str" that matched (which isn't necessarily +* the same as the length of "key") +*/ +int32_t +NFRule::findText(const UnicodeString& str, + const UnicodeString& key, + int32_t startingAt, + int32_t* length) const +{ + // if lenient parsing is turned off, this is easy: just call + // String.indexOf() and we're done + if (!formatter->isLenient()) { + *length = key.length(); + return str.indexOf(key, startingAt); + + // but if lenient parsing is turned ON, we've got some work + // ahead of us + } else { + //---------------------------------------------------------------- + // JDK 1.1 HACK (take out of 1.2-specific code) + + // in JDK 1.2, CollationElementIterator provides us with an + // API to map between character offsets and collation elements + // and we can do this by marching through the string comparing + // collation elements. We can't do that in JDK 1.1. Insted, + // we have to go through this horrible slow mess: + int32_t p = startingAt; + int32_t keyLen = 0; + + // basically just isolate smaller and smaller substrings of + // the target string (each running to the end of the string, + // and with the first one running from startingAt to the end) + // and then use prefixLength() to see if the search key is at + // the beginning of each substring. This is excruciatingly + // slow, but it will locate the key and tell use how long the + // matching text was. + UnicodeString temp; + while (p < str.length() && keyLen == 0) { + temp.setTo(str, p, str.length() - p); + keyLen = prefixLength(temp, key); + if (keyLen != 0) { + *length = keyLen; + return p; + } + ++p; + } + // if we make it to here, we didn't find it. Return -1 for the + // location. The length should be ignored, but set it to 0, + // which should be "safe" + *length = 0; + return -1; + + //---------------------------------------------------------------- + // JDK 1.2 version of this routine + //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator(); + // + //CollationElementIterator strIter = collator.getCollationElementIterator(str); + //CollationElementIterator keyIter = collator.getCollationElementIterator(key); + // + //int keyStart = -1; + // + //str.setOffset(startingAt); + // + //int oStr = strIter.next(); + //int oKey = keyIter.next(); + //while (oKey != CollationElementIterator.NULLORDER) { + // while (oStr != CollationElementIterator.NULLORDER && + // CollationElementIterator.primaryOrder(oStr) == 0) + // oStr = strIter.next(); + // + // while (oKey != CollationElementIterator.NULLORDER && + // CollationElementIterator.primaryOrder(oKey) == 0) + // oKey = keyIter.next(); + // + // if (oStr == CollationElementIterator.NULLORDER) { + // return new int[] { -1, 0 }; + // } + // + // if (oKey == CollationElementIterator.NULLORDER) { + // break; + // } + // + // if (CollationElementIterator.primaryOrder(oStr) == + // CollationElementIterator.primaryOrder(oKey)) { + // keyStart = strIter.getOffset(); + // oStr = strIter.next(); + // oKey = keyIter.next(); + // } else { + // if (keyStart != -1) { + // keyStart = -1; + // keyIter.reset(); + // } else { + // oStr = strIter.next(); + // } + // } + //} + // + //if (oKey == CollationElementIterator.NULLORDER) { + // return new int[] { keyStart, strIter.getOffset() - keyStart }; + //} else { + // return new int[] { -1, 0 }; + //} + } +} + +/** +* Checks to see whether a string consists entirely of ignorable +* characters. +* @param str The string to test. +* @return true if the string is empty of consists entirely of +* characters that the number formatter's collator says are +* ignorable at the primary-order level. false otherwise. +*/ +UBool +NFRule::allIgnorable(const UnicodeString& str) const +{ + // if the string is empty, we can just return true + if (str.length() == 0) { + return TRUE; + } + + // if lenient parsing is turned on, walk through the string with + // a collation element iterator and make sure each collation + // element is 0 (ignorable) at the primary level + if (formatter->isLenient()) { + RuleBasedCollator* collator = (RuleBasedCollator*)(formatter->getCollator()); + CollationElementIterator* iter = collator->createCollationElementIterator(str); + + UErrorCode err = U_ZERO_ERROR; + int32_t o = iter->next(err); + while (o != CollationElementIterator::NULLORDER + && CollationElementIterator::primaryOrder(o) == 0) { + o = iter->next(err); + } + + delete iter; + return o == CollationElementIterator::NULLORDER; + } + // if lenient parsing is turned off, there is no such thing as + // an ignorable character: return true only if the string is empty + return FALSE; +} + +U_NAMESPACE_END + + diff --git a/icu4c/source/i18n/nfrule.h b/icu4c/source/i18n/nfrule.h new file mode 100644 index 00000000000..707670965b8 --- /dev/null +++ b/icu4c/source/i18n/nfrule.h @@ -0,0 +1,104 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +#ifndef NFRULE_H +#define NFRULE_H + +#include "unicode/utypes.h" +#include "unicode/unistr.h" + +#include "llong.h" + +#include + +U_NAMESPACE_BEGIN + +class FieldPosition; +class Formattable; +class NFRuleList; +class NFRuleSet; +class NFSubstitution; +class ParsePosition; +class RuleBasedNumberFormat; +class UnicodeString; + +class NFRule { + public: + + enum ERuleType { + kNoBase = 0, + kNegativeNumberRule = -1, + kImproperFractionRule = -2, + kProperFractionRule = -3, + kMasterRule = -4, + kOtherRule = -5 + }; + + static void makeRules(UnicodeString& definition, + const NFRuleSet* ruleSet, + const NFRule* predecessor, + const RuleBasedNumberFormat* rbnf, + NFRuleList& ruleList, + UErrorCode& status); + + NFRule(const RuleBasedNumberFormat* rbnf); + ~NFRule(); + + UBool operator==(const NFRule& rhs) const; + UBool operator!=(const NFRule& rhs) const { return !operator==(rhs); } + + ERuleType getType() const { return (ERuleType)(baseValue <= 0 ? llong_asInt(baseValue) : kOtherRule); } + void setType(ERuleType ruleType) { baseValue = (int32_t)ruleType; } + + llong getBaseValue() const { return baseValue; } + void setBaseValue(llong value); + + double getDivisor() const { return pow(radix, exponent); } + + void doFormat(llong number, UnicodeString& toAppendTo, int32_t pos) const; + void doFormat(double number, UnicodeString& toAppendTo, int32_t pos) const; + + UBool doParse(const UnicodeString& text, + ParsePosition& pos, + UBool isFractional, + double upperBound, + Formattable& result) const; + + UBool shouldRollBack(double number) const; + + void appendRuleText(UnicodeString& result) const; + + private: + void parseRuleDescriptor(UnicodeString& descriptor, UErrorCode& status); + void extractSubstitutions(const NFRuleSet* ruleSet, const NFRule* predecessor, const RuleBasedNumberFormat* rbnf, UErrorCode& status); + NFSubstitution* extractSubstitution(const NFRuleSet* ruleSet, const NFRule* predecessor, const RuleBasedNumberFormat* rbnf, UErrorCode& status); + + int16_t expectedExponent() const; + int32_t indexOfAny(const UnicodeString* strings[]) const; + double matchToDelimiter(const UnicodeString& text, int32_t startPos, double baseValue, + const UnicodeString& delimiter, ParsePosition& pp, const NFSubstitution* sub, + double upperBound) const; + void stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const; + + int32_t prefixLength(const UnicodeString& str, const UnicodeString& prefix) const; + UBool allIgnorable(const UnicodeString& str) const; + int32_t findText(const UnicodeString& str, const UnicodeString& key, + int32_t startingAt, int32_t* resultCount) const; + +private: + llong baseValue; + int16_t radix; + int16_t exponent; + UnicodeString ruleText; + NFSubstitution* sub1; + NFSubstitution* sub2; + const RuleBasedNumberFormat* formatter; +}; + +U_NAMESPACE_END + +// NFRULE_H +#endif diff --git a/icu4c/source/i18n/nfsubs.cpp b/icu4c/source/i18n/nfsubs.cpp new file mode 100644 index 00000000000..7ccf3803f5f --- /dev/null +++ b/icu4c/source/i18n/nfsubs.cpp @@ -0,0 +1,892 @@ +#include "nfsubs.h" + +static const UChar gLessThan = 0x003c; +static const UChar gEquals = 0x003d; +static const UChar gGreaterThan = 0x003e; +static const UChar gPercent = 0x0025; +static const UChar gPound = 0x0023; +static const UChar gZero = 0x0030; +static const UChar gSpace = 0x0020; + +static const UnicodeString gEqualsEquals("=="); +static const UnicodeString gGreaterGreaterGreaterThan(">>>"); +static const UnicodeString gGreaterGreaterThan(">>"); + +NFSubstitution* +NFSubstitution::makeSubstitution(int32_t pos, + const NFRule* rule, + const NFRule* predecessor, + const NFRuleSet* ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) +{ + // if the description is empty, return a NullSubstitution + if (description.length() == 0) { + return new NullSubstitution(pos, ruleSet, formatter, description, status); + } + + switch (description.charAt(0)) { + // if the description begins with '<'... + case gLessThan: + // throw an exception if the rule is a negative number + // rule + if (rule->getBaseValue() == NFRule::kNegativeNumberRule) { + // throw new IllegalArgumentException("<< not allowed in negative-number rule"); + status = U_PARSE_ERROR; + return NULL; + } + + // if the rule is a fraction rule, return an + // IntegralPartSubstitution + else if (rule->getBaseValue() == NFRule::kImproperFractionRule + || rule->getBaseValue() == NFRule::kProperFractionRule + || rule->getBaseValue() == NFRule::kMasterRule) { + return new IntegralPartSubstitution(pos, ruleSet, formatter, description, status); + } + + // if the rule set containing the rule is a fraction + // rule set, return a NumeratorSubstitution + else if (ruleSet->isFractionRuleSet()) { + return new NumeratorSubstitution(pos, llong_asDouble(rule->getBaseValue()), + formatter->getDefaultRuleSet(), formatter, description, status); + } + + // otherwise, return a MultiplierSubstitution + else { + return new MultiplierSubstitution(pos, rule->getDivisor(), ruleSet, + formatter, description, status); + } + + // if the description begins with '>'... + case gGreaterThan: + // if the rule is a negative-number rule, return + // an AbsoluteValueSubstitution + if (rule->getBaseValue() == NFRule::kNegativeNumberRule) { + return new AbsoluteValueSubstitution(pos, ruleSet, formatter, description, status); + } + + // if the rule is a fraction rule, return a + // FractionalPartSubstitution + else if (rule->getBaseValue() == NFRule::kImproperFractionRule + || rule->getBaseValue() == NFRule::kProperFractionRule + || rule->getBaseValue() == NFRule::kMasterRule) { + return new FractionalPartSubstitution(pos, ruleSet, formatter, description, status); + } + + // if the rule set owning the rule is a fraction rule set, + // throw an exception + else if (ruleSet->isFractionRuleSet()) { + // throw new IllegalArgumentException(">> not allowed in fraction rule set"); + status = U_PARSE_ERROR; + return NULL; + } + + // otherwise, return a ModulusSubstitution + else { + return new ModulusSubstitution(pos, rule->getDivisor(), predecessor, + ruleSet, formatter, description, status); + } + + // if the description begins with '=', always return a + // SameValueSubstitution + case gEquals: + return new SameValueSubstitution(pos, ruleSet, formatter, description, status); + + // and if it's anything else, throw an exception + default: + // throw new IllegalArgumentException("Illegal substitution character"); + status = U_PARSE_ERROR; + } + return NULL; +} + +NFSubstitution::NFSubstitution(int32_t _pos, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : pos(_pos), ruleSet(NULL), numberFormat(NULL) +{ + // the description should begin and end with the same character. + // If it doesn't that's a syntax error. Otherwise, + // makeSubstitution() was the only thing that needed to know + // about these characters, so strip them off + UnicodeString workingDescription(description); + if (description.length() >= 2 && description.charAt(0) == description.charAt( + description.length() - 1)) { + workingDescription.remove(description.length() - 1, 1); + workingDescription.remove(0, 1); + } + else if (description.length() != 0) { + // throw new IllegalArgumentException("Illegal substitution syntax"); + status = U_PARSE_ERROR; + return; + } + + // if the description was just two paired token characters + // (i.e., "<<" or ">>"), it uses the rule set it belongs to to + // format its result + if (workingDescription.length() == 0) { + this->ruleSet = _ruleSet; + } + + // if the description contains a rule set name, that's the rule + // set we use to format the result: get a reference to the + // names rule set + else if (workingDescription.charAt(0) == gPercent) { + this->ruleSet = formatter->findRuleSet(workingDescription, status); + } + + // if the description begins with 0 or #, treat it as a + // DecimalFormat pattern, and initialize a DecimalFormat with + // that pattern (then set it to use the DecimalFormatSymbols + // belonging to our formatter) + else if (workingDescription.charAt(0) == gPound || workingDescription.charAt(0) ==gZero) { + this->numberFormat = new DecimalFormat(workingDescription, *(formatter->getDecimalFormatSymbols()), status); + // this->numberFormat->setDecimalFormatSymbols(formatter->getDecimalFormatSymbols()); + } + + // if the description is ">>>", this substitution bypasses the + // usual rule-search process and always uses the rule that precedes + // it in its own rule set's rule list (this is used for place-value + // notations: formats where you want to see a particular part of + // a number even when it's 0) + else if (workingDescription.charAt(0) == gGreaterThan) { + // this causes problems when >>> is used in a frationalPartSubstitution + // this->ruleSet = NULL; + this->ruleSet = _ruleSet; + this->numberFormat = NULL; + } + + // and of the description is none of these things, it's a syntax error + else { + // throw new IllegalArgumentException("Illegal substitution syntax"); + status = U_PARSE_ERROR; + } + } + +NFSubstitution::~NFSubstitution() +{ + // cast away const + delete (NumberFormat*)numberFormat; numberFormat = NULL; +} + + /** + * Set's the substitution's divisor. Used by NFRule.setBaseValue(). + * A no-op for all substitutions except multiplier and modulus + * substitutions. + * @param radix The radix of the divisor + * @param exponent The exponent of the divisor + */ +void +NFSubstitution::setDivisor(int32_t radix, int32_t exponent) { + // a no-op for all substitutions except multiplier and modulus substitutions +} + + + //----------------------------------------------------------------------- + // boilerplate + //----------------------------------------------------------------------- + +char NFSubstitution::fgClassID; + +UClassID +NFSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + + + + /** + * Compares two substitutions for equality + * @param The substitution to compare this one to + * @return true if the two substitutions are functionally equivalent + */ +UBool +NFSubstitution::operator==(const NFSubstitution& rhs) const +{ + // compare class and all of the fields all substitutions have + // in common + // this should be called by subclasses before their own equality tests + return getDynamicClassID() == rhs.getDynamicClassID() + && pos == rhs.pos + && ruleSet == rhs.ruleSet + && *numberFormat == *rhs.numberFormat; +} + + /** + * Returns a textual description of the substitution + * @return A textual description of the substitution. This might + * not be identical to the description it was created from, but + * it'll produce the same result. + */ +void +NFSubstitution::toString(UnicodeString& text) const { + // use tokenChar() to get the character at the beginning and + // end of the substitutin token. In between them will go + // either the name of the rule set it uses, or the pattern of + // the DecimalFormat it uses + text.remove(); + text.append(tokenChar()); + + UnicodeString temp; + if (ruleSet != NULL) { + ruleSet->getName(temp); + } else { + numberFormat->toPattern(temp); + } + text.append(temp); + text.append(tokenChar()); +} + + //----------------------------------------------------------------------- + // formatting + //----------------------------------------------------------------------- + + /** + * Performs a mathematical operation on the number, formats it using + * either ruleSet or decimalFormat, and inserts the result into + * toInsertInto. + * @param number The number being formatted. + * @param toInsertInto The string we insert the result into + * @param pos The position in toInsertInto where the owning rule's + * rule text begins (this value is added to this substitution's + * position to determine exactly where to insert the new text) + */ +void +NFSubstitution::doSubstitution(llong number, UnicodeString& toInsertInto, int32_t _pos) const +{ + if (ruleSet != NULL) { + // perform a transformation on the number that is dependent + // on the type of substitution this is, then just call its + // rule set's format() method to format the result + llong numberToFormat = transformNumber(number); + + ruleSet->format(numberToFormat, toInsertInto, _pos + this->pos); + } else { + // or perform the transformation on the number (preserving + // the result's fractional part if the formatter it set + // to show it), then use that formatter's format() method + // to format the result + double numberToFormat = transformNumber(llong_asDouble(number)); + if (numberFormat->getMaximumFractionDigits() == 0) { + numberToFormat = floor(numberToFormat); + } + + UnicodeString temp; + numberFormat->format(numberToFormat, temp); + toInsertInto.insert(_pos + this->pos, temp); + } +} + + /** + * Performs a mathematical operation on the number, formats it using + * either ruleSet or decimalFormat, and inserts the result into + * toInsertInto. + * @param number The number being formatted. + * @param toInsertInto The string we insert the result into + * @param pos The position in toInsertInto where the owning rule's + * rule text begins (this value is added to this substitution's + * position to determine exactly where to insert the new text) + */ +void +NFSubstitution::doSubstitution(double number, UnicodeString& toInsertInto, int32_t _pos) const { + // perform a transformation on the number being formatted that + // is dependent on the type of substitution this is + double numberToFormat = transformNumber(number); + + // if the result is an integer, from here on out we work in integer + // space (saving time and memory and preserving accuracy) + if (numberToFormat == floor(numberToFormat) && ruleSet != NULL) { + ruleSet->format(llong(numberToFormat), toInsertInto, _pos + this->pos); + + // if the result isn't an integer, then call either our rule set's + // format() method or our DecimalFormat's format() method to + // format the result + } else { + if (ruleSet != NULL) { + ruleSet->format(numberToFormat, toInsertInto, _pos + this->pos); + } else { + UnicodeString temp; + numberFormat->format(numberToFormat, temp); + toInsertInto.insert(_pos + this->pos, temp); + } + } + } + + + //----------------------------------------------------------------------- + // parsing + //----------------------------------------------------------------------- + + /** + * Parses a string using the rule set or DecimalFormat belonging + * to this substitution. If there's a match, a mathematical + * operation (the inverse of the one used in formatting) is + * performed on the result of the parse and the value passed in + * and returned as the result. The parse position is updated to + * point to the first unmatched character in the string. + * @param text The string to parse + * @param parsePosition On entry, ignored, but assumed to be 0. + * On exit, this is updated to point to the first unmatched + * character (or 0 if the substitution didn't match) + * @param baseValue A partial parse result that should be + * combined with the result of this parse + * @param upperBound When searching the rule set for a rule + * matching the string passed in, only rules with base values + * lower than this are considered + * @param lenientParse If true and matching against rules fails, + * the substitution will also try matching the text against + * numerals using a default-costructed NumberFormat. If false, + * no extra work is done. (This value is false whenever the + * formatter isn't in lenient-parse mode, but is also false + * under some conditions even when the formatter _is_ in + * lenient-parse mode.) + * @return If there's a match, this is the result of composing + * baseValue with whatever was returned from matching the + * characters. This will be either a Long or a Double. If there's + * no match this is new Long(0) (not null), and parsePosition + * is left unchanged. + */ +UBool +NFSubstitution::doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& result) const +{ +#ifdef RBNF_DEBUG + fprintf(stderr, " %x bv: %g ub: %g\n", this, baseValue, upperBound); +#endif + // figure out the highest base value a rule can have and match + // the text being parsed (this varies according to the type of + // substitutions: multiplier, modulus, and numerator substitutions + // restrict the search to rules with base values lower than their + // own; same-value substitutions leave the upper bound wherever + // it was, and the others allow any rule to match + upperBound = calcUpperBound(upperBound); + + // use our rule set to parse the text. If that fails and + // lenient parsing is enabled (this is always false if the + // formatter's lenient-parsing mode is off, but it may also + // be false even when the formatter's lenient-parse mode is + // on), then also try parsing the text using a default- + // constructed NumberFormat + if (ruleSet != NULL) { + ruleSet->parse(text, parsePosition, upperBound, result); + if (lenientParse && !ruleSet->isFractionRuleSet() && parsePosition.getIndex() == 0) { + UErrorCode status = U_ZERO_ERROR; + NumberFormat* fmt = NumberFormat::createInstance(status); + if (U_SUCCESS(status)) { + fmt->parse(text, result, parsePosition); + } + delete fmt; + } + + // ...or use our DecimalFormat to parse the text + } else { + numberFormat->parse(text, result, parsePosition); + } + + // if the parse was successful, we've already advanced the caller's + // parse position (this is the one function that doesn't have one + // of its own). Derive a parse result and return it as a Long, + // if possible, or a Double + if (parsePosition.getIndex() != 0) { + double tempResult = (result.getType() == Formattable::kLong) ? + (double)result.getLong() : + result.getDouble(); + + // composeRuleValue() produces a full parse result from + // the partial parse result passed to this function from + // the caller (this is either the owning rule's base value + // or the partial result obtained from composing the + // owning rule's base value with its other substitution's + // parse result) and the partial parse result obtained by + // matching the substitution (which will be the same value + // the caller would get by parsing just this part of the + // text with RuleBasedNumberFormat.parse() ). How the two + // values are used to derive the full parse result depends + // on the types of substitutions: For a regular rule, the + // ultimate result is its multiplier substitution's result + // times the rule's divisor (or the rule's base value) plus + // the modulus substitution's result (which will actually + // supersede part of the rule's base value). For a negative- + // number rule, the result is the negative of its substitution's + // result. For a fraction rule, it's the sum of its two + // substitution results. For a rule in a fraction rule set, + // it's the numerator substitution's result divided by + // the rule's base value. Results from same-value substitutions + // propagate back upard, and null substitutions don't affect + // the result. + tempResult = composeRuleValue(tempResult, baseValue); + result.setDouble(tempResult); + return TRUE; + // if the parse was UNsuccessful, return 0 + } else { + result.setLong(0); + return FALSE; + } + } + + +UBool +NFSubstitution::isNullSubstitution() const { + return FALSE; +} + + /** + * Returns true if this is a modulus substitution. (We didn't do this + * with instanceof partially because it causes source files to + * proliferate and partially because we have to port this to C++.) + * @return true if this object is an instance of ModulusSubstitution + */ +UBool +NFSubstitution::isModulusSubstitution() const { + return FALSE; +} + +//=================================================================== +// SameValueSubstitution +//=================================================================== + +/** + * A substitution that passes the value passed to it through unchanged. + * Represented by == in rule descriptions. + */ +SameValueSubstitution::SameValueSubstitution(int32_t _pos, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) +: NFSubstitution(_pos, _ruleSet, formatter, description, status) + +{ + if (description == gEqualsEquals) { + // throw new IllegalArgumentException("== is not a legal token"); + status = U_PARSE_ERROR; + } +} + +char SameValueSubstitution::fgClassID; + +UClassID +SameValueSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + + +//=================================================================== +// MultiplierSubstitution +//=================================================================== + +char MultiplierSubstitution::fgClassID; + +UClassID +MultiplierSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + +UBool MultiplierSubstitution::operator==(const NFSubstitution& rhs) const +{ + return NFSubstitution::operator==(rhs) && + divisor == ((const MultiplierSubstitution*)&rhs)->divisor; +} + + +//=================================================================== +// ModulusSubstitution +//=================================================================== + +/** + * A substitution that divides the number being formatted by the its rule's + * divisor and formats the remainder. Represented by ">>" in a + * regular rule. + */ +ModulusSubstitution::ModulusSubstitution(int32_t _pos, + double _divisor, + const NFRule* predecessor, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : NFSubstitution(_pos, _ruleSet, formatter, description, status) + , divisor(_divisor) + , ruleToUse(NULL) +{ + ldivisor = _divisor; + + // the owning rule's divisor controls the behavior of this + // substitution: rather than keeping a backpointer to the rule, + // we keep a copy of the divisor + + if (description == gGreaterGreaterGreaterThan) { + // the >>> token doesn't alter how this substituion calculates the + // values it uses for formatting and parsing, but it changes + // what's done with that value after it's obtained: >>> short- + // circuits the rule-search process and goes straight to the + // specified rule to format the substitution value + ruleToUse = predecessor; + } +} + +char ModulusSubstitution::fgClassID; + +UClassID +ModulusSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + +UBool ModulusSubstitution::operator==(const NFSubstitution& rhs) const +{ + return NFSubstitution::operator==(rhs) && + divisor == ((const ModulusSubstitution*)&rhs)->divisor && + ruleToUse == ((const ModulusSubstitution*)&rhs)->ruleToUse; +} + + //----------------------------------------------------------------------- + // formatting + //----------------------------------------------------------------------- + + /** + * If this is a >>> substitution, use ruleToUse to fill in + * the substitution. Otherwise, just use the superclass function. + * @param number The number being formatted + * @toInsertInto The string to insert the result of this substitution + * into + * @param pos The position of the rule text in toInsertInto + */ + void + ModulusSubstitution::doSubstitution(llong number, UnicodeString& toInsertInto, int32_t _pos) const + { + // if this isn't a >>> substitution, just use the inherited version + // of this function (which uses either a rule set or a DecimalFormat + // to format its substitution value) + if (ruleToUse == NULL) { + NFSubstitution::doSubstitution(number, toInsertInto, _pos); + + // a >>> substitution goes straight to a particular rule to + // format the substitution value + } else { + llong numberToFormat = transformNumber(number); + ruleToUse->doFormat(numberToFormat, toInsertInto, _pos + getPos()); + } + } + + /** + * If this is a >>> substitution, use ruleToUse to fill in + * the substitution. Otherwise, just use the superclass function. + * @param number The number being formatted + * @toInsertInto The string to insert the result of this substitution + * into + * @param pos The position of the rule text in toInsertInto + */ + void + ModulusSubstitution::doSubstitution(double number, UnicodeString& toInsertInto, int32_t _pos) const + { + // if this isn't a >>> substitution, just use the inherited version + // of this function (which uses either a rule set or a DecimalFormat + // to format its substitution value) + if (ruleToUse == NULL) { + NFSubstitution::doSubstitution(number, toInsertInto, _pos); + + // a >>> substitution goes straight to a particular rule to + // format the substitution value + } else { + double numberToFormat = transformNumber(number); + + ruleToUse->doFormat(numberToFormat, toInsertInto, _pos + getPos()); + } + } + + + //----------------------------------------------------------------------- + // parsing + //----------------------------------------------------------------------- + + /** + * If this is a >>> substitution, match only against ruleToUse. + * Otherwise, use the superclass function. + * @param text The string to parse + * @param parsePosition Ignored on entry, updated on exit to point to + * the first unmatched character. + * @param baseValue The partial parse result prior to calling this + * routine. + */ +UBool +ModulusSubstitution::doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& result) const +{ + // if this isn't a >>> substitution, we can just use the + // inherited parse() routine to do the parsing + if (ruleToUse == NULL) { + return NFSubstitution::doParse(text, parsePosition, baseValue, upperBound, lenientParse, result); + + // but if it IS a >>> substitution, we have to do it here: we + // use the specific rule's doParse() method, and then we have to + // do some of the other work of NFRuleSet.parse() + } else { + ruleToUse->doParse(text, parsePosition, FALSE, upperBound, result); + + if (parsePosition.getIndex() != 0) { + double tempResult = result.getDouble(); + tempResult = composeRuleValue(tempResult, baseValue); + result.setDouble(tempResult); + } + + return TRUE; + } +} + + +//=================================================================== +// IntegralPartSubstitution +//=================================================================== + +char IntegralPartSubstitution::fgClassID; + +UClassID +IntegralPartSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + + +//=================================================================== +// FractionalPartSubstitution +//=================================================================== + + + /** + * Constructs a FractionalPartSubstitution. This object keeps a flag + * telling whether it should format by digits or not. In addition, + * it marks the rule set it calls (if any) as a fraction rule set. + */ +FractionalPartSubstitution::FractionalPartSubstitution(int32_t _pos, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : NFSubstitution(_pos, _ruleSet, formatter, description, status) + , byDigits(FALSE) + , useSpaces(TRUE) + +{ + // akk, ruleSet can change in superclass constructor + if (description == gGreaterGreaterThan || + description == gGreaterGreaterGreaterThan || + _ruleSet == getRuleSet()) { + byDigits = TRUE; + if (description == gGreaterGreaterGreaterThan) { + useSpaces = FALSE; + } + } else { + // cast away const + ((NFRuleSet*)getRuleSet())->makeIntoFractionRuleSet(); + } + + // TODO: Thai doesn't use spaces, so spelling out decimals with + // spaces between the words for each digit is incorrect. + // The rules don't seem to accomodate this, at least I can't figure + // out how to handle it using the rules. Need to provide better + // control over fractional part formatting. + // For now, just check if locale uses the Thai language. + + // useSpaces = strcmp(formatter->locale.getLanguage(), "th") != 0; + } + + //----------------------------------------------------------------------- + // formatting + //----------------------------------------------------------------------- + + /** + * If in "by digits" mode, fills in the substitution one decimal digit + * at a time using the rule set containing this substitution. + * Otherwise, uses the superclass function. + * @param number The number being formatted + * @param toInsertInto The string to insert the result of formatting + * the substitution into + * @param pos The position of the owning rule's rule text in + * toInsertInto + */ +void +FractionalPartSubstitution::doSubstitution(double number, UnicodeString& toInsertInto, int32_t _pos) const +{ + // if we're not in "byDigits" mode, just use the inherited + // doSubstitution() routine + if (!byDigits) { + NFSubstitution::doSubstitution(number, toInsertInto, _pos); + + // if we're in "byDigits" mode, transform the value into an integer + // by moving the decimal point eight places to the right and + // pulling digits off the right one at a time, formatting each digit + // as an integer using this substitution's owning rule set + // (this is slower, but more accurate, than doing it from the + // other end) + } else { + int32_t numberToFormat = (int32_t)round(transformNumber(number) * pow(10, kMaxDecimalDigits)); + // this flag keeps us from formatting trailing zeros. It starts + // out false because we're pulling from the right, and switches + // to true the first time we encounter a non-zero digit + UBool doZeros = FALSE; + for (int32_t i = 0; i < kMaxDecimalDigits; i++) { + int32_t digit = numberToFormat % 10; + if (digit != 0 || doZeros) { + if (doZeros && useSpaces) { + toInsertInto.insert(_pos + getPos(), gSpace); + } + doZeros = TRUE; + getRuleSet()->format(digit, toInsertInto, _pos + getPos()); + } + numberToFormat /= 10; + } + } + } + + //----------------------------------------------------------------------- + // parsing + //----------------------------------------------------------------------- + + /** + * If in "by digits" mode, parses the string as if it were a string + * of individual digits; otherwise, uses the superclass function. + * @param text The string to parse + * @param parsePosition Ignored on entry, but updated on exit to point + * to the first unmatched character + * @param baseValue The partial parse result prior to entering this + * function + * @param upperBound Only consider rules with base values lower than + * this when filling in the substitution + * @param lenientParse If true, try matching the text as numerals if + * matching as words doesn't work + * @return If the match was successful, the current partial parse + * result; otherwise new Long(0). The result is either a Long or + * a Double. + */ +UBool +FractionalPartSubstitution::doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& resVal) const +{ + // if we're not in byDigits mode, we can just use the inherited + // doParse() + if (!byDigits) { + return NFSubstitution::doParse(text, parsePosition, baseValue, 0, lenientParse, resVal); + + // if we ARE in byDigits mode, parse the text one digit at a time + // using this substitution's owning rule set (we do this by setting + // upperBound to 10 when calling doParse() ) until we reach + // nonmatching text + } else { + UnicodeString workText(text); + ParsePosition workPos(1); + double result = 0; + int32_t digit; + double p10 = 0.1; + + NumberFormat* fmt = NULL; + while (workText.length() > 0 && workPos.getIndex() != 0) { + workPos.setIndex(0); + Formattable temp; + getRuleSet()->parse(workText, workPos, 10, temp); + digit = temp.getType() == Formattable::kLong ? + temp.getLong() : + (int32_t)temp.getDouble(); + + if (lenientParse && workPos.getIndex() == 0) { + if (!fmt) { + UErrorCode status = U_ZERO_ERROR; + fmt = NumberFormat::createInstance(status); + if (U_FAILURE(status)) { + delete fmt; + fmt = NULL; + } + } + if (fmt) { + fmt->parse(workText, temp, workPos); + digit = temp.getLong(); + } + } + + if (workPos.getIndex() != 0) { + result += digit * p10; + p10 /= 10; + parsePosition.setIndex(parsePosition.getIndex() + workPos.getIndex()); + workText.removeBetween(0, workPos.getIndex()); + while (workText.length() > 0 && workText.charAt(0) == gSpace) { + workText.removeBetween(0, 1); + parsePosition.setIndex(parsePosition.getIndex() + 1); + } + } + } + delete fmt; + + result = composeRuleValue(result, baseValue); + resVal.setDouble(result); + return TRUE; + } + } + +UBool +FractionalPartSubstitution::operator==(const NFSubstitution& rhs) const +{ + return NFSubstitution::operator==(rhs) && + ((const FractionalPartSubstitution*)&rhs)->byDigits == byDigits; +} + +char FractionalPartSubstitution::fgClassID; + +UClassID +FractionalPartSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + + +//=================================================================== +// AbsoluteValueSubstitution +//=================================================================== + +char AbsoluteValueSubstitution::fgClassID; + +UClassID +AbsoluteValueSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + +//=================================================================== +// NumeratorSubstitution +//=================================================================== + +UBool +NumeratorSubstitution::operator==(const NFSubstitution& rhs) const +{ + return NFSubstitution::operator==(rhs) && + denominator == ((const NumeratorSubstitution*)&rhs)->denominator; +} + +char NumeratorSubstitution::fgClassID; + +UClassID +NumeratorSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + +//=================================================================== +// NullSubstitution +//=================================================================== + +char NullSubstitution::fgClassID; + +UClassID +NullSubstitution::getDynamicClassID() const { + return getStaticClassID(); +} + diff --git a/icu4c/source/i18n/nfsubs.h b/icu4c/source/i18n/nfsubs.h new file mode 100644 index 00000000000..967df574cc4 --- /dev/null +++ b/icu4c/source/i18n/nfsubs.h @@ -0,0 +1,498 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +#ifndef NFSUBS_H +#define NFSUBS_H + +#include "unicode/utypes.h" +#include "unicode/decimfmt.h" +#include "nfrs.h" +#include "nfrule.h" +#include "llong.h" + +U_NAMESPACE_BEGIN + +static double MAX_DOUBLE = 1.7976931348623157e+308; +static double java_fmod(double n, double d) +{ + // c doesn't define '%' for floating point, but java does. + // from the java language spec 15.17: + // "In the remaining cases, where neither an infinity, nor a zero, + // nor NaN is involved, the floating-point remainder r from the + // division of a dividend n by a divisor d is defined by the + // mathematical relation r = n - (d . q) where q is an integer + // that is negative only if n/d is negative and positive only if + // n/d is positive, and whose magnitude is as large as possible + // without exceeding the magnitude of the true mathematical + // quotient of n and d." + // + // I'm not sure if fmod (from what header?) has the same implemenation + + double q = n/d; + q = q < 0 ? -floor(-q) : floor(q); + return n - d * q; +} + +static double round(double n) +{ + return floor(n + .5); +} + +class NFSubstitution { + int32_t pos; + const NFRuleSet* ruleSet; + const DecimalFormat* numberFormat; + + protected: + NFSubstitution(int32_t pos, + const NFRuleSet* ruleSet, + const RuleBasedNumberFormat* rbnf, + const UnicodeString& description, + UErrorCode& status); + + const NFRuleSet* getRuleSet() const { return ruleSet; } + const DecimalFormat* getNumberFormat() const { return numberFormat; } + + public: + static NFSubstitution* makeSubstitution(int32_t pos, + const NFRule* rule, + const NFRule* predecessor, + const NFRuleSet* ruleSet, + const RuleBasedNumberFormat* rbnf, + const UnicodeString& description, + UErrorCode& status); + + virtual ~NFSubstitution(); + + virtual UBool operator==(const NFSubstitution& rhs) const; + UBool operator!=(const NFSubstitution& rhs) const { return !operator==(rhs); } + + /** + * Sets the substitution's divisor. Used by NFRule.setBaseValue(). + * A no-op for all substitutions except multiplier and modulus + * substitutions. + * @param radix The radix of the divisor + * @param exponent The exponent of the divisor + */ + virtual void setDivisor(int32_t radix, int32_t exponent); + + /** + * Replaces result with the string describing the substitution. + */ + virtual void toString(UnicodeString& result) const; + + //----------------------------------------------------------------------- + // formatting + //----------------------------------------------------------------------- + + /** + * Performs a mathematical operation on the number, formats it using + * either ruleSet or decimalFormat, and inserts the result into + * toInsertInto. + * @param number The number being formatted. + * @param toInsertInto The string we insert the result into + * @param pos The position in toInsertInto where the owning rule's + * rule text begins (this value is added to this substitution's + * position to determine exactly where to insert the new text) + */ + virtual void doSubstitution(llong number, UnicodeString& toInsertInto, int32_t pos) const; + virtual void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos) const; + + protected: + /** + * Subclasses override this function to perform some kind of + * mathematical operation on the number. The result of this operation + * is formatted using the rule set or DecimalFormat that this + * substitution refers to, and the result is inserted into the result + * string. + * @param The number being formatted + * @return The result of performing the opreration on the number + */ + virtual llong transformNumber(llong number) const = 0; + virtual double transformNumber(double number) const = 0; + + public: + //----------------------------------------------------------------------- + // parsing + //----------------------------------------------------------------------- + + /** + * Parses a string using the rule set or DecimalFormat belonging + * to this substitution. If there's a match, a mathematical + * operation (the inverse of the one used in formatting) is + * performed on the result of the parse and the value passed in + * and returned as the result. The parse position is updated to + * point to the first unmatched character in the string. + * @param text The string to parse + * @param parsePosition On entry, ignored, but assumed to be 0. + * On exit, this is updated to point to the first unmatched + * character (or 0 if the substitution didn't match) + * @param baseValue A partial parse result that should be + * combined with the result of this parse + * @param upperBound When searching the rule set for a rule + * matching the string passed in, only rules with base values + * lower than this are considered + * @param lenientParse If true and matching against rules fails, + * the substitution will also try matching the text against + * numerals using a default-costructed NumberFormat. If false, + * no extra work is done. (This value is false whenever the + * formatter isn't in lenient-parse mode, but is also false + * under some conditions even when the formatter _is_ in + * lenient-parse mode.) + * @return If there's a match, this is the result of composing + * baseValue with whatever was returned from matching the + * characters. This will be either a Long or a Double. If there's + * no match this is new Long(0) (not null), and parsePosition + * is left unchanged. + */ + virtual UBool doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& result) const; + + /** + * Derives a new value from the two values passed in. The two values + * are typically either the base values of two rules (the one containing + * the substitution and the one matching the substitution) or partial + * parse results derived in some other way. The operation is generally + * the inverse of the operation performed by transformNumber(). + * @param newRuleValue The value produced by matching this substitution + * @param oldRuleValue The value that was passed to the substitution + * by the rule that owns it + * @return A third value derived from the other two, representing a + * partial parse result + */ + virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const = 0; + + /** + * Calculates an upper bound when searching for a rule that matches + * this substitution. Rules with base values greater than or equal + * to upperBound are not considered. + * @param oldUpperBound The current upper-bound setting. The new + * upper bound can't be any higher. + */ + virtual double calcUpperBound(double oldUpperBound) const = 0; + + //----------------------------------------------------------------------- + // simple accessors + //----------------------------------------------------------------------- + + /** + * Returns the substitution's position in the rule that owns it. + * @return The substitution's position in the rule that owns it. + */ + + int32_t getPos() const { return pos; } + + /** + * Returns the character used in the textual representation of + * substitutions of this type. Used by toString(). + * @return This substitution's token character. + */ + virtual UChar tokenChar() const = 0; + + /** + * Returns true if this is a null substitution. (We didn't do this + * with instanceof partially because it causes source files to + * proliferate and partially because we have to port this to C++.) + * @return true if this object is an instance of NullSubstitution + */ + virtual UBool isNullSubstitution() const; + + /** + * Returns true if this is a modulus substitution. (We didn't do this + * with instanceof partially because it causes source files to + * proliferate and partially because we have to port this to C++.) + * @return true if this object is an instance of ModulusSubstitution + */ + virtual UBool isModulusSubstitution() const; + + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class SameValueSubstitution : public NFSubstitution { + public: + SameValueSubstitution(int32_t pos, + const NFRuleSet* ruleset, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status); + + llong transformNumber(llong number) const { return number; } + double transformNumber(double number) const { return number; } + double composeRuleValue(double newRuleValue, double oldRuleValue) const { return newRuleValue; } + double calcUpperBound(double oldUpperBound) const { return oldUpperBound; } + UChar tokenChar() const { return (UChar)0x003d; } // '=' + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class MultiplierSubstitution : public NFSubstitution { + double divisor; + llong ldivisor; + + public: + MultiplierSubstitution(int32_t _pos, + double _divisor, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : NFSubstitution(_pos, _ruleSet, formatter, description, status), divisor(_divisor) + { + ldivisor = _divisor; + } + + void setDivisor(int32_t radix, int32_t exponent) { + divisor = pow(radix, exponent); + ldivisor = divisor; + } + + UBool operator==(const NFSubstitution& rhs) const; + + llong transformNumber(llong number) const { + return number / ldivisor; + } + + double transformNumber(double number) const { + return floor(number / divisor); + } + + double composeRuleValue(double newRuleValue, double oldRuleValue) const { + return newRuleValue * divisor; + } + + double calcUpperBound(double oldUpperBound) const { return divisor; } + + UChar tokenChar() const { return (UChar)0x003c; } // '<' + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class ModulusSubstitution : public NFSubstitution { + double divisor; + llong ldivisor; + const NFRule* ruleToUse; + public: + ModulusSubstitution(int32_t pos, + double _divisor, + const NFRule* rulePredecessor, + const NFRuleSet* ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status); + + void setDivisor(int32_t radix, int32_t exponent) { + divisor = pow(radix, exponent); + ldivisor = divisor; + } + + UBool operator==(const NFSubstitution& rhs) const; + + void doSubstitution(llong number, UnicodeString& toInsertInto, int32_t pos) const; + void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos) const; + + llong transformNumber(llong number) const { return number % ldivisor; } + double transformNumber(double number) const { return java_fmod(number, divisor); } + + UBool doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& result) const; + + double composeRuleValue(double newRuleValue, double oldRuleValue) const { + return oldRuleValue - java_fmod(oldRuleValue, divisor) + newRuleValue; + } + + double calcUpperBound(double oldUpperBound) const { return divisor; } + + UBool isModulusSubstitution() const { return TRUE; } + + UChar tokenChar() const { return (UChar)0x003e; } // '>' + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class IntegralPartSubstitution : public NFSubstitution { + public: + IntegralPartSubstitution(int32_t _pos, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : NFSubstitution(_pos, _ruleSet, formatter, description, status) {} + + llong transformNumber(llong number) const { return number; } + double transformNumber(double number) const { return floor(number); } + double composeRuleValue(double newRuleValue, double oldRuleValue) const { return newRuleValue + oldRuleValue; } + double calcUpperBound(double oldUpperBound) const { return MAX_DOUBLE; } + UChar tokenChar() const { return (UChar)0x003c; } // '<' + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class FractionalPartSubstitution : public NFSubstitution { + UBool byDigits; + UBool useSpaces; + enum { kMaxDecimalDigits = 8 }; + public: + FractionalPartSubstitution(int32_t pos, + const NFRuleSet* ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status); + + UBool operator==(const NFSubstitution& rhs) const; + + void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos) const; + llong transformNumber(llong number) const { return llong::kZero; } + double transformNumber(double number) const { return number - floor(number); } + + UBool doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& result) const; + + double composeRuleValue(double newRuleValue, double oldRuleValue) const { return newRuleValue + oldRuleValue; } + double calcUpperBound(double oldUpperBound) const { return 0; } + UChar tokenChar() const { return (UChar)0x003e; } // '>' + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class AbsoluteValueSubstitution : public NFSubstitution { + public: + AbsoluteValueSubstitution(int32_t _pos, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : NFSubstitution(_pos, _ruleSet, formatter, description, status) {} + + llong transformNumber(llong number) const { return llong_abs(number); } + double transformNumber(double number) const { return fabs(number); } + double composeRuleValue(double newRuleValue, double oldRuleValue) const { return -newRuleValue; } + double calcUpperBound(double oldUpperBound) const { return MAX_DOUBLE; } + UChar tokenChar() const { return (UChar)0x003e; } // '>' + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class NumeratorSubstitution : public NFSubstitution { + double denominator; + llong ldenominator; + public: + NumeratorSubstitution(int32_t _pos, + double _denominator, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : NFSubstitution(_pos, _ruleSet, formatter, description, status), denominator(_denominator) + { + ldenominator = _denominator; + } + + UBool operator==(const NFSubstitution& rhs) const; + + llong transformNumber(llong number) const { return number * ldenominator; } + double transformNumber(double number) const { return round(number * denominator); } + + UBool doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& result) const + { + // we don't have to do anything special to do the parsing here, + // but we have to turn lenient parsing off-- if we leave it on, + // it SERIOUSLY messes up the algorithm + return NFSubstitution::doParse(text, parsePosition, baseValue, upperBound, FALSE, result); + } + double composeRuleValue(double newRuleValue, double oldRuleValue) const { return newRuleValue / oldRuleValue; } + double calcUpperBound(double oldUpperBound) const { return denominator; } + UChar tokenChar() const { return (UChar)0x003c; } // '<' + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +class NullSubstitution : public NFSubstitution { + public: + NullSubstitution(int32_t _pos, + const NFRuleSet* _ruleSet, + const RuleBasedNumberFormat* formatter, + const UnicodeString& description, + UErrorCode& status) + : NFSubstitution(_pos, _ruleSet, formatter, description, status) {} + + void toString(UnicodeString& result) const {} + void doSubstitution(double number, UnicodeString& toInsertInto, int32_t _pos) const {} + void doSubstitution(llong number, UnicodeString& toInsertInto, int32_t _pos) const {} + llong transformNumber(llong number) const { return llong::kZero; } + double transformNumber(double number) const { return 0; } + UBool doParse(const UnicodeString& text, + ParsePosition& parsePosition, + double baseValue, + double upperBound, + UBool lenientParse, + Formattable& result) const + { result.setDouble(baseValue); return TRUE; } + double composeRuleValue(double newRuleValue, double oldRuleValue) const { return 0; } // never called + double calcUpperBound(double oldUpperBound) const { return 0; } // never called + UBool isNullSubstitution() const { return TRUE; } + UChar tokenChar() const { return (UChar)0x0020; } // ' ' never called + private: + static char fgClassID; + + public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const; +}; + +U_NAMESPACE_END + +// NFSUBS_H +#endif diff --git a/icu4c/source/i18n/rbnf.cpp b/icu4c/source/i18n/rbnf.cpp new file mode 100644 index 00000000000..374007daf37 --- /dev/null +++ b/icu4c/source/i18n/rbnf.cpp @@ -0,0 +1,623 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +#include "unicode/rbnf.h" + +#include "nfrs.h" + +#include "cmemory.h" +#include "cstring.h" +#include "unicode/normlzr.h" +#include "unicode/tblcoll.h" +#include "unicode/uchar.h" +#include "unicode/ucol.h" +#include "unicode/uloc.h" +#include "unicode/unum.h" +#include "unicode/ures.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" + +#include + +static const UnicodeString gPercentPercent("%%"); + +#define kSomeNumberOfBitsDiv2 22 +#define kHalfMaxDouble (double)(1 << kSomeNumberOfBitsDiv2) +#define kMaxDouble (kHalfMaxDouble * kHalfMaxDouble) + +const char RuleBasedNumberFormat::fgClassID = 0; + +RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description, const Locale& alocale, UParseError& perror, UErrorCode& status) + : ruleSets(NULL) + , defaultRuleSet(NULL) + , locale(alocale) + , collator(NULL) + , decimalFormatSymbols(NULL) + , lenient(FALSE) + , lenientParseRules(NULL) +{ + init(description, perror, status); +} + +RuleBasedNumberFormat::RuleBasedNumberFormat(URBNFRuleSetTag tag, const Locale& alocale, UErrorCode& status) + : ruleSets(NULL) + , defaultRuleSet(NULL) + , locale(alocale) + , collator(NULL) + , decimalFormatSymbols(NULL) + , lenient(FALSE) + , lenientParseRules(NULL) +{ + if (U_FAILURE(status)) { + return; + } + + const char* fmt_tag = ""; + switch (tag) { + case URBNF_SPELLOUT: fmt_tag = "SpelloutRules"; break; + case URBNF_ORDINAL: fmt_tag = "OrdinalRules"; break; + case URBNF_DURATION: fmt_tag = "DurationRules"; break; + default: status = U_ILLEGAL_ARGUMENT_ERROR; return; + } + + UResourceBundle* nfrb = ures_open(NULL, locale.getName(), &status); + int32_t len = 0; + const UChar* description = ures_getStringByKey(nfrb, fmt_tag, &len, &status); + if (U_SUCCESS(status)) { + UnicodeString desc(description, len); + UParseError perror; + init (desc, perror, status); + } + ures_close(nfrb); +} + +RuleBasedNumberFormat::RuleBasedNumberFormat(const RuleBasedNumberFormat& rhs) + : ruleSets(NULL) + , defaultRuleSet(NULL) + , locale(rhs.locale) + , collator(NULL) + , decimalFormatSymbols(NULL) + , lenient(FALSE) + , lenientParseRules(NULL) +{ + this->operator==(rhs); +} + +RuleBasedNumberFormat& +RuleBasedNumberFormat::operator=(const RuleBasedNumberFormat& rhs) +{ + UErrorCode status = U_ZERO_ERROR; + dispose(); + locale = rhs.locale; + UnicodeString rules = rhs.getRules(); + UParseError perror; + init(rules, perror, status); + lenient = rhs.lenient; + return *this; +} + +RuleBasedNumberFormat::~RuleBasedNumberFormat() +{ + dispose(); +} + +Format* +RuleBasedNumberFormat::clone(void) const +{ + RuleBasedNumberFormat * result = NULL; + UnicodeString rules = getRules(); + int32_t len = rules.length(); + UChar* rulestring = new UChar[len+1]; + if (rulestring) { + rules.extract(0, len, rulestring); + UErrorCode status = U_ZERO_ERROR; + UParseError perror; + result = new RuleBasedNumberFormat(rulestring, locale, perror, status); + if (U_FAILURE(status)) { + delete result; + result = NULL; + } else { + result->lenient = lenient; + } + delete[] rulestring; + } + return result; +} + +UBool +RuleBasedNumberFormat::operator==(const Format& other) const +{ + if (this == &other) { + return TRUE; + } + + if (other.getDynamicClassID() == getStaticClassID()) { + const RuleBasedNumberFormat& rhs = (const RuleBasedNumberFormat&)other; + if (locale == rhs.locale && + lenient == rhs.lenient) { + NFRuleSet** p = ruleSets; + NFRuleSet** q = rhs.ruleSets; + while (*p && *q && (**p == **q)) { + ++p; + ++q; + } + return *q == NULL && *p == NULL; + } + } + + return FALSE; +} + +UnicodeString +RuleBasedNumberFormat::getRules() const +{ + UnicodeString result; + for (NFRuleSet** p = ruleSets; *p; ++p) { + (*p)->appendRules(result); + } + return result; +} + +UnicodeString +RuleBasedNumberFormat::getRuleSetName(int32_t index) const +{ + UnicodeString result; + for (NFRuleSet** p = ruleSets; *p; ++p) { + NFRuleSet* rs = *p; + if (rs->isPublic()) { + if (--index == -1) { + rs->getName(result); + return result; + } + } + } + return *(UnicodeString*)NULL; +} + +int32_t +RuleBasedNumberFormat::getNumberOfRuleSetNames() const +{ + int32_t result = 0; + for (NFRuleSet** p = ruleSets; *p; ++p) { + if ((**p).isPublic()) { + ++result; + } + } + return result; +} + +NFRuleSet* +RuleBasedNumberFormat::findRuleSet(const UnicodeString& name, UErrorCode& status) const +{ + if (U_SUCCESS(status)) { + for (NFRuleSet** p = ruleSets; *p; ++p) { + NFRuleSet* rs = *p; + if (rs->isNamed(name)) { + return rs; + } + } + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return NULL; +} + +UnicodeString& +RuleBasedNumberFormat::format(int32_t number, + UnicodeString& toAppendTo, + FieldPosition& pos) const +{ + defaultRuleSet->format(llong(number), toAppendTo, toAppendTo.length()); + return toAppendTo; +} + +#if 0 +UnicodeString& +RuleBasedNumberFormat::format(llong number, + UnicodeString& toAppendTo, + FieldPosition& pos) const +{ + defaultRuleSet->format(number, toAppendTo, toAppendTo.length()); + return toAppendTo; +} +#endif + +UnicodeString& +RuleBasedNumberFormat::format(double number, + UnicodeString& toAppendTo, + FieldPosition& pos) const +{ + defaultRuleSet->format(number, toAppendTo, toAppendTo.length()); + return toAppendTo; +} + + +UnicodeString& +RuleBasedNumberFormat::format(int32_t number, + const UnicodeString& ruleSetName, + UnicodeString& toAppendTo, + FieldPosition& pos, + UErrorCode& status) const +{ + // return format(llong(number), ruleSetName, toAppendTo, pos, status); + if (U_SUCCESS(status)) { + if (ruleSetName.indexOf(gPercentPercent) == 0) { + // throw new IllegalArgumentException("Can't use internal rule set"); + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + NFRuleSet *rs = findRuleSet(ruleSetName, status); + if (rs) { + rs->format(llong(number), toAppendTo, toAppendTo.length()); + } + } + } + return toAppendTo; + +} + +#if 0 +UnicodeString& +RuleBasedNumberFormat::format(llong number, + const UnicodeString& ruleSetName, + UnicodeString& toAppendTo, + FieldPosition& pos, + UErrorCode& status) const +{ + if (U_SUCCESS(status)) { + if (ruleSetName.indexOf(gPercentPercent) == 0) { + // throw new IllegalArgumentException("Can't use internal rule set"); + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + NFRuleSet *rs = findRuleSet(ruleSetName, status); + if (rs) { + rs->format(number, toAppendTo, toAppendTo.length()); + } + } + } + return toAppendTo; +} +#endif + +// make linker happy +UnicodeString& +RuleBasedNumberFormat::format(const Formattable& obj, + UnicodeString& toAppendTo, + FieldPosition& pos, + UErrorCode& status) const +{ + return NumberFormat::format(obj, toAppendTo, pos, status); +} + +UnicodeString& +RuleBasedNumberFormat::format(double number, + const UnicodeString& ruleSetName, + UnicodeString& toAppendTo, + FieldPosition& pos, + UErrorCode& status) const +{ + if (U_SUCCESS(status)) { + if (ruleSetName.indexOf(gPercentPercent) == 0) { + // throw new IllegalArgumentException("Can't use internal rule set"); + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + NFRuleSet *rs = findRuleSet(ruleSetName, status); + if (rs) { + rs->format(number, toAppendTo, toAppendTo.length()); + } + } + } + return toAppendTo; +} + +void +RuleBasedNumberFormat::parse(const UnicodeString& text, + Formattable& result, + ParsePosition& parsePosition) const +{ + ParsePosition high_pp; + Formattable high_result; + + for (NFRuleSet** p = ruleSets; *p; ++p) { + NFRuleSet *rp = *p; + if (rp->isPublic()) { + ParsePosition working_pp = parsePosition; + Formattable working_result; + + rp->parse(text, working_pp, kMaxDouble, working_result); + if (working_pp.getIndex() > high_pp.getIndex()) { + high_pp = working_pp; + high_result = working_result; + + if (high_pp.getIndex() == text.length()) { + break; + } + } + } + } + + parsePosition = high_pp; + result = high_result; + if (result.getType() == Formattable::kDouble) { + int32_t r = (int32_t)result.getDouble(); + if ((double)r == result.getDouble()) { + result.setLong(r); + } + } +} + +void +RuleBasedNumberFormat::setLenient(UBool enabled) +{ + lenient = enabled; + if (!enabled && collator) { + delete collator; + collator = NULL; + } +} + +// All urbnf objects are created through openRules, so we init all of the +// Unicode string constants required by rbnf, nfrs, or nfr here. +static const UnicodeString gLenientParse("%%lenient-parse:"); +static const UChar gSemiColon = 0x003B; +static const UnicodeString gSemiPercent(";%"); + +void +RuleBasedNumberFormat::init(const UnicodeString& rules, UParseError& perror, UErrorCode& status) +{ + // TODO: implement perror + if (U_FAILURE(status)) { + return; + } + + UnicodeString description(rules); + if (!description.length()) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + // start by stripping the trailing whitespace from all the rules + // (this is all the whitespace follwing each semicolon in the + // description). This allows us to look for rule-set boundaries + // by searching for ";%" without having to worry about whitespace + // between the ; and the % + stripWhitespace(description); + + // check to see if there's a set of lenient-parse rules. If there + // is, pull them out into our temporary holding place for them, + // and delete them from the description before the real desciption- + // parsing code sees them + UTextOffset lp = description.indexOf(gLenientParse); + if (lp != -1) { + // we've got to make sure we're not in the middle of a rule + // (where "%%lenient-parse" would actually get treated as + // rule text) + if (lp == 0 || description.charAt(lp - 1) == gSemiColon) { + // locate the beginning and end of the actual collation + // rules (there may be whitespace between the name and + // the first token in the description) + int lpEnd = description.indexOf(gSemiPercent, lp); + + if (lpEnd == -1) { + lpEnd = description.length() - 1; + } + int lpStart = lp + gLenientParse.length(); + while (u_isWhitespace(description.charAt(lpStart))) { + ++lpStart; + } + + // copy out the lenient-parse rules and delete them + // from the description + lenientParseRules = new UnicodeString(); + lenientParseRules->setTo(description, lpStart, lpEnd - lpStart); + + description.remove(lp, lpEnd + 1 - lp); + } + } + + // pre-flight parsing the description and count the number of + // rule sets (";%" marks the end of one rule set and the beginning + // of the next) + int numRuleSets = 0; + for (UTextOffset p = description.indexOf(gSemiPercent); p != -1; p = description.indexOf(gSemiPercent, p)) { + ++numRuleSets; + ++p; + } + ++numRuleSets; + + // our rule list is an array of the appropriate size + ruleSets = new NFRuleSet*[numRuleSets + 1]; + for (int i = 0; i <= numRuleSets; ++i) { + ruleSets[i] = NULL; + } + + // divide up the descriptions into individual rule-set descriptions + // and store them in a temporary array. At each step, we also + // new up a rule set, but all this does is initialize its name + // and remove it from its description. We can't actually parse + // the rest of the descriptions and finish initializing everything + // because we have to know the names and locations of all the rule + // sets before we can actually set everything up + UnicodeString* ruleSetDescriptions = new UnicodeString[numRuleSets]; + + { + int curRuleSet = 0; + UTextOffset start = 0; + for (UTextOffset p = description.indexOf(gSemiPercent); p != -1; p = description.indexOf(gSemiPercent, start)) { + ruleSetDescriptions[curRuleSet].setTo(description, start, p + 1 - start); + ruleSets[curRuleSet] = new NFRuleSet(ruleSetDescriptions, curRuleSet, status); + ++curRuleSet; + start = p + 1; + } + ruleSetDescriptions[curRuleSet].setTo(description, start, description.length() - start); + ruleSets[curRuleSet] = new NFRuleSet(ruleSetDescriptions, curRuleSet, status); + } + + // now we can take note of the formatter's default rule set, which + // is the last public rule set in the description (it's the last + // rather than the first so that a user can create a new formatter + // from an existing formatter and change its default behavior just + // by appending more rule sets to the end) + // setDefaultRuleSet + { + defaultRuleSet = ruleSets[numRuleSets - 1]; + if (!defaultRuleSet->isPublic()) { + for (int i = numRuleSets - 2; i >= 0; --i) { + if (ruleSets[i]->isPublic()) { + defaultRuleSet = ruleSets[i]; + break; + } + } + } + } + + // finally, we can go back through the temporary descriptions + // list and finish seting up the substructure (and we throw + // away the temporary descriptions as we go) + { + for (int i = 0; i < numRuleSets; i++) { + ruleSets[i]->parseRules(ruleSetDescriptions[i], this, status); + } + } + + delete[] ruleSetDescriptions; +} + +void +RuleBasedNumberFormat::stripWhitespace(UnicodeString& description) +{ + // iterate through the characters... + UnicodeString result; + + int start = 0; + while (start != -1 && start < description.length()) { + // seek to the first non-whitespace character... + while (start < description.length() + && u_isWhitespace(description.charAt(start))) { + ++start; + } + + // locate the next semicolon in the text and copy the text from + // our current position up to that semicolon into the result + UTextOffset p = description.indexOf(gSemiColon, start); + if (p == -1) { + // or if we don't find a semicolon, just copy the rest of + // the string into the result + result.append(description, start, description.length() - start); + start = -1; + } + else if (p < description.length()) { + result.append(description, start, p + 1 - start); + start = p + 1; + } + + // when we get here, we've seeked off the end of the sring, and + // we terminate the loop (we continue until *start* is -1 rather + // than until *p* is -1, because otherwise we'd miss the last + // rule in the description) + else { + start = -1; + } + } + + description.setTo(result); +} + + +void +RuleBasedNumberFormat::dispose() +{ + if (ruleSets) { + for (NFRuleSet** p = ruleSets; *p; ++p) { + delete *p; + } + delete[] ruleSets; + ruleSets = NULL; + } + + delete collator; + + delete decimalFormatSymbols; + + delete lenientParseRules; +} + + +//----------------------------------------------------------------------- +// package-internal API +//----------------------------------------------------------------------- + +/** + * Returns the collator to use for lenient parsing. The collator is lazily created: + * this function creates it the first time it's called. + * @return The collator to use for lenient parsing, or null if lenient parsing + * is turned off. +*/ +Collator* +RuleBasedNumberFormat::getCollator() const +{ + // lazy-evaulate the collator + if (collator == NULL && lenient) { + // create a default collator based on the formatter's locale, + // then pull out that collator's rules, append any additional + // rules specified in the description, and create a _new_ + // collator based on the combinaiton of those rules + + UErrorCode status = U_ZERO_ERROR; + + Collator* temp = Collator::createInstance(locale, status); + if (U_SUCCESS(status) && + temp->getDynamicClassID() == RuleBasedCollator::getStaticClassID()) { + + RuleBasedCollator* newCollator = (RuleBasedCollator*)temp; + if (lenientParseRules) { + UnicodeString rules(newCollator->getRules()); + rules.append(*lenientParseRules); + + newCollator = new RuleBasedCollator(rules, status); + } else { + temp = NULL; + } + if (U_SUCCESS(status)) { + newCollator->setDecomposition(Normalizer::DECOMP); + // cast away const + ((RuleBasedNumberFormat*)this)->collator = newCollator; + } else { + delete newCollator; + } + } + delete temp; + } + + // if lenient-parse mode is off, this will be null + // (see setLenientParseMode()) + return collator; +} + + +/** + * Returns the DecimalFormatSymbols object that should be used by all DecimalFormat + * instances owned by this formatter. This object is lazily created: this function + * creates it the first time it's called. + * @return The DecimalFormatSymbols object that should be used by all DecimalFormat + * instances owned by this formatter. +*/ +DecimalFormatSymbols* +RuleBasedNumberFormat::getDecimalFormatSymbols() const +{ + // lazy-evaluate the DecimalFormatSymbols object. This object + // is shared by all DecimalFormat instances belonging to this + // formatter + if (decimalFormatSymbols == NULL) { + UErrorCode status = U_ZERO_ERROR; + DecimalFormatSymbols* temp = new DecimalFormatSymbols(locale, status); + if (U_SUCCESS(status)) { + ((RuleBasedNumberFormat*)this)->decimalFormatSymbols = temp; + } else { + delete temp; + } + } + return decimalFormatSymbols; +} + diff --git a/icu4c/source/i18n/unicode/rbnf.h b/icu4c/source/i18n/unicode/rbnf.h new file mode 100644 index 00000000000..be32862f844 --- /dev/null +++ b/icu4c/source/i18n/unicode/rbnf.h @@ -0,0 +1,828 @@ +/* +******************************************************************************* +* Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +#ifndef RBNF_H +#define RBNF_H + +#include "unicode/coll.h" +#include "unicode/dcfmtsym.h" +#include "unicode/fmtable.h" +#include "unicode/locid.h" +#include "unicode/numfmt.h" +#include "unicode/unistr.h" +#include "unicode/utypes.h" + +U_NAMESPACE_BEGIN + +class NFRuleSet; + +/** + * \file + * \brief C++ API: RuleBasedNumberFormat + * + *

Rule Based Number Format C++ API

+ * + *

A class that formats numbers according to a set of rules. This number formatter is + * typically used for spelling out numeric values in words (e.g., 25,3476 as + * "twenty-five thousand three hundred seventy-six" or "vingt-cinq mille trois + * cents soixante-seize" or + * "fünfundzwanzigtausenddreihundertsechsundsiebzig"), but can also be used for + * other complicated formatting tasks, such as formatting a number of seconds as hours, + * minutes and seconds (e.g., 3,730 as "1:02:10").

+ * + *

The resources contain three predefined formatters for each locale: spellout, which + * spells out a value in words (123 is "one hundred twenty-three"); ordinal, which + * appends an ordinal suffix to the end of a numeral (123 is "123rd"); and + * duration, which shows a duration in seconds as hours, minutes, and seconds (123 is + * "2:03").  The client can also define more specialized RuleBasedNumberFormats + * by supplying programmer-defined rule sets.

+ * + *

The behavior of a RuleBasedNumberFormat is specified by a textual description + * that is either passed to the constructor as a String or loaded from a resource + * bundle. In its simplest form, the description consists of a semicolon-delimited list of rules. + * Each rule has a string of output text and a value or range of values it is applicable to. + * In a typical spellout rule set, the first twenty rules are the words for the numbers from + * 0 to 19:

+ * + *
zero; one; two; three; four; five; six; seven; eight; nine;
+ * ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen; seventeen; eighteen; nineteen;
+ * + *

For larger numbers, we can use the preceding set of rules to format the ones place, and + * we only have to supply the words for the multiples of 10:

+ * + *
 20: twenty[->>];
+ * 30: thirty[->>];
+ * 40: forty[->>];
+ * 50: fifty[->>];
+ * 60: sixty[->>];
+ * 70: seventy[->>];
+ * 80: eighty[->>];
+ * 90: ninety[->>];
+ * + *

In these rules, the base value is spelled out explicitly and set off from the + * rule's output text with a colon. The rules are in a sorted list, and a rule is applicable + * to all numbers from its own base value to one less than the next rule's base value. The + * ">>" token is called a substitution and tells the fomatter to + * isolate the number's ones digit, format it using this same set of rules, and place the + * result at the position of the ">>" token. Text in brackets is omitted if + * the number being formatted is an even multiple of 10 (the hyphen is a literal hyphen; 24 + * is "twenty-four," not "twenty four").

+ * + *

For even larger numbers, we can actually look up several parts of the number in the + * list:

+ * + *
100: << hundred[ >>];
+ * + *

The "<<" represents a new kind of substitution. The << isolates + * the hundreds digit (and any digits to its left), formats it using this same rule set, and + * places the result where the "<<" was. Notice also that the meaning of + * >> has changed: it now refers to both the tens and the ones digits. The meaning of + * both substitutions depends on the rule's base value. The base value determines the rule's divisor, + * which is the highest power of 10 that is less than or equal to the base value (the user + * can change this). To fill in the substitutions, the formatter divides the number being + * formatted by the divisor. The integral quotient is used to fill in the << + * substitution, and the remainder is used to fill in the >> substitution. The meaning + * of the brackets changes similarly: text in brackets is omitted if the value being + * formatted is an even multiple of the rule's divisor. The rules are applied recursively, so + * if a substitution is filled in with text that includes another substitution, that + * substitution is also filled in.

+ * + *

This rule covers values up to 999, at which point we add another rule:

+ * + *
1000: << thousand[ >>];
+ * + *

Again, the meanings of the brackets and substitution tokens shift because the rule's + * base value is a higher power of 10, changing the rule's divisor. This rule can actually be + * used all the way up to 999,999. This allows us to finish out the rules as follows:

+ * + *
 1,000,000: << million[ >>];
+ * 1,000,000,000: << billion[ >>];
+ * 1,000,000,000,000: << trillion[ >>];
+ * 1,000,000,000,000,000: OUT OF RANGE!;
+ * + *

Commas, periods, and spaces can be used in the base values to improve legibility and + * are ignored by the rule parser. The last rule in the list is customarily treated as an + * "overflow rule," applying to everything from its base value on up, and often (as + * in this example) being used to print out an error message or default representation. + * Notice also that the size of the major groupings in large numbers is controlled by the + * spacing of the rules: because in English we group numbers by thousand, the higher rules + * are separated from each other by a factor of 1,000.

+ * + *

To see how these rules actually work in practice, consider the following example: + * Formatting 25,430 with this rule set would work like this:

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
<< thousand >>[the rule whose base value is 1,000 is applicable to 25,340]
twenty->> thousand >>[25,340 over 1,000 is 25. The rule for 20 applies.]
twenty-five thousand >>[25 mod 10 is 5. The rule for 5 is "five."
twenty-five thousand << hundred >>[25,340 mod 1,000 is 340. The rule for 100 applies.]
twenty-five thousand three hundred >>[340 over 100 is 3. The rule for 3 is "three."]
twenty-five thousand three hundred forty[340 mod 100 is 40. The rule for 40 applies. Since 40 divides + * evenly by 10, the hyphen and substitution in the brackets are omitted.]
+ * + *

The above syntax suffices only to format positive integers. To format negative numbers, + * we add a special rule:

+ * + *
-x: minus >>;
+ * + *

This is called a negative-number rule, and is identified by "-x" + * where the base value would be. This rule is used to format all negative numbers. the + * >> token here means "find the number's absolute value, format it with these + * rules, and put the result here."

+ * + *

We also add a special rule called a fraction rule for numbers with fractional + * parts:

+ * + *
x.x: << point >>;
+ * + *

This rule is used for all positive non-integers (negative non-integers pass through the + * negative-number rule first and then through this rule). Here, the << token refers to + * the number's integral part, and the >> to the number's fractional part. The + * fractional part is formatted as a series of single-digit numbers (e.g., 123.456 would be + * formatted as "one hundred twenty-three point four five six").

+ * + *

To see how this rule syntax is applied to various languages, examine the resource data.

+ * + *

There is actually much more flexibility built into the rule language than the + * description above shows. A formatter may own multiple rule sets, which can be selected by + * the caller, and which can use each other to fill in their substitutions. Substitutions can + * also be filled in with digits, using a DecimalFormat object. There is syntax that can be + * used to alter a rule's divisor in various ways. And there is provision for much more + * flexible fraction handling. A complete description of the rule syntax follows:

+ * + *
+ * + *

The description of a RuleBasedNumberFormat's behavior consists of one or more rule + * sets. Each rule set consists of a name, a colon, and a list of rules. A rule + * set name must begin with a % sign. Rule sets with names that begin with a single % sign + * are public: the caller can specify that they be used to format and parse numbers. + * Rule sets with names that begin with %% are private: they exist only for the use + * of other rule sets. If a formatter only has one rule set, the name may be omitted.

+ * + *

The user can also specify a special "rule set" named %%lenient-parse. + * The body of %%lenient-parse isn't a set of number-formatting rules, but a RuleBasedCollator + * description which is used to define equivalences for lenient parsing. For more information + * on the syntax, see RuleBasedCollator. For more information on lenient parsing, + * see setLenientParse().

+ * + *

The body of a rule set consists of an ordered, semicolon-delimited list of rules. + * Internally, every rule has a base value, a divisor, rule text, and zero, one, or two substitutions. + * These parameters are controlled by the description syntax, which consists of a rule + * descriptor, a colon, and a rule body.

+ * + *

A rule descriptor can take one of the following forms (text in italics is the + * name of a token):

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
bv:bv specifies the rule's base value. bv is a decimal + * number expressed using ASCII digits. bv may contain spaces, period, and commas, + * which are ignored. The rule's divisor is the highest power of 10 less than or equal to + * the base value.
bv/rad:bv specifies the rule's base value. The rule's divisor is the + * highest power of rad less than or equal to the base value.
bv>:bv specifies the rule's base value. To calculate the divisor, + * let the radix be 10, and the exponent be the highest exponent of the radix that yields a + * result less than or equal to the base value. Every > character after the base value + * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix + * raised to the power of the exponent; otherwise, the divisor is 1.
bv/rad>:bv specifies the rule's base value. To calculate the divisor, + * let the radix be rad, and the exponent be the highest exponent of the radix that + * yields a result less than or equal to the base value. Every > character after the radix + * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix + * raised to the power of the exponent; otherwise, the divisor is 1.
-x:The rule is a negative-number rule.
x.x:The rule is an improper fraction rule.
0.x:The rule is a proper fraction rule.
x.0:The rule is a master rule.
nothingIf the rule's rule descriptor is left out, the base value is one plus the + * preceding rule's base value (or zero if this is the first rule in the list) in a normal + * rule set.  In a fraction rule set, the base value is the same as the preceding rule's + * base value.
+ * + *

A rule set may be either a regular rule set or a fraction rule set, depending + * on whether it is used to format a number's integral part (or the whole number) or a + * number's fractional part. Using a rule set to format a rule's fractional part makes it a + * fraction rule set.

+ * + *

Which rule is used to format a number is defined according to one of the following + * algorithms: If the rule set is a regular rule set, do the following: + * + *

    + *
  • If the rule set includes a master rule (and the number was passed in as a double), + * use the master rule.  (If the number being formatted was passed in as a long, + * the master rule is ignored.)
  • + *
  • If the number is negative, use the negative-number rule.
  • + *
  • If the number has a fractional part and is greater than 1, use the improper fraction + * rule.
  • + *
  • If the number has a fractional part and is between 0 and 1, use the proper fraction + * rule.
  • + *
  • Binary-search the rule list for the rule with the highest base value less than or equal + * to the number. If that rule has two substitutions, its base value is not an even multiple + * of its divisor, and the number is an even multiple of the rule's divisor, use the + * rule that precedes it in the rule list. Otherwise, use the rule itself.
  • + *
+ * + *

If the rule set is a fraction rule set, do the following: + * + *

    + *
  • Ignore negative-number and fraction rules.
  • + *
  • For each rule in the list, multiply the number being formatted (which will always be + * between 0 and 1) by the rule's base value. Keep track of the distance between the result + * the nearest integer.
  • + *
  • Use the rule that produced the result closest to zero in the above calculation. In the + * event of a tie or a direct hit, use the first matching rule encountered. (The idea here is + * to try each rule's base value as a possible denominator of a fraction. Whichever + * denominator produces the fraction closest in value to the number being formatted wins.) If + * the rule following the matching rule has the same base value, use it if the numerator of + * the fraction is anything other than 1; if the numerator is 1, use the original matching + * rule. (This is to allow singular and plural forms of the rule text without a lot of extra + * hassle.)
  • + *
+ * + *

A rule's body consists of a string of characters terminated by a semicolon. The rule + * may include zero, one, or two substitution tokens, and a range of text in + * brackets. The brackets denote optional text (and may also include one or both + * substitutions). The exact meanings of the substitution tokens, and under what conditions + * optional text is omitted, depend on the syntax of the substitution token and the context. + * The rest of the text in a rule body is literal text that is output when the rule matches + * the number being formatted.

+ * + *

A substitution token begins and ends with a token character. The token + * character and the context together specify a mathematical operation to be performed on the + * number being formatted. An optional substitution descriptor specifies how the + * value resulting from that operation is used to fill in the substitution. The position of + * the substitution token in the rule body specifies the location of the resultant text in + * the original rule text.

+ * + *

The meanings of the substitution token characters are as follows:

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
>>in normal ruleDivide the number by the rule's divisor and format the remainder
in negative-number ruleFind the absolute value of the number and format the result
in fraction or master ruleIsolate the number's fractional part and format it.
in rule in fraction rule setNot allowed.
>>>in normal ruleDivide the number by the rule's divisor and format the remainder, + * but bypass the normal rule-selection process and just use the + * rule that precedes this one in this rule list.
in all other rulesNot allowed.
<<in normal ruleDivide the number by the rule's divisor and format the quotient
in negative-number ruleNot allowed.
in fraction or master ruleIsolate the number's integral part and format it.
in rule in fraction rule setMultiply the number by the rule's base value and format the result.
==in all rule setsFormat the number unchanged
[]in normal ruleOmit the optional text if the number is an even multiple of the rule's divisor
in negative-number ruleNot allowed.
in improper-fraction ruleOmit the optional text if the number is between 0 and 1 (same as specifying both an + * x.x rule and a 0.x rule)
in master ruleOmit the optional text if the number is an integer (same as specifying both an x.x + * rule and an x.0 rule)
in proper-fraction ruleNot allowed.
in rule in fraction rule setOmit the optional text if multiplying the number by the rule's base value yields 1.
+ * + *

The substitution descriptor (i.e., the text between the token characters) may take one + * of three forms:

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
a rule set namePerform the mathematical operation on the number, and format the result using the + * named rule set.
a DecimalFormat patternPerform the mathematical operation on the number, and format the result using a + * DecimalFormat with the specified pattern.  The pattern must begin with 0 or #.
nothingPerform the mathematical operation on the number, and format the result using the rule + * set containing the current rule, except:
    + *
  • You can't have an empty substitution descriptor with a == substitution.
  • + *
  • If you omit the substitution descriptor in a >> substitution in a fraction rule, + * format the result one digit at a time using the rule set containing the current rule.
  • + *
  • If you omit the substitution descriptor in a << substitution in a rule in a + * fraction rule set, format the result using the default rule set for this formatter.
  • + *
+ *
+ * + *

Whitespace is ignored between a rule set name and a rule set body, between a rule + * descriptor and a rule body, or between rules. If a rule body begins with an apostrophe, + * the apostrophe is ignored, but all text after it becomes significant (this is how you can + * have a rule's rule text begin with whitespace). There is no escape function: the semicolon + * is not allowed in rule set names or in rule text, and the colon is not allowed in rule set + * names. The characters beginning a substitution token are always treated as the beginning + * of a substitution token.

+ * + *

See the resource data and the demo program for annotated examples of real rule sets + * using these features.

+ * + * @author Richard Gillam + * @see NumberFormat + * @see DecimalFormat + * @draft + */ + +/** Tags for the predefined rulesets. */ +enum URBNFRuleSetTag { + URBNF_SPELLOUT, + URBNF_ORDINAL, + URBNF_DURATION, + URBNF_COUNT +}; + +class U_I18N_API RuleBasedNumberFormat : public NumberFormat { +public: + + //----------------------------------------------------------------------- + // constructors + //----------------------------------------------------------------------- + + /** + * Creates a RuleBasedNumberFormat that behaves according to the rules + * passed in. The formatter uses the specified locale to determine the + * characters to use when formatting numerals, and to define equivalences + * for lenient parsing. + * @param rules The formatter rules. + * See the class documentation for a complete explanation of the rule + * syntax. + * @param locale A locale, that governs which characters are used for + * formatting values in numerals, and which characters are equivalent in + * lenient parsing. + * @param perror The parse error if an error was encountered. + * @param status The status indicating whether the constructor succeeded. + * @draft + */ + RuleBasedNumberFormat(const UnicodeString& rules, const Locale& locale, + UParseError& perror, UErrorCode& status); + + /** + * Creates a RuleBasedNumberFormat from a predefined ruleset. The selector + * code choosed among three possible predefined formats: spellout, ordinal, + * and duration. + * @param tag A selector code specifying which kind of formatter to create for that + * locale. There are three legal values: URBNF_SPELLOUT, which creates a formatter that + * spells out a value in words in the desired language, URBNF_ORDINAL, which attaches + * an ordinal suffix from the desired language to the end of a number (e.g. "123rd"), + * and URBNF_DURATION, which formats a duration in seconds as hours, minutes, and seconds. + * @param locale The locale for the formatter. + * @param status The status indicating whether the constructor succeeded. + * @draft + */ + RuleBasedNumberFormat(URBNFRuleSetTag tag, const Locale& locale, UErrorCode& status); + + //----------------------------------------------------------------------- + // boilerplate + //----------------------------------------------------------------------- + + /** + * Copy constructor + */ + RuleBasedNumberFormat(const RuleBasedNumberFormat& rhs); + + /** + * Assignment operator + */ + RuleBasedNumberFormat& operator=(const RuleBasedNumberFormat& rhs); + + /** + * Release memory allocated for a RuleBasedNumberFormat when you are finished with it. + */ + virtual ~RuleBasedNumberFormat(); + + /** + * Clone this object polymorphically. The caller is responsible + * for deleting the result when done. + */ + virtual Format* clone(void) const; + + /** + * Return true if the given Format objects are semantically equal. + * Objects of different subclasses are considered unequal. + */ + virtual UBool operator==(const Format& other) const; + +//----------------------------------------------------------------------- +// public API functions +//----------------------------------------------------------------------- + + /** + * @return the rules that were provided to the RuleBasedNumberFormat. + * @return the result String that was passed in + * @draft + */ + virtual UnicodeString getRules() const; + + /** + * Return the name of the index'th public ruleSet. If index is not valid, + * the function returns null. + * @param index the index of the ruleset + * @return the name of the index'th public ruleSet. + * @draft + */ + virtual UnicodeString getRuleSetName(int32_t index) const; + + /** + * Return the number of public rule set names. + * @return the number of public rule set names. + * @draft + */ + virtual int32_t getNumberOfRuleSetNames() const; + + /** + * Formats the specified number using the default ruleset. + * @param number The number to format. + * @param toAppendTo the string that will hold the (appended) result + * @param pos the fieldposition + * @return A textual representation of the number. + * @draft + */ + virtual UnicodeString& format(int32_t number, + UnicodeString& toAppendTo, + FieldPosition& pos) const; + /** + * Formats the specified number using the default ruleset. + * @param number The number to format. + * @param toAppendTo the string that will hold the (appended) result + * @param pos the fieldposition + * @return A textual representation of the number. + * @draft + */ + virtual UnicodeString& format(double number, + UnicodeString& toAppendTo, + FieldPosition& pos) const; + + /** + * Formats the specified number using the default ruleset. + * @param number The number to format. + * @param ruleSetName The name of the rule set to format the number with. + * This must be the name of a valid public rule set for this formatter. + * @param toAppendTo the string that will hold the (appended) result + * @param pos the fieldposition + * @param status the status + * @return A textual representation of the number. + * @draft + */ + virtual UnicodeString& format(int32_t number, + const UnicodeString& ruleSetName, + UnicodeString& toAppendTo, + FieldPosition& pos, + UErrorCode& status) const; + /** + * Formats the specified number using the default ruleset. + * @param number The number to format. + * @param ruleSetName The name of the rule set to format the number with. + * This must be the name of a valid public rule set for this formatter. + * @param toAppendTo the string that will hold the (appended) result + * @param pos the fieldposition + * @param status the status + * @return A textual representation of the number. + * @draft + */ + virtual UnicodeString& format(double number, + const UnicodeString& ruleSetName, + UnicodeString& toAppendTo, + FieldPosition& pos, + UErrorCode& status) const; + + /** + * Formats the specified number using the default ruleset. + * @param obj The number to format. + * @param toAppendTo the string that will hold the (appended) result + * @param pos the fieldposition + * @param status the status + * @return A textual representation of the number. + * @draft + */ + virtual UnicodeString& format(const Formattable& obj, + UnicodeString& toAppendTo, + FieldPosition& pos, + UErrorCode& status) const; + /** + * Redeclared Format method. + * @stable + */ + UnicodeString& format(const Formattable& obj, + UnicodeString& result, + UErrorCode& status) const; + + /** + * Redeclared NumberFormat method. + * @stable + */ + UnicodeString& format(double number, + UnicodeString& output) const; + + /** + * Redeclared NumberFormat method. + * @stable + */ + UnicodeString& format(int32_t number, + UnicodeString& output) const; + + /** + * Parses the specfied string, beginning at the specified position, according + * to this formatter's rules. This will match the string against all of the + * formatter's public rule sets and return the value corresponding to the longest + * parseable substring. This function's behavior is affected by the lenient + * parse mode. + * @param text The string to parse + * @param result the result of the parse, either a double or a long. + * @param parsePosition On entry, contains the position of the first character + * in "text" to examine. On exit, has been updated to contain the position + * of the first character in "text" that wasn't consumed by the parse. + * @see #setLenientParseMode + * @draft + */ + virtual void parse(const UnicodeString& text, + Formattable& result, + ParsePosition& parsePosition) const; + + + /** + * Redeclared Format method. + * @stable + */ + virtual inline void parse(const UnicodeString& text, + Formattable& result, + UErrorCode& status) const; + + + /** + * Turns lenient parse mode on and off. + * + * When in lenient parse mode, the formatter uses a Collator for parsing the text. + * Only primary differences are treated as significant. This means that case + * differences, accent differences, alternate spellings of the same letter + * (e.g., ae and a-umlaut in German), ignorable characters, etc. are ignored in + * matching the text. In many cases, numerals will be accepted in place of words + * or phrases as well. + * + * For example, all of the following will correctly parse as 255 in English in + * lenient-parse mode: + *
"two hundred fifty-five" + *
"two hundred fifty five" + *
"TWO HUNDRED FIFTY-FIVE" + *
"twohundredfiftyfive" + *
"2 hundred fifty-5" + * + * The Collator used is determined by the locale that was + * passed to this object on construction. The description passed to this object + * on construction may supply additional collation rules that are appended to the + * end of the default collator for the locale, enabling additional equivalences + * (such as adding more ignorable characters or permitting spelled-out version of + * symbols; see the demo program for examples). + * + * It's important to emphasize that even strict parsing is relatively lenient: it + * will accept some text that it won't produce as output. In English, for example, + * it will correctly parse "two hundred zero" and "fifteen hundred". + * + * @param enabled If true, turns lenient-parse mode on; if false, turns it off. + * @see RuleBasedCollator + * @draft + */ + virtual void setLenient(UBool enabled); + + /** + * Returns true if lenient-parse mode is turned on. Lenient parsing is off + * by default. + * @return true if lenient-parse mode is turned on. + * @see #setLenientParseMode + * @draft + */ + virtual inline UBool isLenient(void) const; + +private: + void init(const UnicodeString& rules, UParseError& perror, UErrorCode& status); + void dispose(); + void stripWhitespace(UnicodeString& src); + void setDefaultRuleSet(); + void format(double number, NFRuleSet& ruleSet); + NFRuleSet* findRuleSet(const UnicodeString& name, UErrorCode& status) const; + + /* friend access */ + friend class NFSubstitution; + friend class NFRule; + friend class FractionalPartSubstitution; + + inline NFRuleSet * getDefaultRuleSet() const; + Collator * getCollator() const; + DecimalFormatSymbols * getDecimalFormatSymbols() const; + +private: + static const char fgClassID; + +public: + static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } + virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); } + +private: + NFRuleSet **ruleSets; + NFRuleSet *defaultRuleSet; + Locale locale; + Collator* collator; + DecimalFormatSymbols* decimalFormatSymbols; + UBool lenient; + UnicodeString* lenientParseRules; +}; + +// --------------- + +inline UnicodeString& +RuleBasedNumberFormat::format(const Formattable& obj, + UnicodeString& result, + UErrorCode& status) const { + // Don't use Format:: - use immediate base class only, + // in case immediate base modifies behavior later. + // dlf - the above comment is bogus, if there were a reason to modify + // it, it would be virtual, and there's no reason because it is + // a one-line macro in NumberFormat anyway, just like this one. + return NumberFormat::format(obj, result, status); +} + +inline UnicodeString& +RuleBasedNumberFormat::format(double number, UnicodeString& output) const { + FieldPosition pos(0); + return format(number, output, pos); +} + +inline UnicodeString& +RuleBasedNumberFormat::format(int32_t number, UnicodeString& output) const { + FieldPosition pos(0); + return format(number, output, pos); +} + +inline void +RuleBasedNumberFormat::parse(const UnicodeString& text, Formattable& result, UErrorCode& status) const { + NumberFormat::parse(text, result, status); +} + +inline UBool +RuleBasedNumberFormat::isLenient(void) const { + return lenient; +} + +inline NFRuleSet* +RuleBasedNumberFormat::getDefaultRuleSet() const { + return defaultRuleSet; +} + +U_NAMESPACE_END + +/* RBNF_H */ +#endif diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index 6fc969f26ad..f2658954b1c 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -48,7 +48,8 @@ tsmthred.o tsmutex.o tsnmfmt.o tsputil.o tstnorm.o tzbdtest.o \ tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o transtst.o strtest.o thcoll.o \ itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o unhxtrts.o hxuntrts.o \ ufltlgts.o testutil.o transrt.o normconf.o sfwdchit.o indictrn.o\ -jamotest.o srchtest.o +jamotest.o srchtest.o \ +itrbnf.o itrbnfrt.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/test/intltest/intltest.dsp b/icu4c/source/test/intltest/intltest.dsp index 19310197441..3fd64a1d46e 100644 --- a/icu4c/source/test/intltest/intltest.dsp +++ b/icu4c/source/test/intltest/intltest.dsp @@ -217,6 +217,14 @@ SOURCE=.\itrbbi.cpp # End Source File # Begin Source File +SOURCE=.\itrbnf.cpp +# End Source File +# Begin Source File + +SOURCE=.\itrbnfrt.cpp +# End Source File +# Begin Source File + SOURCE=.\ittrans.cpp # End Source File # Begin Source File @@ -545,6 +553,14 @@ SOURCE=.\itrbbi.h # End Source File # Begin Source File +SOURCE=.\itrbnf.h +# End Source File +# Begin Source File + +SOURCE=.\itrbnfrt.h +# End Source File +# Begin Source File + SOURCE=.\ittrans.h # End Source File # Begin Source File diff --git a/icu4c/source/test/intltest/itmajor.cpp b/icu4c/source/test/intltest/itmajor.cpp index 4b77083985b..bf666976565 100644 --- a/icu4c/source/test/intltest/itmajor.cpp +++ b/icu4c/source/test/intltest/itmajor.cpp @@ -24,6 +24,8 @@ #include "itconv.h" #include "ittrans.h" #include "itrbbi.h" +#include "itrbnf.h" +#include "itrbnfrt.h" #include "normconf.h" #include "tstnorm.h" @@ -102,6 +104,20 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam callTest( test, par ); } break; + case 8: name = "rbnf"; + if (exec) { + logln("TestSuite RuleBasedNumberFormat----"); logln(); + IntlTestRBNF test; + callTest(test, par); + } + break; + case 9: name = "rbnfrt"; + if (exec) { + logln("TestSuite RuleBasedNumberFormat RT----"); logln(); + RbnfRoundTripTest test; + callTest(test, par); + } + break; default: name = ""; break; } diff --git a/icu4c/source/test/intltest/itrbnf.cpp b/icu4c/source/test/intltest/itrbnf.cpp new file mode 100644 index 00000000000..d971569146f --- /dev/null +++ b/icu4c/source/test/intltest/itrbnf.cpp @@ -0,0 +1,618 @@ +/* + ******************************************************************************* + * Copyright (C) 1996-2000, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +#include "itrbnf.h" + +#include "unicode/tblcoll.h" +#include "unicode/coleitr.h" + +// import com.ibm.text.RuleBasedNumberFormat; +// import com.ibm.test.TestFmwk; + +// import java.util.Locale; +// import java.text.NumberFormat; + +// current macro not in icu1.8.1 +#define TESTCASE(id,test) \ + case id: \ + name = #test; \ + if (exec) { \ + logln(#test "---"); \ + logln((UnicodeString)""); \ + test(); \ + } \ + break + +void IntlTestRBNF::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) +{ + if (exec) logln("TestSuite RuleBasedNumberFormat"); + switch (index) { + TESTCASE(0, TestEnglishSpellout); + TESTCASE(1, TestOrdinalAbbreviations); + TESTCASE(2, TestDurations); + TESTCASE(3, TestSpanishSpellout); + TESTCASE(4, TestFrenchSpellout); + TESTCASE(5, TestSwissFrenchSpellout); + TESTCASE(6, TestItalianSpellout); + TESTCASE(7, TestGermanSpellout); + TESTCASE(8, TestThaiSpellout); + default: + name = ""; + break; + } +} + +void +IntlTestRBNF::TestEnglishSpellout() +{ +#if 0 + // temporary test code + { + int32_t result = 0; + UErrorCode status = U_ZERO_ERROR; + Collator* temp = Collator::createInstance(Locale::US, status); + if (U_SUCCESS(status) && + temp->getDynamicClassID() == RuleBasedCollator::getStaticClassID()) { + + RuleBasedCollator* collator = (RuleBasedCollator*)temp; + UnicodeString rules(collator->getRules()); + UnicodeString tailoring("&'\\u0000' << ' ' << '-'\n"); + tailoring = tailoring.unescape(); + rules.append(tailoring); + + collator = new RuleBasedCollator(rules, status); + if (U_SUCCESS(status)) { + collator->setDecomposition(Normalizer::DECOMP); + + UnicodeString prefix(" hundred"); + UnicodeString str("hundred-fifty"); + + CollationElementIterator* strIter = collator->createCollationElementIterator(str); + CollationElementIterator* prefixIter = collator->createCollationElementIterator(prefix); + + // match collation elements between the strings + int32_t oStr = strIter->next(status); + int32_t oPrefix = prefixIter->next(status); + + while (oPrefix != CollationElementIterator::NULLORDER) { + // skip over ignorable characters in the target string + while (CollationElementIterator::primaryOrder(oStr) == 0 + && oStr != CollationElementIterator::NULLORDER) { + oStr = strIter->next(status); + } + + // skip over ignorable characters in the prefix + while (CollationElementIterator::primaryOrder(oPrefix) == 0 + && oPrefix != CollationElementIterator::NULLORDER) { + oPrefix = prefixIter->next(status); + } + + // if skipping over ignorables brought us to the end + // of the target string, we didn't match and return 0 + if (oStr == CollationElementIterator::NULLORDER) { + result = -1; + break; + } + + // if skipping over ignorables brought to the end of + // the prefix, we DID match: drop out of the loop + else if (oPrefix == CollationElementIterator::NULLORDER) { + break; + } + + // match collation elements from the two strings + // (considering only primary differences). If we + // get a mismatch, dump out and return 0 + if (CollationElementIterator::primaryOrder(oStr) + != CollationElementIterator::primaryOrder(oPrefix)) { + result = -1; + break; + + // otherwise, advance to the next character in each string + // and loop (we drop out of the loop when we exhaust + // collation elements in the prefix) + } else { + oStr = strIter->next(status); + oPrefix = prefixIter->next(status); + } + } + if (result == 0) { + result = strIter->getOffset(); + } + delete prefixIter; + delete strIter; + } + delete collator; + } + delete temp; + + printf("result: %d\n", result); + } +#endif + + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::US, status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "1", "one" }, + { "2", "two" }, + { "15", "fifteen" }, + { "20", "twenty" }, + { "23", "twenty-three" }, + { "73", "seventy-three" }, + { "88", "eighty-eight" }, + { "100", "one hundred" }, + { "106", "one hundred and six" }, + { "127", "one hundred and twenty-seven" }, + { "200", "two hundred" }, + { "579", "five hundred and seventy-nine" }, + { "1,000", "one thousand" }, + { "2,000", "two thousand" }, + { "3,004", "three thousand and four" }, + { "4,567", "four thousand five hundred and sixty-seven" }, + { "15,943", "fifteen thousand nine hundred and forty-three" }, + { "2,345,678", "two million, three hundred and forty-five thousand, six hundred and seventy-eight" }, + { "-36", "minus thirty-six" }, + { "234.567", "two hundred and thirty-four point five six seven" }, + NULL + }; + + doTest(formatter, testData, TRUE); + + formatter->setLenient(TRUE); + const char* lpTestData[][2] = { + { "2 thousand six HUNDRED fifty-7", "2,657" }, + { "fifteen hundred and zero", "1,500" }, + { "FOurhundred thiRTY six", "436" }, + NULL + }; + doLenientParseTest(formatter, lpTestData); + } +} + +void +IntlTestRBNF::TestOrdinalAbbreviations() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_ORDINAL, Locale::US, status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "1", "1st" }, + { "2", "2nd" }, + { "3", "3rd" }, + { "4", "4th" }, + { "7", "7th" }, + { "10", "10th" }, + { "11", "11th" }, + { "13", "13th" }, + { "20", "20th" }, + { "21", "21st" }, + { "22", "22nd" }, + { "23", "23rd" }, + { "24", "24th" }, + { "33", "33rd" }, + { "102", "102nd" }, + { "312", "312th" }, + { "12,345", "12,345th" }, + NULL + }; + + doTest(formatter, testData, FALSE); + } +} + +void +IntlTestRBNF::TestDurations() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_DURATION, Locale::US, status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "3,600", "1:00:00" }, //move me and I fail + { "0", "0 sec." }, + { "1", "1 sec." }, + { "24", "24 sec." }, + { "60", "1:00" }, + { "73", "1:13" }, + { "145", "2:25" }, + { "666", "11:06" }, + // { "3,600", "1:00:00" }, + { "3,740", "1:02:20" }, + { "10,293", "2:51:33" }, + NULL + }; + + doTest(formatter, testData, TRUE); + + formatter->setLenient(TRUE); + const char* lpTestData[][2] = { + { "2-51-33", "10,293" }, + NULL + }; + doLenientParseTest(formatter, lpTestData); + } +} + +void +IntlTestRBNF::TestSpanishSpellout() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("es", "ES", ""), status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "1", "uno" }, + { "6", "seis" }, + { "16", "diecis\\u00e9is" }, + { "20", "veinte" }, + { "24", "veinticuatro" }, + { "26", "veintis\\u00e9is" }, + { "73", "setenta y tres" }, + { "88", "ochenta y ocho" }, + { "100", "cien" }, + { "106", "ciento seis" }, + { "127", "ciento veintisiete" }, + { "200", "doscientos" }, + { "579", "quinientos setenta y nueve" }, + { "1,000", "mil" }, + { "2,000", "dos mil" }, + { "3,004", "tres mil cuatro" }, + { "4,567", "cuatro mil quinientos sesenta y siete" }, + { "15,943", "quince mil novecientos cuarenta y tres" }, + { "2,345,678", "dos mill\\u00f3n trescientos cuarenta y cinco mil seiscientos setenta y ocho"}, + { "-36", "menos treinta y seis" }, + { "234.567", "doscientos treinta y cuatro punto cinco seis siete" }, + NULL + }; + + doTest(formatter, testData, TRUE); + } +} + +void +IntlTestRBNF::TestFrenchSpellout() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::FRANCE, status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "1", "un" }, + { "15", "quinze" }, + { "20", "vingt" }, + { "21", "vingt-et-un" }, + { "23", "vingt-trois" }, + { "62", "soixante-deux" }, + { "70", "soixante-dix" }, + { "71", "soixante et onze" }, + { "73", "soixante-treize" }, + { "80", "quatre-vingts" }, + { "88", "quatre-vingt-huit" }, + { "100", "cent" }, + { "106", "cent six" }, + { "127", "cent vingt-sept" }, + { "200", "deux cents" }, + { "579", "cinq cents soixante-dix-neuf" }, + { "1,000", "mille" }, + { "1,123", "onze cents vingt-trois" }, + { "1,594", "mille cinq cents quatre-vingt-quatorze" }, + { "2,000", "deux mille" }, + { "3,004", "trois mille quatre" }, + { "4,567", "quatre mille cinq cents soixante-sept" }, + { "15,943", "quinze mille neuf cents quarante-trois" }, + { "2,345,678", "deux million trois cents quarante-cinq mille six cents soixante-dix-huit" }, + { "-36", "moins trente-six" }, + { "234.567", "deux cents trente-quatre virgule cinq six sept" }, + NULL + }; + + doTest(formatter, testData, TRUE); + + formatter->setLenient(TRUE); + const char* lpTestData[][2] = { + { "trente-un", "31" }, + { "un cents quatre vingt dix huit", "198" }, + NULL + }; + doLenientParseTest(formatter, lpTestData); + } +} +void +IntlTestRBNF::TestSwissFrenchSpellout() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("fr", "CH", ""), status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "1", "un" }, + { "15", "quinze" }, + { "20", "vingt" }, + { "21", "vingt-et-un" }, + { "23", "vingt-trois" }, + { "62", "soixante-deux" }, + { "70", "septante" }, + { "71", "septante-et-un" }, + { "73", "septante-trois" }, + { "80", "octante" }, + { "88", "octante-huit" }, + { "100", "cent" }, + { "106", "cent six" }, + { "127", "cent vingt-sept" }, + { "200", "deux cents" }, + { "579", "cinq cents septante-neuf" }, + { "1,000", "mille" }, + { "1,123", "onze cents vingt-trois" }, + { "1,594", "mille cinq cents nonante-quatre" }, + { "2,000", "deux mille" }, + { "3,004", "trois mille quatre" }, + { "4,567", "quatre mille cinq cents soixante-sept" }, + { "15,943", "quinze mille neuf cents quarante-trois" }, + { "2,345,678", "deux million trois cents quarante-cinq mille six cents septante-huit" }, + { "-36", "moins trente-six" }, + { "234.567", "deux cents trente-quatre virgule cinq six sept" }, + NULL + }; + + doTest(formatter, testData, TRUE); + } +} + +void +IntlTestRBNF::TestItalianSpellout() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::ITALIAN, status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "1", "uno" }, + { "15", "quindici" }, + { "20", "venti" }, + { "23", "ventitre" }, + { "73", "settantatre" }, + { "88", "ottantotto" }, + { "100", "cento" }, + { "106", "centosei" }, + { "108", "centotto" }, + { "127", "centoventisette" }, + { "181", "centottantuno" }, + { "200", "duecento" }, + { "579", "cinquecentosettantanove" }, + { "1,000", "mille" }, + { "2,000", "duemila" }, + { "3,004", "tremilaquattro" }, + { "4,567", "quattromilacinquecentosessantasette" }, + { "15,943", "quindicimilanovecentoquarantatre" }, + { "-36", "meno trentisei" }, + { "234.567", "duecentotrentiquattro virgola cinque sei sette" }, + NULL + }; + + doTest(formatter, testData, TRUE); + } +} + +void +IntlTestRBNF::TestGermanSpellout() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::GERMANY, status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "1", "eins" }, + { "15", "f\\u00fcnfzehn" }, + { "20", "zwanzig" }, + { "23", "dreiundzwanzig" }, + { "73", "dreiundsiebzig" }, + { "88", "achtundachtzig" }, + { "100", "hundert" }, + { "106", "hundertsechs" }, + { "127", "hundertsiebenundzwanzig" }, + { "200", "zweihundert" }, + { "579", "f\\u00fcnfhundertneunundsiebzig" }, + { "1,000", "tausend" }, + { "2,000", "zweitausend" }, + { "3,004", "dreitausendvier" }, + { "4,567", "viertausendf\\u00fcnfhundertsiebenundsechzig" }, + { "15,943", "f\\u00fcnfzehntausendneunhundertdreiundvierzig" }, + { "2,345,678", "zwei Millionen dreihundertf\\u00fcnfundvierzigtausendsechshundertachtundsiebzig" }, + NULL + }; + + doTest(formatter, testData, TRUE); + + formatter->setLenient(TRUE); + const char* lpTestData[][2] = { + { "ein Tausend sechs Hundert fuenfunddreissig", "1,635" }, + NULL + }; + doLenientParseTest(formatter, lpTestData); + } +} + +void +IntlTestRBNF::TestThaiSpellout() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("th"), status); + + if (U_FAILURE(status)) { + errln("FAIL: could not construct formatter"); + } else { + const char* testData[][2] = { + { "0", "\\u0e28\\u0e39\\u0e19\\u0e22\\u0e4c" }, + { "1", "\\u0e2b\\u0e19\\u0e36\\u0e48\\u0e07" }, + { "10", "\\u0e2a\\u0e34\\u0e1a" }, + { "11", "\\u0e2a\\u0e34\\u0e1a\\u0e40\\u0e2d\\u0e47\\u0e14" }, + { "21", "\\u0e22\\u0e35\\u0e48\\u0e2a\\u0e34\\u0e1a\\u0e40\\u0e2d\\u0e47\\u0e14" }, + { "101", "\\u0e2b\\u0e19\\u0e36\\u0e48\\u0e07\\u0e23\\u0e49\\u0e2d\\u0e22\\u0e2b\\u0e19\\u0e36\\u0e48\\u0e07" }, + { "1.234", "\\u0e2b\\u0e19\\u0e36\\u0e48\\u0e07\\u0e08\\u0e38\\u0e14\\u0e2a\\u0e2d\\u0e07\\u0e2a\\u0e32\\u0e21\\u0e2a\\u0e35\\u0e48" }, + NULL + }; + + doTest(formatter, testData, TRUE); + } +} + +void +IntlTestRBNF::doTest(RuleBasedNumberFormat* formatter, const char* testData[][2], UBool testParsing) +{ + // man, error reporting would be easier with printf-style syntax for unicode string and formattable + + UErrorCode status = U_ZERO_ERROR; + NumberFormat* decFmt = NumberFormat::createInstance(Locale::US, status); + if (U_FAILURE(status)) { + errln("FAIL: could not create NumberFormat"); + } else { + for (int i = 0; testData[i][0]; ++i) { + const char* numString = testData[i][0]; + const char* expectedWords = testData[i][1]; + + Formattable expectedNumber; + decFmt->parse(numString, expectedNumber, status); + if (U_FAILURE(status)) { + errln("FAIL: decFmt could not parse %s", numString); + break; + } else { + UnicodeString actualString; + FieldPosition pos; + formatter->format(expectedNumber, actualString/* , pos*/, status); + if (U_FAILURE(status)) { + UnicodeString msg = "Fail: formatter could not format "; + decFmt->format(expectedNumber, msg, status); + errln(msg); + break; + } else { + UnicodeString expectedString = UnicodeString(expectedWords).unescape(); + if (actualString != expectedString) { + UnicodeString msg = "FAIL: check failed for "; + decFmt->format(expectedNumber, msg, status); + msg.append(", expected "); + msg.append(expectedString); + msg.append(" but got "); + msg.append(actualString); + errln(msg); + break; + } else if (testParsing) { + Formattable parsedNumber; + formatter->parse(actualString, parsedNumber, status); + if (U_FAILURE(status)) { + UnicodeString msg = "FAIL: formatter could not parse "; + msg.append(actualString); + msg.append(" status code: " ); + char buffer[32]; + sprintf(buffer, "0x%x\0", status); + msg.append(buffer); + errln(msg); + break; + } else { + if (parsedNumber != expectedNumber) { + UnicodeString msg = "FAIL: parse failed for "; + msg.append(actualString); + msg.append(", expected "); + decFmt->format(expectedNumber, msg, status); + msg.append(", but got "); + decFmt->format(parsedNumber, msg, status); + errln(msg); + break; + } + } + } + } + } + } + delete decFmt; + } +} + +void +IntlTestRBNF::doLenientParseTest(RuleBasedNumberFormat* formatter, const char* testData[][2]) +{ + UErrorCode status = U_ZERO_ERROR; + NumberFormat* decFmt = NumberFormat::createInstance(Locale::US, status); + if (U_FAILURE(status)) { + errln("FAIL: could not create NumberFormat"); + } else { + for (int i = 0; testData[i][0]; ++i) { + const char* spelledNumber = testData[i][0]; // spelled-out number + const char* asciiUSNumber = testData[i][1]; // number as ascii digits formatted for US locale + + UnicodeString spelledNumberString = UnicodeString(spelledNumber).unescape(); + Formattable actualNumber; + formatter->parse(spelledNumberString, actualNumber, status); + if (U_FAILURE(status)) { + UnicodeString msg = "FAIL: formatter could not parse "; + msg.append(spelledNumberString); + errln(msg); + break; + } else { + // I changed the logic of this test somewhat from Java-- instead of comparing the + // strings, I compare the Formattables. Hmmm, but the Formattables don't compare, + // so change it back. + + UnicodeString asciiUSNumberString = asciiUSNumber; + Formattable expectedNumber; + decFmt->parse(asciiUSNumberString, expectedNumber, status); + if (U_FAILURE(status)) { + UnicodeString msg = "FAIL: decFmt could not parse "; + msg.append(asciiUSNumberString); + errln(msg); + break; + } else { + UnicodeString actualNumberString; + UnicodeString expectedNumberString; + decFmt->format(actualNumber, actualNumberString, status); + decFmt->format(expectedNumber, expectedNumberString, status); + if (actualNumberString != expectedNumberString) { + UnicodeString msg = "FAIL: parsing"; + msg.append(asciiUSNumberString); + msg.append("\n"); + msg.append(" lenient parse failed for "); + msg.append(spelledNumberString); + msg.append(", expected "); + msg.append(expectedNumberString); + msg.append(", but got "); + msg.append(actualNumberString); + errln(msg); + break; + } + } + } + } + delete decFmt; + } +} + diff --git a/icu4c/source/test/intltest/itrbnf.h b/icu4c/source/test/intltest/itrbnf.h new file mode 100644 index 00000000000..41859ffc21a --- /dev/null +++ b/icu4c/source/test/intltest/itrbnf.h @@ -0,0 +1,74 @@ +/* + ******************************************************************************* + * Copyright (C) 1996-2000, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +#ifndef ITRBNF_H +#define ITRBNF_H + +#include "intltest.h" + +#include "unicode/utypes.h" +#include "unicode/rbnf.h" + + +class IntlTestRBNF : public IntlTest { + public: + + // IntlTest override + virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par); + + /** + * Perform a simple spot check on the English spellout rules + */ + virtual void TestEnglishSpellout(); + + /** + * Perform a simple spot check on the English ordinal-abbreviation rules + */ + virtual void TestOrdinalAbbreviations(); + + /** + * Perform a simple spot check on the duration-formatting rules + */ + virtual void TestDurations(); + + /** + * Perform a simple spot check on the Spanish spellout rules + */ + virtual void TestSpanishSpellout(); + + /** + * Perform a simple spot check on the French spellout rules + */ + virtual void TestFrenchSpellout(); + + /** + * Perform a simple spot check on the Swiss French spellout rules + */ + virtual void TestSwissFrenchSpellout(); + + /** + * Perform a simple spot check on the Italian spellout rules + */ + virtual void TestItalianSpellout(); + + /** + * Perform a simple spot check on the German spellout rules + */ + virtual void TestGermanSpellout(); + + /** + * Perform a simple spot check on the Thai spellout rules + */ + virtual void TestThaiSpellout(); + + protected: + virtual void doTest(RuleBasedNumberFormat* formatter, const char* testData[][2], UBool testParsing); + virtual void doLenientParseTest(RuleBasedNumberFormat* formatter, const char* testData[][2]); +}; + +// endif ITRBNF_H +#endif diff --git a/icu4c/source/test/intltest/itrbnfrt.cpp b/icu4c/source/test/intltest/itrbnfrt.cpp new file mode 100644 index 00000000000..60631250f8c --- /dev/null +++ b/icu4c/source/test/intltest/itrbnfrt.cpp @@ -0,0 +1,341 @@ +/* + ******************************************************************************* + * Copyright (C) 1996-2000, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +#include "itrbnfrt.h" + +#include "unicode/fmtable.h" +#include "math.h" // fabs + +// current macro not in icu1.8.1 +#define TESTCASE(id,test) \ + case id: \ + name = #test; \ + if (exec) { \ + logln(#test "---"); \ + logln((UnicodeString)""); \ + test(); \ + } \ + break + +void RbnfRoundTripTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) +{ + if (exec) logln("TestSuite RuleBasedNumberFormatRT"); + switch (index) { + TESTCASE(0, TestEnglishSpelloutRT); + TESTCASE(1, TestDurationsRT); + TESTCASE(2, TestSpanishSpelloutRT); + TESTCASE(3, TestFrenchSpelloutRT); + TESTCASE(4, TestSwissFrenchSpelloutRT); + TESTCASE(5, TestItalianSpelloutRT); + TESTCASE(6, TestGermanSpelloutRT); + TESTCASE(7, TestSwedishSpelloutRT); + TESTCASE(8, TestDutchSpelloutRT); + TESTCASE(9, TestJapaneseSpelloutRT); + TESTCASE(10, TestRussianSpelloutRT); + TESTCASE(11, TestGreekSpelloutRT); + default: + name = ""; + break; + } +} + +/** + * Perform an exhaustive round-trip test on the English spellout rules + */ +void +RbnfRoundTripTest::TestEnglishSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::US, status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, -12345678, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the duration-formatting rules + */ +void +RbnfRoundTripTest::TestDurationsRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_DURATION, Locale::US, status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, 0, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Spanish spellout rules + */ +void +RbnfRoundTripTest::TestSpanishSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("es", "es"), status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, -12345678, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the French spellout rules + */ +void +RbnfRoundTripTest::TestFrenchSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::FRANCE, status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, -12345678, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Swiss French spellout rules + */ +void +RbnfRoundTripTest::TestSwissFrenchSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("fr", "CH"), status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, -12345678, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Italian spellout rules + */ +void +RbnfRoundTripTest::TestItalianSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::ITALIAN, status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, -999999, 999999); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the German spellout rules + */ +void +RbnfRoundTripTest::TestGermanSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::GERMANY, status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, 0, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Swedish spellout rules + */ +void +RbnfRoundTripTest::TestSwedishSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("sv", "SE"), status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, 0, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Dutch spellout rules + */ +void +RbnfRoundTripTest::TestDutchSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("nl", "NL"), status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, -12345678, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Japanese spellout rules + */ +void +RbnfRoundTripTest::TestJapaneseSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale::JAPAN, status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, 0, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Russian spellout rules + */ +void +RbnfRoundTripTest::TestRussianSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("ru", "RU"), status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, 0, 12345678); + } + delete formatter; +} + +/** + * Perform an exhaustive round-trip test on the Greek spellout rules + */ +void +RbnfRoundTripTest::TestGreekSpelloutRT() +{ + UErrorCode status = U_ZERO_ERROR; + RuleBasedNumberFormat* formatter + = new RuleBasedNumberFormat(URBNF_SPELLOUT, Locale("el", "GR"), status); + + if (U_FAILURE(status)) { + errln("failed to construct formatter"); + } else { + doTest(formatter, 0, 12345678); + } + delete formatter; +} + +void +RbnfRoundTripTest::doTest(const RuleBasedNumberFormat* formatter, + double lowLimit, + double highLimit) +{ + char buf[128]; + + uint32_t count = 0; + double increment = 1; + for (double i = lowLimit; i <= highLimit; i += increment) { + if (count % 1000 == 0) { + sprintf(buf, "%.12g", i); + logln(buf); + } + + if (fabs(i) < 5000) + increment = 1; + else if (fabs(i) < 500000) + increment = 2737; + else + increment = 267437; + + UnicodeString formatResult; + formatter->format(i, formatResult); + UErrorCode status = U_ZERO_ERROR; + Formattable parseResult; + formatter->parse(formatResult, parseResult, status); + if (U_FAILURE(status)) { + sprintf(buf, "Round-trip status failure: %.12g, status: %d", i, status); + errln(buf); + return; + } else { + double rt = (parseResult.getType() == Formattable::kDouble) ? + parseResult.getDouble() : + (double)parseResult.getLong(); + + if (rt != i) { + sprintf(buf, "Round-trip failed: %.12g -> %.12g", i, rt); + errln(buf); + return; + } + } + + ++count; + } + + if (lowLimit < 0) { + double d = 1.234; + while (d < 1000) { + UnicodeString formatResult; + formatter->format(d, formatResult); + UErrorCode status = U_ZERO_ERROR; + Formattable parseResult; + formatter->parse(formatResult, parseResult, status); + if (U_FAILURE(status)) { + sprintf(buf, "Round-trip status failure: %.12g, status: %d", d, status); + errln(buf); + return; + } else { + double rt = (parseResult.getType() == Formattable::kDouble) ? + parseResult.getDouble() : + (double)parseResult.getLong(); + + if (rt != d) { + UnicodeString msg; + sprintf(buf, "Round-trip failed: %.12g -> ", d); + msg.append(buf); + msg.append(formatResult); + sprintf(buf, " -> %.12g", rt); + msg.append(buf); + errln(msg); + return; + } + } + + d *= 10; + } + } +} + diff --git a/icu4c/source/test/intltest/itrbnfrt.h b/icu4c/source/test/intltest/itrbnfrt.h new file mode 100644 index 00000000000..ca211a9fb55 --- /dev/null +++ b/icu4c/source/test/intltest/itrbnfrt.h @@ -0,0 +1,86 @@ +/* + ******************************************************************************* + * Copyright (C) 1996-2000, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +#ifndef ITRBNFRT_H +#define ITRBNFRT_H + +#include "intltest.h" + +#include "unicode/utypes.h" +#include "unicode/rbnf.h" + +class RbnfRoundTripTest : public IntlTest { + + // IntlTest override + virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par); + + /** + * Perform an exhaustive round-trip test on the English spellout rules + */ + virtual void TestEnglishSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the duration-formatting rules + */ + virtual void TestDurationsRT(); + + /** + * Perform an exhaustive round-trip test on the Spanish spellout rules + */ + virtual void TestSpanishSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the French spellout rules + */ + virtual void TestFrenchSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the Swiss French spellout rules + */ + virtual void TestSwissFrenchSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the Italian spellout rules + */ + virtual void TestItalianSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the German spellout rules + */ + virtual void TestGermanSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the Swedish spellout rules + */ + virtual void TestSwedishSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the Dutch spellout rules + */ + virtual void TestDutchSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the Japanese spellout rules + */ + virtual void TestJapaneseSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the Russian spellout rules + */ + virtual void TestRussianSpelloutRT(); + + /** + * Perform an exhaustive round-trip test on the Greek spellout rules + */ + virtual void TestGreekSpelloutRT(); + + protected: + void doTest(const RuleBasedNumberFormat* formatter, double lowLimit, double highLimit); +}; + +// endif ITRBNFRT_H +#endif