From 440cef61a7b01f78a411cc495a49b40d3eb82b7d Mon Sep 17 00:00:00 2001 From: Robert Melo Date: Tue, 21 Apr 2020 18:25:46 -0300 Subject: [PATCH] ICU-21071 Fix lenient parse rules - Check non-lenient rules before call lenint parsing - Remove logKnownIssue 9503 from test code - Adjust TestAllLocales test on ICU4C - Add lenient checks on ICU4J --- icu4c/source/i18n/nfrule.cpp | 16 ++- icu4c/source/i18n/plurfmt.cpp | 12 +- icu4c/source/test/intltest/itrbnf.cpp | 51 ++++---- .../core/src/com/ibm/icu/text/NFRule.java | 15 ++- .../src/com/ibm/icu/text/PluralFormat.java | 12 +- .../com/ibm/icu/dev/test/format/RbnfTest.java | 109 ++++++++++++++---- 6 files changed, 156 insertions(+), 59 deletions(-) diff --git a/icu4c/source/i18n/nfrule.cpp b/icu4c/source/i18n/nfrule.cpp index 3ad0291649e..4f809f9c803 100644 --- a/icu4c/source/i18n/nfrule.cpp +++ b/icu4c/source/i18n/nfrule.cpp @@ -1297,6 +1297,10 @@ NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErr #if !UCONFIG_NO_COLLATION // go through all this grief if we're in lenient-parse mode if (formatter->isLenient()) { + // Check if non-lenient rule finds the text before call lenient parsing + if (str.startsWith(prefix)) { + return prefix.length(); + } // get the formatter's collator and use it to create two // collation element iterators, one over the target string // and another over the prefix (right now, we'll throw an @@ -1505,9 +1509,15 @@ NFRule::findText(const UnicodeString& str, return str.indexOf(key, startingAt); } else { - // but if lenient parsing is turned ON, we've got some work - // ahead of us - return findTextLenient(str, key, startingAt, length); + // Check if non-lenient rule finds the text before call lenient parsing + *length = key.length(); + int32_t pos = str.indexOf(key, startingAt); + if(pos >= 0) { + return pos; + } else { + // but if lenient parsing is turned ON, we've got some work ahead of us + return findTextLenient(str, key, startingAt, length); + } } } diff --git a/icu4c/source/i18n/plurfmt.cpp b/icu4c/source/i18n/plurfmt.cpp index b99437630e6..aac35c5b094 100644 --- a/icu4c/source/i18n/plurfmt.cpp +++ b/icu4c/source/i18n/plurfmt.cpp @@ -549,9 +549,15 @@ void PluralFormat::parseType(const UnicodeString& source, const NFRule *rbnfLeni UnicodeString currArg = pattern.tempSubString(partStart->getLimit(), partLimit->getIndex() - partStart->getLimit()); if (rbnfLenientScanner != NULL) { - // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us. - int32_t length = -1; - currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length); + // Check if non-lenient rule finds the text before call lenient parsing + int32_t tempIndex = source.indexOf(currArg, startingAt); + if (tempIndex >= 0) { + currMatchIndex = tempIndex; + } else { + // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us. + int32_t length = -1; + currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length); + } } else { currMatchIndex = source.indexOf(currArg, startingAt); diff --git a/icu4c/source/test/intltest/itrbnf.cpp b/icu4c/source/test/intltest/itrbnf.cpp index ed4e865aaa3..efddaa76606 100644 --- a/icu4c/source/test/intltest/itrbnf.cpp +++ b/icu4c/source/test/intltest/itrbnf.cpp @@ -1889,16 +1889,19 @@ IntlTestRBNF::TestAllLocales() UErrorCode status = U_ZERO_ERROR; RuleBasedNumberFormat* f = new RuleBasedNumberFormat((URBNFRuleSetTag)j, *loc, status); - if (status == U_USING_DEFAULT_WARNING || status == U_USING_FALLBACK_WARNING) { - // Skip it. - delete f; - break; - } if (U_FAILURE(status)) { errln(UnicodeString(loc->getName()) + names[j] + "ERROR could not instantiate -> " + u_errorName(status)); continue; } + + Locale actualLocale = f->getLocale(ULOC_ACTUAL_LOCALE, status); + if (actualLocale != *loc) { + // Skip the redundancy + delete f; + break; + } + #if !UCONFIG_NO_COLLATION for (unsigned int numidx = 0; numidx < UPRV_LENGTHOF(numbers); numidx++) { double n = numbers[numidx]; @@ -1936,28 +1939,26 @@ IntlTestRBNF::TestAllLocales() + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble()); } } - if (!quick && !logKnownIssue("9503") ) { - // lenient parse - status = U_ZERO_ERROR; - f->setLenient(TRUE); - f->parse(str, num, status); - if (U_FAILURE(status)) { + // lenient parse + status = U_ZERO_ERROR; + f->setLenient(TRUE); + f->parse(str, num, status); + if (U_FAILURE(status)) { + errln(UnicodeString(loc->getName()) + names[j] + + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status)); + } + // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers. + if (j == 0) { + if (num.getType() == Formattable::kLong && num.getLong() != n) { errln(UnicodeString(loc->getName()) + names[j] - + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status)); + + UnicodeString("ERROR could not roundtrip ") + n + + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong()); } - // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers. - if (j == 0) { - if (num.getType() == Formattable::kLong && num.getLong() != n) { - errln(UnicodeString(loc->getName()) + names[j] - + UnicodeString("ERROR could not roundtrip ") + n - + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong()); - } - else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) { - // The epsilon difference is too high. - errln(UnicodeString(loc->getName()) + names[j] - + UnicodeString("ERROR could not roundtrip ") + n - + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble()); - } + else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) { + // The epsilon difference is too high. + errln(UnicodeString(loc->getName()) + names[j] + + UnicodeString("ERROR could not roundtrip ") + n + + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble()); } } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java b/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java index b54fe8aeef2..4a0c4a88cc4 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java @@ -1241,6 +1241,10 @@ final class NFRule { RbnfLenientScanner scanner = formatter.getLenientScanner(); if (scanner != null) { + // Check if non-lenient rule finds the text before call lenient parsing + if (str.startsWith(prefix)) { + return prefix.length(); + } return scanner.prefixLength(str, prefix); } @@ -1290,9 +1294,14 @@ final class NFRule { } if (scanner != null) { - // if lenient parsing is turned ON, we've got some work - // ahead of us - return scanner.findText(str, key, startingAt); + // Check if non-lenient rule finds the text before call lenient parsing + int pos[] = new int[] { str.indexOf(key, startingAt), key.length() }; + if (pos[0] >= 0) { + return pos; + } else { + // if lenient parsing is turned ON, we've got some work ahead of us + return scanner.findText(str, key, startingAt); + } } // if lenient parsing is turned off, this is easy. Just call // String.indexOf() and we're done diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java index dee651369d4..3d73861e188 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java @@ -760,9 +760,15 @@ public class PluralFormat extends UFormat { String currArg = pattern.substring(partStart.getLimit(), partLimit.getIndex()); if (scanner != null) { - // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us. - int[] scannerMatchResult = scanner.findText(source, currArg, startingAt); - currMatchIndex = scannerMatchResult[0]; + // Check if non-lenient rule finds the text before call lenient parsing + int tempPos = source.indexOf(currArg, startingAt); + if (tempPos >= 0) { + currMatchIndex = tempPos; + } else { + // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us. + int[] scannerMatchResult = scanner.findText(source, currArg, startingAt); + currMatchIndex = scannerMatchResult[0]; + } } else { currMatchIndex = source.indexOf(currArg, startingAt); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java index 0dbe81e0f4a..b278649cae6 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java @@ -296,6 +296,17 @@ public class RbnfTest extends TestFmwk { }; doTest(formatter, testData, true); + + String[][] testDataLenient = { + { "fifty-7", "57" }, + { " fifty-7", "57" }, + { " fifty-7", "57" }, + { "2 thousand six HUNDRED fifty-7", "2,657" }, + { "fifteen hundred and zero", "1,500" }, + { "FOurhundred thiRTY six", "436" } + }; + + doParsingTest(formatter, testDataLenient, true); } /** @@ -350,6 +361,12 @@ public class RbnfTest extends TestFmwk { }; doTest(formatter, testData, true); + + String[][] testDataLenient = { + { "2-51-33", "10,293" }, + }; + + doParsingTest(formatter, testDataLenient, true); } /** @@ -425,6 +442,13 @@ public class RbnfTest extends TestFmwk { }; doTest(formatter, testData, true); + + String[][] testDataLenient = { + { "trente-et-un", "31" }, + { "un cent quatre vingt dix huit", "198" }, + }; + + doParsingTest(formatter, testDataLenient, true); } /** @@ -529,6 +553,12 @@ public class RbnfTest extends TestFmwk { }; doTest(formatter, testData, true); + + String[][] testDataLenient = { + { "ein Tausend sechs Hundert fuenfunddreissig", "1,635" }, + }; + + doParsingTest(formatter, testDataLenient, true); } /** @@ -1117,6 +1147,10 @@ public class RbnfTest extends TestFmwk { " (ordinal) " //" (duration) " // English only }; + boolean[] lenientMode = { + false, // non-lenient mode + true // lenient mode + }; double[] numbers = {45.678, 1, 2, 10, 11, 100, 110, 200, 1000, 1111, -1111}; int count = numbers.length; Random r = (count <= numbers.length ? null : createRandom()); @@ -1142,25 +1176,25 @@ public class RbnfTest extends TestFmwk { logln(loc.getName() + names[j] + "success format: " + n + " -> " + s); } - try { - // RBNF parse is extremely slow when lenient option is enabled. - // non-lenient parse - fmt.setLenientParseMode(false); - Number num = fmt.parse(s); - if (isVerbose()) { - logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num); + for (int k = 0; k < lenientMode.length; k++) { + try { + fmt.setLenientParseMode(lenientMode[k]); + Number num = fmt.parse(s); + if (isVerbose()) { + logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num); + } + if (j != 0) { + // TODO: Fix the ordinal rules. + continue; + } + if (n != num.doubleValue()) { + errors.append("\n" + loc + names[j] + "got " + num + " expected " + n); + } + } catch (ParseException pe) { + String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage(); + logln(msg); + errors.append("\n" + msg); } - if (j != 0) { - // TODO: Fix the ordinal rules. - continue; - } - if (n != num.doubleValue()) { - errors.append("\n" + loc + names[j] + "got " + num + " expected " + n); - } - } catch (ParseException pe) { - String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage(); - logln(msg); - errors.append("\n" + msg); } } } @@ -1170,10 +1204,12 @@ public class RbnfTest extends TestFmwk { } } - void doTest(RuleBasedNumberFormat formatter, String[][] testData, - boolean testParsing) { - // NumberFormat decFmt = NumberFormat.getInstance(Locale.US); - NumberFormat decFmt = new DecimalFormat("#,###.################"); + NumberFormat createDecimalFormatter() { + return new DecimalFormat("#,###.################"); + } + + void doTest(RuleBasedNumberFormat formatter, String[][] testData, boolean testParsing) { + NumberFormat decFmt = createDecimalFormatter(); try { for (int i = 0; i < testData.length; i++) { String number = testData[i][0]; @@ -1207,6 +1243,35 @@ public class RbnfTest extends TestFmwk { } } + void doParsingTest(RuleBasedNumberFormat formatter, String[][] testData, boolean lenient) { + NumberFormat decFmt = createDecimalFormatter(); + + if (lenient) { + formatter.setLenientParseMode(true); + } + for (int i = 0; i < testData.length; i++) { + try { + String s = testData[i][0]; + Number expectedNumber = decFmt.parse(testData[i][1]); + if (isVerbose()) { + logln("test[" + i + "] spellout value: (" + s + ") target: " + expectedNumber); + } + + Number num = formatter.parse(s); + if (isVerbose()) { + logln("success parse: (" + s + ") -> " + num); + } + + if (expectedNumber.doubleValue() != num.doubleValue()) { + errln("\nParsing (" + s + ") failed: got " + num + " expected " + expectedNumber); + } + } catch (Throwable e) { + e.printStackTrace(); + errln("Test failed with exception: " + e.toString()); + } + } + } + /* Tests the method * public boolean equals(Object that) */