mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-22518 Add a flag to export the output of the reference implementation from the old segmentation monkey tests
This commit is contained in:
parent
699555a5bd
commit
ba1208e49b
2 changed files with 65 additions and 11 deletions
|
@ -3544,6 +3544,9 @@ RBBILineMonkey::~RBBILineMonkey() {
|
|||
//
|
||||
// type = char | word | line | sent | title
|
||||
//
|
||||
// export = (path) Export test cases to (path)_(type).txt in the UCD
|
||||
// test case format.
|
||||
//
|
||||
// Example:
|
||||
// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
|
||||
//
|
||||
|
@ -3980,6 +3983,8 @@ void RBBITest::TestMonkey() {
|
|||
UnicodeString breakType = "all";
|
||||
Locale locale("en");
|
||||
UBool useUText = false;
|
||||
UBool scalarsOnly = false;
|
||||
std::string exportPath;
|
||||
|
||||
if (quick == false) {
|
||||
loopCount = 10000;
|
||||
|
@ -4004,6 +4009,19 @@ void RBBITest::TestMonkey() {
|
|||
p = u.replaceFirst("", status);
|
||||
}
|
||||
|
||||
RegexMatcher pathMatcher(" *export *= *([^ ]+) *", p, 0, status);
|
||||
if (pathMatcher.find()) {
|
||||
pathMatcher.group(1, status).toUTF8String(exportPath);
|
||||
pathMatcher.reset();
|
||||
p = pathMatcher.replaceFirst("", status);
|
||||
}
|
||||
|
||||
RegexMatcher s(" *scalars_only", p, 0, status);
|
||||
if (s.find()) {
|
||||
scalarsOnly = true;
|
||||
s.reset();
|
||||
p = s.replaceFirst("", status);
|
||||
}
|
||||
|
||||
// m.reset(p);
|
||||
if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
|
||||
|
@ -4019,64 +4037,80 @@ void RBBITest::TestMonkey() {
|
|||
}
|
||||
|
||||
if (breakType == "char" || breakType == "all") {
|
||||
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_char.txt").c_str(), "w");
|
||||
RBBICharMonkey m;
|
||||
BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
RunMonkey(bi, m, "char", seed, loopCount, useUText);
|
||||
RunMonkey(bi, m, "char", seed, loopCount, useUText, file, scalarsOnly);
|
||||
if (breakType == "all" && useUText==false) {
|
||||
// Also run a quick test with UText when "all" is specified
|
||||
RunMonkey(bi, m, "char", seed, loopCount, true);
|
||||
RunMonkey(bi, m, "char", seed, loopCount, true, nullptr, scalarsOnly);
|
||||
}
|
||||
}
|
||||
else {
|
||||
errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
|
||||
}
|
||||
delete bi;
|
||||
if (file != nullptr) {
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
if (breakType == "word" || breakType == "all") {
|
||||
logln("Word Break Monkey Test");
|
||||
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_word.txt").c_str(), "w");
|
||||
RBBIWordMonkey m;
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
RunMonkey(bi, m, "word", seed, loopCount, useUText);
|
||||
RunMonkey(bi, m, "word", seed, loopCount, useUText, file, scalarsOnly);
|
||||
}
|
||||
else {
|
||||
errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
|
||||
}
|
||||
delete bi;
|
||||
if (file != nullptr) {
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
if (breakType == "line" || breakType == "all") {
|
||||
logln("Line Break Monkey Test");
|
||||
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_line.txt").c_str(), "w");
|
||||
RBBILineMonkey m;
|
||||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
if (loopCount >= 10) {
|
||||
loopCount = loopCount / 5; // Line break runs slower than the others.
|
||||
}
|
||||
if (U_SUCCESS(status)) {
|
||||
RunMonkey(bi, m, "line", seed, loopCount, useUText);
|
||||
RunMonkey(bi, m, "line", seed, loopCount, useUText, file, scalarsOnly);
|
||||
}
|
||||
else {
|
||||
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
|
||||
}
|
||||
delete bi;
|
||||
if (file != nullptr) {
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
if (breakType == "sent" || breakType == "all" ) {
|
||||
logln("Sentence Break Monkey Test");
|
||||
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_sent.txt").c_str(), "w");
|
||||
RBBISentMonkey m;
|
||||
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
|
||||
if (loopCount >= 10) {
|
||||
loopCount = loopCount / 10; // Sentence runs slower than the other break types
|
||||
}
|
||||
if (U_SUCCESS(status)) {
|
||||
RunMonkey(bi, m, "sent", seed, loopCount, useUText);
|
||||
RunMonkey(bi, m, "sent", seed, loopCount, useUText, file, scalarsOnly);
|
||||
}
|
||||
else {
|
||||
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
|
||||
}
|
||||
delete bi;
|
||||
if (file != nullptr) {
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -4085,14 +4119,19 @@ void RBBITest::TestMonkey() {
|
|||
//
|
||||
// Run a RBBI monkey test. Common routine, for all break iterator types.
|
||||
// Parameters:
|
||||
// bi - the break iterator to use
|
||||
// mk - MonkeyKind, abstraction for obtaining expected results
|
||||
// name - Name of test (char, word, etc.) for use in error messages
|
||||
// seed - Seed for starting random number generator (parameter from user)
|
||||
// bi - the break iterator to use
|
||||
// mk - MonkeyKind, abstraction for obtaining expected results
|
||||
// name - Name of test (char, word, etc.) for use in error messages
|
||||
// seed - Seed for starting random number generator (parameter from user)
|
||||
// numIterations
|
||||
// exportFile - Pointer to a file to which the test cases will be written in
|
||||
// UCD format. May be null.
|
||||
// scalarsOnly - Only test sequences of Unicode scalar values; if this is false,
|
||||
// arbitrary sequences of code points (including unpaired surrogates)
|
||||
// are tested.
|
||||
//
|
||||
void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
|
||||
int32_t numIterations, UBool useUText) {
|
||||
int32_t numIterations, UBool useUText, FILE *exportFile, UBool scalarsOnly) {
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
|
@ -4157,6 +4196,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
errln("%s:%d c < 0", __FILE__, __LINE__);
|
||||
break;
|
||||
}
|
||||
if (scalarsOnly && U16_IS_SURROGATE(c)) {
|
||||
continue;
|
||||
}
|
||||
// Do not assemble a supplementary character from randomly generated separate surrogates.
|
||||
// (It could be a dictionary character)
|
||||
if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
|
||||
|
@ -4274,6 +4316,16 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
}
|
||||
}
|
||||
|
||||
if (exportFile != nullptr) {
|
||||
for (i = 0; i < testText.length();) {
|
||||
fprintf(exportFile, expectedBreaks[i] ? "÷ " : "× ");
|
||||
char32_t const c = testText.char32At(i);
|
||||
fprintf(exportFile, "%04X ", static_cast<uint32_t>(c));
|
||||
i += U16_LENGTH(c);
|
||||
}
|
||||
fprintf(exportFile, expectedBreaks[testText.length()] ? "÷ # 🐒\n" : "× # 🐒\n");
|
||||
}
|
||||
|
||||
// Compare the expected and actual results.
|
||||
for (i=0; i<=testText.length(); i++) {
|
||||
const char *errorType = nullptr;
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "intltest.h"
|
||||
|
@ -128,7 +130,7 @@ private:
|
|||
**/
|
||||
|
||||
void RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
|
||||
int32_t loopCount, UBool useUText);
|
||||
int32_t loopCount, UBool useUText, FILE *exportFile, UBool scalarsOnly);
|
||||
|
||||
// Run one of the Unicode Consortium boundary test data files.
|
||||
void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi);
|
||||
|
|
Loading…
Add table
Reference in a new issue