ICU-22518 Add a flag to export the output of the reference implementation from the old segmentation monkey tests

This commit is contained in:
Robin Leroy 2023-09-13 18:43:33 +02:00
parent 699555a5bd
commit ba1208e49b
2 changed files with 65 additions and 11 deletions

View file

@ -3544,6 +3544,9 @@ RBBILineMonkey::~RBBILineMonkey() {
//
// type = char | word | line | sent | title
//
// export = (path) Export test cases to (path)_(type).txt in the UCD
// test case format.
//
// Example:
// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
//
@ -3980,6 +3983,8 @@ void RBBITest::TestMonkey() {
UnicodeString breakType = "all";
Locale locale("en");
UBool useUText = false;
UBool scalarsOnly = false;
std::string exportPath;
if (quick == false) {
loopCount = 10000;
@ -4004,6 +4009,19 @@ void RBBITest::TestMonkey() {
p = u.replaceFirst("", status);
}
RegexMatcher pathMatcher(" *export *= *([^ ]+) *", p, 0, status);
if (pathMatcher.find()) {
pathMatcher.group(1, status).toUTF8String(exportPath);
pathMatcher.reset();
p = pathMatcher.replaceFirst("", status);
}
RegexMatcher s(" *scalars_only", p, 0, status);
if (s.find()) {
scalarsOnly = true;
s.reset();
p = s.replaceFirst("", status);
}
// m.reset(p);
if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
@ -4019,64 +4037,80 @@ void RBBITest::TestMonkey() {
}
if (breakType == "char" || breakType == "all") {
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_char.txt").c_str(), "w");
RBBICharMonkey m;
BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "char", seed, loopCount, useUText);
RunMonkey(bi, m, "char", seed, loopCount, useUText, file, scalarsOnly);
if (breakType == "all" && useUText==false) {
// Also run a quick test with UText when "all" is specified
RunMonkey(bi, m, "char", seed, loopCount, true);
RunMonkey(bi, m, "char", seed, loopCount, true, nullptr, scalarsOnly);
}
}
else {
errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}
if (breakType == "word" || breakType == "all") {
logln("Word Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_word.txt").c_str(), "w");
RBBIWordMonkey m;
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "word", seed, loopCount, useUText);
RunMonkey(bi, m, "word", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}
if (breakType == "line" || breakType == "all") {
logln("Line Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_line.txt").c_str(), "w");
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
if (loopCount >= 10) {
loopCount = loopCount / 5; // Line break runs slower than the others.
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "line", seed, loopCount, useUText);
RunMonkey(bi, m, "line", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}
if (breakType == "sent" || breakType == "all" ) {
logln("Sentence Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_sent.txt").c_str(), "w");
RBBISentMonkey m;
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
if (loopCount >= 10) {
loopCount = loopCount / 10; // Sentence runs slower than the other break types
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "sent", seed, loopCount, useUText);
RunMonkey(bi, m, "sent", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}
#endif
@ -4085,14 +4119,19 @@ void RBBITest::TestMonkey() {
//
// Run a RBBI monkey test. Common routine, for all break iterator types.
// Parameters:
// bi - the break iterator to use
// mk - MonkeyKind, abstraction for obtaining expected results
// name - Name of test (char, word, etc.) for use in error messages
// seed - Seed for starting random number generator (parameter from user)
// bi - the break iterator to use
// mk - MonkeyKind, abstraction for obtaining expected results
// name - Name of test (char, word, etc.) for use in error messages
// seed - Seed for starting random number generator (parameter from user)
// numIterations
// exportFile - Pointer to a file to which the test cases will be written in
// UCD format. May be null.
// scalarsOnly - Only test sequences of Unicode scalar values; if this is false,
// arbitrary sequences of code points (including unpaired surrogates)
// are tested.
//
void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
int32_t numIterations, UBool useUText) {
int32_t numIterations, UBool useUText, FILE *exportFile, UBool scalarsOnly) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
@ -4157,6 +4196,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
errln("%s:%d c < 0", __FILE__, __LINE__);
break;
}
if (scalarsOnly && U16_IS_SURROGATE(c)) {
continue;
}
// Do not assemble a supplementary character from randomly generated separate surrogates.
// (It could be a dictionary character)
if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
@ -4274,6 +4316,16 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
}
}
if (exportFile != nullptr) {
for (i = 0; i < testText.length();) {
fprintf(exportFile, expectedBreaks[i] ? "÷ " : "× ");
char32_t const c = testText.char32At(i);
fprintf(exportFile, "%04X ", static_cast<uint32_t>(c));
i += U16_LENGTH(c);
}
fprintf(exportFile, expectedBreaks[testText.length()] ? "÷ # 🐒\n" : "× # 🐒\n");
}
// Compare the expected and actual results.
for (i=0; i<=testText.length(); i++) {
const char *errorType = nullptr;

View file

@ -17,6 +17,8 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include <stdio.h>
#include <memory>
#include "intltest.h"
@ -128,7 +130,7 @@ private:
**/
void RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
int32_t loopCount, UBool useUText);
int32_t loopCount, UBool useUText, FILE *exportFile, UBool scalarsOnly);
// Run one of the Unicode Consortium boundary test data files.
void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi);