From 3f959352b5f161dd965d4d0f4a894a97077b2ab7 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 29 Nov 2024 17:15:42 +0100 Subject: [PATCH] ICU-22984 Optimize old monkeys --- icu4c/source/test/intltest/rbbitst.cpp | 114 ++++++++++++++++++------- 1 file changed, 85 insertions(+), 29 deletions(-) diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index c893c844fdd..fd90247bc25 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -2619,8 +2619,7 @@ class SegmentationRule { const SegmentationRule *appliedRule = nullptr; }; - SegmentationRule(std::u16string_view name) - : name_(UnicodeString(name).toUTF8String(std::string{})) {} + SegmentationRule(std::u16string_view name) { UnicodeString(name).toUTF8String(name_); } virtual ~SegmentationRule() = default; virtual void apply(UnicodeString &remapped, std::vector &resolved) const = 0; @@ -2630,7 +2629,7 @@ class SegmentationRule { std::chrono::steady_clock::duration timeSpent() const { return time_spent_; } private: - const std::string name_; + std::string name_; protected: mutable std::chrono::steady_clock::duration time_spent_{}; }; @@ -2651,15 +2650,15 @@ class RemapRule : public SegmentationRule { auto const start = std::chrono::steady_clock::now(); UErrorCode status = U_ZERO_ERROR; UnicodeString result; - int i = 0; - int offset = 0; + std::size_t i = 0; + std::ptrdiff_t offset = 0; std::unique_ptr matcher(pattern_->matcher(remapped, status)); while (matcher->find()) { for (;; ++i) { if (!resolved[i].indexInRemapped.has_value()) { continue; } - if (*resolved[i].indexInRemapped > matcher->start(status)) { + if (*resolved[i].indexInRemapped > static_cast(matcher->start64(status))) { break; } *resolved[i].indexInRemapped += offset; @@ -2668,13 +2667,13 @@ class RemapRule : public SegmentationRule { if (!resolved[i].indexInRemapped.has_value()) { continue; } - if (*resolved[i].indexInRemapped == matcher->end(status)) { + if (*resolved[i].indexInRemapped == static_cast(matcher->end64(status))) { break; } if (resolved[i].appliedRule != nullptr && resolved[i].appliedRule->resolution() == BREAK) { - printf("Replacement rule at remapped indices %d sqq. spans a break: %s", - matcher->start(status), remapped.toUTF8String(std::string{}).c_str()); + printf("Replacement rule at remapped indices %d sqq. spans a break", + matcher->start(status)); std::terminate(); } resolved[i].appliedRule = this; @@ -2696,9 +2695,9 @@ class RemapRule : public SegmentationRule { indices += r.indexInRemapped.has_value() ? std::to_string(*r.indexInRemapped) : "null"; indices += ","; } - printf(("Inconsistent indexInRemapped " + indices + " for new remapped string " + - result.toUTF8String(std::string{}).c_str()) - .c_str()); + std::string s; + puts(("Inconsistent indexInRemapped " + indices + " for new remapped string " + + result.toUTF8String(s)).c_str()); std::terminate(); } remapped = result; @@ -2721,8 +2720,10 @@ class RegexRule : public SegmentationRule { : SegmentationRule(name), resolution_(static_cast(resolution)) { UParseError parseError; UErrorCode status = U_ZERO_ERROR; - before_.reset(RegexPattern::compile(u".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, - parseError, status)); + before_.reset( + RegexPattern::compile(before, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status)); + endsWithBefore_.reset(RegexPattern::compile( + ".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status)); after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status)); U_ASSERT(U_SUCCESS(status)); } @@ -2730,20 +2731,70 @@ class RegexRule : public SegmentationRule { virtual void apply(UnicodeString &remapped, std::vector &resolved) const override { auto const start = std::chrono::steady_clock::now(); UErrorCode status = U_ZERO_ERROR; - for (auto &[indexInRemapped, appliedRule] : resolved) { - if (appliedRule != nullptr) { - continue; - } - if (std::unique_ptr(before_->matcher(remapped, status)) - ->region(0, *indexInRemapped, status) - .matches(status) && - std::unique_ptr(after_->matcher(remapped, status)) - ->region(*indexInRemapped, remapped.length(), status) - .lookingAt(status)) { - appliedRule = this; + // The unicodetools implementation simply tries, for each index, to + // match the string up to the index against /.*(before)/ (with + // `matches`) and the beginning of the string after the index against + // /after/ (with `lookingAt`), but that is very slow, especially for + // nonempty /before/. While the old monkeys are not a production + // implementation, we still do not want them to be too slow, since we + // need to test millions of sample strings. Instead we search for + // /before/ and /after/, and check resulting candidates. This speeds + // things up by a factor of ~40. + // We need to be careful about greedy matching: The first position where + // the rule matches may be before the end of the first /before/ match. + // However, it is both: + // 1. within a /before/ match or at its bounds, + // 2. at the beginning of an /after/ match. + // Further, the /before/ context of the rule matches within the + // aforementioned /before/ match. Note that we need to look for + // overlapping matches, thus calls to `find` are always preceded by a + // reset via `region`. + std::unique_ptr beforeSearch(before_->matcher(remapped, status)); + std::unique_ptr afterSearch(after_->matcher(remapped, status)); + beforeSearch->useAnchoringBounds(false); + afterSearch->useAnchoringBounds(false); + U_ASSERT(U_SUCCESS(status)); + if (beforeSearch->find() && afterSearch->find()) { + for (;;) { + if (afterSearch->start(status) < beforeSearch->start(status)) { + afterSearch->region(beforeSearch->start(status), remapped.length(), status); + if (!afterSearch->find()) { + break; + } + } else if (afterSearch->start(status) > beforeSearch->end(status)) { + if (beforeSearch->start(status) == remapped.length()) { + break; + } + beforeSearch->region(remapped.moveIndex32(beforeSearch->start(status), 1), + remapped.length(), status); + if (!beforeSearch->find()) { + break; + } + } else { + auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) { + return r.indexInRemapped == afterSearch->start(status); + }); + U_ASSERT(it != resolved.end()); + U_ASSERT(U_SUCCESS(status)); + if (it->appliedRule == nullptr && + std::unique_ptr(endsWithBefore_->matcher(remapped, status)) + ->useAnchoringBounds(false) + .region(beforeSearch->start(status), afterSearch->start(status), status) + .matches(status)) { + it->appliedRule = this; + } + if (afterSearch->start(status) == remapped.length()) { + break; + } + afterSearch->region(remapped.moveIndex32(afterSearch->start(status), 1), + remapped.length(), status); + if (!afterSearch->find()) { + break; + } + } + U_ASSERT(U_SUCCESS(status)); } } - U_ASSERT(U_SUCCESS(status)); time_spent_ += std::chrono::steady_clock::now() - start; } @@ -2751,6 +2802,7 @@ class RegexRule : public SegmentationRule { private: std::unique_ptr before_; + std::unique_ptr endsWithBefore_; std::unique_ptr after_; const Resolution resolution_; }; @@ -3005,7 +3057,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) { UnicodeString remapped = s; resolved.clear(); resolved.reserve(s.length() + 1); - for (std::size_t i = 0; i < s.length() + 1; ++i) { + for (int i = 0; i < s.length() + 1; ++i) { resolved.emplace_back(i); } for (const auto& rule : rules) { @@ -3038,8 +3090,12 @@ const std::vector& RBBILineMonkey::charClasses() { RBBILineMonkey::~RBBILineMonkey() { - for (auto const& rule : rules) { - printf("%s : %lld ms\n", rule->name().c_str(), rule->timeSpent() / std::chrono::milliseconds(1)); + constexpr bool debuggingOldMonkeyPerformance = false; + if (debuggingOldMonkeyPerformance) { + for (auto const &rule : rules) { + puts((rule->name() + " : " + std::to_string(rule->timeSpent() / std::chrono::milliseconds(1)) + + " ms").c_str()); + } } delete fCharBI;