From 3f959352b5f161dd965d4d0f4a894a97077b2ab7 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Fri, 29 Nov 2024 17:15:42 +0100
Subject: [PATCH] ICU-22984 Optimize old monkeys

---
 icu4c/source/test/intltest/rbbitst.cpp | 114 ++++++++++++++++++-------
 1 file changed, 85 insertions(+), 29 deletions(-)
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index c893c844fdd..fd90247bc25 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -2619,8 +2619,7 @@ class SegmentationRule {
         const SegmentationRule *appliedRule = nullptr;
     };
 
-    SegmentationRule(std::u16string_view name)
-        : name_(UnicodeString(name).toUTF8String(std::string{})) {}
+    SegmentationRule(std::u16string_view name) { UnicodeString(name).toUTF8String(name_); }
     virtual ~SegmentationRule() = default;
 
     virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const = 0;
@@ -2630,7 +2629,7 @@ class SegmentationRule {
     std::chrono::steady_clock::duration timeSpent() const { return time_spent_; }
 
   private:
-    const std::string name_;
+    std::string name_;
   protected:
     mutable std::chrono::steady_clock::duration time_spent_{};
 };
@@ -2651,15 +2650,15 @@ class RemapRule : public SegmentationRule {
         auto const start = std::chrono::steady_clock::now();
         UErrorCode status = U_ZERO_ERROR;
         UnicodeString result;
-        int i = 0;
-        int offset = 0;
+        std::size_t i = 0;
+        std::ptrdiff_t offset = 0;
         std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
         while (matcher->find()) {
             for (;; ++i) {
                 if (!resolved[i].indexInRemapped.has_value()) {
                     continue;
                 }
-                if (*resolved[i].indexInRemapped > matcher->start(status)) {
+                if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
                     break;
                 }
                 *resolved[i].indexInRemapped += offset;
@@ -2668,13 +2667,13 @@ class RemapRule : public SegmentationRule {
                 if (!resolved[i].indexInRemapped.has_value()) {
                     continue;
                 }
-                if (*resolved[i].indexInRemapped == matcher->end(status)) {
+                if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
                     break;
                 }
                 if (resolved[i].appliedRule != nullptr &&
                     resolved[i].appliedRule->resolution() == BREAK) {
-                    printf("Replacement rule at remapped indices %d sqq. spans a break: %s",
-                           matcher->start(status), remapped.toUTF8String(std::string{}).c_str());
+                    printf("Replacement rule at remapped indices %d sqq. spans a break",
+                           matcher->start(status));
                     std::terminate();
                 }
                 resolved[i].appliedRule = this;
@@ -2696,9 +2695,9 @@ class RemapRule : public SegmentationRule {
                 indices += r.indexInRemapped.has_value() ? std::to_string(*r.indexInRemapped) : "null";
                 indices += ",";
             }
-            printf(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
-                    result.toUTF8String(std::string{}).c_str())
-                       .c_str());
+            std::string s;
+            puts(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
+                  result.toUTF8String(s)).c_str());
             std::terminate();
         }
         remapped = result;
@@ -2721,8 +2720,10 @@ class RegexRule : public SegmentationRule {
         : SegmentationRule(name), resolution_(static_cast<Resolution>(resolution)) {
         UParseError parseError;
         UErrorCode status = U_ZERO_ERROR;
-        before_.reset(RegexPattern::compile(u".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL,
-                                            parseError, status));
+        before_.reset(
+            RegexPattern::compile(before, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
+        endsWithBefore_.reset(RegexPattern::compile(
+            ".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
         after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
         U_ASSERT(U_SUCCESS(status));
     }
@@ -2730,20 +2731,70 @@ class RegexRule : public SegmentationRule {
     virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
         auto const start = std::chrono::steady_clock::now();
         UErrorCode status = U_ZERO_ERROR;
-        for (auto &[indexInRemapped, appliedRule] : resolved) {
-            if (appliedRule != nullptr) {
-                continue;
-            }
-            if (std::unique_ptr<RegexMatcher>(before_->matcher(remapped, status))
-                    ->region(0, *indexInRemapped, status)
-                    .matches(status) &&
-                std::unique_ptr<RegexMatcher>(after_->matcher(remapped, status))
-                    ->region(*indexInRemapped, remapped.length(), status)
-                    .lookingAt(status)) {
-                appliedRule = this;
+        // The unicodetools implementation simply tries, for each index, to
+        // match the string up to the index against /.*(before)/ (with
+        // `matches`) and the beginning of the string after the index against
+        // /after/ (with `lookingAt`), but that is very slow, especially for
+        // nonempty /before/.  While the old monkeys are not a production
+        // implementation, we still do not want them to be too slow, since we
+        // need to test millions of sample strings.  Instead we search for
+        // /before/ and /after/, and check resulting candidates.  This speeds
+        // things up by a factor of ~40.
+        // We need to be careful about greedy matching: The first position where
+        // the rule matches may be before the end of the first /before/ match.
+        // However, it is both:
+        //   1. within a /before/ match or at its bounds,
+        //   2. at the beginning of an /after/ match.
+        // Further, the /before/ context of the rule matches within the
+        // aforementioned /before/ match.  Note that we need to look for
+        // overlapping matches, thus calls to `find` are always preceded by a
+        // reset via `region`.
+        std::unique_ptr<RegexMatcher> beforeSearch(before_->matcher(remapped, status));
+        std::unique_ptr<RegexMatcher> afterSearch(after_->matcher(remapped, status));
+        beforeSearch->useAnchoringBounds(false);
+        afterSearch->useAnchoringBounds(false);
+        U_ASSERT(U_SUCCESS(status));
+        if (beforeSearch->find() && afterSearch->find()) {
+            for (;;) {
+                if (afterSearch->start(status) < beforeSearch->start(status)) {
+                    afterSearch->region(beforeSearch->start(status), remapped.length(), status);
+                    if (!afterSearch->find()) {
+                        break;
+                    }
+                } else if (afterSearch->start(status) > beforeSearch->end(status)) {
+                    if (beforeSearch->start(status) == remapped.length()) {
+                        break;
+                    }
+                    beforeSearch->region(remapped.moveIndex32(beforeSearch->start(status), 1),
+                                         remapped.length(), status);
+                    if (!beforeSearch->find()) {
+                      break;
+                    }
+                } else {
+                    auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
+                        return r.indexInRemapped == afterSearch->start(status);
+                    });
+                    U_ASSERT(it != resolved.end());
+                    U_ASSERT(U_SUCCESS(status));
+                    if (it->appliedRule == nullptr &&
+                        std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
+                            ->useAnchoringBounds(false)
+                            .region(beforeSearch->start(status), afterSearch->start(status), status)
+                            .matches(status)) {
+                        it->appliedRule = this;
+                    }
+                    if (afterSearch->start(status) == remapped.length()) {
+                        break;
+                    }
+                    afterSearch->region(remapped.moveIndex32(afterSearch->start(status), 1),
+                                        remapped.length(), status);
+                    if (!afterSearch->find()) {
+                        break;
+                    }
+                }
+                U_ASSERT(U_SUCCESS(status));
             }
         }
-        U_ASSERT(U_SUCCESS(status));
         time_spent_ += std::chrono::steady_clock::now() - start;
     }
 
@@ -2751,6 +2802,7 @@ class RegexRule : public SegmentationRule {
 
   private:
     std::unique_ptr<RegexPattern> before_;
+    std::unique_ptr<RegexPattern> endsWithBefore_;
     std::unique_ptr<RegexPattern> after_;
     const Resolution resolution_;
 };
@@ -3005,7 +3057,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
     UnicodeString remapped = s;
     resolved.clear();
     resolved.reserve(s.length() + 1);
-    for (std::size_t i = 0; i < s.length() + 1; ++i) {
+    for (int i = 0; i < s.length() + 1; ++i) {
         resolved.emplace_back(i);
     }
     for (const auto& rule : rules) {
@@ -3038,8 +3090,12 @@ const std::vector<UnicodeSet>& RBBILineMonkey::charClasses() {
 
 
 RBBILineMonkey::~RBBILineMonkey() {
-    for (auto const& rule : rules) {
-        printf("%s : %lld ms\n", rule->name().c_str(), rule->timeSpent() / std::chrono::milliseconds(1));
+    constexpr bool debuggingOldMonkeyPerformance = false;
+    if (debuggingOldMonkeyPerformance) {
+        for (auto const &rule : rules) {
+          puts((rule->name() + " : " + std::to_string(rule->timeSpent() / std::chrono::milliseconds(1)) +
+                " ms").c_str());
+        }
     }
 
     delete fCharBI;