ICU-22984 Move old monkeys

2025-04-05 13:35:32 +00:00 · 2024-11-29 19:52:37 +01:00 · 2024-11-29 19:52:37 +01:00 · 757f27cd35
commit 757f27cd35
parent 3f959352b5
1 changed files with 206 additions and 202 deletions
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -1593,6 +1593,212 @@ unsigned int RBBIMonkeyKind::maxClassNameSize() {
    return maxSize;
 }

+namespace {
+
+class SegmentationRule {
+  public:
+    enum Resolution {
+        BREAK = u'÷',
+        NO_BREAK = u'×',
+    };
+    struct BreakContext {
+        BreakContext(std::size_t index) : indexInRemapped(index) {}
+        std::optional<std::size_t> indexInRemapped;
+        const SegmentationRule *appliedRule = nullptr;
+    };
+
+    SegmentationRule(std::u16string_view name) { UnicodeString(name).toUTF8String(name_); }
+    virtual ~SegmentationRule() = default;
+
+    virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const = 0;
+    virtual Resolution resolution() const = 0;
+    const std::string &name() const { return name_; }
+
+    std::chrono::steady_clock::duration timeSpent() const { return timeSpent_; }
+
+  private:
+    std::string name_;
+
+  protected:
+    mutable std::chrono::steady_clock::duration timeSpent_{};
+};
+
+class RemapRule : public SegmentationRule {
+  public:
+    RemapRule(const std::u16string_view name, const std::u16string_view pattern,
+              const std::u16string_view replacement)
+        : SegmentationRule(name), replacement_(replacement) {
+        UParseError parseError;
+        UErrorCode status = U_ZERO_ERROR;
+        pattern_.reset(
+            RegexPattern::compile(pattern, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
+        U_ASSERT(U_SUCCESS(status));
+    }
+
+    virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
+        auto const start = std::chrono::steady_clock::now();
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeString result;
+        std::size_t i = 0;
+        std::ptrdiff_t offset = 0;
+        std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
+        while (matcher->find()) {
+            for (;; ++i) {
+                if (!resolved[i].indexInRemapped.has_value()) {
+                    continue;
+                }
+                if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
+                    break;
+                }
+                *resolved[i].indexInRemapped += offset;
+            }
+            for (;; ++i) {
+                if (!resolved[i].indexInRemapped.has_value()) {
+                    continue;
+                }
+                if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
+                    break;
+                }
+                if (resolved[i].appliedRule != nullptr &&
+                    resolved[i].appliedRule->resolution() == BREAK) {
+                    printf("Replacement rule at remapped indices %d sqq. spans a break",
+                           matcher->start(status));
+                    std::terminate();
+                }
+                resolved[i].appliedRule = this;
+                resolved[i].indexInRemapped = std::nullopt;
+            }
+            matcher->appendReplacement(result, replacement_, status);
+            offset = result.length() - *resolved[i].indexInRemapped;
+        }
+        for (; i < resolved.size(); ++i) {
+            if (!resolved[i].indexInRemapped.has_value()) {
+                continue;
+            }
+            *resolved[i].indexInRemapped += offset;
+        }
+        matcher->appendTail(result);
+        if (resolved.back().indexInRemapped != result.length()) {
+            std::string indices;
+            for (const auto r : resolved) {
+                indices += r.indexInRemapped.has_value() ? std::to_string(*r.indexInRemapped) : "null";
+                indices += ",";
+            }
+            std::string s;
+            puts(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
+                  result.toUTF8String(s))
+                     .c_str());
+            std::terminate();
+        }
+        remapped = result;
+        U_ASSERT(U_SUCCESS(status));
+        timeSpent_ += std::chrono::steady_clock::now() - start;
+    }
+
+    virtual Resolution resolution() const override { return NO_BREAK; }
+
+  private:
+    std::unique_ptr<RegexPattern> pattern_;
+    UnicodeString replacement_;
+};
+
+class RegexRule : public SegmentationRule {
+  public:
+    template <typename T, typename = std::enable_if_t<std::is_constructible_v<T, Resolution>>>
+    RegexRule(const std::u16string_view name, const std::u16string_view before, T resolution,
+              const std::u16string_view after)
+        : SegmentationRule(name), resolution_(static_cast<Resolution>(resolution)) {
+        UParseError parseError;
+        UErrorCode status = U_ZERO_ERROR;
+        before_.reset(
+            RegexPattern::compile(before, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
+        endsWithBefore_.reset(RegexPattern::compile(
+            ".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
+        after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
+        U_ASSERT(U_SUCCESS(status));
+    }
+
+    virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
+        auto const start = std::chrono::steady_clock::now();
+        UErrorCode status = U_ZERO_ERROR;
+        // The unicodetools implementation simply tries, for each index, to
+        // match the string up to the index against /.*(before)/ (with
+        // `matches`) and the beginning of the string after the index against
+        // /after/ (with `lookingAt`), but that is very slow, especially for
+        // nonempty /before/.  While the old monkeys are not a production
+        // implementation, we still do not want them to be too slow, since we
+        // need to test millions of sample strings.  Instead we search for
+        // /before/ and /after/, and check resulting candidates.  This speeds
+        // things up by a factor of ~40.
+        // We need to be careful about greedy matching: The first position where
+        // the rule matches may be before the end of the first /before/ match.
+        // However, it is both:
+        //   1. within a /before/ match or at its bounds,
+        //   2. at the beginning of an /after/ match.
+        // Further, the /before/ context of the rule matches within the
+        // aforementioned /before/ match.  Note that we need to look for
+        // overlapping matches, thus calls to `find` are always preceded by a
+        // reset via `region`.
+        std::unique_ptr<RegexMatcher> beforeSearch(before_->matcher(remapped, status));
+        std::unique_ptr<RegexMatcher> afterSearch(after_->matcher(remapped, status));
+        beforeSearch->useAnchoringBounds(false);
+        afterSearch->useAnchoringBounds(false);
+        U_ASSERT(U_SUCCESS(status));
+        if (beforeSearch->find() && afterSearch->find()) {
+            for (;;) {
+                if (afterSearch->start(status) < beforeSearch->start(status)) {
+                    afterSearch->region(beforeSearch->start(status), remapped.length(), status);
+                    if (!afterSearch->find()) {
+                        break;
+                    }
+                } else if (afterSearch->start(status) > beforeSearch->end(status)) {
+                    if (beforeSearch->start(status) == remapped.length()) {
+                        break;
+                    }
+                    beforeSearch->region(remapped.moveIndex32(beforeSearch->start(status), 1),
+                                         remapped.length(), status);
+                    if (!beforeSearch->find()) {
+                        break;
+                    }
+                } else {
+                    auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
+                        return r.indexInRemapped == afterSearch->start(status);
+                    });
+                    U_ASSERT(it != resolved.end());
+                    U_ASSERT(U_SUCCESS(status));
+                    if (it->appliedRule == nullptr &&
+                        std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
+                            ->useAnchoringBounds(false)
+                            .region(beforeSearch->start(status), afterSearch->start(status), status)
+                            .matches(status)) {
+                        it->appliedRule = this;
+                    }
+                    if (afterSearch->start(status) == remapped.length()) {
+                        break;
+                    }
+                    afterSearch->region(remapped.moveIndex32(afterSearch->start(status), 1),
+                                        remapped.length(), status);
+                    if (!afterSearch->find()) {
+                        break;
+                    }
+                }
+                U_ASSERT(U_SUCCESS(status));
+            }
+        }
+        timeSpent_ += std::chrono::steady_clock::now() - start;
+    }
+
+    virtual Resolution resolution() const override { return resolution_; }
+
+  private:
+    std::unique_ptr<RegexPattern> before_;
+    std::unique_ptr<RegexPattern> endsWithBefore_;
+    std::unique_ptr<RegexPattern> after_;
+    const Resolution resolution_;
+};
+
+}  // namespace
+
 //----------------------------------------------------------------------------------------
 //
 //   Random Numbers.  We need a long cycle length since we run overnight tests over
@ -2605,208 +2811,6 @@ RBBISentMonkey::~RBBISentMonkey() {
    delete fExtendSet;
 }

-
-
-class SegmentationRule {
-  public:
-    enum Resolution {
-        BREAK = u'÷',
-        NO_BREAK = u'×',
-    };
-    struct BreakContext {
-        BreakContext(std::size_t index) : indexInRemapped(index) {}
-        std::optional<std::size_t> indexInRemapped;
-        const SegmentationRule *appliedRule = nullptr;
-    };
-
-    SegmentationRule(std::u16string_view name) { UnicodeString(name).toUTF8String(name_); }
-    virtual ~SegmentationRule() = default;
-
-    virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const = 0;
-    virtual Resolution resolution() const = 0;
-    const std::string &name() const { return name_; }
-
-    std::chrono::steady_clock::duration timeSpent() const { return time_spent_; }
-
-  private:
-    std::string name_;
-  protected:
-    mutable std::chrono::steady_clock::duration time_spent_{};
-};
-
-class RemapRule : public SegmentationRule {
-  public:
-    RemapRule(const std::u16string_view name, const std::u16string_view pattern,
-              const std::u16string_view replacement)
-        : SegmentationRule(name), replacement_(replacement) {
-        UParseError parseError;
-        UErrorCode status = U_ZERO_ERROR;
-        pattern_.reset(RegexPattern::compile(pattern,
-                                             UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
-        U_ASSERT(U_SUCCESS(status));
-    }
-
-    virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
-        auto const start = std::chrono::steady_clock::now();
-        UErrorCode status = U_ZERO_ERROR;
-        UnicodeString result;
-        std::size_t i = 0;
-        std::ptrdiff_t offset = 0;
-        std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
-        while (matcher->find()) {
-            for (;; ++i) {
-                if (!resolved[i].indexInRemapped.has_value()) {
-                    continue;
-                }
-                if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
-                    break;
-                }
-                *resolved[i].indexInRemapped += offset;
-            }
-            for (;; ++i) {
-                if (!resolved[i].indexInRemapped.has_value()) {
-                    continue;
-                }
-                if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
-                    break;
-                }
-                if (resolved[i].appliedRule != nullptr &&
-                    resolved[i].appliedRule->resolution() == BREAK) {
-                    printf("Replacement rule at remapped indices %d sqq. spans a break",
-                           matcher->start(status));
-                    std::terminate();
-                }
-                resolved[i].appliedRule = this;
-                resolved[i].indexInRemapped = std::nullopt;
-            }
-            matcher->appendReplacement(result, replacement_, status);
-            offset = result.length() - *resolved[i].indexInRemapped;
-        }
-        for (; i < resolved.size(); ++i) {
-            if (!resolved[i].indexInRemapped.has_value()) {
-                continue;
-            }
-            *resolved[i].indexInRemapped += offset;
-        }
-        matcher->appendTail(result);
-        if (resolved.back().indexInRemapped != result.length()) {
-            std::string indices;
-            for (const auto r : resolved) {
-                indices += r.indexInRemapped.has_value() ? std::to_string(*r.indexInRemapped) : "null";
-                indices += ",";
-            }
-            std::string s;
-            puts(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
-                  result.toUTF8String(s)).c_str());
-            std::terminate();
-        }
-        remapped = result;
-        U_ASSERT(U_SUCCESS(status));
-        time_spent_ += std::chrono::steady_clock::now() - start;
-    }
-
-    virtual Resolution resolution() const override { return NO_BREAK; }
-
-  private:
-    std::unique_ptr<RegexPattern> pattern_;
-    UnicodeString replacement_;
-};
-
-class RegexRule : public SegmentationRule {
-  public:
-    template<typename T, typename = std::enable_if_t<std::is_constructible_v<T, Resolution>>>
-    RegexRule(const std::u16string_view name, const std::u16string_view before, T resolution,
-              const std::u16string_view after)
-        : SegmentationRule(name), resolution_(static_cast<Resolution>(resolution)) {
-        UParseError parseError;
-        UErrorCode status = U_ZERO_ERROR;
-        before_.reset(
-            RegexPattern::compile(before, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
-        endsWithBefore_.reset(RegexPattern::compile(
-            ".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
-        after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
-        U_ASSERT(U_SUCCESS(status));
-    }
-
-    virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
-        auto const start = std::chrono::steady_clock::now();
-        UErrorCode status = U_ZERO_ERROR;
-        // The unicodetools implementation simply tries, for each index, to
-        // match the string up to the index against /.*(before)/ (with
-        // `matches`) and the beginning of the string after the index against
-        // /after/ (with `lookingAt`), but that is very slow, especially for
-        // nonempty /before/.  While the old monkeys are not a production
-        // implementation, we still do not want them to be too slow, since we
-        // need to test millions of sample strings.  Instead we search for
-        // /before/ and /after/, and check resulting candidates.  This speeds
-        // things up by a factor of ~40.
-        // We need to be careful about greedy matching: The first position where
-        // the rule matches may be before the end of the first /before/ match.
-        // However, it is both:
-        //   1. within a /before/ match or at its bounds,
-        //   2. at the beginning of an /after/ match.
-        // Further, the /before/ context of the rule matches within the
-        // aforementioned /before/ match.  Note that we need to look for
-        // overlapping matches, thus calls to `find` are always preceded by a
-        // reset via `region`.
-        std::unique_ptr<RegexMatcher> beforeSearch(before_->matcher(remapped, status));
-        std::unique_ptr<RegexMatcher> afterSearch(after_->matcher(remapped, status));
-        beforeSearch->useAnchoringBounds(false);
-        afterSearch->useAnchoringBounds(false);
-        U_ASSERT(U_SUCCESS(status));
-        if (beforeSearch->find() && afterSearch->find()) {
-            for (;;) {
-                if (afterSearch->start(status) < beforeSearch->start(status)) {
-                    afterSearch->region(beforeSearch->start(status), remapped.length(), status);
-                    if (!afterSearch->find()) {
-                        break;
-                    }
-                } else if (afterSearch->start(status) > beforeSearch->end(status)) {
-                    if (beforeSearch->start(status) == remapped.length()) {
-                        break;
-                    }
-                    beforeSearch->region(remapped.moveIndex32(beforeSearch->start(status), 1),
-                                         remapped.length(), status);
-                    if (!beforeSearch->find()) {
-                      break;
-                    }
-                } else {
-                    auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
-                        return r.indexInRemapped == afterSearch->start(status);
-                    });
-                    U_ASSERT(it != resolved.end());
-                    U_ASSERT(U_SUCCESS(status));
-                    if (it->appliedRule == nullptr &&
-                        std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
-                            ->useAnchoringBounds(false)
-                            .region(beforeSearch->start(status), afterSearch->start(status), status)
-                            .matches(status)) {
-                        it->appliedRule = this;
-                    }
-                    if (afterSearch->start(status) == remapped.length()) {
-                        break;
-                    }
-                    afterSearch->region(remapped.moveIndex32(afterSearch->start(status), 1),
-                                        remapped.length(), status);
-                    if (!afterSearch->find()) {
-                        break;
-                    }
-                }
-                U_ASSERT(U_SUCCESS(status));
-            }
-        }
-        time_spent_ += std::chrono::steady_clock::now() - start;
-    }
-
-    virtual Resolution resolution() const override { return resolution_; }
-
-  private:
-    std::unique_ptr<RegexPattern> before_;
-    std::unique_ptr<RegexPattern> endsWithBefore_;
-    std::unique_ptr<RegexPattern> after_;
-    const Resolution resolution_;
-};
-
 //-------------------------------------------------------------------------------------------
 //
 //  RBBILineMonkey