ICU-22984 Generate old Java monkeys

2025-04-10 07:39:16 +00:00 · 2025-01-27 01:56:54 +01:00 · 2025-01-27 01:56:54 +01:00 · 6d8b63ce84
commit 6d8b63ce84
parent 4fc1b7e7f6
4 changed files with 598 additions and 1065 deletions
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RegexRule.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RegexRule.java
@ -0,0 +1,111 @@
+// © 2024 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+package com.ibm.icu.dev.test.rbbi;
+
+import java.util.Arrays;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A regex rule expressed as in UAXes #14 and #29.
+ *
+ * The rule consists of two regexes for context before and after a position in
+ * the remapped text,
+ * and of a resolution (break or not) that applies to the corresponding position
+ * in the original
+ * string if both match.
+ */
+class RegexRule extends SegmentationRule {
+    RegexRule(String name, String before, Resolution resolution,
+            String after) {
+        super(name);
+        resolution_ = resolution;
+        before_ = Pattern.compile(expandUnicodeSets(before), Pattern.COMMENTS | Pattern.DOTALL);
+        endsWithBefore_ = Pattern.compile(
+                ".*(" + expandUnicodeSets(before) + ")", Pattern.COMMENTS | Pattern.DOTALL);
+        after_ = Pattern.compile(expandUnicodeSets(after), Pattern.COMMENTS | Pattern.DOTALL);
+    }
+
+    @Override
+    void apply(StringBuilder remapped, BreakContext[] resolved) {
+        // The unicodetools implementation simply tries, for each index, to
+        // match the string up to the index against /.*(before)/ (with
+        // `matches`) and the beginning of the string after the index against
+        // /after/ (with `lookingAt`), but that is very slow, especially for
+        // nonempty /before/. While the old monkeys are not a production
+        // implementation, we still do not want them to be too slow, since we
+        // need to test millions of sample strings. Instead we search for
+        // /before/ and /after/, and check resulting candidates. This speeds
+        // things up by a factor of ~40.
+        // We need to be careful about greedy matching: The first position where
+        // the rule matches may be before the end of the first /before/ match.
+        // However, it is both:
+        // 1. within a /before/ match or at its bounds,
+        // 2. at the beginning of an /after/ match.
+        // Further, the /before/ context of the rule matches within the
+        // aforementioned /before/ match. Note that we need to look for
+        // overlapping matches, thus calls to `find` are always preceded by a
+        // reset via `region`.
+        final Matcher beforeSearch = before_.matcher(remapped);
+        final Matcher afterSearch = after_.matcher(remapped);
+        beforeSearch.useAnchoringBounds(false);
+        afterSearch.useAnchoringBounds(false);
+        if (beforeSearch.find() && afterSearch.find()) {
+            for (;;) {
+                if (afterSearch.start() < beforeSearch.start()) {
+                    afterSearch.region(beforeSearch.start(), remapped.length());
+                    if (!afterSearch.find()) {
+                        break;
+                    }
+                } else if (afterSearch.start() > beforeSearch.end()) {
+                    if (beforeSearch.start() == remapped.length()) {
+                        break;
+                    }
+                    beforeSearch.region(remapped.offsetByCodePoints(beforeSearch.start(), 1),
+                            remapped.length());
+                    if (!beforeSearch.find()) {
+                        break;
+                    }
+                } else {
+                    final Optional<BreakContext> position = Arrays.stream(resolved)
+                            .filter(r -> r.indexInRemapped != null && r.indexInRemapped == afterSearch.start())
+                            .findFirst();
+                    if (!position.isPresent()) {
+                        throw new IllegalArgumentException(("Rule " + name() +
+                                " matched at position " + afterSearch.start() +
+                                " in " + remapped +
+                                " which does not correspond to an index in " +
+                                "the original string"));
+                    }
+                    if (position.get().appliedRule == null &&
+                            endsWithBefore_.matcher(remapped)
+                                    .useAnchoringBounds(false)
+                                    .region(beforeSearch.start(), afterSearch.start())
+                                    .matches()) {
+                        position.get().appliedRule = this;
+                    }
+                    if (afterSearch.start() == remapped.length()) {
+                        break;
+                    }
+                    afterSearch.region(remapped.offsetByCodePoints(afterSearch.start(), 1),
+                            remapped.length());
+                    if (!afterSearch.find()) {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    @Override
+    Resolution resolution() {
+        return resolution_;
+    }
+
+    private final Pattern before_;
+    private final Pattern endsWithBefore_;
+    private final Pattern after_;
+    private final Resolution resolution_;
+}
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RemapRule.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RemapRule.java
@ -0,0 +1,166 @@
+// © 2024 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+package com.ibm.icu.dev.test.rbbi;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A segmentation rule expressed as in UAXes #14 and #29.
+ *
+ * The application of a remap rule is a normal regex replacement on the remapped
+ * string.  This replacement may use capturing groups.  Any positions in the
+ * original string that correspond to positions within the replaced text are
+ * resolved to NO_BREAK by this rule.
+ */
+public class RemapRule extends SegmentationRule {
+    RemapRule(String name, String pattern, String replacement) {
+        super(name);
+        replacement_ = replacement;
+        pattern_ = Pattern.compile(expandUnicodeSets(pattern), Pattern.COMMENTS | Pattern.DOTALL);
+    }
+
+    @Override
+    void apply(StringBuilder remapped, BreakContext[] resolved) {
+        // This one has to be a StringBuffer rather than a StringBuilder because the
+        // overload of
+        // AppendReplacement that takes a StringBuilder is new in Java 9.
+        StringBuffer result = new StringBuffer();
+        int i = 0;
+        int offset = 0;
+        // We find all matches of the `pattern_` and replace them according to
+        // the `replacement_`, producing the new remapped string `result`.
+        // For every position i in the original string,
+        // `resolved[i].indexInRemapped` is null if i lies within a replaced
+        // match, and is set to the new index in `result` otherwise, by adding
+        // the accumulated difference `offset` between match lengths and
+        // replacement lengths.
+        // Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where
+        // ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code
+        // units, and apply the following two rules:
+        // 1. (?<X>\P{lb=SP}) \p{lb=CM}* → ${X}
+        // 2. \p{lb=CM} → A
+        // The string remapped and the indexInRemapped values change as follows:
+        // indexInRemapped remapped string rule final
+        // (aligned on the initial string) applied offset
+        // 𒀀 ◌́ ␠ ◌𝅲
+        // 0 1 2 3 4 5 6 ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩ (none)
+        // 0 - - 2 3 4 5 ⟨ 𒀀, ␠, ◌𝅲 ⟩ 1 -1
+        // 0 - - 2 3 - 4 ⟨ 𒀀, ␠, A ⟩ 2 -1
+        //
+        // Note that the last indexInRemapped is always equal to the length of
+        // the remapped string.
+        final Matcher matcher = pattern_.matcher(remapped);
+        while (matcher.find()) {
+            for (;; ++i) {
+                if (resolved[i].indexInRemapped == null) {
+                    continue;
+                }
+                if (resolved[i].indexInRemapped != null &&
+                        resolved[i].indexInRemapped > matcher.start()) {
+                    break;
+                }
+                resolved[i].indexInRemapped += offset;
+            }
+            for (;; ++i) {
+                if (resolved[i].indexInRemapped == null) {
+                    continue;
+                }
+                // Note that
+                // `*resolved[i].indexInRemapped > matcher.end()` should
+                // never happen with ordinary rules, but could in principle
+                // happen with rules that remap to code point sequences, e.g.,
+                // 1. BC → TYZ
+                // 2. AT → X
+                // applied to ⟨ A, B, C ⟩:
+                // indexInRemapped remapped rule
+                // A B C
+                // 0 1 2 3 ⟨ A, B, C ⟩ (none)
+                // 0 1 - 4 ⟨ A, T, Y, Z ⟩ 1
+                // 0 - - 3 ⟨ X, Y, Z ⟩ 2
+                // Where for the application of rule 2, the match ends at
+                // position 2 in remapped, which does not correspond to a
+                // position in the original string.
+                if (resolved[i].indexInRemapped != null &&
+                        resolved[i].indexInRemapped >= matcher.end()) {
+                    break;
+                }
+                if (resolved[i].appliedRule != null &&
+                        resolved[i].appliedRule.resolution() == Resolution.BREAK) {
+                    throw new IllegalArgumentException(
+                            "Replacement rule at remapped indices " +
+                                    matcher.start() +
+                                    " sqq. spans a break");
+                }
+                resolved[i].appliedRule = this;
+                resolved[i].indexInRemapped = null;
+            }
+            // While replacing, we need to check that we are not creating
+            // surrogate pairs.  Since appendReplacement performs two
+            // concatenations (the unreplaced segment and the replacement), we
+            // need to check in two places: whether the unreplaced segment
+            // starts with a trailing surrogate that ends up after a leading
+            // surrogate, and whether the replaced segment starts with a leading
+            // surrogate that ends up after a trailing surrogate.
+            // We break the pair by replacing one of the surrogates with U+FFFF,
+            // which has the same properties for all but line breaking, and the
+            // same behaviour in line breaking (lb=SG and lb=XX are both treated
+            // as lb=AL).
+            Integer trailingLead = null;
+            if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) {
+                trailingLead = result.length() - 1;
+            }
+
+            matcher.appendReplacement(result, replacement_);
+
+            if (trailingLead != null && trailingLead + 1 < result.length() &&
+                    Character.isLowSurrogate(result.charAt(trailingLead + 1))) {
+                result.setCharAt(trailingLead, '\uFFFF');
+            }
+
+            if (matcher.start() + offset > 0 &&
+                    Character.isHighSurrogate(result.charAt(matcher.start() + offset - 1)) &&
+                    Character.isLowSurrogate(result.charAt(matcher.start() + offset))) {
+                result.setCharAt(matcher.start() + offset, '\uFFFF');
+            }
+            offset = result.length() - resolved[i].indexInRemapped;
+        }
+        for (; i < resolved.length; ++i) {
+            if (resolved[i].indexInRemapped == null) {
+                continue;
+            }
+            resolved[i].indexInRemapped += offset;
+        }
+
+        Integer trailingLead = null;
+        if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) {
+            trailingLead = result.length() - 1;
+        }
+        matcher.appendTail(result);
+        if (trailingLead != null && trailingLead + 1 < result.length() &&
+                Character.isLowSurrogate(result.charAt(trailingLead + 1))) {
+            result.setCharAt(trailingLead, '\uFFFF');
+        }
+
+        if (resolved[resolved.length - 1].indexInRemapped != result.length()) {
+            StringBuilder indices = new StringBuilder();
+            for (final BreakContext r : resolved) {
+                indices.append(r.indexInRemapped == null ? "null" : r.indexInRemapped.toString());
+                indices.append(",");
+            }
+            throw new IllegalArgumentException("Inconsistent indexInRemapped " + indices + " for new remapped string " +
+                    result);
+        }
+        remapped.setLength(0);
+        remapped.append(result);
+    }
+
+    @Override
+    Resolution resolution() {
+        return Resolution.NO_BREAK;
+    }
+
+    private final Pattern pattern_;
+    private final String replacement_;
+}
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/SegmentationRule.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/SegmentationRule.java
@ -0,0 +1,94 @@
+// © 2024 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+package com.ibm.icu.dev.test.rbbi;
+
+import java.text.ParsePosition;
+
+import javax.swing.RowFilter.Entry;
+
+import com.ibm.icu.impl.Utility;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSet.EntryRange;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * A segmentation rule expressed as in UAXes #14 and #29.
+ * 
+ * Rules are applied sequentially.
+ * Rules operate on a mutable remapped string (which the caller should initially
+ * set to the string
+ * to be segmented), and can resolve positions in the original string to either
+ * BREAK or NO_BREAK.
+ */
+public abstract class SegmentationRule {
+    enum Resolution {
+        BREAK,
+        NO_BREAK,
+    }
+
+    static class BreakContext {
+        BreakContext(int index) {
+            indexInRemapped = index;
+        }
+
+        Integer indexInRemapped;
+        SegmentationRule appliedRule = null;
+    };
+
+    SegmentationRule(String name) {
+        name_ = name;
+    }
+
+    // Returns "\\uhhhh" for a BMP code point and "\\uDhhh\\uDhhh" (UTF-16) for other code points.
+    private String javaUEscape(int codePoint) {
+        if (codePoint <= 0xFFFF) {
+            return "\\u" + Utility.hex(codePoint);
+        } else {
+            return "\\u" + Utility.hex(UTF16.getLeadSurrogate(codePoint)) + "\\u"
+            + Utility.hex(UTF16.getTrailSurrogate(codePoint));
+        }
+    }
+
+    protected String expandUnicodeSets(String regex) {
+        StringBuilder result = new StringBuilder();
+        int i = 0;
+        final boolean java8OrOlder = System.getProperty("java.version").startsWith("1.");
+        while (i < regex.length()) {
+            if (regex.charAt(i) == '[' || regex.charAt(i) == '\\') {
+                ParsePosition pp = new ParsePosition(i);
+                final UnicodeSet set = new UnicodeSet(regex, pp, null);
+                // Regular expressions that match unpaired surrogates apparently behave
+                // differently in Java 8.  Let’s not go there.
+                if (java8OrOlder) {
+                    set.removeAll(new UnicodeSet("[\\uD800-\\uDFFF]"));
+                }
+                // Escape everything.  We could use _generatePattern, but then we would have to
+                // convert \U escapes to sequences of \‌u escapes, and to escape # ourselves.
+                result.append('[');
+                for (EntryRange range : set.ranges()) {
+                    result.append(javaUEscape(range.codepoint));
+                    if (range.codepointEnd != range.codepoint) {
+                        result.append('-');
+                        result.append(javaUEscape(range.codepointEnd));
+                    }
+                }
+                result.append(']');
+                i = pp.getIndex();
+            } else {
+                result.append(regex.charAt(i++));
+            }
+        }
+        return result.toString();
+    }
+
+    abstract void apply(StringBuilder remapped, BreakContext[] resolved);
+
+    abstract Resolution resolution();
+
+    String name() {
+        return name_;
+    }
+
+    private final String name_;
+}