ICU-22984 Generate old Java monkeys

This commit is contained in:
Robin Leroy 2025-01-27 01:56:54 +01:00
parent 4fc1b7e7f6
commit 6d8b63ce84
4 changed files with 598 additions and 1065 deletions
icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi

View file

@ -0,0 +1,111 @@
// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.dev.test.rbbi;
import java.util.Arrays;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A regex rule expressed as in UAXes #14 and #29.
*
* The rule consists of two regexes for context before and after a position in
* the remapped text,
* and of a resolution (break or not) that applies to the corresponding position
* in the original
* string if both match.
*/
class RegexRule extends SegmentationRule {
RegexRule(String name, String before, Resolution resolution,
String after) {
super(name);
resolution_ = resolution;
before_ = Pattern.compile(expandUnicodeSets(before), Pattern.COMMENTS | Pattern.DOTALL);
endsWithBefore_ = Pattern.compile(
".*(" + expandUnicodeSets(before) + ")", Pattern.COMMENTS | Pattern.DOTALL);
after_ = Pattern.compile(expandUnicodeSets(after), Pattern.COMMENTS | Pattern.DOTALL);
}
@Override
void apply(StringBuilder remapped, BreakContext[] resolved) {
// The unicodetools implementation simply tries, for each index, to
// match the string up to the index against /.*(before)/ (with
// `matches`) and the beginning of the string after the index against
// /after/ (with `lookingAt`), but that is very slow, especially for
// nonempty /before/. While the old monkeys are not a production
// implementation, we still do not want them to be too slow, since we
// need to test millions of sample strings. Instead we search for
// /before/ and /after/, and check resulting candidates. This speeds
// things up by a factor of ~40.
// We need to be careful about greedy matching: The first position where
// the rule matches may be before the end of the first /before/ match.
// However, it is both:
// 1. within a /before/ match or at its bounds,
// 2. at the beginning of an /after/ match.
// Further, the /before/ context of the rule matches within the
// aforementioned /before/ match. Note that we need to look for
// overlapping matches, thus calls to `find` are always preceded by a
// reset via `region`.
final Matcher beforeSearch = before_.matcher(remapped);
final Matcher afterSearch = after_.matcher(remapped);
beforeSearch.useAnchoringBounds(false);
afterSearch.useAnchoringBounds(false);
if (beforeSearch.find() && afterSearch.find()) {
for (;;) {
if (afterSearch.start() < beforeSearch.start()) {
afterSearch.region(beforeSearch.start(), remapped.length());
if (!afterSearch.find()) {
break;
}
} else if (afterSearch.start() > beforeSearch.end()) {
if (beforeSearch.start() == remapped.length()) {
break;
}
beforeSearch.region(remapped.offsetByCodePoints(beforeSearch.start(), 1),
remapped.length());
if (!beforeSearch.find()) {
break;
}
} else {
final Optional<BreakContext> position = Arrays.stream(resolved)
.filter(r -> r.indexInRemapped != null && r.indexInRemapped == afterSearch.start())
.findFirst();
if (!position.isPresent()) {
throw new IllegalArgumentException(("Rule " + name() +
" matched at position " + afterSearch.start() +
" in " + remapped +
" which does not correspond to an index in " +
"the original string"));
}
if (position.get().appliedRule == null &&
endsWithBefore_.matcher(remapped)
.useAnchoringBounds(false)
.region(beforeSearch.start(), afterSearch.start())
.matches()) {
position.get().appliedRule = this;
}
if (afterSearch.start() == remapped.length()) {
break;
}
afterSearch.region(remapped.offsetByCodePoints(afterSearch.start(), 1),
remapped.length());
if (!afterSearch.find()) {
break;
}
}
}
}
}
@Override
Resolution resolution() {
return resolution_;
}
private final Pattern before_;
private final Pattern endsWithBefore_;
private final Pattern after_;
private final Resolution resolution_;
}

View file

@ -0,0 +1,166 @@
// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.dev.test.rbbi;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A segmentation rule expressed as in UAXes #14 and #29.
*
* The application of a remap rule is a normal regex replacement on the remapped
* string. This replacement may use capturing groups. Any positions in the
* original string that correspond to positions within the replaced text are
* resolved to NO_BREAK by this rule.
*/
public class RemapRule extends SegmentationRule {
RemapRule(String name, String pattern, String replacement) {
super(name);
replacement_ = replacement;
pattern_ = Pattern.compile(expandUnicodeSets(pattern), Pattern.COMMENTS | Pattern.DOTALL);
}
@Override
void apply(StringBuilder remapped, BreakContext[] resolved) {
// This one has to be a StringBuffer rather than a StringBuilder because the
// overload of
// AppendReplacement that takes a StringBuilder is new in Java 9.
StringBuffer result = new StringBuffer();
int i = 0;
int offset = 0;
// We find all matches of the `pattern_` and replace them according to
// the `replacement_`, producing the new remapped string `result`.
// For every position i in the original string,
// `resolved[i].indexInRemapped` is null if i lies within a replaced
// match, and is set to the new index in `result` otherwise, by adding
// the accumulated difference `offset` between match lengths and
// replacement lengths.
// Consider a 4-codepoint, 6 code unit string s = 𒀀, ́, , 𝅲 , where
// stands for U+0020 and U+12000 𒀀 and U+1D172 𝅲 each require two code
// units, and apply the following two rules:
// 1. (?<X>\P{lb=SP}) \p{lb=CM}* ${X}
// 2. \p{lb=CM} A
// The string remapped and the indexInRemapped values change as follows:
// indexInRemapped remapped string rule final
// (aligned on the initial string) applied offset
// 𒀀 ́ 𝅲
// 0 1 2 3 4 5 6 𒀀, ́, , 𝅲 (none)
// 0 - - 2 3 4 5 𒀀, , 𝅲 1 -1
// 0 - - 2 3 - 4 𒀀, , A 2 -1
//
// Note that the last indexInRemapped is always equal to the length of
// the remapped string.
final Matcher matcher = pattern_.matcher(remapped);
while (matcher.find()) {
for (;; ++i) {
if (resolved[i].indexInRemapped == null) {
continue;
}
if (resolved[i].indexInRemapped != null &&
resolved[i].indexInRemapped > matcher.start()) {
break;
}
resolved[i].indexInRemapped += offset;
}
for (;; ++i) {
if (resolved[i].indexInRemapped == null) {
continue;
}
// Note that
// `*resolved[i].indexInRemapped > matcher.end()` should
// never happen with ordinary rules, but could in principle
// happen with rules that remap to code point sequences, e.g.,
// 1. BC TYZ
// 2. AT X
// applied to A, B, C :
// indexInRemapped remapped rule
// A B C
// 0 1 2 3 A, B, C (none)
// 0 1 - 4 A, T, Y, Z 1
// 0 - - 3 X, Y, Z 2
// Where for the application of rule 2, the match ends at
// position 2 in remapped, which does not correspond to a
// position in the original string.
if (resolved[i].indexInRemapped != null &&
resolved[i].indexInRemapped >= matcher.end()) {
break;
}
if (resolved[i].appliedRule != null &&
resolved[i].appliedRule.resolution() == Resolution.BREAK) {
throw new IllegalArgumentException(
"Replacement rule at remapped indices " +
matcher.start() +
" sqq. spans a break");
}
resolved[i].appliedRule = this;
resolved[i].indexInRemapped = null;
}
// While replacing, we need to check that we are not creating
// surrogate pairs. Since appendReplacement performs two
// concatenations (the unreplaced segment and the replacement), we
// need to check in two places: whether the unreplaced segment
// starts with a trailing surrogate that ends up after a leading
// surrogate, and whether the replaced segment starts with a leading
// surrogate that ends up after a trailing surrogate.
// We break the pair by replacing one of the surrogates with U+FFFF,
// which has the same properties for all but line breaking, and the
// same behaviour in line breaking (lb=SG and lb=XX are both treated
// as lb=AL).
Integer trailingLead = null;
if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) {
trailingLead = result.length() - 1;
}
matcher.appendReplacement(result, replacement_);
if (trailingLead != null && trailingLead + 1 < result.length() &&
Character.isLowSurrogate(result.charAt(trailingLead + 1))) {
result.setCharAt(trailingLead, '\uFFFF');
}
if (matcher.start() + offset > 0 &&
Character.isHighSurrogate(result.charAt(matcher.start() + offset - 1)) &&
Character.isLowSurrogate(result.charAt(matcher.start() + offset))) {
result.setCharAt(matcher.start() + offset, '\uFFFF');
}
offset = result.length() - resolved[i].indexInRemapped;
}
for (; i < resolved.length; ++i) {
if (resolved[i].indexInRemapped == null) {
continue;
}
resolved[i].indexInRemapped += offset;
}
Integer trailingLead = null;
if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) {
trailingLead = result.length() - 1;
}
matcher.appendTail(result);
if (trailingLead != null && trailingLead + 1 < result.length() &&
Character.isLowSurrogate(result.charAt(trailingLead + 1))) {
result.setCharAt(trailingLead, '\uFFFF');
}
if (resolved[resolved.length - 1].indexInRemapped != result.length()) {
StringBuilder indices = new StringBuilder();
for (final BreakContext r : resolved) {
indices.append(r.indexInRemapped == null ? "null" : r.indexInRemapped.toString());
indices.append(",");
}
throw new IllegalArgumentException("Inconsistent indexInRemapped " + indices + " for new remapped string " +
result);
}
remapped.setLength(0);
remapped.append(result);
}
@Override
Resolution resolution() {
return Resolution.NO_BREAK;
}
private final Pattern pattern_;
private final String replacement_;
}

View file

@ -0,0 +1,94 @@
// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.dev.test.rbbi;
import java.text.ParsePosition;
import javax.swing.RowFilter.Entry;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.EntryRange;
import com.ibm.icu.text.UTF16;
/**
* A segmentation rule expressed as in UAXes #14 and #29.
*
* Rules are applied sequentially.
* Rules operate on a mutable remapped string (which the caller should initially
* set to the string
* to be segmented), and can resolve positions in the original string to either
* BREAK or NO_BREAK.
*/
public abstract class SegmentationRule {
enum Resolution {
BREAK,
NO_BREAK,
}
static class BreakContext {
BreakContext(int index) {
indexInRemapped = index;
}
Integer indexInRemapped;
SegmentationRule appliedRule = null;
};
SegmentationRule(String name) {
name_ = name;
}
// Returns "\\uhhhh" for a BMP code point and "\\uDhhh\\uDhhh" (UTF-16) for other code points.
private String javaUEscape(int codePoint) {
if (codePoint <= 0xFFFF) {
return "\\u" + Utility.hex(codePoint);
} else {
return "\\u" + Utility.hex(UTF16.getLeadSurrogate(codePoint)) + "\\u"
+ Utility.hex(UTF16.getTrailSurrogate(codePoint));
}
}
protected String expandUnicodeSets(String regex) {
StringBuilder result = new StringBuilder();
int i = 0;
final boolean java8OrOlder = System.getProperty("java.version").startsWith("1.");
while (i < regex.length()) {
if (regex.charAt(i) == '[' || regex.charAt(i) == '\\') {
ParsePosition pp = new ParsePosition(i);
final UnicodeSet set = new UnicodeSet(regex, pp, null);
// Regular expressions that match unpaired surrogates apparently behave
// differently in Java 8. Lets not go there.
if (java8OrOlder) {
set.removeAll(new UnicodeSet("[\\uD800-\\uDFFF]"));
}
// Escape everything. We could use _generatePattern, but then we would have to
// convert \U escapes to sequences of \u escapes, and to escape # ourselves.
result.append('[');
for (EntryRange range : set.ranges()) {
result.append(javaUEscape(range.codepoint));
if (range.codepointEnd != range.codepoint) {
result.append('-');
result.append(javaUEscape(range.codepointEnd));
}
}
result.append(']');
i = pp.getIndex();
} else {
result.append(regex.charAt(i++));
}
}
return result.toString();
}
abstract void apply(StringBuilder remapped, BreakContext[] resolved);
abstract Resolution resolution();
String name() {
return name_;
}
private final String name_;
}