mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-22984 Generate old Java monkeys
This commit is contained in:
parent
4fc1b7e7f6
commit
6d8b63ce84
4 changed files with 598 additions and 1065 deletions
icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,111 @@
|
|||
// © 2024 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A regex rule expressed as in UAXes #14 and #29.
|
||||
*
|
||||
* The rule consists of two regexes for context before and after a position in
|
||||
* the remapped text,
|
||||
* and of a resolution (break or not) that applies to the corresponding position
|
||||
* in the original
|
||||
* string if both match.
|
||||
*/
|
||||
class RegexRule extends SegmentationRule {
|
||||
RegexRule(String name, String before, Resolution resolution,
|
||||
String after) {
|
||||
super(name);
|
||||
resolution_ = resolution;
|
||||
before_ = Pattern.compile(expandUnicodeSets(before), Pattern.COMMENTS | Pattern.DOTALL);
|
||||
endsWithBefore_ = Pattern.compile(
|
||||
".*(" + expandUnicodeSets(before) + ")", Pattern.COMMENTS | Pattern.DOTALL);
|
||||
after_ = Pattern.compile(expandUnicodeSets(after), Pattern.COMMENTS | Pattern.DOTALL);
|
||||
}
|
||||
|
||||
@Override
|
||||
void apply(StringBuilder remapped, BreakContext[] resolved) {
|
||||
// The unicodetools implementation simply tries, for each index, to
|
||||
// match the string up to the index against /.*(before)/ (with
|
||||
// `matches`) and the beginning of the string after the index against
|
||||
// /after/ (with `lookingAt`), but that is very slow, especially for
|
||||
// nonempty /before/. While the old monkeys are not a production
|
||||
// implementation, we still do not want them to be too slow, since we
|
||||
// need to test millions of sample strings. Instead we search for
|
||||
// /before/ and /after/, and check resulting candidates. This speeds
|
||||
// things up by a factor of ~40.
|
||||
// We need to be careful about greedy matching: The first position where
|
||||
// the rule matches may be before the end of the first /before/ match.
|
||||
// However, it is both:
|
||||
// 1. within a /before/ match or at its bounds,
|
||||
// 2. at the beginning of an /after/ match.
|
||||
// Further, the /before/ context of the rule matches within the
|
||||
// aforementioned /before/ match. Note that we need to look for
|
||||
// overlapping matches, thus calls to `find` are always preceded by a
|
||||
// reset via `region`.
|
||||
final Matcher beforeSearch = before_.matcher(remapped);
|
||||
final Matcher afterSearch = after_.matcher(remapped);
|
||||
beforeSearch.useAnchoringBounds(false);
|
||||
afterSearch.useAnchoringBounds(false);
|
||||
if (beforeSearch.find() && afterSearch.find()) {
|
||||
for (;;) {
|
||||
if (afterSearch.start() < beforeSearch.start()) {
|
||||
afterSearch.region(beforeSearch.start(), remapped.length());
|
||||
if (!afterSearch.find()) {
|
||||
break;
|
||||
}
|
||||
} else if (afterSearch.start() > beforeSearch.end()) {
|
||||
if (beforeSearch.start() == remapped.length()) {
|
||||
break;
|
||||
}
|
||||
beforeSearch.region(remapped.offsetByCodePoints(beforeSearch.start(), 1),
|
||||
remapped.length());
|
||||
if (!beforeSearch.find()) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
final Optional<BreakContext> position = Arrays.stream(resolved)
|
||||
.filter(r -> r.indexInRemapped != null && r.indexInRemapped == afterSearch.start())
|
||||
.findFirst();
|
||||
if (!position.isPresent()) {
|
||||
throw new IllegalArgumentException(("Rule " + name() +
|
||||
" matched at position " + afterSearch.start() +
|
||||
" in " + remapped +
|
||||
" which does not correspond to an index in " +
|
||||
"the original string"));
|
||||
}
|
||||
if (position.get().appliedRule == null &&
|
||||
endsWithBefore_.matcher(remapped)
|
||||
.useAnchoringBounds(false)
|
||||
.region(beforeSearch.start(), afterSearch.start())
|
||||
.matches()) {
|
||||
position.get().appliedRule = this;
|
||||
}
|
||||
if (afterSearch.start() == remapped.length()) {
|
||||
break;
|
||||
}
|
||||
afterSearch.region(remapped.offsetByCodePoints(afterSearch.start(), 1),
|
||||
remapped.length());
|
||||
if (!afterSearch.find()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
Resolution resolution() {
|
||||
return resolution_;
|
||||
}
|
||||
|
||||
private final Pattern before_;
|
||||
private final Pattern endsWithBefore_;
|
||||
private final Pattern after_;
|
||||
private final Resolution resolution_;
|
||||
}
|
|
@ -0,0 +1,166 @@
|
|||
// © 2024 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A segmentation rule expressed as in UAXes #14 and #29.
|
||||
*
|
||||
* The application of a remap rule is a normal regex replacement on the remapped
|
||||
* string. This replacement may use capturing groups. Any positions in the
|
||||
* original string that correspond to positions within the replaced text are
|
||||
* resolved to NO_BREAK by this rule.
|
||||
*/
|
||||
public class RemapRule extends SegmentationRule {
|
||||
RemapRule(String name, String pattern, String replacement) {
|
||||
super(name);
|
||||
replacement_ = replacement;
|
||||
pattern_ = Pattern.compile(expandUnicodeSets(pattern), Pattern.COMMENTS | Pattern.DOTALL);
|
||||
}
|
||||
|
||||
@Override
|
||||
void apply(StringBuilder remapped, BreakContext[] resolved) {
|
||||
// This one has to be a StringBuffer rather than a StringBuilder because the
|
||||
// overload of
|
||||
// AppendReplacement that takes a StringBuilder is new in Java 9.
|
||||
StringBuffer result = new StringBuffer();
|
||||
int i = 0;
|
||||
int offset = 0;
|
||||
// We find all matches of the `pattern_` and replace them according to
|
||||
// the `replacement_`, producing the new remapped string `result`.
|
||||
// For every position i in the original string,
|
||||
// `resolved[i].indexInRemapped` is null if i lies within a replaced
|
||||
// match, and is set to the new index in `result` otherwise, by adding
|
||||
// the accumulated difference `offset` between match lengths and
|
||||
// replacement lengths.
|
||||
// Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where
|
||||
// ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code
|
||||
// units, and apply the following two rules:
|
||||
// 1. (?<X>\P{lb=SP}) \p{lb=CM}* → ${X}
|
||||
// 2. \p{lb=CM} → A
|
||||
// The string remapped and the indexInRemapped values change as follows:
|
||||
// indexInRemapped remapped string rule final
|
||||
// (aligned on the initial string) applied offset
|
||||
// 𒀀 ◌́ ␠ ◌𝅲
|
||||
// 0 1 2 3 4 5 6 ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩ (none)
|
||||
// 0 - - 2 3 4 5 ⟨ 𒀀, ␠, ◌𝅲 ⟩ 1 -1
|
||||
// 0 - - 2 3 - 4 ⟨ 𒀀, ␠, A ⟩ 2 -1
|
||||
//
|
||||
// Note that the last indexInRemapped is always equal to the length of
|
||||
// the remapped string.
|
||||
final Matcher matcher = pattern_.matcher(remapped);
|
||||
while (matcher.find()) {
|
||||
for (;; ++i) {
|
||||
if (resolved[i].indexInRemapped == null) {
|
||||
continue;
|
||||
}
|
||||
if (resolved[i].indexInRemapped != null &&
|
||||
resolved[i].indexInRemapped > matcher.start()) {
|
||||
break;
|
||||
}
|
||||
resolved[i].indexInRemapped += offset;
|
||||
}
|
||||
for (;; ++i) {
|
||||
if (resolved[i].indexInRemapped == null) {
|
||||
continue;
|
||||
}
|
||||
// Note that
|
||||
// `*resolved[i].indexInRemapped > matcher.end()` should
|
||||
// never happen with ordinary rules, but could in principle
|
||||
// happen with rules that remap to code point sequences, e.g.,
|
||||
// 1. BC → TYZ
|
||||
// 2. AT → X
|
||||
// applied to ⟨ A, B, C ⟩:
|
||||
// indexInRemapped remapped rule
|
||||
// A B C
|
||||
// 0 1 2 3 ⟨ A, B, C ⟩ (none)
|
||||
// 0 1 - 4 ⟨ A, T, Y, Z ⟩ 1
|
||||
// 0 - - 3 ⟨ X, Y, Z ⟩ 2
|
||||
// Where for the application of rule 2, the match ends at
|
||||
// position 2 in remapped, which does not correspond to a
|
||||
// position in the original string.
|
||||
if (resolved[i].indexInRemapped != null &&
|
||||
resolved[i].indexInRemapped >= matcher.end()) {
|
||||
break;
|
||||
}
|
||||
if (resolved[i].appliedRule != null &&
|
||||
resolved[i].appliedRule.resolution() == Resolution.BREAK) {
|
||||
throw new IllegalArgumentException(
|
||||
"Replacement rule at remapped indices " +
|
||||
matcher.start() +
|
||||
" sqq. spans a break");
|
||||
}
|
||||
resolved[i].appliedRule = this;
|
||||
resolved[i].indexInRemapped = null;
|
||||
}
|
||||
// While replacing, we need to check that we are not creating
|
||||
// surrogate pairs. Since appendReplacement performs two
|
||||
// concatenations (the unreplaced segment and the replacement), we
|
||||
// need to check in two places: whether the unreplaced segment
|
||||
// starts with a trailing surrogate that ends up after a leading
|
||||
// surrogate, and whether the replaced segment starts with a leading
|
||||
// surrogate that ends up after a trailing surrogate.
|
||||
// We break the pair by replacing one of the surrogates with U+FFFF,
|
||||
// which has the same properties for all but line breaking, and the
|
||||
// same behaviour in line breaking (lb=SG and lb=XX are both treated
|
||||
// as lb=AL).
|
||||
Integer trailingLead = null;
|
||||
if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) {
|
||||
trailingLead = result.length() - 1;
|
||||
}
|
||||
|
||||
matcher.appendReplacement(result, replacement_);
|
||||
|
||||
if (trailingLead != null && trailingLead + 1 < result.length() &&
|
||||
Character.isLowSurrogate(result.charAt(trailingLead + 1))) {
|
||||
result.setCharAt(trailingLead, '\uFFFF');
|
||||
}
|
||||
|
||||
if (matcher.start() + offset > 0 &&
|
||||
Character.isHighSurrogate(result.charAt(matcher.start() + offset - 1)) &&
|
||||
Character.isLowSurrogate(result.charAt(matcher.start() + offset))) {
|
||||
result.setCharAt(matcher.start() + offset, '\uFFFF');
|
||||
}
|
||||
offset = result.length() - resolved[i].indexInRemapped;
|
||||
}
|
||||
for (; i < resolved.length; ++i) {
|
||||
if (resolved[i].indexInRemapped == null) {
|
||||
continue;
|
||||
}
|
||||
resolved[i].indexInRemapped += offset;
|
||||
}
|
||||
|
||||
Integer trailingLead = null;
|
||||
if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) {
|
||||
trailingLead = result.length() - 1;
|
||||
}
|
||||
matcher.appendTail(result);
|
||||
if (trailingLead != null && trailingLead + 1 < result.length() &&
|
||||
Character.isLowSurrogate(result.charAt(trailingLead + 1))) {
|
||||
result.setCharAt(trailingLead, '\uFFFF');
|
||||
}
|
||||
|
||||
if (resolved[resolved.length - 1].indexInRemapped != result.length()) {
|
||||
StringBuilder indices = new StringBuilder();
|
||||
for (final BreakContext r : resolved) {
|
||||
indices.append(r.indexInRemapped == null ? "null" : r.indexInRemapped.toString());
|
||||
indices.append(",");
|
||||
}
|
||||
throw new IllegalArgumentException("Inconsistent indexInRemapped " + indices + " for new remapped string " +
|
||||
result);
|
||||
}
|
||||
remapped.setLength(0);
|
||||
remapped.append(result);
|
||||
}
|
||||
|
||||
@Override
|
||||
Resolution resolution() {
|
||||
return Resolution.NO_BREAK;
|
||||
}
|
||||
|
||||
private final Pattern pattern_;
|
||||
private final String replacement_;
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
// © 2024 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
|
||||
import java.text.ParsePosition;
|
||||
|
||||
import javax.swing.RowFilter.Entry;
|
||||
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSet.EntryRange;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* A segmentation rule expressed as in UAXes #14 and #29.
|
||||
*
|
||||
* Rules are applied sequentially.
|
||||
* Rules operate on a mutable remapped string (which the caller should initially
|
||||
* set to the string
|
||||
* to be segmented), and can resolve positions in the original string to either
|
||||
* BREAK or NO_BREAK.
|
||||
*/
|
||||
public abstract class SegmentationRule {
|
||||
enum Resolution {
|
||||
BREAK,
|
||||
NO_BREAK,
|
||||
}
|
||||
|
||||
static class BreakContext {
|
||||
BreakContext(int index) {
|
||||
indexInRemapped = index;
|
||||
}
|
||||
|
||||
Integer indexInRemapped;
|
||||
SegmentationRule appliedRule = null;
|
||||
};
|
||||
|
||||
SegmentationRule(String name) {
|
||||
name_ = name;
|
||||
}
|
||||
|
||||
// Returns "\\uhhhh" for a BMP code point and "\\uDhhh\\uDhhh" (UTF-16) for other code points.
|
||||
private String javaUEscape(int codePoint) {
|
||||
if (codePoint <= 0xFFFF) {
|
||||
return "\\u" + Utility.hex(codePoint);
|
||||
} else {
|
||||
return "\\u" + Utility.hex(UTF16.getLeadSurrogate(codePoint)) + "\\u"
|
||||
+ Utility.hex(UTF16.getTrailSurrogate(codePoint));
|
||||
}
|
||||
}
|
||||
|
||||
protected String expandUnicodeSets(String regex) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
int i = 0;
|
||||
final boolean java8OrOlder = System.getProperty("java.version").startsWith("1.");
|
||||
while (i < regex.length()) {
|
||||
if (regex.charAt(i) == '[' || regex.charAt(i) == '\\') {
|
||||
ParsePosition pp = new ParsePosition(i);
|
||||
final UnicodeSet set = new UnicodeSet(regex, pp, null);
|
||||
// Regular expressions that match unpaired surrogates apparently behave
|
||||
// differently in Java 8. Let’s not go there.
|
||||
if (java8OrOlder) {
|
||||
set.removeAll(new UnicodeSet("[\\uD800-\\uDFFF]"));
|
||||
}
|
||||
// Escape everything. We could use _generatePattern, but then we would have to
|
||||
// convert \U escapes to sequences of \u escapes, and to escape # ourselves.
|
||||
result.append('[');
|
||||
for (EntryRange range : set.ranges()) {
|
||||
result.append(javaUEscape(range.codepoint));
|
||||
if (range.codepointEnd != range.codepoint) {
|
||||
result.append('-');
|
||||
result.append(javaUEscape(range.codepointEnd));
|
||||
}
|
||||
}
|
||||
result.append(']');
|
||||
i = pp.getIndex();
|
||||
} else {
|
||||
result.append(regex.charAt(i++));
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
abstract void apply(StringBuilder remapped, BreakContext[] resolved);
|
||||
|
||||
abstract Resolution resolution();
|
||||
|
||||
String name() {
|
||||
return name_;
|
||||
}
|
||||
|
||||
private final String name_;
|
||||
}
|
Loading…
Add table
Reference in a new issue