mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-22984 Clean up old monkeys
This commit is contained in:
parent
757f27cd35
commit
e59065cc74
2 changed files with 80 additions and 17 deletions
|
@ -212,6 +212,16 @@ The rule updates are done first for ICU4C, and then ported (code changes) or mov
|
|||
|
||||
Updating the test with new or revised rules requires changing the test source code, in `icu4c/source/test/intltest/rbbitst.cpp`. Look for the classes RBBICharMonkey, RBBIWordMonkey, RBBISentMonkey and RBBILineMonkey. The body of each class tracks the corresponding UAX-14 or UAX-29 specifications in defining the character classes and break rules.
|
||||
|
||||
The rules, as well as the partition of the code space used to generate the random sample strings,
|
||||
are defined by regular expressions and Unicode sets generated by GenerateBreakTest in the
|
||||
Unicode tools, which runs as part of MakeUnicodeFiles.
|
||||
Copy the relevant lines from `Generated/UCD/17.0.0/extra/*BreakTest.cpp.txt` into `rbbitst.cpp`.
|
||||
When developing changes to the line breaking algorithms that require changes to property assignments,
|
||||
the generated rules and partition may need to be adjusted for testing.
|
||||
However, the updated rules should only be merged into ICU once the property changes have actually been
|
||||
made in the UCD and imported into ICU, at which point the unmodified generated partition and rules can
|
||||
be used in `rbbitst.cpp`.
|
||||
|
||||
After making changes, as a final check, let the test run for an extended period of time, on the order of several hours.
|
||||
Run it from a terminal, and just interrupt it (Ctrl-C) when it's gone long enough.
|
||||
|
||||
|
|
|
@ -1602,8 +1602,8 @@ class SegmentationRule {
|
|||
NO_BREAK = u'×',
|
||||
};
|
||||
struct BreakContext {
|
||||
BreakContext(std::size_t index) : indexInRemapped(index) {}
|
||||
std::optional<std::size_t> indexInRemapped;
|
||||
BreakContext(int32_t index) : indexInRemapped(index) {}
|
||||
std::optional<int32_t> indexInRemapped;
|
||||
const SegmentationRule *appliedRule = nullptr;
|
||||
};
|
||||
|
||||
|
@ -1639,15 +1639,37 @@ class RemapRule : public SegmentationRule {
|
|||
auto const start = std::chrono::steady_clock::now();
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString result;
|
||||
std::size_t i = 0;
|
||||
std::ptrdiff_t offset = 0;
|
||||
int32_t i = 0;
|
||||
int32_t offset = 0;
|
||||
// We find all matches of the `pattern_` and replace them according to
|
||||
// the `replacement_`, producing the new remapped string `result`.
|
||||
// For every position i in the original string,
|
||||
// `resolved[i].indexInRemapped` is nullopt if i lies within a replaced
|
||||
// match, and is set to the new index in `result` otherwise, by adding
|
||||
// the accumulated difference `offset` between match lengths and
|
||||
// replacement lengths.
|
||||
// Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where
|
||||
// ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code
|
||||
// units, and apply the following two rules:
|
||||
// 1. (?<X>\P{lb=SP}) \p{lb=CM}* → ${X}
|
||||
// 2. \p{lb=CM} → A
|
||||
// The string remapped and the indexInRemapped values change as follows:
|
||||
// indexInRemapped remapped string rule final
|
||||
// (aligned on the initial string) applied offset
|
||||
// 𒀀 ◌́ ␠ ◌𝅲
|
||||
// 0 1 2 3 4 5 6 ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩ (none)
|
||||
// 0 - - 2 3 4 5 ⟨ 𒀀, ␠, ◌𝅲 ⟩ 1 -1
|
||||
// 0 - - 2 3 - 4 ⟨ 𒀀, ␠, A ⟩ 2 -1
|
||||
//
|
||||
// Note that the last indexInRemapped is always equal to the length of
|
||||
// the remapped string.
|
||||
std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
|
||||
while (matcher->find()) {
|
||||
for (;; ++i) {
|
||||
if (!resolved[i].indexInRemapped.has_value()) {
|
||||
continue;
|
||||
}
|
||||
if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
|
||||
if (*resolved[i].indexInRemapped > matcher->start(status)) {
|
||||
break;
|
||||
}
|
||||
*resolved[i].indexInRemapped += offset;
|
||||
|
@ -1656,22 +1678,37 @@ class RemapRule : public SegmentationRule {
|
|||
if (!resolved[i].indexInRemapped.has_value()) {
|
||||
continue;
|
||||
}
|
||||
if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
|
||||
// Note that
|
||||
// `*resolved[i].indexInRemapped > matcher->end(status)` should
|
||||
// never happen with ordinary rules, but could in principle
|
||||
// happen with rules that remap to code point sequences, e.g.,
|
||||
// 1. BC → TYZ
|
||||
// 2. AT → X
|
||||
// applied to ⟨ A, B, C ⟩:
|
||||
// indexInRemapped remapped rule
|
||||
// A B C
|
||||
// 0 1 2 3 ⟨ A, B, C ⟩ (none)
|
||||
// 0 1 - 4 ⟨ A, T, Y, Z ⟩ 1
|
||||
// 0 - - 3 ⟨ X, Y, Z ⟩ 2
|
||||
// Where for the application of rule 2, the match ends at
|
||||
// position 2 in remapped, which does not correspond to a
|
||||
// position in the original string.
|
||||
if (*resolved[i].indexInRemapped >= matcher->end(status)) {
|
||||
break;
|
||||
}
|
||||
if (resolved[i].appliedRule != nullptr &&
|
||||
resolved[i].appliedRule->resolution() == BREAK) {
|
||||
resolved[i].appliedRule->resolution() == BREAK) {
|
||||
printf("Replacement rule at remapped indices %d sqq. spans a break",
|
||||
matcher->start(status));
|
||||
std::terminate();
|
||||
}
|
||||
resolved[i].appliedRule = this;
|
||||
resolved[i].indexInRemapped = std::nullopt;
|
||||
resolved[i].indexInRemapped.reset();
|
||||
}
|
||||
matcher->appendReplacement(result, replacement_, status);
|
||||
offset = result.length() - *resolved[i].indexInRemapped;
|
||||
}
|
||||
for (; i < resolved.size(); ++i) {
|
||||
for (; i < static_cast<int32_t>(resolved.size()); ++i) {
|
||||
if (!resolved[i].indexInRemapped.has_value()) {
|
||||
continue;
|
||||
}
|
||||
|
@ -1691,7 +1728,10 @@ class RemapRule : public SegmentationRule {
|
|||
std::terminate();
|
||||
}
|
||||
remapped = result;
|
||||
U_ASSERT(U_SUCCESS(status));
|
||||
if (U_FAILURE(status)) {
|
||||
puts(("Failed to apply rule " + name()).c_str());
|
||||
std::terminate();
|
||||
}
|
||||
timeSpent_ += std::chrono::steady_clock::now() - start;
|
||||
}
|
||||
|
||||
|
@ -1715,7 +1755,10 @@ class RegexRule : public SegmentationRule {
|
|||
endsWithBefore_.reset(RegexPattern::compile(
|
||||
".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
|
||||
after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
|
||||
U_ASSERT(U_SUCCESS(status));
|
||||
if (U_FAILURE(status)) {
|
||||
puts(("Failed to compile regular expressions for rule " + this->name()).c_str());
|
||||
std::terminate();
|
||||
}
|
||||
}
|
||||
|
||||
virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
|
||||
|
@ -1764,7 +1807,13 @@ class RegexRule : public SegmentationRule {
|
|||
auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
|
||||
return r.indexInRemapped == afterSearch->start(status);
|
||||
});
|
||||
U_ASSERT(it != resolved.end());
|
||||
if (it == resolved.end()) {
|
||||
puts(("Rule " + name() +
|
||||
" found a break at a position which does not correspond to an index in "
|
||||
"the original string")
|
||||
.c_str());
|
||||
std::terminate();
|
||||
}
|
||||
U_ASSERT(U_SUCCESS(status));
|
||||
if (it->appliedRule == nullptr &&
|
||||
std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
|
||||
|
@ -1785,6 +1834,10 @@ class RegexRule : public SegmentationRule {
|
|||
U_ASSERT(U_SUCCESS(status));
|
||||
}
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
puts(("Failed to apply rule " + name()).c_str());
|
||||
std::terminate();
|
||||
}
|
||||
timeSpent_ += std::chrono::steady_clock::now() - start;
|
||||
}
|
||||
|
||||
|
@ -3035,9 +3088,9 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
const UnicodeSet lbSA(uR"(\p{lb=SA})", status);
|
||||
for (auto it = partition.begin(); it != partition.end();) {
|
||||
if (lbSA.containsAll(it->second)) {
|
||||
it = partition.erase(it);
|
||||
it = partition.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3079,9 +3132,9 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
|
|||
|
||||
int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
for (std::size_t i = startPos + 1; i < resolved.size(); ++i) {
|
||||
if (resolved[i].appliedRule != nullptr && resolved[i].appliedRule->resolution() ==
|
||||
SegmentationRule::BREAK) {
|
||||
return i;
|
||||
if (resolved[i].appliedRule != nullptr &&
|
||||
resolved[i].appliedRule->resolution() == SegmentationRule::BREAK) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
|
|
Loading…
Add table
Reference in a new issue