mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-04 21:15:35 +00:00
ICU-22295 Remove deprecated RBBI LBCMNoChain option
ICU-22295 Remove deprecated RBBI LBCMNoChain option ICU-22295 Remove deprecated RBBI LBCMNoChain option
This commit is contained in:
parent
b31579920e
commit
84e4762069
8 changed files with 3 additions and 60 deletions
|
@ -206,15 +206,6 @@ Chaining into a rule can be dis-allowed by beginning that rule with a '`^`'. Rul
|
|||
so marked can begin a match after a preceding boundary or at the start of text,
|
||||
but cannot extend a match via chaining from another rule.
|
||||
|
||||
~~The !!LBCMNoChain; statement modifies chaining behavior by preventing chaining
|
||||
from one rule to another from occurring on any character whose Line Break
|
||||
property is Combining Mark. This option is subject to change or removal, and
|
||||
should not be used in general. Within ICU, it is used only with the line break
|
||||
rules. We hope to replace it with something more general.~~
|
||||
|
||||
> :point_right: **Note**: `!!LBCMNoChain` is deprecated, and will be removed
|
||||
> completely from a future version of ICU.
|
||||
|
||||
## Rule Status Values
|
||||
|
||||
Break rules can be tagged with a number, which is called the *rule status*.
|
||||
|
@ -293,7 +284,6 @@ See, for example, this snippet from the [line break rules](https://github.com/un
|
|||
| ~~`!!reverse`~~ | ~~*[deprecated]* The rules that follow are for reverse iteration. No longer needed; any rules in a Reverse rule section are ignored.~~ |
|
||||
| ~~`!!safe_forward`~~ | ~~*[deprecated]* The rules that follow are for safe forward iteration. No longer needed; any rules in such a section are ignored.~~ |
|
||||
| ~~`!!safe_reverse`~~ | ~~*[deprecated]* The rules that follow are for safe reverse iteration. No longer needed; any rules in such a section are ignored.~~ |
|
||||
| ~~`!!LBCMNoChain`~~ | ~~*[deprecated]* Disable chaining when the overlap character matches `\p{Line_Break=Combining_Mark}`~~ |
|
||||
|
||||
## Rule Syntax
|
||||
|
||||
|
@ -377,22 +367,17 @@ Here is the syntax for the boundary rules. (The EBNF Syntax is given below.)
|
|||
exclamation `!`. This syntax is deprecated, and will be removed from a
|
||||
future version of ICU.
|
||||
|
||||
2. `!!LBCMNoChain` was a global option that specified that characters with the
|
||||
line break property of "Combining Character" would not participate in rule
|
||||
chaining. This option was always considered internal, is deprecated and will
|
||||
be removed from a future version of ICU.
|
||||
|
||||
3. Naked rule characters. Plain text, in the context of a rule, is treated as
|
||||
2. Naked rule characters. Plain text, in the context of a rule, is treated as
|
||||
literal text to be matched, much like normal regular expressions. This turns
|
||||
out to be very error prone, has been the source of bugs in released versions
|
||||
of ICU, and is not useful in implementing normal text boundary rules. A
|
||||
future version will reject literal text that is not escaped.
|
||||
|
||||
4. Exact reverse rules and safe forward rules: planned changes to the break
|
||||
3. Exact reverse rules and safe forward rules: planned changes to the break
|
||||
engine implementation will remove the need for exact reverse rules and safe
|
||||
forward rules.
|
||||
|
||||
5. `{bof}` and `{eof}`, appearing within `[`sets`]`, match the beginning or ending of
|
||||
4. `{bof}` and `{eof}`, appearing within `[`sets`]`, match the beginning or ending of
|
||||
the input text, respectively. This is an internal (not documented) feature
|
||||
that will probably be removed in a future version of ICU. They are currently
|
||||
used by the standard rules for word, line and sentence breaking. An
|
||||
|
|
|
@ -66,7 +66,6 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
|||
fForwardTable = nullptr;
|
||||
fRuleStatusVals = nullptr;
|
||||
fChainRules = false;
|
||||
fLBCMNoChain = false;
|
||||
fLookAheadHardBreak = false;
|
||||
fUSetNodes = nullptr;
|
||||
fRuleStatusVals = nullptr;
|
||||
|
|
|
@ -159,9 +159,6 @@ public:
|
|||
UBool fChainRules; // True for chained Unicode TR style rules.
|
||||
// False for traditional regexp rules.
|
||||
|
||||
UBool fLBCMNoChain; // True: suppress chaining of rules on
|
||||
// chars with LineBreak property == CM.
|
||||
|
||||
UBool fLookAheadHardBreak; // True: Look ahead matches cause an
|
||||
// immediate break, no continuing for the
|
||||
// longest match.
|
||||
|
|
|
@ -547,8 +547,6 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
|
|||
UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
|
||||
if (opt == UNICODE_STRING("chain", 5)) {
|
||||
fRB->fChainRules = true;
|
||||
} else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
|
||||
fRB->fLBCMNoChain = true;
|
||||
} else if (opt == UNICODE_STRING("forward", 7)) {
|
||||
fRB->fDefaultTree = &fRB->fForwardTree;
|
||||
} else if (opt == UNICODE_STRING("reverse", 7)) {
|
||||
|
|
|
@ -458,21 +458,6 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNod
|
|||
|
||||
// We've got a node that can end a match.
|
||||
|
||||
// !!LBCMNoChain implementation: If this node's val correspond to
|
||||
// the Line Break $CM char class, don't chain from it.
|
||||
// TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
|
||||
// by any of the standard ICU rules.
|
||||
if (fRB->fLBCMNoChain) {
|
||||
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
|
||||
if (c != -1) {
|
||||
// c == -1 occurs with sets containing only the {eof} marker string.
|
||||
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
|
||||
if (cLBProp == U_LB_COMBINING_MARK) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now iterate over the nodes that can start a match, looking for ones
|
||||
// with the same char class as our ending node.
|
||||
RBBINode *startNode;
|
||||
|
|
|
@ -53,9 +53,6 @@ class RBBIRuleBuilder {
|
|||
boolean fChainRules; // True for chained Unicode TR style rules.
|
||||
// False for traditional regexp rules.
|
||||
|
||||
boolean fLBCMNoChain; // True: suppress chaining of rules on
|
||||
// chars with LineBreak property == CM.
|
||||
|
||||
boolean fLookAheadHardBreak; // True: Look ahead matches cause an
|
||||
// immediate break, no continuing for the
|
||||
// longest match.
|
||||
|
|
|
@ -436,8 +436,6 @@ class RBBIRuleScanner {
|
|||
String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
|
||||
if (opt.equals("chain")) {
|
||||
fRB.fChainRules = true;
|
||||
} else if (opt.equals("LBCMNoChain")) {
|
||||
fRB.fLBCMNoChain = true;
|
||||
} else if (opt.equals("forward")) {
|
||||
fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
|
||||
} else if (opt.equals("reverse")) {
|
||||
|
|
|
@ -441,22 +441,6 @@ class RBBITableBuilder {
|
|||
|
||||
// We've got a node that can end a match.
|
||||
|
||||
// !!LBCMNoChain implementation: If this node's val correspond to
|
||||
// the Line Break $CM char class, don't chain from it.
|
||||
// TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
|
||||
// by any of the standard ICU rules.
|
||||
if (fRB.fLBCMNoChain) {
|
||||
int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
|
||||
if (c != -1) {
|
||||
// c == -1 occurs with sets containing only the {eof} marker string.
|
||||
int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
|
||||
if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Now iterate over the nodes that can start a match, looking for ones
|
||||
// with the same char class as our ending node.
|
||||
for (RBBINode startNode : matchStartNodes) {
|
||||
|
|
Loading…
Add table
Reference in a new issue