mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-12677 RBBI, fix incorrect stripping of comments from saved rules.
X-SVN-Rev: 40837
This commit is contained in:
parent
f74b42f960
commit
aaebaf90a3
9 changed files with 77 additions and 35 deletions
|
@ -47,7 +47,7 @@ U_NAMESPACE_BEGIN
|
|||
RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
||||
UParseError *parseErr,
|
||||
UErrorCode &status)
|
||||
: fRules(rules)
|
||||
: fRules(rules), fStrippedRules(rules)
|
||||
{
|
||||
fStatus = &status; // status is checked below
|
||||
fParseError = parseErr;
|
||||
|
@ -147,8 +147,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// Remove comments and whitespace from the rules to make it smaller.
|
||||
UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
|
||||
// Remove whitespace from the rules to make it smaller.
|
||||
// The rule parser has already removed comments.
|
||||
fStrippedRules = fScanner->stripRules(fStrippedRules);
|
||||
|
||||
// Calculate the size of each section in the data.
|
||||
// Sizes here are padded up to a multiple of 8 for better memory alignment.
|
||||
|
@ -162,7 +163,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
|
||||
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
|
||||
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
|
||||
|
||||
(void)safeFwdTableSize;
|
||||
|
||||
|
@ -225,7 +226,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
data->fStatusTable = data->fTrie + trieSize;
|
||||
data->fStatusTableLen= statusTableSize;
|
||||
data->fRuleSource = data->fStatusTable + statusTableSize;
|
||||
data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
|
||||
data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
|
||||
|
||||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
|
@ -245,7 +246,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
|
||||
}
|
||||
|
||||
strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
||||
fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
|
|
@ -130,6 +130,7 @@ public:
|
|||
UErrorCode *fStatus; // Error reporting. Keeping status
|
||||
UParseError *fParseError; // here avoids passing it everywhere.
|
||||
const UnicodeString &fRules; // The rule string that we are compiling
|
||||
UnicodeString fStrippedRules; // The rule string, with comments stripped.
|
||||
|
||||
RBBIRuleScanner *fScanner; // The scanner.
|
||||
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
|
||||
|
|
|
@ -822,27 +822,24 @@ static const UChar chRParen = 0x29;
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// stripRules Return a rules string without unnecessary
|
||||
// characters.
|
||||
// stripRules Return a rules string without extra spaces.
|
||||
// (Comments are removed separately, during rule parsing.)
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
|
||||
UnicodeString strippedRules;
|
||||
int rulesLength = rules.length();
|
||||
for (int idx = 0; idx < rulesLength; ) {
|
||||
UChar ch = rules[idx++];
|
||||
if (ch == chPound) {
|
||||
while (idx < rulesLength
|
||||
&& ch != chCR && ch != chLF && ch != chNEL)
|
||||
{
|
||||
ch = rules[idx++];
|
||||
}
|
||||
}
|
||||
if (!u_isISOControl(ch)) {
|
||||
strippedRules.append(ch);
|
||||
int32_t rulesLength = rules.length();
|
||||
bool skippingSpaces = false;
|
||||
|
||||
for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
|
||||
UChar32 cp = rules.char32At(idx);
|
||||
bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
|
||||
if (skippingSpaces && whiteSpace) {
|
||||
continue;
|
||||
}
|
||||
strippedRules.append(cp);
|
||||
skippingSpaces = whiteSpace;
|
||||
}
|
||||
// strippedRules = strippedRules.unescape();
|
||||
return strippedRules;
|
||||
}
|
||||
|
||||
|
@ -942,6 +939,7 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
|
|||
// It will be treated as white-space, and serves to break up anything
|
||||
// that might otherwise incorrectly clump together with a comment in
|
||||
// the middle (a variable name, for example.)
|
||||
int32_t commentStart = fScanIndex;
|
||||
for (;;) {
|
||||
c.fChar = nextCharLL();
|
||||
if (c.fChar == (UChar32)-1 || // EOF
|
||||
|
@ -950,6 +948,9 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
|
|||
c.fChar == chNEL ||
|
||||
c.fChar == chLS) {break;}
|
||||
}
|
||||
for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
|
||||
fRB->fStrippedRules.setCharAt(i, u' ');
|
||||
}
|
||||
}
|
||||
if (c.fChar == (UChar32)-1) {
|
||||
return;
|
||||
|
|
|
@ -1035,7 +1035,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
|||
|
||||
builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
|
||||
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
|
||||
RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
|
||||
LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
|
||||
if (U_FAILURE(status)) {
|
||||
errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
|
||||
__FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
|
||||
|
@ -1048,7 +1048,6 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
|||
errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
|
||||
return;
|
||||
}
|
||||
delete brkItr;
|
||||
}
|
||||
|
||||
void RBBIAPITest::TestRoundtripRules() {
|
||||
|
|
|
@ -105,6 +105,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
TESTCASE_AUTO(TestBug12932);
|
||||
TESTCASE_AUTO(TestEmoji);
|
||||
TESTCASE_AUTO(TestBug12519);
|
||||
TESTCASE_AUTO(TestBug12677);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -4436,6 +4437,23 @@ void RBBITest::TestBug12519() {
|
|||
assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
|
||||
}
|
||||
|
||||
void RBBITest::TestBug12677() {
|
||||
// Check that stripping of comments from rules for getRules() is not confused by
|
||||
// the presence of '#' characters in the rules that do not introduce comments.
|
||||
UnicodeString rules(u"!!forward; \n"
|
||||
"$x = [ab#]; # a set with a # literal. \n"
|
||||
" # .; # a comment that looks sort of like a rule. \n"
|
||||
" '#' '?'; # a rule with a quoted # \n"
|
||||
);
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
RuleBasedBreakIterator bi (rules, pe, status);
|
||||
assertSuccess(WHERE, status);
|
||||
UnicodeString rtRules = bi.getRules();
|
||||
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
|
||||
}
|
||||
|
||||
//
|
||||
// TestDebug - A place-holder test for debugging purposes.
|
||||
// For putting in fragments of other tests that can be invoked
|
||||
|
|
|
@ -74,6 +74,7 @@ public:
|
|||
void TestBug12932();
|
||||
void TestEmoji();
|
||||
void TestBug12519();
|
||||
void TestBug12677();
|
||||
|
||||
void TestDebug();
|
||||
void TestProperties();
|
||||
|
|
|
@ -28,6 +28,7 @@ class RBBIRuleBuilder {
|
|||
|
||||
String fDebugEnv; // controls debug trace output
|
||||
String fRules; // The rule string that we are compiling
|
||||
StringBuilder fStrippedRules; // The rule string, with comments stripped.
|
||||
RBBIRuleScanner fScanner; // The scanner.
|
||||
|
||||
|
||||
|
@ -142,6 +143,7 @@ class RBBIRuleBuilder {
|
|||
fDebugEnv = ICUDebug.enabled("rbbi") ?
|
||||
ICUDebug.value("rbbi") : null;
|
||||
fRules = rules;
|
||||
fStrippedRules = new StringBuilder(rules);
|
||||
fUSetNodes = new ArrayList<RBBINode>();
|
||||
fRuleStatusVals = new ArrayList<Integer>();
|
||||
fScanner = new RBBIRuleScanner(this);
|
||||
|
@ -165,8 +167,9 @@ class RBBIRuleBuilder {
|
|||
DataOutputStream dos = new DataOutputStream(os);
|
||||
int i;
|
||||
|
||||
// Remove comments and whitespace from the rules to make it smaller.
|
||||
String strippedRules = RBBIRuleScanner.stripRules(fRules);
|
||||
// Remove whitespace from the rules to make it smaller.
|
||||
// The rule parser has already removed comments.
|
||||
String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString());
|
||||
|
||||
// Calculate the size of each section in the data in bytes.
|
||||
// Sizes here are padded up to a multiple of 8 for better memory alignment.
|
||||
|
|
|
@ -14,6 +14,7 @@ import java.util.HashMap;
|
|||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
|
||||
/**
|
||||
* This class is part of the Rule Based Break Iterator rule compiler.
|
||||
|
@ -696,17 +697,16 @@ class RBBIRuleScanner {
|
|||
static String stripRules(String rules) {
|
||||
StringBuilder strippedRules = new StringBuilder();
|
||||
int rulesLength = rules.length();
|
||||
for (int idx = 0; idx < rulesLength;) {
|
||||
char ch = rules.charAt(idx++);
|
||||
if (ch == '#') {
|
||||
while (idx < rulesLength
|
||||
&& ch != '\r' && ch != '\n' && ch != chNEL) {
|
||||
ch = rules.charAt(idx++);
|
||||
}
|
||||
}
|
||||
if (!UCharacter.isISOControl(ch)) {
|
||||
strippedRules.append(ch);
|
||||
boolean skippingSpaces = false;
|
||||
|
||||
for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
|
||||
int cp = rules.codePointAt(idx);
|
||||
boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
|
||||
if (skippingSpaces && whiteSpace) {
|
||||
continue;
|
||||
}
|
||||
strippedRules.appendCodePoint(cp);
|
||||
skippingSpaces = whiteSpace;
|
||||
}
|
||||
return strippedRules.toString();
|
||||
}
|
||||
|
@ -800,6 +800,7 @@ class RBBIRuleScanner {
|
|||
// It will be treated as white-space, and serves to break up anything
|
||||
// that might otherwise incorrectly clump together with a comment in
|
||||
// the middle (a variable name, for example.)
|
||||
int commentStart = fScanIndex;
|
||||
for (;;) {
|
||||
c.fChar = nextCharLL();
|
||||
if (c.fChar == -1 || // EOF
|
||||
|
@ -811,6 +812,9 @@ class RBBIRuleScanner {
|
|||
break;
|
||||
}
|
||||
}
|
||||
for (int i=commentStart; i<fNextIndex-1; ++i) {
|
||||
fRB.fStrippedRules.setCharAt(i, ' ');
|
||||
}
|
||||
}
|
||||
if (c.fChar == -1) {
|
||||
return;
|
||||
|
|
|
@ -548,4 +548,18 @@ public class RBBITest extends TestFmwk {
|
|||
assertEquals("", t1.fExpectedBoundaries, t1.fBoundaries);
|
||||
assertEquals("", t2.fExpectedBoundaries, t2.fBoundaries);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestBug12677() {
|
||||
// Check that stripping of comments from rules for getRules() is not confused by
|
||||
// the presence of '#' characters in the rules that do not introduce comments.
|
||||
String rules = "!!forward; \n"
|
||||
+ "$x = [ab#]; # a set with a # literal. \n"
|
||||
+ " # .; # a comment that looks sort of like a rule. \n"
|
||||
+ " '#' '?'; # a rule with a quoted # \n";
|
||||
|
||||
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
|
||||
String rtRules = bi.toString(); // getRules() in C++
|
||||
assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue