ICU-12677 RBBI, fix incorrect stripping of comments from saved rules.

X-SVN-Rev: 40837
This commit is contained in:
Andy Heninger 2018-02-03 19:10:50 +00:00
parent f74b42f960
commit aaebaf90a3
9 changed files with 77 additions and 35 deletions

View file

@ -47,7 +47,7 @@ U_NAMESPACE_BEGIN
RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
UParseError *parseErr,
UErrorCode &status)
: fRules(rules)
: fRules(rules), fStrippedRules(rules)
{
fStatus = &status; // status is checked below
fParseError = parseErr;
@ -147,8 +147,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
return NULL;
}
// Remove comments and whitespace from the rules to make it smaller.
UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
// Remove whitespace from the rules to make it smaller.
// The rule parser has already removed comments.
fStrippedRules = fScanner->stripRules(fStrippedRules);
// Calculate the size of each section in the data.
// Sizes here are padded up to a multiple of 8 for better memory alignment.
@ -162,7 +163,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
(void)safeFwdTableSize;
@ -225,7 +226,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fStatusTable = data->fTrie + trieSize;
data->fStatusTableLen= statusTableSize;
data->fRuleSource = data->fStatusTable + statusTableSize;
data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
@ -245,7 +246,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
}
strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
return data;
}

View file

@ -130,6 +130,7 @@ public:
UErrorCode *fStatus; // Error reporting. Keeping status
UParseError *fParseError; // here avoids passing it everywhere.
const UnicodeString &fRules; // The rule string that we are compiling
UnicodeString fStrippedRules; // The rule string, with comments stripped.
RBBIRuleScanner *fScanner; // The scanner.
RBBINode *fForwardTree; // The parse trees, generated by the scanner,

View file

@ -822,27 +822,24 @@ static const UChar chRParen = 0x29;
//------------------------------------------------------------------------------
//
// stripRules Return a rules string without unnecessary
// characters.
// stripRules Return a rules string without extra spaces.
// (Comments are removed separately, during rule parsing.)
//
//------------------------------------------------------------------------------
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
UnicodeString strippedRules;
int rulesLength = rules.length();
for (int idx = 0; idx < rulesLength; ) {
UChar ch = rules[idx++];
if (ch == chPound) {
while (idx < rulesLength
&& ch != chCR && ch != chLF && ch != chNEL)
{
ch = rules[idx++];
}
}
if (!u_isISOControl(ch)) {
strippedRules.append(ch);
int32_t rulesLength = rules.length();
bool skippingSpaces = false;
for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
UChar32 cp = rules.char32At(idx);
bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
if (skippingSpaces && whiteSpace) {
continue;
}
strippedRules.append(cp);
skippingSpaces = whiteSpace;
}
// strippedRules = strippedRules.unescape();
return strippedRules;
}
@ -942,6 +939,7 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
// It will be treated as white-space, and serves to break up anything
// that might otherwise incorrectly clump together with a comment in
// the middle (a variable name, for example.)
int32_t commentStart = fScanIndex;
for (;;) {
c.fChar = nextCharLL();
if (c.fChar == (UChar32)-1 || // EOF
@ -950,6 +948,9 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
c.fChar == chNEL ||
c.fChar == chLS) {break;}
}
for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
fRB->fStrippedRules.setCharAt(i, u' ');
}
}
if (c.fChar == (UChar32)-1) {
return;

View file

@ -1035,7 +1035,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
if (U_FAILURE(status)) {
errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
__FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
@ -1048,7 +1048,6 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
return;
}
delete brkItr;
}
void RBBIAPITest::TestRoundtripRules() {

View file

@ -105,6 +105,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestBug12932);
TESTCASE_AUTO(TestEmoji);
TESTCASE_AUTO(TestBug12519);
TESTCASE_AUTO(TestBug12677);
TESTCASE_AUTO_END;
}
@ -4436,6 +4437,23 @@ void RBBITest::TestBug12519() {
assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
}
void RBBITest::TestBug12677() {
// Check that stripping of comments from rules for getRules() is not confused by
// the presence of '#' characters in the rules that do not introduce comments.
UnicodeString rules(u"!!forward; \n"
"$x = [ab#]; # a set with a # literal. \n"
" # .; # a comment that looks sort of like a rule. \n"
" '#' '?'; # a rule with a quoted # \n"
);
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RuleBasedBreakIterator bi (rules, pe, status);
assertSuccess(WHERE, status);
UnicodeString rtRules = bi.getRules();
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
}
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked

View file

@ -74,6 +74,7 @@ public:
void TestBug12932();
void TestEmoji();
void TestBug12519();
void TestBug12677();
void TestDebug();
void TestProperties();

View file

@ -28,6 +28,7 @@ class RBBIRuleBuilder {
String fDebugEnv; // controls debug trace output
String fRules; // The rule string that we are compiling
StringBuilder fStrippedRules; // The rule string, with comments stripped.
RBBIRuleScanner fScanner; // The scanner.
@ -142,6 +143,7 @@ class RBBIRuleBuilder {
fDebugEnv = ICUDebug.enabled("rbbi") ?
ICUDebug.value("rbbi") : null;
fRules = rules;
fStrippedRules = new StringBuilder(rules);
fUSetNodes = new ArrayList<RBBINode>();
fRuleStatusVals = new ArrayList<Integer>();
fScanner = new RBBIRuleScanner(this);
@ -165,8 +167,9 @@ class RBBIRuleBuilder {
DataOutputStream dos = new DataOutputStream(os);
int i;
// Remove comments and whitespace from the rules to make it smaller.
String strippedRules = RBBIRuleScanner.stripRules(fRules);
// Remove whitespace from the rules to make it smaller.
// The rule parser has already removed comments.
String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString());
// Calculate the size of each section in the data in bytes.
// Sizes here are padded up to a multiple of 8 for better memory alignment.

View file

@ -14,6 +14,7 @@ import java.util.HashMap;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
/**
* This class is part of the Rule Based Break Iterator rule compiler.
@ -696,17 +697,16 @@ class RBBIRuleScanner {
static String stripRules(String rules) {
StringBuilder strippedRules = new StringBuilder();
int rulesLength = rules.length();
for (int idx = 0; idx < rulesLength;) {
char ch = rules.charAt(idx++);
if (ch == '#') {
while (idx < rulesLength
&& ch != '\r' && ch != '\n' && ch != chNEL) {
ch = rules.charAt(idx++);
}
}
if (!UCharacter.isISOControl(ch)) {
strippedRules.append(ch);
boolean skippingSpaces = false;
for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
int cp = rules.codePointAt(idx);
boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
if (skippingSpaces && whiteSpace) {
continue;
}
strippedRules.appendCodePoint(cp);
skippingSpaces = whiteSpace;
}
return strippedRules.toString();
}
@ -800,6 +800,7 @@ class RBBIRuleScanner {
// It will be treated as white-space, and serves to break up anything
// that might otherwise incorrectly clump together with a comment in
// the middle (a variable name, for example.)
int commentStart = fScanIndex;
for (;;) {
c.fChar = nextCharLL();
if (c.fChar == -1 || // EOF
@ -811,6 +812,9 @@ class RBBIRuleScanner {
break;
}
}
for (int i=commentStart; i<fNextIndex-1; ++i) {
fRB.fStrippedRules.setCharAt(i, ' ');
}
}
if (c.fChar == -1) {
return;

View file

@ -548,4 +548,18 @@ public class RBBITest extends TestFmwk {
assertEquals("", t1.fExpectedBoundaries, t1.fBoundaries);
assertEquals("", t2.fExpectedBoundaries, t2.fBoundaries);
}
@Test
public void TestBug12677() {
// Check that stripping of comments from rules for getRules() is not confused by
// the presence of '#' characters in the rules that do not introduce comments.
String rules = "!!forward; \n"
+ "$x = [ab#]; # a set with a # literal. \n"
+ " # .; # a comment that looks sort of like a rule. \n"
+ " '#' '?'; # a rule with a quoted # \n";
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
String rtRules = bi.toString(); // getRules() in C++
assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules);
}
}