ICU-1572 fix UnicodeSet.toPattern() round trip bugs

X-SVN-Rev: 7245
This commit is contained in:
Alan Liu 2001-12-01 01:33:41 +00:00
parent 221d9f6880
commit 302bf822c7
5 changed files with 129 additions and 26 deletions

View file

@ -402,11 +402,13 @@ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool useHexEscape)
case BACKSLASH:
case 123/*{*/:
case 125/*}*/:
case SymbolTable::SYMBOL_REF:
case COLON:
buf.append(BACKSLASH);
break;
default:
// Escape whitespace
if (Unicode::isWhitespace(c)) {
if (u_isspace(c)) {
buf.append(BACKSLASH);
}
break;
@ -435,8 +437,9 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
if (pat.length() > 0) {
int32_t i;
int32_t backslashCount = 0;
for (i=0; i<pat.length(); ++i) {
UChar c = pat.charAt(i);
for (i=0; i<pat.length(); ) {
UChar32 c = pat.char32At(i);
i += UTF_CHAR_LENGTH(c);
if (escapeUnprintable && Utility::isUnprintable(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
@ -940,6 +943,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
const UChar32 NONE = (UChar32) -1;
UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE
UBool isLastLiteral = FALSE; // TRUE if lastChar was a literal
UChar lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
@ -1269,6 +1273,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
_appendToPat(newPat, lastChar, FALSE);
}
lastChar = c;
isLastLiteral = isLiteral;
}
}
@ -1281,7 +1286,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
// Treat a trailing '$' as indicating ETHER. This code is only
// executed if symbols == NULL; otherwise other code parses the
// anchor.
if (lastChar == (UChar)SymbolTable::SYMBOL_REF) {
if (lastChar == (UChar)SymbolTable::SYMBOL_REF && !isLastLiteral) {
rebuildPattern = TRUE;
newPat.append(lastChar);
add(TransliterationRule::ETHER);

View file

@ -44,10 +44,88 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
CASE(7,TestPropertySet);
CASE(8,TestClone);
CASE(9,TestExhaustive);
CASE(10,TestToPattern);
default: name = ""; break;
}
}
/**
* Test that toPattern() round trips with syntax characters and
* whitespace.
*/
void UnicodeSetTest::TestToPattern() {
for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
// check various combinations to make sure they all work.
if (i != 0 && !toPatternAux(i, i)) continue;
if (!toPatternAux(0, i)) continue;
if (!toPatternAux(i, 0xFFFF)) continue;
}
}
UErrorCode ec = U_ZERO_ERROR;
UnicodeString spat = "[:nonspacing mark:]";
UnicodeSet s(spat, ec);
if (U_FAILURE(ec)) { errln("FAIL: UnicodeSet constructor"); return; }
UnicodeString tpat;
s.toPattern(tpat, TRUE);
UnicodeSet t(tpat, ec);
if (U_FAILURE(ec)) {
errln((UnicodeString)"FAIL: " + spat + ".toPattern() => " + tpat +
": INVALID PATTERN");
} else {
if (s!=t) {
UnicodeString str;
t.toPattern(str, TRUE);
errln((UnicodeString)"FAIL: " + spat + ".toPattern().new UnicodeSet() => " +
str);
}
}
}
UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
// use Integer.toString because Utility.hex doesn't handle ints
UnicodeString pat = "";
// TODO do these in hex
//String source = "0x" + Integer.toString(start,16).toUpperCase();
//if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
UnicodeString source = source + (int32_t)start;
if (start != end) source = source + ".." + (int32_t)end;
UnicodeSet testSet;
testSet.add(start, end);
// What we want to make sure of is that a pattern generated
// by toPattern(), with or without escaped unprintables, can
// be passed back into the UnicodeSet constructor.
UnicodeString pat0; testSet.toPattern(pat0, TRUE);
if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
//String pat1 = unescapeLeniently(pat0);
//if (!checkPat(source + " (in code)", testSet, pat1)) return false;
UnicodeString pat2; testSet.toPattern(pat2, FALSE);
if (!checkPat(source, testSet, pat2)) return FALSE;
//String pat3 = unescapeLeniently(pat2);
//if (!checkPat(source + " (in code)", testSet, pat3)) return false;
//logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
return TRUE;
}
UBool UnicodeSetTest::checkPat(const UnicodeString& source,
const UnicodeSet& testSet,
const UnicodeString& pat) {
UErrorCode ec = U_ZERO_ERROR;
UnicodeSet testSet2(pat, ec);
if (testSet2 != testSet) {
errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
return FALSE;
}
return TRUE;
}
void
UnicodeSetTest::TestPatterns(void) {
UnicodeSet set;

View file

@ -27,6 +27,12 @@ class UnicodeSetTest: public IntlTest {
private:
/**
* Test that toPattern() round trips with syntax characters and
* whitespace.
*/
void TestToPattern();
void TestPatterns(void);
void TestCategories(void);
void TestAddRemove(void);
@ -52,6 +58,10 @@ private:
private:
UBool toPatternAux(UChar32 start, UChar32 end);
UBool checkPat(const UnicodeString& source, const UnicodeSet& testSet, const UnicodeString& pat);
void _testComplement(int32_t a, UnicodeSet&, UnicodeSet&);
void _testAdd(int32_t a, int32_t b, UnicodeSet&, UnicodeSet&, UnicodeSet&);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
* $Date: 2001/11/29 22:31:18 $
* $Revision: 1.48 $
* $Date: 2001/12/01 01:31:18 $
* $Revision: 1.49 $
*
*****************************************************************************************
*/
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
* Unicode property
* </table>
* @author Alan Liu
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.48 $ $Date: 2001/11/29 22:31:18 $
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.49 $ $Date: 2001/12/01 01:31:18 $
*/
public class UnicodeSet extends UnicodeFilter {
@ -443,6 +443,8 @@ public class UnicodeSet extends UnicodeFilter {
case '\\': //BACKSLASH:
case '{':
case '}':
case '$':
case ':':
buf.append('\\');
break;
default:
@ -475,8 +477,9 @@ public class UnicodeSet extends UnicodeFilter {
if (pat != null) {
int i;
int backslashCount = 0;
for (i=0; i<pat.length(); ++i) {
char c = pat.charAt(i);
for (i=0; i<pat.length(); ) {
int c = UTF16.charAt(pat, i);
i += UTF16.getCharCount(c);
if (escapeUnprintable && Utility.isUnprintable(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
@ -488,7 +491,7 @@ public class UnicodeSet extends UnicodeFilter {
Utility.escapeUnprintable(result, c);
backslashCount = 0;
} else {
result.append(c);
UTF16.append(result, c);
if (c == '\\') {
++backslashCount;
} else {
@ -706,7 +709,7 @@ public class UnicodeSet extends UnicodeFilter {
int start = list[i++];
int count = list[i++] - start;
if (index < count) {
return (char)(start + index);
return start + index;
}
index -= count;
}
@ -1114,6 +1117,7 @@ public class UnicodeSet extends UnicodeFilter {
final int NONE = -1;
int lastChar = NONE; // This is either a char (0..10FFFF) or -1
boolean isLastLiteral = false; // TRUE if lastChar was a literal
char lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
@ -1343,7 +1347,7 @@ public class UnicodeSet extends UnicodeFilter {
if (lastOp != 0) {
throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
}
add((char) lastChar, (char) lastChar);
add(lastChar, lastChar);
if (nestedPatDone) {
// If there was a character before the nested set,
// then we need to insert it in newPat before the
@ -1416,10 +1420,11 @@ public class UnicodeSet extends UnicodeFilter {
} else {
if (lastChar != NONE) {
// We have <char><char>
add((char) lastChar, (char) lastChar);
add(lastChar, lastChar);
_appendToPat(newPat, lastChar, false);
}
lastChar = c;
isLastLiteral = isLiteral;
}
}
@ -1430,9 +1435,9 @@ public class UnicodeSet extends UnicodeFilter {
// Treat a trailing '$' as indicating ETHER. This code is only
// executed if symbols == NULL; otherwise other code parses the
// anchor.
if (lastChar == SymbolTable.SYMBOL_REF) {
if (lastChar == SymbolTable.SYMBOL_REF && !isLastLiteral) {
rebuildPattern = true;
newPat.append(lastChar);
newPat.append((char) lastChar);
add(TransliterationRule.ETHER);
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
* $Date: 2001/11/29 22:31:18 $
* $Revision: 1.48 $
* $Date: 2001/12/01 01:31:18 $
* $Revision: 1.49 $
*
*****************************************************************************************
*/
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
* Unicode property
* </table>
* @author Alan Liu
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.48 $ $Date: 2001/11/29 22:31:18 $
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.49 $ $Date: 2001/12/01 01:31:18 $
*/
public class UnicodeSet extends UnicodeFilter {
@ -443,6 +443,8 @@ public class UnicodeSet extends UnicodeFilter {
case '\\': //BACKSLASH:
case '{':
case '}':
case '$':
case ':':
buf.append('\\');
break;
default:
@ -475,8 +477,9 @@ public class UnicodeSet extends UnicodeFilter {
if (pat != null) {
int i;
int backslashCount = 0;
for (i=0; i<pat.length(); ++i) {
char c = pat.charAt(i);
for (i=0; i<pat.length(); ) {
int c = UTF16.charAt(pat, i);
i += UTF16.getCharCount(c);
if (escapeUnprintable && Utility.isUnprintable(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
@ -488,7 +491,7 @@ public class UnicodeSet extends UnicodeFilter {
Utility.escapeUnprintable(result, c);
backslashCount = 0;
} else {
result.append(c);
UTF16.append(result, c);
if (c == '\\') {
++backslashCount;
} else {
@ -706,7 +709,7 @@ public class UnicodeSet extends UnicodeFilter {
int start = list[i++];
int count = list[i++] - start;
if (index < count) {
return (char)(start + index);
return start + index;
}
index -= count;
}
@ -1114,6 +1117,7 @@ public class UnicodeSet extends UnicodeFilter {
final int NONE = -1;
int lastChar = NONE; // This is either a char (0..10FFFF) or -1
boolean isLastLiteral = false; // TRUE if lastChar was a literal
char lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
@ -1343,7 +1347,7 @@ public class UnicodeSet extends UnicodeFilter {
if (lastOp != 0) {
throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
}
add((char) lastChar, (char) lastChar);
add(lastChar, lastChar);
if (nestedPatDone) {
// If there was a character before the nested set,
// then we need to insert it in newPat before the
@ -1416,10 +1420,11 @@ public class UnicodeSet extends UnicodeFilter {
} else {
if (lastChar != NONE) {
// We have <char><char>
add((char) lastChar, (char) lastChar);
add(lastChar, lastChar);
_appendToPat(newPat, lastChar, false);
}
lastChar = c;
isLastLiteral = isLiteral;
}
}
@ -1430,9 +1435,9 @@ public class UnicodeSet extends UnicodeFilter {
// Treat a trailing '$' as indicating ETHER. This code is only
// executed if symbols == NULL; otherwise other code parses the
// anchor.
if (lastChar == SymbolTable.SYMBOL_REF) {
if (lastChar == SymbolTable.SYMBOL_REF && !isLastLiteral) {
rebuildPattern = true;
newPat.append(lastChar);
newPat.append((char) lastChar);
add(TransliterationRule.ETHER);
}