mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
ICU-1572 fix UnicodeSet.toPattern() round trip bugs
X-SVN-Rev: 7245
This commit is contained in:
parent
221d9f6880
commit
302bf822c7
5 changed files with 129 additions and 26 deletions
|
@ -402,11 +402,13 @@ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool useHexEscape)
|
|||
case BACKSLASH:
|
||||
case 123/*{*/:
|
||||
case 125/*}*/:
|
||||
case SymbolTable::SYMBOL_REF:
|
||||
case COLON:
|
||||
buf.append(BACKSLASH);
|
||||
break;
|
||||
default:
|
||||
// Escape whitespace
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
if (u_isspace(c)) {
|
||||
buf.append(BACKSLASH);
|
||||
}
|
||||
break;
|
||||
|
@ -435,8 +437,9 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
|
|||
if (pat.length() > 0) {
|
||||
int32_t i;
|
||||
int32_t backslashCount = 0;
|
||||
for (i=0; i<pat.length(); ++i) {
|
||||
UChar c = pat.charAt(i);
|
||||
for (i=0; i<pat.length(); ) {
|
||||
UChar32 c = pat.char32At(i);
|
||||
i += UTF_CHAR_LENGTH(c);
|
||||
if (escapeUnprintable && Utility::isUnprintable(c)) {
|
||||
// If the unprintable character is preceded by an odd
|
||||
// number of backslashes, then it has been escaped.
|
||||
|
@ -940,6 +943,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
|
||||
const UChar32 NONE = (UChar32) -1;
|
||||
UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE
|
||||
UBool isLastLiteral = FALSE; // TRUE if lastChar was a literal
|
||||
UChar lastOp = 0;
|
||||
|
||||
/* This loop iterates over the characters in the pattern. We start at
|
||||
|
@ -1269,6 +1273,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
_appendToPat(newPat, lastChar, FALSE);
|
||||
}
|
||||
lastChar = c;
|
||||
isLastLiteral = isLiteral;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1281,7 +1286,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
// Treat a trailing '$' as indicating ETHER. This code is only
|
||||
// executed if symbols == NULL; otherwise other code parses the
|
||||
// anchor.
|
||||
if (lastChar == (UChar)SymbolTable::SYMBOL_REF) {
|
||||
if (lastChar == (UChar)SymbolTable::SYMBOL_REF && !isLastLiteral) {
|
||||
rebuildPattern = TRUE;
|
||||
newPat.append(lastChar);
|
||||
add(TransliterationRule::ETHER);
|
||||
|
|
|
@ -44,10 +44,88 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
|
|||
CASE(7,TestPropertySet);
|
||||
CASE(8,TestClone);
|
||||
CASE(9,TestExhaustive);
|
||||
CASE(10,TestToPattern);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that toPattern() round trips with syntax characters and
|
||||
* whitespace.
|
||||
*/
|
||||
void UnicodeSetTest::TestToPattern() {
|
||||
for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
|
||||
if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
|
||||
// check various combinations to make sure they all work.
|
||||
if (i != 0 && !toPatternAux(i, i)) continue;
|
||||
if (!toPatternAux(0, i)) continue;
|
||||
if (!toPatternAux(i, 0xFFFF)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UnicodeString spat = "[:nonspacing mark:]";
|
||||
UnicodeSet s(spat, ec);
|
||||
if (U_FAILURE(ec)) { errln("FAIL: UnicodeSet constructor"); return; }
|
||||
UnicodeString tpat;
|
||||
s.toPattern(tpat, TRUE);
|
||||
UnicodeSet t(tpat, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
errln((UnicodeString)"FAIL: " + spat + ".toPattern() => " + tpat +
|
||||
": INVALID PATTERN");
|
||||
} else {
|
||||
if (s!=t) {
|
||||
UnicodeString str;
|
||||
t.toPattern(str, TRUE);
|
||||
errln((UnicodeString)"FAIL: " + spat + ".toPattern().new UnicodeSet() => " +
|
||||
str);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
|
||||
// use Integer.toString because Utility.hex doesn't handle ints
|
||||
UnicodeString pat = "";
|
||||
// TODO do these in hex
|
||||
//String source = "0x" + Integer.toString(start,16).toUpperCase();
|
||||
//if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
|
||||
UnicodeString source = source + (int32_t)start;
|
||||
if (start != end) source = source + ".." + (int32_t)end;
|
||||
UnicodeSet testSet;
|
||||
testSet.add(start, end);
|
||||
|
||||
// What we want to make sure of is that a pattern generated
|
||||
// by toPattern(), with or without escaped unprintables, can
|
||||
// be passed back into the UnicodeSet constructor.
|
||||
UnicodeString pat0; testSet.toPattern(pat0, TRUE);
|
||||
if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
|
||||
|
||||
//String pat1 = unescapeLeniently(pat0);
|
||||
//if (!checkPat(source + " (in code)", testSet, pat1)) return false;
|
||||
|
||||
UnicodeString pat2; testSet.toPattern(pat2, FALSE);
|
||||
if (!checkPat(source, testSet, pat2)) return FALSE;
|
||||
|
||||
//String pat3 = unescapeLeniently(pat2);
|
||||
//if (!checkPat(source + " (in code)", testSet, pat3)) return false;
|
||||
|
||||
//logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
|
||||
logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool UnicodeSetTest::checkPat(const UnicodeString& source,
|
||||
const UnicodeSet& testSet,
|
||||
const UnicodeString& pat) {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UnicodeSet testSet2(pat, ec);
|
||||
if (testSet2 != testSet) {
|
||||
errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void
|
||||
UnicodeSetTest::TestPatterns(void) {
|
||||
UnicodeSet set;
|
||||
|
|
|
@ -27,6 +27,12 @@ class UnicodeSetTest: public IntlTest {
|
|||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Test that toPattern() round trips with syntax characters and
|
||||
* whitespace.
|
||||
*/
|
||||
void TestToPattern();
|
||||
|
||||
void TestPatterns(void);
|
||||
void TestCategories(void);
|
||||
void TestAddRemove(void);
|
||||
|
@ -52,6 +58,10 @@ private:
|
|||
|
||||
private:
|
||||
|
||||
UBool toPatternAux(UChar32 start, UChar32 end);
|
||||
|
||||
UBool checkPat(const UnicodeString& source, const UnicodeSet& testSet, const UnicodeString& pat);
|
||||
|
||||
void _testComplement(int32_t a, UnicodeSet&, UnicodeSet&);
|
||||
|
||||
void _testAdd(int32_t a, int32_t b, UnicodeSet&, UnicodeSet&, UnicodeSet&);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
|
||||
* $Date: 2001/11/29 22:31:18 $
|
||||
* $Revision: 1.48 $
|
||||
* $Date: 2001/12/01 01:31:18 $
|
||||
* $Revision: 1.49 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
|
|||
* Unicode property
|
||||
* </table>
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.48 $ $Date: 2001/11/29 22:31:18 $
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.49 $ $Date: 2001/12/01 01:31:18 $
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter {
|
||||
|
||||
|
@ -443,6 +443,8 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
case '\\': //BACKSLASH:
|
||||
case '{':
|
||||
case '}':
|
||||
case '$':
|
||||
case ':':
|
||||
buf.append('\\');
|
||||
break;
|
||||
default:
|
||||
|
@ -475,8 +477,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
if (pat != null) {
|
||||
int i;
|
||||
int backslashCount = 0;
|
||||
for (i=0; i<pat.length(); ++i) {
|
||||
char c = pat.charAt(i);
|
||||
for (i=0; i<pat.length(); ) {
|
||||
int c = UTF16.charAt(pat, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (escapeUnprintable && Utility.isUnprintable(c)) {
|
||||
// If the unprintable character is preceded by an odd
|
||||
// number of backslashes, then it has been escaped.
|
||||
|
@ -488,7 +491,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
Utility.escapeUnprintable(result, c);
|
||||
backslashCount = 0;
|
||||
} else {
|
||||
result.append(c);
|
||||
UTF16.append(result, c);
|
||||
if (c == '\\') {
|
||||
++backslashCount;
|
||||
} else {
|
||||
|
@ -706,7 +709,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
int start = list[i++];
|
||||
int count = list[i++] - start;
|
||||
if (index < count) {
|
||||
return (char)(start + index);
|
||||
return start + index;
|
||||
}
|
||||
index -= count;
|
||||
}
|
||||
|
@ -1114,6 +1117,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
|
||||
final int NONE = -1;
|
||||
int lastChar = NONE; // This is either a char (0..10FFFF) or -1
|
||||
boolean isLastLiteral = false; // TRUE if lastChar was a literal
|
||||
char lastOp = 0;
|
||||
|
||||
/* This loop iterates over the characters in the pattern. We start at
|
||||
|
@ -1343,7 +1347,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
if (lastOp != 0) {
|
||||
throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
|
||||
}
|
||||
add((char) lastChar, (char) lastChar);
|
||||
add(lastChar, lastChar);
|
||||
if (nestedPatDone) {
|
||||
// If there was a character before the nested set,
|
||||
// then we need to insert it in newPat before the
|
||||
|
@ -1416,10 +1420,11 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
} else {
|
||||
if (lastChar != NONE) {
|
||||
// We have <char><char>
|
||||
add((char) lastChar, (char) lastChar);
|
||||
add(lastChar, lastChar);
|
||||
_appendToPat(newPat, lastChar, false);
|
||||
}
|
||||
lastChar = c;
|
||||
isLastLiteral = isLiteral;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1430,9 +1435,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// Treat a trailing '$' as indicating ETHER. This code is only
|
||||
// executed if symbols == NULL; otherwise other code parses the
|
||||
// anchor.
|
||||
if (lastChar == SymbolTable.SYMBOL_REF) {
|
||||
if (lastChar == SymbolTable.SYMBOL_REF && !isLastLiteral) {
|
||||
rebuildPattern = true;
|
||||
newPat.append(lastChar);
|
||||
newPat.append((char) lastChar);
|
||||
add(TransliterationRule.ETHER);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
|
||||
* $Date: 2001/11/29 22:31:18 $
|
||||
* $Revision: 1.48 $
|
||||
* $Date: 2001/12/01 01:31:18 $
|
||||
* $Revision: 1.49 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
|
|||
* Unicode property
|
||||
* </table>
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.48 $ $Date: 2001/11/29 22:31:18 $
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.49 $ $Date: 2001/12/01 01:31:18 $
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter {
|
||||
|
||||
|
@ -443,6 +443,8 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
case '\\': //BACKSLASH:
|
||||
case '{':
|
||||
case '}':
|
||||
case '$':
|
||||
case ':':
|
||||
buf.append('\\');
|
||||
break;
|
||||
default:
|
||||
|
@ -475,8 +477,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
if (pat != null) {
|
||||
int i;
|
||||
int backslashCount = 0;
|
||||
for (i=0; i<pat.length(); ++i) {
|
||||
char c = pat.charAt(i);
|
||||
for (i=0; i<pat.length(); ) {
|
||||
int c = UTF16.charAt(pat, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (escapeUnprintable && Utility.isUnprintable(c)) {
|
||||
// If the unprintable character is preceded by an odd
|
||||
// number of backslashes, then it has been escaped.
|
||||
|
@ -488,7 +491,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
Utility.escapeUnprintable(result, c);
|
||||
backslashCount = 0;
|
||||
} else {
|
||||
result.append(c);
|
||||
UTF16.append(result, c);
|
||||
if (c == '\\') {
|
||||
++backslashCount;
|
||||
} else {
|
||||
|
@ -706,7 +709,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
int start = list[i++];
|
||||
int count = list[i++] - start;
|
||||
if (index < count) {
|
||||
return (char)(start + index);
|
||||
return start + index;
|
||||
}
|
||||
index -= count;
|
||||
}
|
||||
|
@ -1114,6 +1117,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
|
||||
final int NONE = -1;
|
||||
int lastChar = NONE; // This is either a char (0..10FFFF) or -1
|
||||
boolean isLastLiteral = false; // TRUE if lastChar was a literal
|
||||
char lastOp = 0;
|
||||
|
||||
/* This loop iterates over the characters in the pattern. We start at
|
||||
|
@ -1343,7 +1347,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
if (lastOp != 0) {
|
||||
throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
|
||||
}
|
||||
add((char) lastChar, (char) lastChar);
|
||||
add(lastChar, lastChar);
|
||||
if (nestedPatDone) {
|
||||
// If there was a character before the nested set,
|
||||
// then we need to insert it in newPat before the
|
||||
|
@ -1416,10 +1420,11 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
} else {
|
||||
if (lastChar != NONE) {
|
||||
// We have <char><char>
|
||||
add((char) lastChar, (char) lastChar);
|
||||
add(lastChar, lastChar);
|
||||
_appendToPat(newPat, lastChar, false);
|
||||
}
|
||||
lastChar = c;
|
||||
isLastLiteral = isLiteral;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1430,9 +1435,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// Treat a trailing '$' as indicating ETHER. This code is only
|
||||
// executed if symbols == NULL; otherwise other code parses the
|
||||
// anchor.
|
||||
if (lastChar == SymbolTable.SYMBOL_REF) {
|
||||
if (lastChar == SymbolTable.SYMBOL_REF && !isLastLiteral) {
|
||||
rebuildPattern = true;
|
||||
newPat.append(lastChar);
|
||||
newPat.append((char) lastChar);
|
||||
add(TransliterationRule.ETHER);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue