mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-4790 more spoof detection
X-SVN-Rev: 25921
This commit is contained in:
parent
c480d63da5
commit
25be8362b9
3 changed files with 104 additions and 34 deletions
|
@ -234,48 +234,89 @@ uspoof_check(const USpoofChecker *sc,
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: add USPOOF_INVISIBLE check
|
||||
|
||||
if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
|
||||
// The basic test is the same for both whole and mixed script confusables.
|
||||
// Compute the set of scripts that every input character has a confusable in.
|
||||
// For this computation an input character is always considered to be
|
||||
// confusable with itself in its own script.
|
||||
// If the number of such scripts is two or more, and the input consisted of
|
||||
// characters all from a single script, we have a whole script confusable.
|
||||
// (The two scripts will be the original script and the one that is confusable)
|
||||
// If the number of such scripts >= one, and the original input contained characters from
|
||||
// more than one script, we have a mixed script confusable. (We can transform
|
||||
// some of the characters, and end up with a visually similar string all in
|
||||
// one script.)
|
||||
|
||||
if (This->fChecks &
|
||||
(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
|
||||
// These are the checks that need to be done on NFKD input
|
||||
NFKDBuffer normalizedInput(text, length, *status);
|
||||
const UChar *nfkdText = normalizedInput.getBuffer();
|
||||
int32_t nfkdLength = normalizedInput.getLength();
|
||||
|
||||
if (scriptCount == -1) {
|
||||
int32_t t;
|
||||
scriptCount = This->scriptScan(text, length, t, *status);
|
||||
if (This->fChecks & USPOOF_INVISIBLE) {
|
||||
|
||||
// scan for more than one occurence of the same non-spacing mark
|
||||
// in a sequence of non-spacing marks.
|
||||
int32_t i;
|
||||
UChar32 c;
|
||||
UChar32 firstNonspacingMark = 0;
|
||||
UBool haveMultipleMarks = FALSE;
|
||||
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
|
||||
|
||||
for (i=0; i<length ;) {
|
||||
U16_NEXT(nfkdText, i, nfkdLength, c);
|
||||
if (u_charType(c) != U_NON_SPACING_MARK) {
|
||||
firstNonspacingMark = 0;
|
||||
if (haveMultipleMarks) {
|
||||
marksSeenSoFar.clear();
|
||||
haveMultipleMarks = FALSE;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (firstNonspacingMark == 0) {
|
||||
firstNonspacingMark = c;
|
||||
continue;
|
||||
}
|
||||
if (!haveMultipleMarks) {
|
||||
marksSeenSoFar.add(firstNonspacingMark);
|
||||
haveMultipleMarks = TRUE;
|
||||
}
|
||||
if (marksSeenSoFar.contains(c)) {
|
||||
// report the error, and stop scanning.
|
||||
// No need to find more than the first failure.
|
||||
result |= USPOOF_INVISIBLE;
|
||||
failPos = i;
|
||||
break;
|
||||
}
|
||||
marksSeenSoFar.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ScriptSet scripts;
|
||||
This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
|
||||
int32_t confusableScriptCount = scripts.countMembers();
|
||||
//printf("confusableScriptCount = %d\n", confusableScriptCount);
|
||||
if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
|
||||
// The basic test is the same for both whole and mixed script confusables.
|
||||
// Compute the set of scripts that every input character has a confusable in.
|
||||
// For this computation an input character is always considered to be
|
||||
// confusable with itself in its own script.
|
||||
// If the number of such scripts is two or more, and the input consisted of
|
||||
// characters all from a single script, we have a whole script confusable.
|
||||
// (The two scripts will be the original script and the one that is confusable)
|
||||
// If the number of such scripts >= one, and the original input contained characters from
|
||||
// more than one script, we have a mixed script confusable. (We can transform
|
||||
// some of the characters, and end up with a visually similar string all in
|
||||
// one script.)
|
||||
|
||||
if (scriptCount == -1) {
|
||||
int32_t t;
|
||||
scriptCount = This->scriptScan(text, length, t, *status);
|
||||
}
|
||||
|
||||
ScriptSet scripts;
|
||||
This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
|
||||
int32_t confusableScriptCount = scripts.countMembers();
|
||||
//printf("confusableScriptCount = %d\n", confusableScriptCount);
|
||||
|
||||
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 2 &&
|
||||
scriptCount == 1) {
|
||||
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
|
||||
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 2 &&
|
||||
scriptCount == 1) {
|
||||
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
|
||||
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 1 &&
|
||||
scriptCount > 1) {
|
||||
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 1 &&
|
||||
scriptCount > 1) {
|
||||
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (position != NULL && failPos != 0x7fffffff) {
|
||||
*position = failPos;
|
||||
}
|
||||
|
|
|
@ -73,6 +73,12 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
|
|||
testAreConfusable();
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
name = "TestInvisible";
|
||||
if (exec) {
|
||||
testInvisible();
|
||||
}
|
||||
break;
|
||||
default: name=""; break;
|
||||
}
|
||||
}
|
||||
|
@ -206,7 +212,6 @@ void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
|
|||
}
|
||||
|
||||
void IntlTestSpoof::testAreConfusable() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
TEST_SETUP
|
||||
UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
|
||||
"A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
|
||||
|
@ -218,5 +223,27 @@ void IntlTestSpoof::testAreConfusable() {
|
|||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testInvisible() {
|
||||
TEST_SETUP
|
||||
UnicodeString s = UnicodeString("abcd\\u0301ef").unescape();
|
||||
int32_t position = -42;
|
||||
TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(position == -42);
|
||||
|
||||
UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
|
||||
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(7, position);
|
||||
|
||||
// Tow acute accents, one from the composed a with acute accent, \u00e1,
|
||||
// and one separate.
|
||||
position = -42;
|
||||
UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
|
||||
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(7, position);
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_SPOOF_DETECT*/
|
||||
|
|
|
@ -29,6 +29,8 @@ public:
|
|||
|
||||
void testAreConfusable();
|
||||
|
||||
void testInvisible();
|
||||
|
||||
// Internal function to run a single skeleton test case.
|
||||
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
|
||||
const char *input, const char *expected, int32_t lineNum);
|
||||
|
|
Loading…
Add table
Reference in a new issue