ICU-7087 fix spoof detection bugs with buffer handling while computing skeletons, and with supplemental character handling.

X-SVN-Rev: 26628
This commit is contained in:
Andy Heninger 2009-09-14 04:02:38 +00:00
parent d51a3d446c
commit 44b26a579b
6 changed files with 27862 additions and 8 deletions

View file

@ -679,12 +679,13 @@ uspoof_getSkeleton(const USpoofChecker *sc,
// Check the skeleton for NFKD, normalize it if needed.
// Unnormalized results should be very rare.
if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
normalizedLen = unorm_normalize(dest, resultLen, UNORM_NFKD, 0, NULL, 0, status);
normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status);
UChar *normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
if (normedResult == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
*status = U_ZERO_ERROR;
unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
result = normedResult;
resultLen = normalizedLen;
@ -776,21 +777,21 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
goto cleanup;
}
*status = U_ZERO_ERROR;
u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars+1,
u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
s, length, status);
}
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
if (outBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
*status = U_ZERO_ERROR;
skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
outBuf, skelLengthInUChars+1, status);
}
u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,

View file

@ -289,7 +289,7 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
// This a little like a Java intern() - any duplicates will be eliminated.
SPUString *smapString = stringPool->addString(mapString, status);
// Add the UChar -> string mapping to the appropriate table.
// Add the UChar32 -> string mapping to the appropriate table.
UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :

View file

@ -130,7 +130,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
int32_t *low = fSpoofData->fCFUKeys;
int32_t *mid = NULL;
int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
UChar midc;
UChar32 midc;
do {
int32_t delta = (limit-low)/2;
mid = low + delta;

View file

@ -14,6 +14,11 @@
#include "itspoof.h"
#include "unicode/uspoof.h"
#include "unicode/unistr.h"
#include "unicode/regex.h"
#include "unicode/normlzr.h"
#include <stdlib.h>
#include <stdio.h>
#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
@ -79,6 +84,12 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
testInvisible();
}
break;
case 4:
name = "testConfData";
if (exec) {
testConfData();
}
break;
default: name=""; break;
}
}
@ -245,5 +256,144 @@ void IntlTestSpoof::testInvisible() {
TEST_ASSERT_EQ(7, position);
TEST_TEARDOWN;
}
static UnicodeString parseHex(const UnicodeString &in) {
// Convert a series of hex numbers in a Unicode String to a string with the
// corresponding characters.
// The conversion is _really_ annoying. There must be some function to just do it.
UnicodeString result;
UChar32 cc = 0;
for (int32_t i=0; i<in.length(); i++) {
UChar c = in.charAt(i);
if (c == 0x20) { // Space
if (cc > 0) {
result.append(cc);
cc = 0;
}
} else if (c>=0x30 && c<=0x39) {
cc = (cc<<4) + (c - 0x30);
} else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
cc = (cc<<4) + (c & 0x0f)+9;
}
// else do something with bad input.
}
if (cc > 0) {
result.append(cc);
}
return result;
}
//
// Append the hex form of a UChar32 to a UnicodeString.
// Used in formatting error messages.
// Match the formatting of numbers in confusables.txt
// Minimum of 4 digits, no leading zeroes for positions 5 and up.
//
static void appendHexUChar(UnicodeString &dest, UChar32 c) {
UBool doZeroes = FALSE;
for (int bitNum=28; bitNum>=0; bitNum-=4) {
if (bitNum <= 12) {
doZeroes = TRUE;
}
int hexDigit = (c>>bitNum) & 0x0f;
if (hexDigit != 0 || doZeroes) {
doZeroes = TRUE;
dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
}
}
dest.append((UChar)0x20);
}
// testConfData - Check each data item from the Unicode confusables.txt file,
// verify that it transforms correctly in a skeleton.
//
void IntlTestSpoof::testConfData() {
UErrorCode status = U_ZERO_ERROR;
const char *testDataDir = IntlTest::getSourceTestData(status);
TEST_ASSERT_SUCCESS(status);
char buffer[2000];
strcpy(buffer, testDataDir);
strcat(buffer, "confusables.txt");
FILE *f = NULL;
f = fopen(buffer, "rb");
if (f == 0) {
errln("Skipping test spoof/testConfData. File confusables.txt not accessible.");
return;
}
fseek(f, 0, SEEK_END);
int32_t fileSize = ftell(f);
char *fileBuf = new char[fileSize];
fseek(f, 0, SEEK_SET);
int32_t amt_read = fread(fileBuf, 1, fileSize, f);
TEST_ASSERT_EQ(amt_read, fileSize);
TEST_ASSERT(fileSize>0);
if (amt_read != fileSize || fileSize <=0) {
delete [] fileBuf;
return;
}
fclose(f);
UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf, fileSize));
USpoofChecker *sc = uspoof_open(&status);
TEST_ASSERT_SUCCESS(status);
// Parse lines from the confusables.txt file. Example Line:
// FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
// Three fields. The hex fields can contain more than one character,
// and each character may be more than 4 digits (for supplemntals)
// This regular expression matches lines and splits the fields into capture groups.
RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
TEST_ASSERT_SUCCESS(status);
while (parseLine.find()) {
UnicodeString from = parseHex(parseLine.group(1, status));
if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
// The source character was not NFKD.
// Skip this case; the first step in obtaining a skeleton is to NFKD the input,
// so the mapping in this line of confusables.txt will never be applied.
continue;
}
UnicodeString rawExpected = parseHex(parseLine.group(2, status));
UnicodeString expected;
Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
TEST_ASSERT_SUCCESS(status);
int32_t skeletonType = 0;
UnicodeString tableType = parseLine.group(3, status);
TEST_ASSERT_SUCCESS(status);
if (tableType.indexOf("SL") >= 0) {
skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
} else if (tableType.indexOf("SA") >= 0) {
skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
} else if (tableType.indexOf("ML") >= 0) {
skeletonType = 0;
} else if (tableType.indexOf("MA") >= 0) {
skeletonType = USPOOF_ANY_CASE;
}
UnicodeString actual;
uspoof_getSkeletonUnicodeString(sc, skeletonType, from, actual, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(actual == expected);
if (actual != expected) {
errln(parseLine.group(0, status));
UnicodeString line = "Actual: ";
int i = 0;
while (i < actual.length()) {
appendHexUChar(line, actual.char32At(i));
i = actual.moveIndex32(i, 1);
}
errln(line);
}
if (U_FAILURE(status)) {
break;
}
}
}
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS

View file

@ -32,6 +32,8 @@ public:
void testInvisible();
void testConfData();
// Internal function to run a single skeleton test case.
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
const char *input, const char *expected, int32_t lineNum);

27701
icu4c/source/test/testdata/confusables.txt vendored Normal file

File diff suppressed because it is too large Load diff