mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-7087 fix spoof detection bugs with buffer handling while computing skeletons, and with supplemental character handling.
X-SVN-Rev: 26628
This commit is contained in:
parent
d51a3d446c
commit
44b26a579b
6 changed files with 27862 additions and 8 deletions
|
@ -679,12 +679,13 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||
// Check the skeleton for NFKD, normalize it if needed.
|
||||
// Unnormalized results should be very rare.
|
||||
if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
|
||||
normalizedLen = unorm_normalize(dest, resultLen, UNORM_NFKD, 0, NULL, 0, status);
|
||||
normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status);
|
||||
UChar *normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
|
||||
if (normedResult == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
*status = U_ZERO_ERROR;
|
||||
unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
|
||||
result = normedResult;
|
||||
resultLen = normalizedLen;
|
||||
|
@ -776,21 +777,21 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
|||
goto cleanup;
|
||||
}
|
||||
*status = U_ZERO_ERROR;
|
||||
u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars+1,
|
||||
u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
|
||||
s, length, status);
|
||||
}
|
||||
|
||||
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
|
||||
skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
|
||||
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*status = U_ZERO_ERROR;
|
||||
outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
|
||||
if (outBuf == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
|
||||
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
|
||||
*status = U_ZERO_ERROR;
|
||||
skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
|
||||
outBuf, skelLengthInUChars+1, status);
|
||||
}
|
||||
|
||||
u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
|
||||
|
|
|
@ -289,7 +289,7 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
|
|||
// This a little like a Java intern() - any duplicates will be eliminated.
|
||||
SPUString *smapString = stringPool->addString(mapString, status);
|
||||
|
||||
// Add the UChar -> string mapping to the appropriate table.
|
||||
// Add the UChar32 -> string mapping to the appropriate table.
|
||||
UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
|
||||
uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
|
||||
uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
|
||||
|
|
|
@ -130,7 +130,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
|
|||
int32_t *low = fSpoofData->fCFUKeys;
|
||||
int32_t *mid = NULL;
|
||||
int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
|
||||
UChar midc;
|
||||
UChar32 midc;
|
||||
do {
|
||||
int32_t delta = (limit-low)/2;
|
||||
mid = low + delta;
|
||||
|
|
|
@ -14,6 +14,11 @@
|
|||
|
||||
#include "itspoof.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
|
||||
errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
|
||||
|
@ -79,6 +84,12 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
|
|||
testInvisible();
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
name = "testConfData";
|
||||
if (exec) {
|
||||
testConfData();
|
||||
}
|
||||
break;
|
||||
default: name=""; break;
|
||||
}
|
||||
}
|
||||
|
@ -245,5 +256,144 @@ void IntlTestSpoof::testInvisible() {
|
|||
TEST_ASSERT_EQ(7, position);
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
|
||||
static UnicodeString parseHex(const UnicodeString &in) {
|
||||
// Convert a series of hex numbers in a Unicode String to a string with the
|
||||
// corresponding characters.
|
||||
// The conversion is _really_ annoying. There must be some function to just do it.
|
||||
UnicodeString result;
|
||||
UChar32 cc = 0;
|
||||
for (int32_t i=0; i<in.length(); i++) {
|
||||
UChar c = in.charAt(i);
|
||||
if (c == 0x20) { // Space
|
||||
if (cc > 0) {
|
||||
result.append(cc);
|
||||
cc = 0;
|
||||
}
|
||||
} else if (c>=0x30 && c<=0x39) {
|
||||
cc = (cc<<4) + (c - 0x30);
|
||||
} else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
|
||||
cc = (cc<<4) + (c & 0x0f)+9;
|
||||
}
|
||||
// else do something with bad input.
|
||||
}
|
||||
if (cc > 0) {
|
||||
result.append(cc);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Append the hex form of a UChar32 to a UnicodeString.
|
||||
// Used in formatting error messages.
|
||||
// Match the formatting of numbers in confusables.txt
|
||||
// Minimum of 4 digits, no leading zeroes for positions 5 and up.
|
||||
//
|
||||
static void appendHexUChar(UnicodeString &dest, UChar32 c) {
|
||||
UBool doZeroes = FALSE;
|
||||
for (int bitNum=28; bitNum>=0; bitNum-=4) {
|
||||
if (bitNum <= 12) {
|
||||
doZeroes = TRUE;
|
||||
}
|
||||
int hexDigit = (c>>bitNum) & 0x0f;
|
||||
if (hexDigit != 0 || doZeroes) {
|
||||
doZeroes = TRUE;
|
||||
dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
|
||||
}
|
||||
}
|
||||
dest.append((UChar)0x20);
|
||||
}
|
||||
|
||||
// testConfData - Check each data item from the Unicode confusables.txt file,
|
||||
// verify that it transforms correctly in a skeleton.
|
||||
//
|
||||
void IntlTestSpoof::testConfData() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
const char *testDataDir = IntlTest::getSourceTestData(status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
char buffer[2000];
|
||||
strcpy(buffer, testDataDir);
|
||||
strcat(buffer, "confusables.txt");
|
||||
|
||||
FILE *f = NULL;
|
||||
f = fopen(buffer, "rb");
|
||||
if (f == 0) {
|
||||
errln("Skipping test spoof/testConfData. File confusables.txt not accessible.");
|
||||
return;
|
||||
}
|
||||
fseek(f, 0, SEEK_END);
|
||||
int32_t fileSize = ftell(f);
|
||||
char *fileBuf = new char[fileSize];
|
||||
fseek(f, 0, SEEK_SET);
|
||||
int32_t amt_read = fread(fileBuf, 1, fileSize, f);
|
||||
TEST_ASSERT_EQ(amt_read, fileSize);
|
||||
TEST_ASSERT(fileSize>0);
|
||||
if (amt_read != fileSize || fileSize <=0) {
|
||||
delete [] fileBuf;
|
||||
return;
|
||||
}
|
||||
fclose(f);
|
||||
UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf, fileSize));
|
||||
|
||||
USpoofChecker *sc = uspoof_open(&status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
// Parse lines from the confusables.txt file. Example Line:
|
||||
// FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
|
||||
// Three fields. The hex fields can contain more than one character,
|
||||
// and each character may be more than 4 digits (for supplemntals)
|
||||
// This regular expression matches lines and splits the fields into capture groups.
|
||||
RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
while (parseLine.find()) {
|
||||
UnicodeString from = parseHex(parseLine.group(1, status));
|
||||
if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
|
||||
// The source character was not NFKD.
|
||||
// Skip this case; the first step in obtaining a skeleton is to NFKD the input,
|
||||
// so the mapping in this line of confusables.txt will never be applied.
|
||||
continue;
|
||||
}
|
||||
|
||||
UnicodeString rawExpected = parseHex(parseLine.group(2, status));
|
||||
UnicodeString expected;
|
||||
Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
int32_t skeletonType = 0;
|
||||
UnicodeString tableType = parseLine.group(3, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
if (tableType.indexOf("SL") >= 0) {
|
||||
skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
} else if (tableType.indexOf("SA") >= 0) {
|
||||
skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
|
||||
} else if (tableType.indexOf("ML") >= 0) {
|
||||
skeletonType = 0;
|
||||
} else if (tableType.indexOf("MA") >= 0) {
|
||||
skeletonType = USPOOF_ANY_CASE;
|
||||
}
|
||||
|
||||
UnicodeString actual;
|
||||
uspoof_getSkeletonUnicodeString(sc, skeletonType, from, actual, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(actual == expected);
|
||||
if (actual != expected) {
|
||||
errln(parseLine.group(0, status));
|
||||
UnicodeString line = "Actual: ";
|
||||
int i = 0;
|
||||
while (i < actual.length()) {
|
||||
appendHexUChar(line, actual.char32At(i));
|
||||
i = actual.moveIndex32(i, 1);
|
||||
}
|
||||
errln(line);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
|
|
|
@ -32,6 +32,8 @@ public:
|
|||
|
||||
void testInvisible();
|
||||
|
||||
void testConfData();
|
||||
|
||||
// Internal function to run a single skeleton test case.
|
||||
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
|
||||
const char *input, const char *expected, int32_t lineNum);
|
||||
|
|
27701
icu4c/source/test/testdata/confusables.txt
vendored
Normal file
27701
icu4c/source/test/testdata/confusables.txt
vendored
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue