ICU-13333 Adding combining dot spoof check.

X-SVN-Rev: 41428
This commit is contained in:
Shane Carr 2018-05-22 02:47:31 +00:00
parent 33a0fa7172
commit 0a2aeb017a
8 changed files with 222 additions and 5 deletions

View file

@ -477,7 +477,7 @@ typedef enum USpoofChecks {
*/
USPOOF_CHAR_LIMIT = 64,
/**
/**
* Check that an identifier does not mix numbers from different numbering systems.
* For more information, see UTS 39 section 5.3.
*
@ -485,6 +485,27 @@ typedef enum USpoofChecks {
*/
USPOOF_MIXED_NUMBERS = 128,
/**
* Check that an identifier does not have a combining character following a character in which that
* combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
*
* More specifically, the following characters are forbidden from preceding a U+0307:
* <ul>
* <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
* <li>Latin lowercase letter 'l'</li>
* <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
* <li>Any character whose confusable prototype ends with such a character
* (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
* </ul>
* In addition, combining characters are allowed between the above characters and U+0307 except those
* with combining class 0 or combining class "Above" (230, same class as U+0307).
*
* This list and the number of combing characters considered by this check may grow over time.
*
* @draft ICU 62
*/
USPOOF_HIDDEN_OVERLAY = 256,
/**
* Enable all spoof checks.
*

View file

@ -558,6 +558,13 @@ int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* c
checkResult->fNumerics = numerics; // UnicodeSet::operator=
}
if (0 != (This->fChecks & USPOOF_HIDDEN_OVERLAY)) {
int32_t index = This->findHiddenOverlay(id, *status);
if (index != -1) {
result |= USPOOF_HIDDEN_OVERLAY;
}
}
if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) {
int32_t i;

View file

@ -377,6 +377,43 @@ URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UEr
return USPOOF_MINIMALLY_RESTRICTIVE;
}
int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
bool sawLeadCharacter = false;
for (int32_t i=0; i<input.length();) {
UChar32 cp = input.char32At(i);
if (sawLeadCharacter && cp == 0x0307) {
return i;
}
uint8_t combiningClass = u_getCombiningClass(cp);
// Skip over characters except for those with combining class 0 (non-combining characters) or with
// combining class 230 (same class as U+0307)
U_ASSERT(u_getCombiningClass(0x0307) == 230);
if (combiningClass == 0 || combiningClass == 230) {
sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
}
i += U16_LENGTH(cp);
}
return -1;
}
static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
}
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
return true;
}
UnicodeString skelStr;
fSpoofData->confusableLookup(cp, skelStr);
UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
return true;
}
return false;
}
// Convert a text format hex number. Utility function used by builder code. Static.
@ -532,24 +569,25 @@ uspoof_cleanupDefaultData(void) {
if (gDefaultSpoofData) {
// Will delete, assuming all user-level spoof checkers were closed.
gDefaultSpoofData->removeReference();
gDefaultSpoofData = NULL;
gDefaultSpoofData = nullptr;
gSpoofInitDefaultOnce.reset();
}
return TRUE;
}
static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
spoofDataIsAcceptable,
NULL, // context, would receive dataVersion if supplied.
nullptr, // context, would receive dataVersion if supplied.
&status);
if (U_FAILURE(status)) { return; }
gDefaultSpoofData = new SpoofData(udm, status);
if (U_FAILURE(status)) {
delete gDefaultSpoofData;
gDefaultSpoofData = nullptr;
return;
}
if (gDefaultSpoofData == NULL) {
if (gDefaultSpoofData == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}

View file

@ -83,6 +83,9 @@ public:
void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const;
URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const;
int32_t findHiddenOverlay(const UnicodeString& input, UErrorCode& status) const;
bool isIllegalCombiningDotLeadCharacter(UChar32 cp) const;
/** parse a hex number. Untility used by the builders. */
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);

View file

@ -92,6 +92,7 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
TESTCASE_AUTO(testBug12815);
TESTCASE_AUTO(testBug13314_MixedNumbers);
TESTCASE_AUTO(testBug13328_MixedCombiningMarks);
TESTCASE_AUTO(testCombiningDot);
TESTCASE_AUTO_END;
}
@ -710,4 +711,45 @@ void IntlTestSpoof::testBug13328_MixedCombiningMarks() {
failedChecks);
}
void IntlTestSpoof::testCombiningDot() {
UErrorCode status = U_ZERO_ERROR;
LocalUSpoofCheckerPointer sc(uspoof_open(&status));
TEST_ASSERT_SUCCESS(status);
uspoof_setChecks(sc.getAlias(), USPOOF_HIDDEN_OVERLAY, &status);
TEST_ASSERT_SUCCESS(status);
static const struct TestCase {
bool shouldFail;
const char16_t* input;
} cases[] = {
{false, u"i"},
{false, u"j"},
{false, u"l"},
{true, u"i\u0307"},
{true, u"j\u0307"},
{true, u"l\u0307"},
{true, u"ı\u0307"},
{true, u"ȷ\u0307"},
{true, u"𝚤\u0307"},
{true, u"𝑗\u0307"},
{false, u"m\u0307"},
{true, u"1\u0307"},
{true, u"ij\u0307"},
{true, u"i\u0307\u0307"},
{true, u"abci\u0307def"},
{false, u"i\u0301\u0307"}, // U+0301 has combining class ABOVE (230)
{true, u"i\u0320\u0307"}, // U+0320 has combining class BELOW
{true, u"i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW
{false, u"i\u0320\u0301\u0307"},
{false, u"iz\u0307"},
};
for (auto& cas : cases) {
int32_t failedChecks = uspoof_check2(sc.getAlias(), cas.input, -1, nullptr, &status);
TEST_ASSERT_SUCCESS(status);
int32_t expected = cas.shouldFail ? USPOOF_HIDDEN_OVERLAY : 0;
assertEquals(cas.input, expected, failedChecks);
}
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */

View file

@ -54,6 +54,8 @@ public:
void testBug13328_MixedCombiningMarks();
void testCombiningDot();
// Internal function to run a single skeleton test case.
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
const char *input, const char *expected, int32_t lineNum);

View file

@ -441,6 +441,28 @@ public class SpoofChecker {
*/
public static final int MIXED_NUMBERS = 128;
/**
* Check that an identifier does not have a combining character following a character in which that
* combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
* <p>
* More specifically, the following characters are forbidden from preceding a U+0307:
* <ul>
* <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
* <li>Latin lowercase letter 'l'</li>
* <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
* <li>Any character whose confusable prototype ends with such a character
* (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
* </ul>
* In addition, combining characters are allowed between the above characters and U+0307 except those
* with combining class 0 or combining class "Above" (230, same class as U+0307).
* <p>
* This list and the number of combing characters considered by this check may grow over time.
*
* @draft ICU 62
* @provisional This API might change or be removed in a future release.
*/
public static final int HIDDEN_OVERLAY = 256;
// Update CheckResult.toString() when a new check is added.
/**
@ -1300,6 +1322,13 @@ public class SpoofChecker {
}
}
if (0 != (this.fChecks & HIDDEN_OVERLAY)) {
int index = findHiddenOverlay(text);
if (index != -1) {
result |= HIDDEN_OVERLAY;
}
}
if (0 != (this.fChecks & CHAR_LIMIT)) {
int i;
int c;
@ -1657,6 +1686,44 @@ public class SpoofChecker {
return RestrictionLevel.MINIMALLY_RESTRICTIVE;
}
int findHiddenOverlay(String input) {
boolean sawLeadCharacter = false;
StringBuilder sb = new StringBuilder();
for (int i=0; i<input.length();) {
int cp = input.codePointAt(i);
if (sawLeadCharacter && cp == 0x0307) {
return i;
}
int combiningClass = UCharacter.getCombiningClass(cp);
// Skip over characters except for those with combining class 0 (non-combining characters) or with
// combining class 230 (same class as U+0307)
assert UCharacter.getCombiningClass(0x0307) == 230;
if (combiningClass == 0 || combiningClass == 230) {
sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb);
}
i += UCharacter.charCount(cp);
}
return -1;
}
boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) {
return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' ||
UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED);
}
boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) {
if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
return true;
}
sb.setLength(0);
fSpoofData.confusableLookup(cp, sb);
int finalCp = UCharacter.codePointBefore(sb, sb.length());
if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
return true;
}
return false;
}
// Data Members
private int fChecks; // Bit vector of checks to perform.
private SpoofData fSpoofData;

View file

@ -855,4 +855,41 @@ public class SpoofCheckerTest extends TestFmwk {
SpoofChecker.RESTRICTION_LEVEL,
checkResult.checks);
}
@Test
public void testCombiningDot() {
SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.HIDDEN_OVERLAY).build();
Object[][] cases = new Object[][] {
{false, "i"},
{false, "j"},
{false, "l"},
{true, "i\u0307"},
{true, "j\u0307"},
{true, "l\u0307"},
{true, "ı\u0307"},
{true, "ȷ\u0307"},
{true, "𝚤\u0307"},
{true, "𝑗\u0307"},
{false, "m\u0307"},
{true, "1\u0307"},
{true, "ij\u0307"},
{true, "i\u0307\u0307"},
{true, "abci\u0307def"},
{false, "i\u0301\u0307"}, // U+0301 has combining class ABOVE (230)
{true, "i\u0320\u0307"}, // U+0320 has combining class BELOW
{true, "i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW
{false, "i\u0320\u0301\u0307"},
{false, "iz\u0307"},
};
for (Object[] cas : cases) {
boolean shouldFail = (Boolean) cas[0];
String input = (String) cas[1];
CheckResult result = new CheckResult();
sc.failsChecks(input, result);
int expected = shouldFail ? SpoofChecker.HIDDEN_OVERLAY : 0;
assertEquals(input, expected, result.checks);
}
}
}