diff --git a/icu4c/source/i18n/unicode/uspoof.h b/icu4c/source/i18n/unicode/uspoof.h index 9fcfcd3ede8..002e704d4ac 100644 --- a/icu4c/source/i18n/unicode/uspoof.h +++ b/icu4c/source/i18n/unicode/uspoof.h @@ -477,7 +477,7 @@ typedef enum USpoofChecks { */ USPOOF_CHAR_LIMIT = 64, - /** + /** * Check that an identifier does not mix numbers from different numbering systems. * For more information, see UTS 39 section 5.3. * @@ -485,6 +485,27 @@ typedef enum USpoofChecks { */ USPOOF_MIXED_NUMBERS = 128, + /** + * Check that an identifier does not have a combining character following a character in which that + * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. + * + * More specifically, the following characters are forbidden from preceding a U+0307: + * + * In addition, combining characters are allowed between the above characters and U+0307 except those + * with combining class 0 or combining class "Above" (230, same class as U+0307). + * + * This list and the number of combing characters considered by this check may grow over time. + * + * @draft ICU 62 + */ + USPOOF_HIDDEN_OVERLAY = 256, + /** * Enable all spoof checks. * diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp index 2e8b29e1bba..c66a6e1ffde 100644 --- a/icu4c/source/i18n/uspoof.cpp +++ b/icu4c/source/i18n/uspoof.cpp @@ -558,6 +558,13 @@ int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* c checkResult->fNumerics = numerics; // UnicodeSet::operator= } + if (0 != (This->fChecks & USPOOF_HIDDEN_OVERLAY)) { + int32_t index = This->findHiddenOverlay(id, *status); + if (index != -1) { + result |= USPOOF_HIDDEN_OVERLAY; + } + } + if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) { int32_t i; diff --git a/icu4c/source/i18n/uspoof_impl.cpp b/icu4c/source/i18n/uspoof_impl.cpp index 7c68612b0aa..2c1f088b12d 100644 --- a/icu4c/source/i18n/uspoof_impl.cpp +++ b/icu4c/source/i18n/uspoof_impl.cpp @@ -377,6 +377,43 @@ URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UEr return USPOOF_MINIMALLY_RESTRICTIVE; } +int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const { + bool sawLeadCharacter = false; + for (int32_t i=0; iconfusableLookup(cp, skelStr); + UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1)); + if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { + return true; + } + return false; +} + // Convert a text format hex number. Utility function used by builder code. Static. @@ -532,24 +569,25 @@ uspoof_cleanupDefaultData(void) { if (gDefaultSpoofData) { // Will delete, assuming all user-level spoof checkers were closed. gDefaultSpoofData->removeReference(); - gDefaultSpoofData = NULL; + gDefaultSpoofData = nullptr; gSpoofInitDefaultOnce.reset(); } return TRUE; } static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) { - UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables", + UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables", spoofDataIsAcceptable, - NULL, // context, would receive dataVersion if supplied. + nullptr, // context, would receive dataVersion if supplied. &status); if (U_FAILURE(status)) { return; } gDefaultSpoofData = new SpoofData(udm, status); if (U_FAILURE(status)) { delete gDefaultSpoofData; + gDefaultSpoofData = nullptr; return; } - if (gDefaultSpoofData == NULL) { + if (gDefaultSpoofData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } diff --git a/icu4c/source/i18n/uspoof_impl.h b/icu4c/source/i18n/uspoof_impl.h index a6b1e73e22d..2985c7317f4 100644 --- a/icu4c/source/i18n/uspoof_impl.h +++ b/icu4c/source/i18n/uspoof_impl.h @@ -83,6 +83,9 @@ public: void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const; URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const; + int32_t findHiddenOverlay(const UnicodeString& input, UErrorCode& status) const; + bool isIllegalCombiningDotLeadCharacter(UChar32 cp) const; + /** parse a hex number. Untility used by the builders. */ static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status); diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp index 1e2ec174c8b..13f993959d2 100644 --- a/icu4c/source/test/intltest/itspoof.cpp +++ b/icu4c/source/test/intltest/itspoof.cpp @@ -92,6 +92,7 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name TESTCASE_AUTO(testBug12815); TESTCASE_AUTO(testBug13314_MixedNumbers); TESTCASE_AUTO(testBug13328_MixedCombiningMarks); + TESTCASE_AUTO(testCombiningDot); TESTCASE_AUTO_END; } @@ -710,4 +711,45 @@ void IntlTestSpoof::testBug13328_MixedCombiningMarks() { failedChecks); } +void IntlTestSpoof::testCombiningDot() { + UErrorCode status = U_ZERO_ERROR; + LocalUSpoofCheckerPointer sc(uspoof_open(&status)); + TEST_ASSERT_SUCCESS(status); + uspoof_setChecks(sc.getAlias(), USPOOF_HIDDEN_OVERLAY, &status); + TEST_ASSERT_SUCCESS(status); + + static const struct TestCase { + bool shouldFail; + const char16_t* input; + } cases[] = { + {false, u"i"}, + {false, u"j"}, + {false, u"l"}, + {true, u"i\u0307"}, + {true, u"j\u0307"}, + {true, u"l\u0307"}, + {true, u"ı\u0307"}, + {true, u"ȷ\u0307"}, + {true, u"𝚤\u0307"}, + {true, u"𝑗\u0307"}, + {false, u"m\u0307"}, + {true, u"1\u0307"}, + {true, u"ij\u0307"}, + {true, u"i\u0307\u0307"}, + {true, u"abci\u0307def"}, + {false, u"i\u0301\u0307"}, // U+0301 has combining class ABOVE (230) + {true, u"i\u0320\u0307"}, // U+0320 has combining class BELOW + {true, u"i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW + {false, u"i\u0320\u0301\u0307"}, + {false, u"iz\u0307"}, + }; + + for (auto& cas : cases) { + int32_t failedChecks = uspoof_check2(sc.getAlias(), cas.input, -1, nullptr, &status); + TEST_ASSERT_SUCCESS(status); + int32_t expected = cas.shouldFail ? USPOOF_HIDDEN_OVERLAY : 0; + assertEquals(cas.input, expected, failedChecks); + } +} + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */ diff --git a/icu4c/source/test/intltest/itspoof.h b/icu4c/source/test/intltest/itspoof.h index f3212a19821..47cda32cf38 100644 --- a/icu4c/source/test/intltest/itspoof.h +++ b/icu4c/source/test/intltest/itspoof.h @@ -54,6 +54,8 @@ public: void testBug13328_MixedCombiningMarks(); + void testCombiningDot(); + // Internal function to run a single skeleton test case. void checkSkeleton(const USpoofChecker *sc, uint32_t flags, const char *input, const char *expected, int32_t lineNum); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java index 28afb1adbd4..4059b213080 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java @@ -441,6 +441,28 @@ public class SpoofChecker { */ public static final int MIXED_NUMBERS = 128; + /** + * Check that an identifier does not have a combining character following a character in which that + * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. + *

+ * More specifically, the following characters are forbidden from preceding a U+0307: + *

+ * In addition, combining characters are allowed between the above characters and U+0307 except those + * with combining class 0 or combining class "Above" (230, same class as U+0307). + *

+ * This list and the number of combing characters considered by this check may grow over time. + * + * @draft ICU 62 + * @provisional This API might change or be removed in a future release. + */ + public static final int HIDDEN_OVERLAY = 256; + // Update CheckResult.toString() when a new check is added. /** @@ -1300,6 +1322,13 @@ public class SpoofChecker { } } + if (0 != (this.fChecks & HIDDEN_OVERLAY)) { + int index = findHiddenOverlay(text); + if (index != -1) { + result |= HIDDEN_OVERLAY; + } + } + if (0 != (this.fChecks & CHAR_LIMIT)) { int i; int c; @@ -1657,6 +1686,44 @@ public class SpoofChecker { return RestrictionLevel.MINIMALLY_RESTRICTIVE; } + int findHiddenOverlay(String input) { + boolean sawLeadCharacter = false; + StringBuilder sb = new StringBuilder(); + for (int i=0; i