mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-13333 Adding combining dot spoof check.
X-SVN-Rev: 41428
This commit is contained in:
parent
33a0fa7172
commit
0a2aeb017a
8 changed files with 222 additions and 5 deletions
|
@ -477,7 +477,7 @@ typedef enum USpoofChecks {
|
|||
*/
|
||||
USPOOF_CHAR_LIMIT = 64,
|
||||
|
||||
/**
|
||||
/**
|
||||
* Check that an identifier does not mix numbers from different numbering systems.
|
||||
* For more information, see UTS 39 section 5.3.
|
||||
*
|
||||
|
@ -485,6 +485,27 @@ typedef enum USpoofChecks {
|
|||
*/
|
||||
USPOOF_MIXED_NUMBERS = 128,
|
||||
|
||||
/**
|
||||
* Check that an identifier does not have a combining character following a character in which that
|
||||
* combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
|
||||
*
|
||||
* More specifically, the following characters are forbidden from preceding a U+0307:
|
||||
* <ul>
|
||||
* <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
|
||||
* <li>Latin lowercase letter 'l'</li>
|
||||
* <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
|
||||
* <li>Any character whose confusable prototype ends with such a character
|
||||
* (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
|
||||
* </ul>
|
||||
* In addition, combining characters are allowed between the above characters and U+0307 except those
|
||||
* with combining class 0 or combining class "Above" (230, same class as U+0307).
|
||||
*
|
||||
* This list and the number of combing characters considered by this check may grow over time.
|
||||
*
|
||||
* @draft ICU 62
|
||||
*/
|
||||
USPOOF_HIDDEN_OVERLAY = 256,
|
||||
|
||||
/**
|
||||
* Enable all spoof checks.
|
||||
*
|
||||
|
|
|
@ -558,6 +558,13 @@ int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* c
|
|||
checkResult->fNumerics = numerics; // UnicodeSet::operator=
|
||||
}
|
||||
|
||||
if (0 != (This->fChecks & USPOOF_HIDDEN_OVERLAY)) {
|
||||
int32_t index = This->findHiddenOverlay(id, *status);
|
||||
if (index != -1) {
|
||||
result |= USPOOF_HIDDEN_OVERLAY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) {
|
||||
int32_t i;
|
||||
|
|
|
@ -377,6 +377,43 @@ URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UEr
|
|||
return USPOOF_MINIMALLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
|
||||
bool sawLeadCharacter = false;
|
||||
for (int32_t i=0; i<input.length();) {
|
||||
UChar32 cp = input.char32At(i);
|
||||
if (sawLeadCharacter && cp == 0x0307) {
|
||||
return i;
|
||||
}
|
||||
uint8_t combiningClass = u_getCombiningClass(cp);
|
||||
// Skip over characters except for those with combining class 0 (non-combining characters) or with
|
||||
// combining class 230 (same class as U+0307)
|
||||
U_ASSERT(u_getCombiningClass(0x0307) == 230);
|
||||
if (combiningClass == 0 || combiningClass == 230) {
|
||||
sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
|
||||
}
|
||||
i += U16_LENGTH(cp);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
|
||||
return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
|
||||
u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
|
||||
}
|
||||
|
||||
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
|
||||
if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
|
||||
return true;
|
||||
}
|
||||
UnicodeString skelStr;
|
||||
fSpoofData->confusableLookup(cp, skelStr);
|
||||
UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
|
||||
if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Convert a text format hex number. Utility function used by builder code. Static.
|
||||
|
@ -532,24 +569,25 @@ uspoof_cleanupDefaultData(void) {
|
|||
if (gDefaultSpoofData) {
|
||||
// Will delete, assuming all user-level spoof checkers were closed.
|
||||
gDefaultSpoofData->removeReference();
|
||||
gDefaultSpoofData = NULL;
|
||||
gDefaultSpoofData = nullptr;
|
||||
gSpoofInitDefaultOnce.reset();
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
|
||||
UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
|
||||
UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
|
||||
spoofDataIsAcceptable,
|
||||
NULL, // context, would receive dataVersion if supplied.
|
||||
nullptr, // context, would receive dataVersion if supplied.
|
||||
&status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
gDefaultSpoofData = new SpoofData(udm, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete gDefaultSpoofData;
|
||||
gDefaultSpoofData = nullptr;
|
||||
return;
|
||||
}
|
||||
if (gDefaultSpoofData == NULL) {
|
||||
if (gDefaultSpoofData == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -83,6 +83,9 @@ public:
|
|||
void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const;
|
||||
URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const;
|
||||
|
||||
int32_t findHiddenOverlay(const UnicodeString& input, UErrorCode& status) const;
|
||||
bool isIllegalCombiningDotLeadCharacter(UChar32 cp) const;
|
||||
|
||||
/** parse a hex number. Untility used by the builders. */
|
||||
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);
|
||||
|
||||
|
|
|
@ -92,6 +92,7 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
|
|||
TESTCASE_AUTO(testBug12815);
|
||||
TESTCASE_AUTO(testBug13314_MixedNumbers);
|
||||
TESTCASE_AUTO(testBug13328_MixedCombiningMarks);
|
||||
TESTCASE_AUTO(testCombiningDot);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -710,4 +711,45 @@ void IntlTestSpoof::testBug13328_MixedCombiningMarks() {
|
|||
failedChecks);
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testCombiningDot() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalUSpoofCheckerPointer sc(uspoof_open(&status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
uspoof_setChecks(sc.getAlias(), USPOOF_HIDDEN_OVERLAY, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
static const struct TestCase {
|
||||
bool shouldFail;
|
||||
const char16_t* input;
|
||||
} cases[] = {
|
||||
{false, u"i"},
|
||||
{false, u"j"},
|
||||
{false, u"l"},
|
||||
{true, u"i\u0307"},
|
||||
{true, u"j\u0307"},
|
||||
{true, u"l\u0307"},
|
||||
{true, u"ı\u0307"},
|
||||
{true, u"ȷ\u0307"},
|
||||
{true, u"𝚤\u0307"},
|
||||
{true, u"𝑗\u0307"},
|
||||
{false, u"m\u0307"},
|
||||
{true, u"1\u0307"},
|
||||
{true, u"ij\u0307"},
|
||||
{true, u"i\u0307\u0307"},
|
||||
{true, u"abci\u0307def"},
|
||||
{false, u"i\u0301\u0307"}, // U+0301 has combining class ABOVE (230)
|
||||
{true, u"i\u0320\u0307"}, // U+0320 has combining class BELOW
|
||||
{true, u"i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW
|
||||
{false, u"i\u0320\u0301\u0307"},
|
||||
{false, u"iz\u0307"},
|
||||
};
|
||||
|
||||
for (auto& cas : cases) {
|
||||
int32_t failedChecks = uspoof_check2(sc.getAlias(), cas.input, -1, nullptr, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
int32_t expected = cas.shouldFail ? USPOOF_HIDDEN_OVERLAY : 0;
|
||||
assertEquals(cas.input, expected, failedChecks);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */
|
||||
|
|
|
@ -54,6 +54,8 @@ public:
|
|||
|
||||
void testBug13328_MixedCombiningMarks();
|
||||
|
||||
void testCombiningDot();
|
||||
|
||||
// Internal function to run a single skeleton test case.
|
||||
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
|
||||
const char *input, const char *expected, int32_t lineNum);
|
||||
|
|
|
@ -441,6 +441,28 @@ public class SpoofChecker {
|
|||
*/
|
||||
public static final int MIXED_NUMBERS = 128;
|
||||
|
||||
/**
|
||||
* Check that an identifier does not have a combining character following a character in which that
|
||||
* combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
|
||||
* <p>
|
||||
* More specifically, the following characters are forbidden from preceding a U+0307:
|
||||
* <ul>
|
||||
* <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
|
||||
* <li>Latin lowercase letter 'l'</li>
|
||||
* <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
|
||||
* <li>Any character whose confusable prototype ends with such a character
|
||||
* (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
|
||||
* </ul>
|
||||
* In addition, combining characters are allowed between the above characters and U+0307 except those
|
||||
* with combining class 0 or combining class "Above" (230, same class as U+0307).
|
||||
* <p>
|
||||
* This list and the number of combing characters considered by this check may grow over time.
|
||||
*
|
||||
* @draft ICU 62
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final int HIDDEN_OVERLAY = 256;
|
||||
|
||||
// Update CheckResult.toString() when a new check is added.
|
||||
|
||||
/**
|
||||
|
@ -1300,6 +1322,13 @@ public class SpoofChecker {
|
|||
}
|
||||
}
|
||||
|
||||
if (0 != (this.fChecks & HIDDEN_OVERLAY)) {
|
||||
int index = findHiddenOverlay(text);
|
||||
if (index != -1) {
|
||||
result |= HIDDEN_OVERLAY;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 != (this.fChecks & CHAR_LIMIT)) {
|
||||
int i;
|
||||
int c;
|
||||
|
@ -1657,6 +1686,44 @@ public class SpoofChecker {
|
|||
return RestrictionLevel.MINIMALLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
int findHiddenOverlay(String input) {
|
||||
boolean sawLeadCharacter = false;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i=0; i<input.length();) {
|
||||
int cp = input.codePointAt(i);
|
||||
if (sawLeadCharacter && cp == 0x0307) {
|
||||
return i;
|
||||
}
|
||||
int combiningClass = UCharacter.getCombiningClass(cp);
|
||||
// Skip over characters except for those with combining class 0 (non-combining characters) or with
|
||||
// combining class 230 (same class as U+0307)
|
||||
assert UCharacter.getCombiningClass(0x0307) == 230;
|
||||
if (combiningClass == 0 || combiningClass == 230) {
|
||||
sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb);
|
||||
}
|
||||
i += UCharacter.charCount(cp);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) {
|
||||
return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' ||
|
||||
UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED);
|
||||
}
|
||||
|
||||
boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) {
|
||||
if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
|
||||
return true;
|
||||
}
|
||||
sb.setLength(0);
|
||||
fSpoofData.confusableLookup(cp, sb);
|
||||
int finalCp = UCharacter.codePointBefore(sb, sb.length());
|
||||
if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Data Members
|
||||
private int fChecks; // Bit vector of checks to perform.
|
||||
private SpoofData fSpoofData;
|
||||
|
|
|
@ -855,4 +855,41 @@ public class SpoofCheckerTest extends TestFmwk {
|
|||
SpoofChecker.RESTRICTION_LEVEL,
|
||||
checkResult.checks);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCombiningDot() {
|
||||
SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.HIDDEN_OVERLAY).build();
|
||||
|
||||
Object[][] cases = new Object[][] {
|
||||
{false, "i"},
|
||||
{false, "j"},
|
||||
{false, "l"},
|
||||
{true, "i\u0307"},
|
||||
{true, "j\u0307"},
|
||||
{true, "l\u0307"},
|
||||
{true, "ı\u0307"},
|
||||
{true, "ȷ\u0307"},
|
||||
{true, "𝚤\u0307"},
|
||||
{true, "𝑗\u0307"},
|
||||
{false, "m\u0307"},
|
||||
{true, "1\u0307"},
|
||||
{true, "ij\u0307"},
|
||||
{true, "i\u0307\u0307"},
|
||||
{true, "abci\u0307def"},
|
||||
{false, "i\u0301\u0307"}, // U+0301 has combining class ABOVE (230)
|
||||
{true, "i\u0320\u0307"}, // U+0320 has combining class BELOW
|
||||
{true, "i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW
|
||||
{false, "i\u0320\u0301\u0307"},
|
||||
{false, "iz\u0307"},
|
||||
};
|
||||
|
||||
for (Object[] cas : cases) {
|
||||
boolean shouldFail = (Boolean) cas[0];
|
||||
String input = (String) cas[1];
|
||||
CheckResult result = new CheckResult();
|
||||
sc.failsChecks(input, result);
|
||||
int expected = shouldFail ? SpoofChecker.HIDDEN_OVERLAY : 0;
|
||||
assertEquals(input, expected, result.checks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue