ICU-22404 Strip default ignorable code points in the skeleton for confusable detection

This commit is contained in:
Robin Leroy 2023-08-09 16:03:23 +02:00
parent 86193b1b98
commit a6fc915e05
4 changed files with 46 additions and 2 deletions

View file

@ -721,7 +721,9 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
for (inputIndex=0; inputIndex < normalizedLen; ) {
UChar32 c = nfdId.char32At(inputIndex);
inputIndex += U16_LENGTH(c);
This->fSpoofData->confusableLookup(c, skelStr);
if (!u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
This->fSpoofData->confusableLookup(c, skelStr);
}
}
gNfdNormalizer->normalize(skelStr, dest, *status);

View file

@ -140,6 +140,18 @@ void IntlTestSpoof::testSpoofAPI() {
TEST_ASSERT(UnicodeString("lllOO") == dest);
TEST_ASSERT(&dest == &retStr);
TEST_TEARDOWN;
TEST_SETUP
// Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile
// of a minimal pair with a ZWNJ in Persian.
const UnicodeString behrooz(u"بهروز");
const UnicodeString update(u"به‌روز");
// These strings differ only by a ZWNJ.
TEST_ASSERT(UnicodeString(update).findAndReplace(u"\u200C", u"") == behrooz);
int32_t checkResults = uspoof_areConfusableUnicodeString(sc, behrooz, update, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
TEST_TEARDOWN;
}
@ -384,6 +396,13 @@ void IntlTestSpoof::testConfData() {
continue;
}
if (u_hasBinaryProperty(from.char32At(0), UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
// The source character is a default ignorable code point.
// Skip this case; the second step in obtaining a skeleton is to remove DIs,
// so the mapping in this line of confusables.txt will never be applied.
continue;
}
UnicodeString rawExpected = parseHex(parseLine.group(2, status));
UnicodeString expected;
Normalizer::decompose(rawExpected, false /*NFD*/, 0, expected, status);

View file

@ -32,6 +32,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.ICUBinary.Authenticate;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
@ -1509,7 +1510,9 @@ public class SpoofChecker {
for (int inputIndex = 0; inputIndex < normalizedLen;) {
int c = Character.codePointAt(nfdId, inputIndex);
inputIndex += Character.charCount(c);
this.fSpoofData.confusableLookup(c, skelSB);
if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) {
this.fSpoofData.confusableLookup(c, skelSB);
}
}
String skelStr = skelSB.toString();
skelStr = nfdNormalizer.normalize(skelStr);

View file

@ -31,7 +31,10 @@ import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.dev.test.TestUtil.JavaVendor;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.SpoofChecker;
@ -66,6 +69,7 @@ public class SpoofCheckerTest extends TestFmwk {
String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han
static final UnicodeSet DEFAULT_IGNORABLE_CODE_POINT = new UnicodeSet("\\p{di}");
/*
* Test basic constructor.
@ -376,6 +380,15 @@ public class SpoofCheckerTest extends TestFmwk {
s = "I1l0O";
String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
assertEquals("", dest, "lllOO");
// Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile,
// of a minimal pair with a ZWNJ in Persian.
final String behrooz = "بهروز";
final String update = "به‌روز";
// These strings differ only by a ZWNJ.
assertEquals("", update.replace("\u200C", ""), behrooz);
checkResult = sc.areConfusable(behrooz, update);
assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResult);
}
@Test
@ -728,6 +741,13 @@ public class SpoofCheckerTest extends TestFmwk {
continue;
}
if (DEFAULT_IGNORABLE_CODE_POINT.containsSome(from)) {
// The source character is a default ignorable code point.
// Skip this case; the second step in obtaining a skeleton is to remove DIs,
// so the mapping in this line of confusables.txt will never be applied.
continue;
}
String rawExpected = parseHex(parseLine.group(2));
String expected = normalizer.normalize(rawExpected);