mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-22404 Strip default ignorable code points in the skeleton for confusable detection
This commit is contained in:
parent
86193b1b98
commit
a6fc915e05
4 changed files with 46 additions and 2 deletions
|
@ -721,7 +721,9 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
|||
for (inputIndex=0; inputIndex < normalizedLen; ) {
|
||||
UChar32 c = nfdId.char32At(inputIndex);
|
||||
inputIndex += U16_LENGTH(c);
|
||||
This->fSpoofData->confusableLookup(c, skelStr);
|
||||
if (!u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
|
||||
This->fSpoofData->confusableLookup(c, skelStr);
|
||||
}
|
||||
}
|
||||
|
||||
gNfdNormalizer->normalize(skelStr, dest, *status);
|
||||
|
|
|
@ -140,6 +140,18 @@ void IntlTestSpoof::testSpoofAPI() {
|
|||
TEST_ASSERT(UnicodeString("lllOO") == dest);
|
||||
TEST_ASSERT(&dest == &retStr);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
TEST_SETUP
|
||||
// Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile
|
||||
// of a minimal pair with a ZWNJ in Persian.
|
||||
const UnicodeString behrooz(u"بهروز");
|
||||
const UnicodeString update(u"بهروز");
|
||||
// These strings differ only by a ZWNJ.
|
||||
TEST_ASSERT(UnicodeString(update).findAndReplace(u"\u200C", u"") == behrooz);
|
||||
int32_t checkResults = uspoof_areConfusableUnicodeString(sc, behrooz, update, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
|
||||
|
@ -384,6 +396,13 @@ void IntlTestSpoof::testConfData() {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (u_hasBinaryProperty(from.char32At(0), UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
|
||||
// The source character is a default ignorable code point.
|
||||
// Skip this case; the second step in obtaining a skeleton is to remove DIs,
|
||||
// so the mapping in this line of confusables.txt will never be applied.
|
||||
continue;
|
||||
}
|
||||
|
||||
UnicodeString rawExpected = parseHex(parseLine.group(2, status));
|
||||
UnicodeString expected;
|
||||
Normalizer::decompose(rawExpected, false /*NFD*/, 0, expected, status);
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.impl.ICUBinary;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.ICUBinary.Authenticate;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
@ -1509,7 +1510,9 @@ public class SpoofChecker {
|
|||
for (int inputIndex = 0; inputIndex < normalizedLen;) {
|
||||
int c = Character.codePointAt(nfdId, inputIndex);
|
||||
inputIndex += Character.charCount(c);
|
||||
this.fSpoofData.confusableLookup(c, skelSB);
|
||||
if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) {
|
||||
this.fSpoofData.confusableLookup(c, skelSB);
|
||||
}
|
||||
}
|
||||
String skelStr = skelSB.toString();
|
||||
skelStr = nfdNormalizer.normalize(skelStr);
|
||||
|
|
|
@ -31,7 +31,10 @@ import org.junit.runners.JUnit4;
|
|||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.dev.test.TestUtil;
|
||||
import com.ibm.icu.dev.test.TestUtil.JavaVendor;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.SpoofChecker;
|
||||
|
@ -66,6 +69,7 @@ public class SpoofCheckerTest extends TestFmwk {
|
|||
|
||||
String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han
|
||||
|
||||
static final UnicodeSet DEFAULT_IGNORABLE_CODE_POINT = new UnicodeSet("\\p{di}");
|
||||
|
||||
/*
|
||||
* Test basic constructor.
|
||||
|
@ -376,6 +380,15 @@ public class SpoofCheckerTest extends TestFmwk {
|
|||
s = "I1l0O";
|
||||
String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
|
||||
assertEquals("", dest, "lllOO");
|
||||
|
||||
// Example from UTS #55, Section 5.1.3 https://www.unicode.org/reports/tr55/#General-Security-Profile,
|
||||
// of a minimal pair with a ZWNJ in Persian.
|
||||
final String behrooz = "بهروز";
|
||||
final String update = "بهروز";
|
||||
// These strings differ only by a ZWNJ.
|
||||
assertEquals("", update.replace("\u200C", ""), behrooz);
|
||||
checkResult = sc.areConfusable(behrooz, update);
|
||||
assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResult);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -728,6 +741,13 @@ public class SpoofCheckerTest extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (DEFAULT_IGNORABLE_CODE_POINT.containsSome(from)) {
|
||||
// The source character is a default ignorable code point.
|
||||
// Skip this case; the second step in obtaining a skeleton is to remove DIs,
|
||||
// so the mapping in this line of confusables.txt will never be applied.
|
||||
continue;
|
||||
}
|
||||
|
||||
String rawExpected = parseHex(parseLine.group(2));
|
||||
String expected = normalizer.normalize(rawExpected);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue