ICU-22332 bidiSkeleton and LTR- and RTL-confusabilities

This commit is contained in:
Robin Leroy 2023-08-11 17:19:10 +02:00
parent fb0f36203a
commit f79fe9347a
8 changed files with 708 additions and 28 deletions
icu4c/source
icu4j/main
classes/core/src/com/ibm/icu/text
tests/core/src/com/ibm/icu/dev/test/text

View file

@ -19,6 +19,7 @@
#ifndef USPOOF_H
#define USPOOF_H
#include "unicode/ubidi.h"
#include "unicode/utypes.h"
#include "unicode/uset.h"
#include "unicode/parseerr.h"
@ -83,6 +84,25 @@
* the instance should be created once (e.g., upon application startup), and the efficient
* {@link uspoof_areConfusable} method can be used at runtime.
*
* If the paragraph direction used to display the strings is known, the bidi function should be used instead:
*
* \code{.c}
* UErrorCode status = U_ZERO_ERROR;
* // These strings look identical when rendered in a left-to-right context.
* // They look distinct in a left-to-right context.
* UChar* str1 = (UChar*) u"A1\u05D0"; // A1א
* UChar* str2 = (UChar*) u"A\u05D01"; // Aא1
*
* USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
*
* int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
* UBool result = bitmask != 0;
* // areBidiConfusable: 1 (status: U_ZERO_ERROR)
* printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc);
* \endcode
*
* <p>
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
* {@link uspoof_close} when the object goes out of scope:
@ -519,7 +539,7 @@ typedef enum USpoofChecks {
/**
* Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
* Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
* for returned identifier restriction levels in check results.
*
* @stable ICU 51
@ -633,8 +653,8 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
/**
* Open a Spoof Checker from the source form of the spoof data.
* The input corresponds to the Unicode data file confusables.txt
* as described in Unicode UAX #39. The syntax of the source data
* is as described in UAX #39 for this file, and the content of
* as described in Unicode Technical Standard #39. The syntax of the source data
* is as described in UTS #39 for this file, and the content of
* this file is acceptable input.
*
* The character encoding of the (char *) input text is UTF-8.
@ -1111,7 +1131,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
/**
* Check the whether two specified strings are visually confusable.
* Check whether two specified strings are visually confusable.
*
* If the strings are confusable, the return value will be nonzero, as long as
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
@ -1159,7 +1179,58 @@ uspoof_areConfusable(const USpoofChecker *sc,
const UChar *id2, int32_t length2,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Check whether two specified strings are visually confusable when
* displayed in a context with the given paragraph direction.
*
* If the strings are confusable, the return value will be nonzero, as long as
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
*
* The bits in the return value correspond to flags for each of the classes of
* confusables applicable to the two input strings. According to UTS 39
* section 4, the possible flags are:
*
* <ul>
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
* </ul>
*
* If one or more of the above flags were not listed in uspoof_setChecks(), this
* function will never report that class of confusable. The check
* {@link USPOOF_CONFUSABLE} enables all three flags.
*
*
* @param sc The USpoofChecker
* @param direction The paragraph direction with which the identifiers are
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-16 format.
* @param length1 the length of the first identifier, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* nul terminated.
* @param id2 The second of the two identifiers to be compared for
* confusability. The identifiers are in UTF-16 format.
* @param length2 The length of the second identifiers, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
*
* @draft ICU 74
*/
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
const UChar *id1, int32_t length1,
const UChar *id2, int32_t length2,
UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
@ -1192,14 +1263,45 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc,
const char *id2, int32_t length2,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
*
* @param sc The USpoofChecker
* @param direction The paragraph direction with which the identifiers are
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param length1 the length of the first identifiers, in bytes, or -1
* if the string is nul terminated.
* @param id2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param length2 The length of the second string in bytes, or -1
* if the string is nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the strings is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the strings
* are not confusable.
*
* @draft ICU 74
*
* @see uspoof_areBidiConfusable
*/
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id1, int32_t length1,
const char *id2, int32_t length2,
UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
* See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -1233,11 +1335,50 @@ uspoof_getSkeleton(const USpoofChecker *sc,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Get the "bidiSkeleton" for an identifier and a direction.
* Skeletons are a transformation of the input identifier;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information:
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* set of existing identifiers, by creating an efficiently
* searchable collection of the skeletons.
*
* @param sc The USpoofChecker.
* @param direction The context direction with which the identifier will be
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id The input identifier whose skeleton will be computed.
* @param length The length of the input identifier, expressed in 16 bit
* UTF-16 code units, or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
* @param destCapacity The length of the output buffer, in 16 bit units.
* The destCapacity may be zero, in which case the function will
* return the actual length of the skeleton.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* @return The length of the skeleton string. The returned length
* is always that of the complete skeleton, even when the
* supplied buffer is too small (or of zero length)
*
* @draft ICU 74
* @see uspoof_areBidiConfusable
*/
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
UBiDiDirection direction,
const UChar *id, int32_t length,
UChar *dest, int32_t destCapacity, UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
* See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -1273,6 +1414,46 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
char *dest, int32_t destCapacity,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Get the "bidiSkeleton" for an identifier and a direction.
* Skeletons are a transformation of the input identifier;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information:
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* set of existing identifiers, by creating an efficiently
* searchable collection of the skeletons.
*
* @param sc The USpoofChecker
* @param direction The context direction with which the identifier will be
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id The UTF-8 format identifier whose skeleton will be computed.
* @param length The length of the input string, in bytes,
* or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
* @param destCapacity The length of the output buffer, in bytes.
* The destCapacity may be zero, in which case the function will
* return the actual length of the skeleton.
* @param status The error code, set if an error occurred while attempting to
* perform the check. Possible Errors include U_INVALID_CHAR_FOUND
* for invalid UTF-8 sequences, and
* U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
* to hold the complete skeleton.
* @return The length of the skeleton string, in bytes. The returned length
* is always that of the complete skeleton, even when the
* supplied buffer is too small (or of zero length)
*
* @draft ICU 74
*/
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id, int32_t length, char *dest,
int32_t destCapacity, UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in http://unicode.org/Public/security/latest/xidmodifications.txt
@ -1510,11 +1691,42 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
const icu::UnicodeString &s2,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
*
* @param sc The USpoofChecker
* @param direction The paragraph direction with which the identifiers are
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param s1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param s2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
*
* @draft ICU 74
*
* @see uspoof_areBidiConfusable
*/
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
UBiDiDirection direction,
const icu::UnicodeString &s1,
const icu::UnicodeString &s2,
UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
* See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -1540,6 +1752,36 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
icu::UnicodeString &dest,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Get the "bidiSkeleton" for an identifier and a direction.
* Skeletons are a transformation of the input identifier;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information.
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* set of existing identifiers, by creating an efficiently
* searchable collection of the skeletons.
*
* @param sc The USpoofChecker.
* @param direction The context direction with which the identifier will be
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id The input identifier whose bidiSkeleton will be computed.
* @param dest The output identifier, to receive the skeleton string.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* @return A reference to the destination (skeleton) string.
*
* @draft ICU 74.0
*/
U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
icu::UnicodeString &dest, UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in http://unicode.org/Public/security/latest/xidmodifications.txt

View file

@ -15,6 +15,7 @@
*
* Unicode Spoof Detection
*/
#include "unicode/ubidi.h"
#include "unicode/utypes.h"
#include "unicode/normalizer2.h"
#include "unicode/uspoof.h"
@ -538,6 +539,90 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
return result;
}
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
const char16_t *id1, int32_t length1,
const char16_t *id2, int32_t length2,
UErrorCode *status) {
UnicodeString id1Str((length1 == -1), id1, length1); // Aliasing constructor
UnicodeString id2Str((length2 == -1), id2, length2); // Aliasing constructor
if (id1Str.isBogus() || id2Str.isBogus()) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
}
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id1, int32_t length1, const char *id2,
int32_t length2, UErrorCode *status) {
if (length1 < -1 || length2 < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString id1Str = UnicodeString::fromUTF8(
StringPiece(id1, length1 >= 0 ? length1 : static_cast<int32_t>(uprv_strlen(id1))));
UnicodeString id2Str = UnicodeString::fromUTF8(
StringPiece(id2, length2 >= 0 ? length2 : static_cast<int32_t>(uprv_strlen(id2))));
return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
}
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
UBiDiDirection direction,
const icu::UnicodeString &id1,
const icu::UnicodeString &id2,
UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
return 0;
}
//
// See section 4 of UTS 39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
// We only care about a few of the check flags. Ignore the others.
// If no tests relevant to this function have been specified, return an error.
// TODO: is this really the right thing to do? It's probably an error on the caller's part,
// but logically we would just return 0 (no error).
if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
*status = U_INVALID_STATE_ERROR;
return 0;
}
// Compute the skeletons and check for confusability.
UnicodeString id1Skeleton;
uspoof_getBidiSkeletonUnicodeString(sc, direction, id1, id1Skeleton, status);
UnicodeString id2Skeleton;
uspoof_getBidiSkeletonUnicodeString(sc, direction, id2, id2Skeleton, status);
if (U_FAILURE(*status)) {
return 0;
}
if (id1Skeleton != id2Skeleton) {
return 0;
}
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate
// classes of confusables according to UTS 39 section 4. Start by computing the resolved script sets
// of id1 and id2.
ScriptSet id1RSS;
This->getResolvedScriptSet(id1, id1RSS, *status);
ScriptSet id2RSS;
This->getResolvedScriptSet(id2, id2RSS, *status);
// Turn on all applicable flags
uint32_t result = 0;
if (id1RSS.intersects(id2RSS)) {
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
} else {
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) {
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
}
// Turn off flags that the user doesn't want
return result & This->fChecks;
}
U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
@ -697,6 +782,60 @@ uspoof_getSkeleton(const USpoofChecker *sc,
return destStr.length();
}
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc, UBiDiDirection direction,
const UChar *id, int32_t length, UChar *dest,
int32_t destCapacity, UErrorCode *status) {
UnicodeString idStr((length == -1), id, length); // Aliasing constructor
if (idStr.isBogus()) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString destStr;
uspoof_getBidiSkeletonUnicodeString(sc, direction, idStr, destStr, status);
return destStr.extract(dest, destCapacity, *status);
}
U_I18N_API UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(const USpoofChecker *sc,
UBiDiDirection direction,
const UnicodeString &id,
UnicodeString &dest,
UErrorCode *status) {
dest.remove();
if (direction != UBIDI_LTR && direction != UBIDI_RTL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return dest;
}
UBiDi *bidi = ubidi_open();
ubidi_setPara(bidi, id.getBuffer(), id.length(), direction,
/*embeddingLevels*/ nullptr, status);
if (U_FAILURE(*status)) {
ubidi_close(bidi);
return dest;
}
UnicodeString reordered;
int32_t const size = ubidi_getProcessedLength(bidi);
UChar* const reorderedBuffer = reordered.getBuffer(size);
if (reorderedBuffer == nullptr) {
*status = U_MEMORY_ALLOCATION_ERROR;
ubidi_close(bidi);
return dest;
}
ubidi_writeReordered(bidi, reorderedBuffer, size,
UBIDI_KEEP_BASE_COMBINING | UBIDI_DO_MIRRORING, status);
reordered.releaseBuffer(size);
ubidi_close(bidi);
if (U_FAILURE(*status)) {
return dest;
}
// The type parameter is deprecated since ICU 58; any number may be passed.
constexpr uint32_t deprecatedType = 58;
return uspoof_getSkeletonUnicodeString(sc, deprecatedType, reordered, dest, status);
}
U_I18N_API UnicodeString & U_EXPORT2
@ -730,12 +869,8 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
return dest;
}
U_CAPI int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
uint32_t type,
const char *id, int32_t length,
char *dest, int32_t destCapacity,
U_CAPI int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, const char *id,
int32_t length, char *dest, int32_t destCapacity,
UErrorCode *status) {
SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
@ -746,7 +881,8 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
return 0;
}
UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : static_cast<int32_t>(uprv_strlen(id))));
UnicodeString srcStr = UnicodeString::fromUTF8(
StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
UnicodeString destStr;
uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
if (U_FAILURE(*status)) {
@ -754,8 +890,28 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
}
int32_t lengthInUTF8 = 0;
u_strToUTF8(dest, destCapacity, &lengthInUTF8,
destStr.getBuffer(), destStr.length(), status);
u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
return lengthInUTF8;
}
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id, int32_t length, char *dest,
int32_t destCapacity, UErrorCode *status) {
if (length < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString srcStr = UnicodeString::fromUTF8(
StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
UnicodeString destStr;
uspoof_getBidiSkeletonUnicodeString(sc, direction, srcStr, destStr, status);
if (U_FAILURE(*status)) {
return 0;
}
int32_t lengthInUTF8 = 0;
u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
return lengthInUTF8;
}

View file

@ -545,6 +545,26 @@ static void TestUSpoofCAPI(void) {
TEST_TEARDOWN;
/*
* uspoof_areBidiConfusable()
*/
TEST_SETUP
int32_t checkResults;
checkResults = uspoof_areBidiConfusable(sc, UBIDI_LTR, scLatin, -1, scMixed, -1, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
checkResults = uspoof_areBidiConfusable(sc, UBIDI_LTR, goodGreek, -1, scLatin, -1, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(0, checkResults);
checkResults = uspoof_areBidiConfusable(sc, UBIDI_LTR, lll_Latin_a, -1, lll_Latin_b, -1, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
TEST_TEARDOWN;
/*
* areConfusableUTF8
*/
@ -577,6 +597,38 @@ static void TestUSpoofCAPI(void) {
TEST_TEARDOWN;
/*
* areBidiConfusableUTF8
*/
TEST_SETUP
int32_t checkResults;
char s1[200];
char s2[200];
u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status);
u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status);
TEST_ASSERT_SUCCESS(status);
checkResults = uspoof_areBidiConfusableUTF8(sc, UBIDI_LTR, s1, -1, s2, -1, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status);
u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status);
TEST_ASSERT_SUCCESS(status);
checkResults = uspoof_areBidiConfusableUTF8(sc, UBIDI_LTR, s1, -1, s2, -1, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(0, checkResults);
u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status);
u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status);
TEST_ASSERT_SUCCESS(status);
checkResults = uspoof_areBidiConfusableUTF8(sc, UBIDI_LTR, s1, -1, s2, -1, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
TEST_TEARDOWN;
/*
* getSkeleton
@ -602,6 +654,31 @@ static void TestUSpoofCAPI(void) {
TEST_TEARDOWN;
/*
* getBidiSkeleton
*/
TEST_SETUP
UChar dest[100];
int32_t skelLength;
skelLength = uspoof_getBidiSkeleton(sc, UBIDI_LTR, lll_Latin_a, -1, dest, UPRV_LENGTHOF(dest), &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest));
TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength);
skelLength = uspoof_getBidiSkeletonUTF8(sc, UBIDI_LTR, goodLatinUTF8, -1, (char *)dest,
UPRV_LENGTHOF(dest), &status);
TEST_ASSERT_SUCCESS(status);
skelLength = uspoof_getBidiSkeleton(sc, UBIDI_LTR, lll_Latin_a, -1, NULL, 0, &status);
TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status);
TEST_ASSERT_EQ(3, skelLength);
status = U_ZERO_ERROR;
TEST_TEARDOWN;
/*
* get Inclusion and Recommended sets
*/

View file

@ -917,7 +917,7 @@ group: charset_detector
group: spoof_detection
uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o scriptset.o
deps
uniset_props regex unorm uscript
uniset_props regex unorm uscript ubidi
group: alphabetic_index
alphaindex.o

View file

@ -95,6 +95,7 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testSpoofAPI);
TESTCASE_AUTO(testSkeleton);
TESTCASE_AUTO(testBidiSkeleton);
TESTCASE_AUTO(testAreConfusable);
TESTCASE_AUTO(testInvisible);
TESTCASE_AUTO(testConfData);
@ -154,10 +155,13 @@ void IntlTestSpoof::testSpoofAPI() {
TEST_TEARDOWN;
}
#define CHECK_SKELETON(type, input, expected) \
UPRV_BLOCK_MACRO_BEGIN { checkSkeleton(sc, type, input, expected, __LINE__); } \
UPRV_BLOCK_MACRO_END
#define CHECK_SKELETON(type, input, expected) UPRV_BLOCK_MACRO_BEGIN { \
checkSkeleton(sc, type, input, expected, __LINE__); \
} UPRV_BLOCK_MACRO_END
#define CHECK_BIDI_SKELETON(type, input, expected) \
UPRV_BLOCK_MACRO_BEGIN { checkBidiSkeleton(sc, type, input, expected, __LINE__); } \
UPRV_BLOCK_MACRO_END
// testSkeleton. Spot check a number of confusable skeleton substitutions from the
@ -227,6 +231,15 @@ void IntlTestSpoof::testSkeleton() {
TEST_TEARDOWN;
}
void IntlTestSpoof::testBidiSkeleton() {
TEST_SETUP
CHECK_BIDI_SKELETON(u"A1<שׂ", UBIDI_LTR, u"Al<ש\u0307");
CHECK_BIDI_SKELETON(u"Αשֺ>1", UBIDI_LTR, u"Al<ש\u0307");
CHECK_BIDI_SKELETON(u"A1<שׂ", UBIDI_RTL, u"ש\u0307>Al");
CHECK_BIDI_SKELETON(u"Αשֺ>1", UBIDI_RTL, u"l<ש\u0307A");
TEST_TEARDOWN;
}
//
// Run a single confusable skeleton transformation test case.
@ -252,6 +265,31 @@ void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
}
}
//
// Run a single confusable bidiSkeleton transformation test case.
//
void IntlTestSpoof::checkBidiSkeleton(const USpoofChecker *sc, const UnicodeString &input,
UBiDiDirection direction, const UnicodeString &expected,
int32_t lineNum) {
UnicodeString uInput = input.unescape();
UnicodeString uExpected = expected.unescape();
UErrorCode status = U_ZERO_ERROR;
UnicodeString actual;
uspoof_getBidiSkeletonUnicodeString(sc, direction, uInput, actual, &status);
if (U_FAILURE(status)) {
errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
u_errorName(status));
return;
}
if (uExpected != actual) {
errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
__FILE__, __LINE__, lineNum);
errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") +
UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
}
}
void IntlTestSpoof::testAreConfusable() {
TEST_SETUP
UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
@ -265,6 +303,20 @@ void IntlTestSpoof::testAreConfusable() {
TEST_TEARDOWN;
}
void IntlTestSpoof::testAreBidiConfusable() {
TEST_SETUP
const UnicodeString jHyphen2(u"J-2");
// The following string has RLMs around the 2, flipping it; it uses an
// EN DASH instead of the HYPHEN-MINUS above.
const UnicodeString j2Dash(u"J\u200F2\u2013\u200F");
TEST_ASSERT(j2Dash == u"J2");
int32_t result = uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, jHyphen2, j2Dash, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, result);
TEST_TEARDOWN;
}
void IntlTestSpoof::testInvisible() {
TEST_SETUP
UnicodeString s = UnicodeString("abcd\\u0301ef").unescape();

View file

@ -30,8 +30,12 @@ public:
void testSkeleton();
void testBidiSkeleton();
void testAreConfusable();
void testAreBidiConfusable();
void testInvisible();
void testConfData();
@ -56,9 +60,11 @@ public:
void testCombiningDot();
// Internal function to run a single skeleton test case.
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
const char *input, const char *expected, int32_t lineNum);
// Internal functions to run a single skeleton test case.
void checkSkeleton(const USpoofChecker *sc, uint32_t flags, const char *input, const char *expected,
int32_t lineNum);
void checkBidiSkeleton(const USpoofChecker *sc, const UnicodeString &input, UBiDiDirection direction,
const UnicodeString &expected, int32_t lineNum);
};
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO

View file

@ -81,6 +81,22 @@ import com.ibm.icu.util.ULocale;
* application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime.
*
* <p>
* If the paragraph direction used to display the strings is known, it should be passed to {@link SpoofChecker#areConfusable}:
*
* <pre>
* <code>
* // These strings look identical when rendered in a left-to-right context.
* // They look distinct in a right-to-left context.
* String s1 = "A1\u05D0"; // A1א
* String s2 = "A\u05D01"; // Aא1
*
* SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
* int result = sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, s1, s2);
* System.out.println(result != 0); // true
* </code>
* </pre>
*
* <p>
* UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a
* sequence of families of confusable characters, where each family has a single exemplar character.
* {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is
@ -1422,7 +1438,7 @@ public class SpoofChecker {
}
/**
* Check the whether two specified strings are visually confusable. The types of confusability to be tested - single
* Check whether two specified strings are visually confusable. The types of confusability to be tested - single
* script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
*
* The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
@ -1442,7 +1458,7 @@ public class SpoofChecker {
*/
public int areConfusable(String s1, String s2) {
//
// See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
// See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
// We only care about a few of the check flags. Ignore the others.
@ -1479,12 +1495,104 @@ public class SpoofChecker {
}
}
// Turn off flags that the user doesn't want
return result & fChecks;
}
/**
* Check whether two specified strings are visually when displayed in a paragraph with the given direction.
* The types of confusability to be testedsingle script, mixed script, or whole scriptare determined by the check options set for the SpoofChecker.
*
* The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
* WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
*
* ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
* folded for comparison and display to the user, do not select the ANY_CASE option.
*
*
* @param direction The paragraph direction with which the identifiers are displayed.
* Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
* @param s1
* The first of the two strings to be compared for confusability.
* @param s2
* The second of the two strings to be compared for confusability.
* @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
* found, as defined by spoof check test constants.
* @draft ICU 74
*/
public int areConfusable(int direction, CharSequence s1, CharSequence s2) {
//
// See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
// We only care about a few of the check flags. Ignore the others.
// If no tests relevant to this function have been specified, signal an error.
// TODO: is this really the right thing to do? It's probably an error on
// the caller's part, but logically we would just return 0 (no error).
if ((this.fChecks & CONFUSABLE) == 0) {
throw new IllegalArgumentException("No confusable checks are enabled.");
}
// Compute the skeletons and check for confusability.
String s1Skeleton = getBidiSkeleton(direction, s1);
String s2Skeleton = getBidiSkeleton(direction, s2);
if (!s1Skeleton.equals(s2Skeleton)) {
return 0;
}
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
// of confusables according to UTS 39 section 4.
// Start by computing the resolved script sets of s1 and s2.
ScriptSet s1RSS = new ScriptSet();
getResolvedScriptSet(s1, s1RSS);
ScriptSet s2RSS = new ScriptSet();
getResolvedScriptSet(s2, s2RSS);
// Turn on all applicable flags
int result = 0;
if (s1RSS.intersects(s2RSS)) {
result |= SINGLE_SCRIPT_CONFUSABLE;
} else {
result |= MIXED_SCRIPT_CONFUSABLE;
if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
result |= WHOLE_SCRIPT_CONFUSABLE;
}
}
// Turn off flags that the user doesn't want
result &= fChecks;
return result;
}
/**
* Get the "bidiSkeleton" for an identifier string and a direction.
* Skeletons are a transformation of the input string;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information:
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
* large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
*
* Skeletons are computed using the algorithm and data described in UTS #39.
*
* @param direction The paragraph direction with which the string is displayed.
* Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
* @param str The input string whose bidiSkeleton will be generated.
* @return The output skeleton string.
*
* @draft ICU 74
*/
public String getBidiSkeleton(int direction, CharSequence str) {
if (direction != Bidi.DIRECTION_LEFT_TO_RIGHT && direction != Bidi.DIRECTION_RIGHT_TO_LEFT) {
throw new IllegalArgumentException("direction should be DIRECTION_LEFT_TO_RIGHT or DIRECTION_RIGHT_TO_LEFT");
}
Bidi bidi = new Bidi(str.toString(), direction);
return getSkeleton(bidi.writeReordered(Bidi.KEEP_BASE_COMBINING | Bidi.DO_MIRRORING));
}
/**
* Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
* confusable if their skeletons are identical. See Unicode UAX 39 for additional information.

View file

@ -36,6 +36,7 @@ import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Bidi;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.SpoofChecker;
import com.ibm.icu.text.SpoofChecker.CheckResult;
@ -455,6 +456,16 @@ public class SpoofCheckerTest extends TestFmwk {
}
@Test
public void TestBidiSkeleton() {
final SpoofChecker sc = new SpoofChecker.Builder().build();
final String testName = "TestBidiSkeleton";
checkBidiSkeleton(sc, Bidi.DIRECTION_LEFT_TO_RIGHT, "A1<שׂ", "Al<ש\u0307", testName);
checkBidiSkeleton(sc, Bidi.DIRECTION_LEFT_TO_RIGHT, "Αשֺ>1", "Al<ש\u0307", testName);
checkBidiSkeleton(sc, Bidi.DIRECTION_RIGHT_TO_LEFT, "A1<שׂ", "ש\u0307>Al", testName);
checkBidiSkeleton(sc, Bidi.DIRECTION_RIGHT_TO_LEFT, "Αשֺ>1", "l<ש\u0307A", testName);
}
// Internal function to run a single skeleton test case.
//
// Run a single confusable skeleton transformation test case.
@ -470,6 +481,19 @@ public class SpoofCheckerTest extends TestFmwk {
assertEquals(testName + " test at line " + lineNumberOfTest + " : Expected (escaped): " + expected, uExpected, actual);
}
// Internal function to run a single skeleton test case.
//
// Run a single confusable skeleton transformation test case.
//
void checkBidiSkeleton(SpoofChecker sc, int direction, String input, String expected, String testName) {
assertEquals(
"bidiSkeleton(" +
(direction == Bidi.DIRECTION_LEFT_TO_RIGHT ? "LTR" : "RTL") +
", \"" + input +"\")",
expected,
sc.getBidiSkeleton(direction, input));
}
@Test
public void TestAreConfusable() {
SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
@ -480,6 +504,21 @@ public class SpoofCheckerTest extends TestFmwk {
assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2));
}
@Test
public void TestAreBidiConfusable() {
SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
final String jHyphen2 = "J-2";
// The following string has RLMs around the 2, flipping it; it uses an
// EN DASH instead of the HYPHEN-MINUS above.
final String j2Dash = "J\u200F2\u2013\u200F";
assertEquals("Unescaped display of j2Dash", "J2", j2Dash);
assertEquals(
"Expected single-script confusability",
SpoofChecker.SINGLE_SCRIPT_CONFUSABLE,
sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, jHyphen2, j2Dash));
}
@Test
public void TestConfusableFlagVariants() {
// The spoof checker should only return those tests that the user requested. This test makes sure that