mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-16 10:17:23 +00:00
ICU-22332 bidiSkeleton and LTR- and RTL-confusabilities
This commit is contained in:
parent
fb0f36203a
commit
f79fe9347a
8 changed files with 708 additions and 28 deletions
icu4c/source
i18n
test
icu4j/main
classes/core/src/com/ibm/icu/text
tests/core/src/com/ibm/icu/dev/test/text
|
@ -19,6 +19,7 @@
|
|||
#ifndef USPOOF_H
|
||||
#define USPOOF_H
|
||||
|
||||
#include "unicode/ubidi.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
@ -83,6 +84,25 @@
|
|||
* the instance should be created once (e.g., upon application startup), and the efficient
|
||||
* {@link uspoof_areConfusable} method can be used at runtime.
|
||||
*
|
||||
* If the paragraph direction used to display the strings is known, the bidi function should be used instead:
|
||||
*
|
||||
* \code{.c}
|
||||
* UErrorCode status = U_ZERO_ERROR;
|
||||
* // These strings look identical when rendered in a left-to-right context.
|
||||
* // They look distinct in a left-to-right context.
|
||||
* UChar* str1 = (UChar*) u"A1\u05D0"; // A1א
|
||||
* UChar* str2 = (UChar*) u"A\u05D01"; // Aא1
|
||||
*
|
||||
* USpoofChecker* sc = uspoof_open(&status);
|
||||
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
|
||||
*
|
||||
* int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
|
||||
* UBool result = bitmask != 0;
|
||||
* // areBidiConfusable: 1 (status: U_ZERO_ERROR)
|
||||
* printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
|
||||
* uspoof_close(sc);
|
||||
* \endcode
|
||||
*
|
||||
* <p>
|
||||
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
|
||||
* {@link uspoof_close} when the object goes out of scope:
|
||||
|
@ -519,7 +539,7 @@ typedef enum USpoofChecks {
|
|||
|
||||
|
||||
/**
|
||||
* Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
|
||||
* Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
|
||||
* for returned identifier restriction levels in check results.
|
||||
*
|
||||
* @stable ICU 51
|
||||
|
@ -633,8 +653,8 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
|
|||
/**
|
||||
* Open a Spoof Checker from the source form of the spoof data.
|
||||
* The input corresponds to the Unicode data file confusables.txt
|
||||
* as described in Unicode UAX #39. The syntax of the source data
|
||||
* is as described in UAX #39 for this file, and the content of
|
||||
* as described in Unicode Technical Standard #39. The syntax of the source data
|
||||
* is as described in UTS #39 for this file, and the content of
|
||||
* this file is acceptable input.
|
||||
*
|
||||
* The character encoding of the (char *) input text is UTF-8.
|
||||
|
@ -1111,7 +1131,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
|
|||
|
||||
|
||||
/**
|
||||
* Check the whether two specified strings are visually confusable.
|
||||
* Check whether two specified strings are visually confusable.
|
||||
*
|
||||
* If the strings are confusable, the return value will be nonzero, as long as
|
||||
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
|
||||
|
@ -1159,7 +1179,58 @@ uspoof_areConfusable(const USpoofChecker *sc,
|
|||
const UChar *id2, int32_t length2,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Check whether two specified strings are visually confusable when
|
||||
* displayed in a context with the given paragraph direction.
|
||||
*
|
||||
* If the strings are confusable, the return value will be nonzero, as long as
|
||||
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
|
||||
*
|
||||
* The bits in the return value correspond to flags for each of the classes of
|
||||
* confusables applicable to the two input strings. According to UTS 39
|
||||
* section 4, the possible flags are:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
|
||||
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
|
||||
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
|
||||
* </ul>
|
||||
*
|
||||
* If one or more of the above flags were not listed in uspoof_setChecks(), this
|
||||
* function will never report that class of confusable. The check
|
||||
* {@link USPOOF_CONFUSABLE} enables all three flags.
|
||||
*
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param direction The paragraph direction with which the identifiers are
|
||||
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
|
||||
* @param id1 The first of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-16 format.
|
||||
* @param length1 the length of the first identifier, expressed in
|
||||
* 16 bit UTF-16 code units, or -1 if the string is
|
||||
* nul terminated.
|
||||
* @param id2 The second of the two identifiers to be compared for
|
||||
* confusability. The identifiers are in UTF-16 format.
|
||||
* @param length2 The length of the second identifiers, expressed in
|
||||
* 16 bit UTF-16 code units, or -1 if the string is
|
||||
* nul terminated.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Confusability of the identifiers is not reported here,
|
||||
* but through this function's return value.
|
||||
* @return An integer value with bit(s) set corresponding to
|
||||
* the type of confusability found, as defined by
|
||||
* enum USpoofChecks. Zero is returned if the identifiers
|
||||
* are not confusable.
|
||||
*
|
||||
* @draft ICU 74
|
||||
*/
|
||||
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
|
||||
const UChar *id1, int32_t length1,
|
||||
const UChar *id2, int32_t length2,
|
||||
UErrorCode *status);
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
|
||||
|
@ -1192,14 +1263,45 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc,
|
|||
const char *id2, int32_t length2,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param direction The paragraph direction with which the identifiers are
|
||||
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
|
||||
* @param id1 The first of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param length1 the length of the first identifiers, in bytes, or -1
|
||||
* if the string is nul terminated.
|
||||
* @param id2 The second of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param length2 The length of the second string in bytes, or -1
|
||||
* if the string is nul terminated.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Confusability of the strings is not reported here,
|
||||
* but through this function's return value.
|
||||
* @return An integer value with bit(s) set corresponding to
|
||||
* the type of confusability found, as defined by
|
||||
* enum USpoofChecks. Zero is returned if the strings
|
||||
* are not confusable.
|
||||
*
|
||||
* @draft ICU 74
|
||||
*
|
||||
* @see uspoof_areBidiConfusable
|
||||
*/
|
||||
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
|
||||
const char *id1, int32_t length1,
|
||||
const char *id2, int32_t length2,
|
||||
UErrorCode *status);
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Get the "skeleton" for an identifier.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are confusable if their skeletons are identical.
|
||||
* See Unicode UAX #39 for additional information.
|
||||
* See Unicode Technical Standard #39 for additional information.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
|
@ -1233,11 +1335,50 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||
UChar *dest, int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Get the "bidiSkeleton" for an identifier and a direction.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
|
||||
* they are RTL-confusable if their RTL bidiSkeletons are identical.
|
||||
* See Unicode Technical Standard #39 for additional information:
|
||||
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
* set of existing identifiers, by creating an efficiently
|
||||
* searchable collection of the skeletons.
|
||||
*
|
||||
* @param sc The USpoofChecker.
|
||||
* @param direction The context direction with which the identifier will be
|
||||
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
|
||||
* @param id The input identifier whose skeleton will be computed.
|
||||
* @param length The length of the input identifier, expressed in 16 bit
|
||||
* UTF-16 code units, or -1 if the string is zero terminated.
|
||||
* @param dest The output buffer, to receive the skeleton string.
|
||||
* @param destCapacity The length of the output buffer, in 16 bit units.
|
||||
* The destCapacity may be zero, in which case the function will
|
||||
* return the actual length of the skeleton.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* @return The length of the skeleton string. The returned length
|
||||
* is always that of the complete skeleton, even when the
|
||||
* supplied buffer is too small (or of zero length)
|
||||
*
|
||||
* @draft ICU 74
|
||||
* @see uspoof_areBidiConfusable
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
|
||||
UBiDiDirection direction,
|
||||
const UChar *id, int32_t length,
|
||||
UChar *dest, int32_t destCapacity, UErrorCode *status);
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Get the "skeleton" for an identifier.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are confusable if their skeletons are identical.
|
||||
* See Unicode UAX #39 for additional information.
|
||||
* See Unicode Technical Standard #39 for additional information.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
|
@ -1273,6 +1414,46 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
|||
char *dest, int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Get the "bidiSkeleton" for an identifier and a direction.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
|
||||
* they are RTL-confusable if their RTL bidiSkeletons are identical.
|
||||
* See Unicode Technical Standard #39 for additional information:
|
||||
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
* set of existing identifiers, by creating an efficiently
|
||||
* searchable collection of the skeletons.
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param direction The context direction with which the identifier will be
|
||||
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
|
||||
* @param id The UTF-8 format identifier whose skeleton will be computed.
|
||||
* @param length The length of the input string, in bytes,
|
||||
* or -1 if the string is zero terminated.
|
||||
* @param dest The output buffer, to receive the skeleton string.
|
||||
* @param destCapacity The length of the output buffer, in bytes.
|
||||
* The destCapacity may be zero, in which case the function will
|
||||
* return the actual length of the skeleton.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check. Possible Errors include U_INVALID_CHAR_FOUND
|
||||
* for invalid UTF-8 sequences, and
|
||||
* U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
|
||||
* to hold the complete skeleton.
|
||||
* @return The length of the skeleton string, in bytes. The returned length
|
||||
* is always that of the complete skeleton, even when the
|
||||
* supplied buffer is too small (or of zero length)
|
||||
*
|
||||
* @draft ICU 74
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
|
||||
const char *id, int32_t length, char *dest,
|
||||
int32_t destCapacity, UErrorCode *status);
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
|
||||
* in http://unicode.org/Public/security/latest/xidmodifications.txt
|
||||
|
@ -1510,11 +1691,42 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
|||
const icu::UnicodeString &s2,
|
||||
UErrorCode *status);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param direction The paragraph direction with which the identifiers are
|
||||
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
|
||||
* @param s1 The first of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param s2 The second of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Confusability of the identifiers is not reported here,
|
||||
* but through this function's return value.
|
||||
* @return An integer value with bit(s) set corresponding to
|
||||
* the type of confusability found, as defined by
|
||||
* enum USpoofChecks. Zero is returned if the identifiers
|
||||
* are not confusable.
|
||||
*
|
||||
* @draft ICU 74
|
||||
*
|
||||
* @see uspoof_areBidiConfusable
|
||||
*/
|
||||
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
|
||||
UBiDiDirection direction,
|
||||
const icu::UnicodeString &s1,
|
||||
const icu::UnicodeString &s2,
|
||||
UErrorCode *status);
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Get the "skeleton" for an identifier.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are confusable if their skeletons are identical.
|
||||
* See Unicode UAX #39 for additional information.
|
||||
* See Unicode Technical Standard #39 for additional information.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
|
@ -1540,6 +1752,36 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
|||
icu::UnicodeString &dest,
|
||||
UErrorCode *status);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Get the "bidiSkeleton" for an identifier and a direction.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
|
||||
* they are RTL-confusable if their RTL bidiSkeletons are identical.
|
||||
* See Unicode Technical Standard #39 for additional information.
|
||||
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
* set of existing identifiers, by creating an efficiently
|
||||
* searchable collection of the skeletons.
|
||||
*
|
||||
* @param sc The USpoofChecker.
|
||||
* @param direction The context direction with which the identifier will be
|
||||
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
|
||||
* @param id The input identifier whose bidiSkeleton will be computed.
|
||||
* @param dest The output identifier, to receive the skeleton string.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* @return A reference to the destination (skeleton) string.
|
||||
*
|
||||
* @draft ICU 74.0
|
||||
*/
|
||||
U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
|
||||
const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
|
||||
icu::UnicodeString &dest, UErrorCode *status);
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
|
||||
* in http://unicode.org/Public/security/latest/xidmodifications.txt
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*
|
||||
* Unicode Spoof Detection
|
||||
*/
|
||||
#include "unicode/ubidi.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/uspoof.h"
|
||||
|
@ -538,6 +539,90 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
|||
return result;
|
||||
}
|
||||
|
||||
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
|
||||
const char16_t *id1, int32_t length1,
|
||||
const char16_t *id2, int32_t length2,
|
||||
UErrorCode *status) {
|
||||
UnicodeString id1Str((length1 == -1), id1, length1); // Aliasing constructor
|
||||
UnicodeString id2Str((length2 == -1), id2, length2); // Aliasing constructor
|
||||
if (id1Str.isBogus() || id2Str.isBogus()) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
|
||||
}
|
||||
|
||||
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
|
||||
const char *id1, int32_t length1, const char *id2,
|
||||
int32_t length2, UErrorCode *status) {
|
||||
if (length1 < -1 || length2 < -1) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString id1Str = UnicodeString::fromUTF8(
|
||||
StringPiece(id1, length1 >= 0 ? length1 : static_cast<int32_t>(uprv_strlen(id1))));
|
||||
UnicodeString id2Str = UnicodeString::fromUTF8(
|
||||
StringPiece(id2, length2 >= 0 ? length2 : static_cast<int32_t>(uprv_strlen(id2))));
|
||||
return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
|
||||
}
|
||||
|
||||
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
|
||||
UBiDiDirection direction,
|
||||
const icu::UnicodeString &id1,
|
||||
const icu::UnicodeString &id2,
|
||||
UErrorCode *status) {
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
//
|
||||
// See section 4 of UTS 39 for the algorithm for checking whether two strings are confusable,
|
||||
// and for definitions of the types (single, whole, mixed-script) of confusables.
|
||||
|
||||
// We only care about a few of the check flags. Ignore the others.
|
||||
// If no tests relevant to this function have been specified, return an error.
|
||||
// TODO: is this really the right thing to do? It's probably an error on the caller's part,
|
||||
// but logically we would just return 0 (no error).
|
||||
if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
|
||||
*status = U_INVALID_STATE_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Compute the skeletons and check for confusability.
|
||||
UnicodeString id1Skeleton;
|
||||
uspoof_getBidiSkeletonUnicodeString(sc, direction, id1, id1Skeleton, status);
|
||||
UnicodeString id2Skeleton;
|
||||
uspoof_getBidiSkeletonUnicodeString(sc, direction, id2, id2Skeleton, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if (id1Skeleton != id2Skeleton) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate
|
||||
// classes of confusables according to UTS 39 section 4. Start by computing the resolved script sets
|
||||
// of id1 and id2.
|
||||
ScriptSet id1RSS;
|
||||
This->getResolvedScriptSet(id1, id1RSS, *status);
|
||||
ScriptSet id2RSS;
|
||||
This->getResolvedScriptSet(id2, id2RSS, *status);
|
||||
|
||||
// Turn on all applicable flags
|
||||
uint32_t result = 0;
|
||||
if (id1RSS.intersects(id2RSS)) {
|
||||
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
} else {
|
||||
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) {
|
||||
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
}
|
||||
|
||||
// Turn off flags that the user doesn't want
|
||||
return result & This->fChecks;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_checkUnicodeString(const USpoofChecker *sc,
|
||||
|
@ -697,6 +782,60 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||
return destStr.length();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc, UBiDiDirection direction,
|
||||
const UChar *id, int32_t length, UChar *dest,
|
||||
int32_t destCapacity, UErrorCode *status) {
|
||||
UnicodeString idStr((length == -1), id, length); // Aliasing constructor
|
||||
if (idStr.isBogus()) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString destStr;
|
||||
uspoof_getBidiSkeletonUnicodeString(sc, direction, idStr, destStr, status);
|
||||
return destStr.extract(dest, destCapacity, *status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_I18N_API UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(const USpoofChecker *sc,
|
||||
UBiDiDirection direction,
|
||||
const UnicodeString &id,
|
||||
UnicodeString &dest,
|
||||
UErrorCode *status) {
|
||||
dest.remove();
|
||||
if (direction != UBIDI_LTR && direction != UBIDI_RTL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return dest;
|
||||
}
|
||||
UBiDi *bidi = ubidi_open();
|
||||
ubidi_setPara(bidi, id.getBuffer(), id.length(), direction,
|
||||
/*embeddingLevels*/ nullptr, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
ubidi_close(bidi);
|
||||
return dest;
|
||||
}
|
||||
UnicodeString reordered;
|
||||
int32_t const size = ubidi_getProcessedLength(bidi);
|
||||
UChar* const reorderedBuffer = reordered.getBuffer(size);
|
||||
if (reorderedBuffer == nullptr) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
ubidi_close(bidi);
|
||||
return dest;
|
||||
}
|
||||
ubidi_writeReordered(bidi, reorderedBuffer, size,
|
||||
UBIDI_KEEP_BASE_COMBINING | UBIDI_DO_MIRRORING, status);
|
||||
reordered.releaseBuffer(size);
|
||||
ubidi_close(bidi);
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return dest;
|
||||
}
|
||||
|
||||
// The type parameter is deprecated since ICU 58; any number may be passed.
|
||||
constexpr uint32_t deprecatedType = 58;
|
||||
return uspoof_getSkeletonUnicodeString(sc, deprecatedType, reordered, dest, status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_I18N_API UnicodeString & U_EXPORT2
|
||||
|
@ -730,12 +869,8 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
|||
return dest;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
const char *id, int32_t length,
|
||||
char *dest, int32_t destCapacity,
|
||||
U_CAPI int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, const char *id,
|
||||
int32_t length, char *dest, int32_t destCapacity,
|
||||
UErrorCode *status) {
|
||||
SpoofImpl::validateThis(sc, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
|
@ -746,7 +881,8 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
|||
return 0;
|
||||
}
|
||||
|
||||
UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : static_cast<int32_t>(uprv_strlen(id))));
|
||||
UnicodeString srcStr = UnicodeString::fromUTF8(
|
||||
StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
|
||||
UnicodeString destStr;
|
||||
uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
|
@ -754,8 +890,28 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
|||
}
|
||||
|
||||
int32_t lengthInUTF8 = 0;
|
||||
u_strToUTF8(dest, destCapacity, &lengthInUTF8,
|
||||
destStr.getBuffer(), destStr.length(), status);
|
||||
u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
|
||||
return lengthInUTF8;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
|
||||
const char *id, int32_t length, char *dest,
|
||||
int32_t destCapacity, UErrorCode *status) {
|
||||
if (length < -1) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
UnicodeString srcStr = UnicodeString::fromUTF8(
|
||||
StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
|
||||
UnicodeString destStr;
|
||||
uspoof_getBidiSkeletonUnicodeString(sc, direction, srcStr, destStr, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t lengthInUTF8 = 0;
|
||||
u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
|
||||
return lengthInUTF8;
|
||||
}
|
||||
|
||||
|
|
|
@ -545,6 +545,26 @@ static void TestUSpoofCAPI(void) {
|
|||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/*
|
||||
* uspoof_areBidiConfusable()
|
||||
*/
|
||||
TEST_SETUP
|
||||
int32_t checkResults;
|
||||
|
||||
checkResults = uspoof_areBidiConfusable(sc, UBIDI_LTR, scLatin, -1, scMixed, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
|
||||
|
||||
checkResults = uspoof_areBidiConfusable(sc, UBIDI_LTR, goodGreek, -1, scLatin, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(0, checkResults);
|
||||
|
||||
checkResults = uspoof_areBidiConfusable(sc, UBIDI_LTR, lll_Latin_a, -1, lll_Latin_b, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
|
||||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/*
|
||||
* areConfusableUTF8
|
||||
*/
|
||||
|
@ -577,6 +597,38 @@ static void TestUSpoofCAPI(void) {
|
|||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/*
|
||||
* areBidiConfusableUTF8
|
||||
*/
|
||||
TEST_SETUP
|
||||
int32_t checkResults;
|
||||
char s1[200];
|
||||
char s2[200];
|
||||
|
||||
|
||||
u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status);
|
||||
u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
checkResults = uspoof_areBidiConfusableUTF8(sc, UBIDI_LTR, s1, -1, s2, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
|
||||
|
||||
u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status);
|
||||
u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
checkResults = uspoof_areBidiConfusableUTF8(sc, UBIDI_LTR, s1, -1, s2, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(0, checkResults);
|
||||
|
||||
u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status);
|
||||
u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
checkResults = uspoof_areBidiConfusableUTF8(sc, UBIDI_LTR, s1, -1, s2, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
|
||||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
|
||||
/*
|
||||
* getSkeleton
|
||||
|
@ -602,6 +654,31 @@ static void TestUSpoofCAPI(void) {
|
|||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
|
||||
/*
|
||||
* getBidiSkeleton
|
||||
*/
|
||||
|
||||
TEST_SETUP
|
||||
UChar dest[100];
|
||||
int32_t skelLength;
|
||||
|
||||
skelLength = uspoof_getBidiSkeleton(sc, UBIDI_LTR, lll_Latin_a, -1, dest, UPRV_LENGTHOF(dest), &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest));
|
||||
TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength);
|
||||
|
||||
skelLength = uspoof_getBidiSkeletonUTF8(sc, UBIDI_LTR, goodLatinUTF8, -1, (char *)dest,
|
||||
UPRV_LENGTHOF(dest), &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
skelLength = uspoof_getBidiSkeleton(sc, UBIDI_LTR, lll_Latin_a, -1, NULL, 0, &status);
|
||||
TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status);
|
||||
TEST_ASSERT_EQ(3, skelLength);
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/*
|
||||
* get Inclusion and Recommended sets
|
||||
*/
|
||||
|
|
|
@ -917,7 +917,7 @@ group: charset_detector
|
|||
group: spoof_detection
|
||||
uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o scriptset.o
|
||||
deps
|
||||
uniset_props regex unorm uscript
|
||||
uniset_props regex unorm uscript ubidi
|
||||
|
||||
group: alphabetic_index
|
||||
alphaindex.o
|
||||
|
|
|
@ -95,6 +95,7 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
|
|||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(testSpoofAPI);
|
||||
TESTCASE_AUTO(testSkeleton);
|
||||
TESTCASE_AUTO(testBidiSkeleton);
|
||||
TESTCASE_AUTO(testAreConfusable);
|
||||
TESTCASE_AUTO(testInvisible);
|
||||
TESTCASE_AUTO(testConfData);
|
||||
|
@ -154,10 +155,13 @@ void IntlTestSpoof::testSpoofAPI() {
|
|||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
#define CHECK_SKELETON(type, input, expected) \
|
||||
UPRV_BLOCK_MACRO_BEGIN { checkSkeleton(sc, type, input, expected, __LINE__); } \
|
||||
UPRV_BLOCK_MACRO_END
|
||||
|
||||
#define CHECK_SKELETON(type, input, expected) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
checkSkeleton(sc, type, input, expected, __LINE__); \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
#define CHECK_BIDI_SKELETON(type, input, expected) \
|
||||
UPRV_BLOCK_MACRO_BEGIN { checkBidiSkeleton(sc, type, input, expected, __LINE__); } \
|
||||
UPRV_BLOCK_MACRO_END
|
||||
|
||||
|
||||
// testSkeleton. Spot check a number of confusable skeleton substitutions from the
|
||||
|
@ -227,6 +231,15 @@ void IntlTestSpoof::testSkeleton() {
|
|||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testBidiSkeleton() {
|
||||
TEST_SETUP
|
||||
CHECK_BIDI_SKELETON(u"A1<שׂ", UBIDI_LTR, u"Al<ש\u0307");
|
||||
CHECK_BIDI_SKELETON(u"Αשֺ>1", UBIDI_LTR, u"Al<ש\u0307");
|
||||
CHECK_BIDI_SKELETON(u"A1<שׂ", UBIDI_RTL, u"ש\u0307>Al");
|
||||
CHECK_BIDI_SKELETON(u"Αשֺ>1", UBIDI_RTL, u"l<ש\u0307A");
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Run a single confusable skeleton transformation test case.
|
||||
|
@ -252,6 +265,31 @@ void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
|
|||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Run a single confusable bidiSkeleton transformation test case.
|
||||
//
|
||||
void IntlTestSpoof::checkBidiSkeleton(const USpoofChecker *sc, const UnicodeString &input,
|
||||
UBiDiDirection direction, const UnicodeString &expected,
|
||||
int32_t lineNum) {
|
||||
UnicodeString uInput = input.unescape();
|
||||
UnicodeString uExpected = expected.unescape();
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString actual;
|
||||
uspoof_getBidiSkeletonUnicodeString(sc, direction, uInput, actual, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
|
||||
u_errorName(status));
|
||||
return;
|
||||
}
|
||||
if (uExpected != actual) {
|
||||
errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
|
||||
__FILE__, __LINE__, lineNum);
|
||||
errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") +
|
||||
UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
|
||||
}
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testAreConfusable() {
|
||||
TEST_SETUP
|
||||
UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
|
||||
|
@ -265,6 +303,20 @@ void IntlTestSpoof::testAreConfusable() {
|
|||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testAreBidiConfusable() {
|
||||
TEST_SETUP
|
||||
const UnicodeString jHyphen2(u"J-2");
|
||||
// The following string has RLMs around the 2–, flipping it; it uses an
|
||||
// EN DASH instead of the HYPHEN-MINUS above.
|
||||
const UnicodeString j2Dash(u"J\u200F2\u2013\u200F");
|
||||
TEST_ASSERT(j2Dash == u"J2–");
|
||||
int32_t result = uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, jHyphen2, j2Dash, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, result);
|
||||
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testInvisible() {
|
||||
TEST_SETUP
|
||||
UnicodeString s = UnicodeString("abcd\\u0301ef").unescape();
|
||||
|
|
|
@ -30,8 +30,12 @@ public:
|
|||
|
||||
void testSkeleton();
|
||||
|
||||
void testBidiSkeleton();
|
||||
|
||||
void testAreConfusable();
|
||||
|
||||
void testAreBidiConfusable();
|
||||
|
||||
void testInvisible();
|
||||
|
||||
void testConfData();
|
||||
|
@ -56,9 +60,11 @@ public:
|
|||
|
||||
void testCombiningDot();
|
||||
|
||||
// Internal function to run a single skeleton test case.
|
||||
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
|
||||
const char *input, const char *expected, int32_t lineNum);
|
||||
// Internal functions to run a single skeleton test case.
|
||||
void checkSkeleton(const USpoofChecker *sc, uint32_t flags, const char *input, const char *expected,
|
||||
int32_t lineNum);
|
||||
void checkBidiSkeleton(const USpoofChecker *sc, const UnicodeString &input, UBiDiDirection direction,
|
||||
const UnicodeString &expected, int32_t lineNum);
|
||||
};
|
||||
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
|
||||
|
|
|
@ -81,6 +81,22 @@ import com.ibm.icu.util.ULocale;
|
|||
* application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime.
|
||||
*
|
||||
* <p>
|
||||
* If the paragraph direction used to display the strings is known, it should be passed to {@link SpoofChecker#areConfusable}:
|
||||
*
|
||||
* <pre>
|
||||
* <code>
|
||||
* // These strings look identical when rendered in a left-to-right context.
|
||||
* // They look distinct in a right-to-left context.
|
||||
* String s1 = "A1\u05D0"; // A1א
|
||||
* String s2 = "A\u05D01"; // Aא1
|
||||
*
|
||||
* SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
|
||||
* int result = sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, s1, s2);
|
||||
* System.out.println(result != 0); // true
|
||||
* </code>
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a
|
||||
* sequence of families of confusable characters, where each family has a single exemplar character.
|
||||
* {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is
|
||||
|
@ -1422,7 +1438,7 @@ public class SpoofChecker {
|
|||
}
|
||||
|
||||
/**
|
||||
* Check the whether two specified strings are visually confusable. The types of confusability to be tested - single
|
||||
* Check whether two specified strings are visually confusable. The types of confusability to be tested - single
|
||||
* script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
|
||||
*
|
||||
* The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
|
||||
|
@ -1442,7 +1458,7 @@ public class SpoofChecker {
|
|||
*/
|
||||
public int areConfusable(String s1, String s2) {
|
||||
//
|
||||
// See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
|
||||
// See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
|
||||
// and for definitions of the types (single, whole, mixed-script) of confusables.
|
||||
|
||||
// We only care about a few of the check flags. Ignore the others.
|
||||
|
@ -1479,12 +1495,104 @@ public class SpoofChecker {
|
|||
}
|
||||
}
|
||||
|
||||
// Turn off flags that the user doesn't want
|
||||
return result & fChecks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether two specified strings are visually when displayed in a paragraph with the given direction.
|
||||
* The types of confusability to be tested—single script, mixed script, or whole script—are determined by the check options set for the SpoofChecker.
|
||||
*
|
||||
* The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
|
||||
* WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
|
||||
*
|
||||
* ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
|
||||
* folded for comparison and display to the user, do not select the ANY_CASE option.
|
||||
*
|
||||
*
|
||||
* @param direction The paragraph direction with which the identifiers are displayed.
|
||||
* Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
|
||||
* @param s1
|
||||
* The first of the two strings to be compared for confusability.
|
||||
* @param s2
|
||||
* The second of the two strings to be compared for confusability.
|
||||
* @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
|
||||
* found, as defined by spoof check test constants.
|
||||
* @draft ICU 74
|
||||
*/
|
||||
public int areConfusable(int direction, CharSequence s1, CharSequence s2) {
|
||||
//
|
||||
// See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
|
||||
// and for definitions of the types (single, whole, mixed-script) of confusables.
|
||||
|
||||
// We only care about a few of the check flags. Ignore the others.
|
||||
// If no tests relevant to this function have been specified, signal an error.
|
||||
// TODO: is this really the right thing to do? It's probably an error on
|
||||
// the caller's part, but logically we would just return 0 (no error).
|
||||
if ((this.fChecks & CONFUSABLE) == 0) {
|
||||
throw new IllegalArgumentException("No confusable checks are enabled.");
|
||||
}
|
||||
|
||||
// Compute the skeletons and check for confusability.
|
||||
String s1Skeleton = getBidiSkeleton(direction, s1);
|
||||
String s2Skeleton = getBidiSkeleton(direction, s2);
|
||||
if (!s1Skeleton.equals(s2Skeleton)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
|
||||
// of confusables according to UTS 39 section 4.
|
||||
// Start by computing the resolved script sets of s1 and s2.
|
||||
ScriptSet s1RSS = new ScriptSet();
|
||||
getResolvedScriptSet(s1, s1RSS);
|
||||
ScriptSet s2RSS = new ScriptSet();
|
||||
getResolvedScriptSet(s2, s2RSS);
|
||||
|
||||
// Turn on all applicable flags
|
||||
int result = 0;
|
||||
if (s1RSS.intersects(s2RSS)) {
|
||||
result |= SINGLE_SCRIPT_CONFUSABLE;
|
||||
} else {
|
||||
result |= MIXED_SCRIPT_CONFUSABLE;
|
||||
if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
|
||||
result |= WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
}
|
||||
|
||||
// Turn off flags that the user doesn't want
|
||||
result &= fChecks;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the "bidiSkeleton" for an identifier string and a direction.
|
||||
* Skeletons are a transformation of the input string;
|
||||
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
|
||||
* they are RTL-confusable if their RTL bidiSkeletons are identical.
|
||||
* See Unicode Technical Standard #39 for additional information:
|
||||
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
|
||||
* large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
|
||||
*
|
||||
* Skeletons are computed using the algorithm and data described in UTS #39.
|
||||
*
|
||||
* @param direction The paragraph direction with which the string is displayed.
|
||||
* Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
|
||||
* @param str The input string whose bidiSkeleton will be generated.
|
||||
* @return The output skeleton string.
|
||||
*
|
||||
* @draft ICU 74
|
||||
*/
|
||||
public String getBidiSkeleton(int direction, CharSequence str) {
|
||||
if (direction != Bidi.DIRECTION_LEFT_TO_RIGHT && direction != Bidi.DIRECTION_RIGHT_TO_LEFT) {
|
||||
throw new IllegalArgumentException("direction should be DIRECTION_LEFT_TO_RIGHT or DIRECTION_RIGHT_TO_LEFT");
|
||||
}
|
||||
Bidi bidi = new Bidi(str.toString(), direction);
|
||||
return getSkeleton(bidi.writeReordered(Bidi.KEEP_BASE_COMBINING | Bidi.DO_MIRRORING));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
|
||||
* confusable if their skeletons are identical. See Unicode UAX 39 for additional information.
|
||||
|
|
|
@ -36,6 +36,7 @@ import com.ibm.icu.impl.Utility;
|
|||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.Bidi;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.SpoofChecker;
|
||||
import com.ibm.icu.text.SpoofChecker.CheckResult;
|
||||
|
@ -455,6 +456,16 @@ public class SpoofCheckerTest extends TestFmwk {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestBidiSkeleton() {
|
||||
final SpoofChecker sc = new SpoofChecker.Builder().build();
|
||||
final String testName = "TestBidiSkeleton";
|
||||
checkBidiSkeleton(sc, Bidi.DIRECTION_LEFT_TO_RIGHT, "A1<שׂ", "Al<ש\u0307", testName);
|
||||
checkBidiSkeleton(sc, Bidi.DIRECTION_LEFT_TO_RIGHT, "Αשֺ>1", "Al<ש\u0307", testName);
|
||||
checkBidiSkeleton(sc, Bidi.DIRECTION_RIGHT_TO_LEFT, "A1<שׂ", "ש\u0307>Al", testName);
|
||||
checkBidiSkeleton(sc, Bidi.DIRECTION_RIGHT_TO_LEFT, "Αשֺ>1", "l<ש\u0307A", testName);
|
||||
}
|
||||
|
||||
// Internal function to run a single skeleton test case.
|
||||
//
|
||||
// Run a single confusable skeleton transformation test case.
|
||||
|
@ -470,6 +481,19 @@ public class SpoofCheckerTest extends TestFmwk {
|
|||
assertEquals(testName + " test at line " + lineNumberOfTest + " : Expected (escaped): " + expected, uExpected, actual);
|
||||
}
|
||||
|
||||
// Internal function to run a single skeleton test case.
|
||||
//
|
||||
// Run a single confusable skeleton transformation test case.
|
||||
//
|
||||
void checkBidiSkeleton(SpoofChecker sc, int direction, String input, String expected, String testName) {
|
||||
assertEquals(
|
||||
"bidiSkeleton(" +
|
||||
(direction == Bidi.DIRECTION_LEFT_TO_RIGHT ? "LTR" : "RTL") +
|
||||
", \"" + input +"\")",
|
||||
expected,
|
||||
sc.getBidiSkeleton(direction, input));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestAreConfusable() {
|
||||
SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
|
||||
|
@ -480,6 +504,21 @@ public class SpoofCheckerTest extends TestFmwk {
|
|||
assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestAreBidiConfusable() {
|
||||
SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
|
||||
final String jHyphen2 = "J-2";
|
||||
// The following string has RLMs around the 2–, flipping it; it uses an
|
||||
// EN DASH instead of the HYPHEN-MINUS above.
|
||||
final String j2Dash = "J\u200F2\u2013\u200F";
|
||||
assertEquals("Unescaped display of j2Dash", "J2–", j2Dash);
|
||||
|
||||
assertEquals(
|
||||
"Expected single-script confusability",
|
||||
SpoofChecker.SINGLE_SCRIPT_CONFUSABLE,
|
||||
sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, jHyphen2, j2Dash));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestConfusableFlagVariants() {
|
||||
// The spoof checker should only return those tests that the user requested. This test makes sure that
|
||||
|
|
Loading…
Add table
Reference in a new issue