diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index 253427eabe3..ffb82c0ef4f 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -86,7 +86,7 @@ tmunit.o tmutamt.o tmutfmt.o currpinf.o \ uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \ ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o locdspnm.o \ decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \ -tzfmt.o compactdecimalformat.o gender.o region.o +tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o identifier_info.o ## Header files to install HEADERS = $(srcdir)/unicode/*.h diff --git a/icu4c/source/i18n/i18n.vcxproj b/icu4c/source/i18n/i18n.vcxproj index b1f49722ac3..d583f7bbb26 100644 --- a/icu4c/source/i18n/i18n.vcxproj +++ b/icu4c/source/i18n/i18n.vcxproj @@ -302,6 +302,7 @@ + @@ -323,6 +324,7 @@ + @@ -1546,6 +1548,8 @@ ..\..\include\unicode\%(Filename)%(Extension);%(Outputs) + + @@ -1562,4 +1566,4 @@ - \ No newline at end of file + diff --git a/icu4c/source/i18n/i18n.vcxproj.filters b/icu4c/source/i18n/i18n.vcxproj.filters index 080b9d49797..55688468fe4 100644 --- a/icu4c/source/i18n/i18n.vcxproj.filters +++ b/icu4c/source/i18n/i18n.vcxproj.filters @@ -456,6 +456,12 @@ charset detect + + spoof + + + spoof + spoof @@ -759,6 +765,12 @@ charset detect + + spoof + + + spoof + spoof @@ -1017,4 +1029,4 @@ formatting - \ No newline at end of file + diff --git a/icu4c/source/i18n/identifier_info.cpp b/icu4c/source/i18n/identifier_info.cpp new file mode 100644 index 00000000000..30c73a5b270 --- /dev/null +++ b/icu4c/source/i18n/identifier_info.cpp @@ -0,0 +1,314 @@ +/* +********************************************************************** +* Copyright (C) 2012-2013, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ + +#include "unicode/utypes.h" + +#include "unicode/uchar.h" +#include "unicode/utf16.h" + +#include "identifier_info.h" +#include "mutex.h" +#include "scriptset.h" +#include "ucln_in.h" +#include "uvector.h" + +U_NAMESPACE_BEGIN + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +static UMutex gInitMutex = U_MUTEX_INITIALIZER; +static UBool gStaticsAreInitialized = FALSE; + +UnicodeSet *IdentifierInfo::ASCII; +ScriptSet *IdentifierInfo::JAPANESE; +ScriptSet *IdentifierInfo::CHINESE; +ScriptSet *IdentifierInfo::KOREAN; +ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; + +UBool IdentifierInfo::cleanup() { + delete ASCII; + ASCII = NULL; + delete JAPANESE; + JAPANESE = NULL; + delete CHINESE; + CHINESE = NULL; + delete KOREAN; + KOREAN = NULL; + delete CONFUSABLE_WITH_LATIN; + CONFUSABLE_WITH_LATIN = NULL; + gStaticsAreInitialized = FALSE; + return TRUE; +} + +U_CDECL_BEGIN +static UBool U_CALLCONV +IdentifierInfo_cleanup(void) { + return IdentifierInfo::cleanup(); +} +U_CDECL_END + + +IdentifierInfo::IdentifierInfo(UErrorCode &status): + fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), + fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { + if (U_FAILURE(status)) { + return; + } + { + Mutex lock(&gInitMutex); + if (!gStaticsAreInitialized) { + ASCII = new UnicodeSet(0, 0x7f); + JAPANESE = new ScriptSet(); + CHINESE = new ScriptSet(); + KOREAN = new ScriptSet(); + CONFUSABLE_WITH_LATIN = new ScriptSet(); + if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL + || CONFUSABLE_WITH_LATIN == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + ASCII->freeze(); + JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) + .set(USCRIPT_KATAKANA, status); + CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); + KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); + CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) + .set(USCRIPT_CHEROKEE, status); + ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); + gStaticsAreInitialized = TRUE; + } + } + fIdentifier = new UnicodeString(); + fRequiredScripts = new ScriptSet(); + fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); + uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); + fCommonAmongAlternates = new ScriptSet(); + fNumerics = new UnicodeSet(); + fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); + + if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || + fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { + status = U_MEMORY_ALLOCATION_ERROR; + } +}; + +IdentifierInfo::~IdentifierInfo() { + delete fIdentifier; + delete fRequiredScripts; + uhash_close(fScriptSetSet); + delete fCommonAmongAlternates; + delete fNumerics; + delete fIdentifierProfile; +}; + + +IdentifierInfo &IdentifierInfo::clear() { + fRequiredScripts->resetAll(); + uhash_removeAll(fScriptSetSet); + fNumerics->clear(); + fCommonAmongAlternates->resetAll(); + return *this; +} + + +IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { + *fIdentifierProfile = identifierProfile; + return *this; +} + + +const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { + return *fIdentifierProfile; +} + + +IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + *fIdentifier = identifier; + clear(); + ScriptSet scriptsForCP; + UChar32 cp; + for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { + cp = identifier.char32At(i); + // Store a representative character for each kind of decimal digit + if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { + // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value + fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); + } + UScriptCode extensions[500]; + int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); + if (U_FAILURE(status)) { + return *this; + } + scriptsForCP.resetAll(); + for (int32_t j=0; jUnion(scriptsForCP); + break; + default: + if (!fRequiredScripts->intersects(scriptsForCP) + && !uhash_geti(fScriptSetSet, &scriptsForCP)) { + // If the set hasn't been added already, add it + // (Add a copy, fScriptSetSet takes ownership of the copy.) + uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); + } + break; + } + } + // Now make a final pass through ScriptSetSet to remove alternates that came before singles. + // [Kana], [Kana Hira] => [Kana] + // This is relatively infrequent, so doesn't have to be optimized. + // We also compute any commonalities among the alternates. + if (uhash_count(fScriptSetSet) > 0) { + fCommonAmongAlternates->setAll(); + for (int32_t it = -1;;) { + const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); + if (nextHashEl == NULL) { + break; + } + ScriptSet *next = static_cast(nextHashEl->key.pointer); + // [Kana], [Kana Hira] => [Kana] + if (fRequiredScripts->intersects(*next)) { + uhash_removeElement(fScriptSetSet, nextHashEl); + } else { + fCommonAmongAlternates->intersect(*next); + // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] + for (int32_t otherIt = -1;;) { + const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); + if (otherHashEl == NULL) { + break; + } + ScriptSet *other = static_cast(otherHashEl->key.pointer); + if (next != other && next->contains(*other)) { + uhash_removeElement(fScriptSetSet, nextHashEl); + break; + } + } + } + } + } + if (uhash_count(fScriptSetSet) == 0) { + fCommonAmongAlternates->resetAll(); + } + return *this; +} + + +const UnicodeString *IdentifierInfo::getIdentifier() const { + return fIdentifier; +} + +const ScriptSet *IdentifierInfo::getScripts() const { + return fRequiredScripts; +} + +const UHashtable *IdentifierInfo::getAlternates() const { + return fScriptSetSet; +} + + +const UnicodeSet *IdentifierInfo::getNumerics() const { + return fNumerics; +} + +const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { + return fCommonAmongAlternates; +} + +URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { + if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { + return USPOOF_UNRESTRICTIVE; + } + if (ASCII->containsAll(*fIdentifier)) { + return USPOOF_ASCII; + } + // This is a bit tricky. We look at a number of factors. + // The number of scripts in the text. + // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) + // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) + + // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the + // time it is created, in setIdentifier(). + int32_t cardinalityPlus = fRequiredScripts->countMembers() + + (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); + if (cardinalityPlus < 2) { + return USPOOF_HIGHLY_RESTRICTIVE; + } + if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) + || containsWithAlternates(*KOREAN, *fRequiredScripts)) { + return USPOOF_HIGHLY_RESTRICTIVE; + } + if (cardinalityPlus == 2 && + fRequiredScripts->test(USCRIPT_LATIN, status) && + !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { + return USPOOF_MODERATELY_RESTRICTIVE; + } + return USPOOF_MINIMALLY_RESTRICTIVE; +} + +int32_t IdentifierInfo::getScriptCount() const { + // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. + int32_t count = fRequiredScripts->countMembers() + + (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); + return count; +} + + + +UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { + if (!container.contains(containee)) { + return FALSE; + } + for (int32_t iter = -1; ;) { + const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); + if (hashEl == NULL) { + break; + } + ScriptSet *alternatives = static_cast(hashEl->key.pointer); + if (!container.intersects(*alternatives)) { + return false; + } + } + return true; +} + +UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { + UVector sorted(status); + if (U_FAILURE(status)) { + return dest; + } + for (int32_t pos = -1; ;) { + const UHashElement *el = uhash_nextElement(alternates, &pos); + if (el == NULL) { + break; + } + ScriptSet *ss = static_cast(el->key.pointer); + sorted.addElement(ss, status); + } + sorted.sort(uhash_compareScriptSet, status); + UnicodeString separator = UNICODE_STRING_SIMPLE("; "); + for (int32_t i=0; i0) { + dest.append(separator); + } + ScriptSet *ss = static_cast(sorted.elementAt(i)); + ss->displayScripts(dest); + } + return dest; +} + +U_NAMESPACE_END + diff --git a/icu4c/source/i18n/identifier_info.h b/icu4c/source/i18n/identifier_info.h new file mode 100644 index 00000000000..49c98217209 --- /dev/null +++ b/icu4c/source/i18n/identifier_info.h @@ -0,0 +1,200 @@ +/* +********************************************************************** +* Copyright (C) 2013, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* indentifier_info.h +* +* created on: 2013 Jan 7 +* created by: Andy Heninger +*/ + +#ifndef __IDENTIFIER_INFO_H__ +#define __IDENTIFIER_INFO_H__ + +#include "unicode/utypes.h" + +#include "unicode/uniset.h" +#include "unicode/uspoof.h" +#include "uhash.h" + +U_NAMESPACE_BEGIN + +class ScriptSet; + +// TODO(andy): review consistency of reference vs pointer arguments to the funcions. + +/** + * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile + * then setIdentifier. Available methods include: + *
    + *
  1. call getScripts for the specific scripts in the identifier. The identifier contains at least one character in + * each of these. + *
  2. call getAlternates to get cases where a character is not limited to a single script. For example, it could be + * either Katakana or Hiragana. + *
  3. call getCommonAmongAlternates to find out if any scripts are common to all the alternates. + *
  4. call getNumerics to get a representative character (with value zero) for each of the decimal number systems in + * the identifier. + *
  5. call getRestrictionLevel to see what the UTS36 restriction level is. + *
+ * + * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo + */ +class U_I18N_API IdentifierInfo : public UMemory { + + public: + /** + * Create an identifier info object. Subsequently, call setIdentifier(), etc. + * @internal + */ + IdentifierInfo(UErrorCode &status); + + /** + * Destructor + */ + virtual ~IdentifierInfo(); + + private: + /* Disallow copying for now. Can be added if there's a need. */ + IdentifierInfo(const IdentifierInfo &other); + + public: + + /** + * Set the identifier profile: the characters that are to be allowed in the identifier. + * + * @param identifierProfile the characters that are to be allowed in the identifier + * @return this + * @internal + */ + IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); + + /** + * Get the identifier profile: the characters that are to be allowed in the identifier. + * + * @return The characters that are to be allowed in the identifier. + * @internal + */ + const UnicodeSet &getIdentifierProfile() const; + + + /** + * Set an identifier to analyze. Afterwards, call methods like getScripts() + * + * @param identifier the identifier to analyze + * @param status Errorcode, set if errors occur. + * @return this + * @internal + */ + IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); + + + /** + * Get the identifier that was analyzed. The returned string is owned by the ICU library, + * and must not be deleted by the caller. + * + * @return the identifier that was analyzed. + * @internal + */ + const UnicodeString *getIdentifier() const; + + + /** + * Get the scripts found in the identifiers. + * + * @return the set of explicit scripts. + * @internal + */ + const ScriptSet *getScripts() const; + + /** + * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then + * the set consisting of those scripts will be returned. + * + * @return a uhash, with each key being of type (ScriptSet *). + * This is a set, not a map, so the value stored in the uhash is not relevant. + * (It is, in fact, 1). + * Ownership of the uhash and its contents remains with the IndetifierInfo object, + * and remains valid until a new identifer is set or until the object is deleted. + * @internal + */ + const UHashtable *getAlternates() const; + + /** + * Get the representative characters (zeros) for the numerics found in the identifier. + * + * @return the set of explicit scripts. + * @internal + */ + const UnicodeSet *getNumerics() const; + + /** + * Find out which scripts are in common among the alternates. + * + * @return the set of scripts that are in common among the alternates. + * @internal + */ + const ScriptSet *getCommonAmongAlternates() const; + + /** + * Get the number of scripts appearing in the identifier. + * Note: Common and Inherited scripts are omitted from the count. + * Note: Result may be high when the identifier contains characters + * with alternate scripts. The distinction between + * 0, 1 and > 1 will remain valid, however. + * @return the number of scripts. + */ + int32_t getScriptCount() const; + + /** + * Find the "tightest" restriction level that the identifier satisfies. + * + * @return the restriction level. + * @internal + */ + URestrictionLevel getRestrictionLevel(UErrorCode &status) const; + + UnicodeString toString() const; + + /** + * Produce a readable string of alternates. + * + * @param alternates a UHashtable of UScriptSets. + * Keys only, no meaningful values in the UHash. + * @return display form + * @internal + */ + static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); + + /** + * Static memory cleanup function. + * @internal + */ + static UBool cleanup(); + private: + + IdentifierInfo & clear(); + UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; + + UnicodeString *fIdentifier; + ScriptSet *fRequiredScripts; + UHashtable *fScriptSetSet; + ScriptSet *fCommonAmongAlternates; + UnicodeSet *fNumerics; + UnicodeSet *fIdentifierProfile; + + static UnicodeSet *ASCII; + static ScriptSet *JAPANESE; + static ScriptSet *CHINESE; + static ScriptSet *KOREAN; + static ScriptSet *CONFUSABLE_WITH_LATIN; + + + +}; + +U_NAMESPACE_END + +#endif // __IDENTIFIER_INFO_H__ + diff --git a/icu4c/source/i18n/scriptset.cpp b/icu4c/source/i18n/scriptset.cpp new file mode 100644 index 00000000000..809e5f63a9f --- /dev/null +++ b/icu4c/source/i18n/scriptset.cpp @@ -0,0 +1,276 @@ +/* +********************************************************************** +* Copyright (C) 2013, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* scriptset.cpp +* +* created on: 2013 Jan 7 +* created by: Andy Heninger +*/ + +#include "unicode/utypes.h" + +#include "unicode/uchar.h" +#include "unicode/unistr.h" + +#include "scriptset.h" +#include "uassert.h" + +U_NAMESPACE_BEGIN + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +//---------------------------------------------------------------------------- +// +// ScriptSet implementation +// +//---------------------------------------------------------------------------- +ScriptSet::ScriptSet() { + for (uint32_t i=0; i= (int32_t)sizeof(bits) * 8) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return FALSE; + } + uint32_t index = script / 32; + uint32_t bit = 1 << (script & 31); + return ((bits[index] & bit) != 0); +} + + +ScriptSet &ScriptSet::set(UScriptCode script, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (script < 0 || script >= (int32_t)sizeof(bits) * 8) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + uint32_t index = script / 32; + uint32_t bit = 1 << (script & 31); + bits[index] |= bit; + return *this; +} + +ScriptSet &ScriptSet::reset(UScriptCode script, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (script < 0 || script >= (int32_t)sizeof(bits) * 8) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + uint32_t index = script / 32; + uint32_t bit = 1 << (script & 31); + bits[index] &= ~bit; + return *this; +} + + + +ScriptSet &ScriptSet::Union(const ScriptSet &other) { + for (uint32_t i=0; iintersect(t); + } + return *this; +} + +UBool ScriptSet::intersects(const ScriptSet &other) const { + for (uint32_t i=0; i 0) { + count++; + x &= (x - 1); // and off the least significant one bit. + } + } + return count; +} + +int32_t ScriptSet::hashCode() const { + int32_t hash = 0; + for (int32_t i=0; i= 0; i = nextSetBit(i + 1)) { + if (!firstTime) { + dest.append(0x20); + } + firstTime = FALSE; + const char *scriptName = uscript_getShortName((UScriptCode(i))); + dest.append(UnicodeString(scriptName, -1, US_INV)); + } + return dest; +} + +ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode &status) { + resetAll(); + if (U_FAILURE(status)) { + return *this; + } + UnicodeString oneScriptName; + for (int32_t i=0; i 0) { + char buf[40]; + oneScriptName.extract(0, oneScriptName.length(), buf, sizeof(buf)-1, US_INV); + buf[sizeof(buf)-1] = 0; + int32_t sc = u_getPropertyValueEnum(UCHAR_SCRIPT, buf); + if (sc == UCHAR_INVALID_CODE) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + this->set((UScriptCode)sc, status); + } + if (U_FAILURE(status)) { + return *this; + } + oneScriptName.remove(); + } + } + return *this; +} + +U_NAMESPACE_END + +U_CAPI UBool U_EXPORT2 +uhash_equalsScriptSet(const UElement key1, const UElement key2) { + icu::ScriptSet *s1 = static_cast(key1.pointer); + icu::ScriptSet *s2 = static_cast(key2.pointer); + return (*s1 == *s2); +} + +U_CAPI int8_t U_EXPORT2 +uhash_compareScriptSet(UElement key0, UElement key1) { + icu::ScriptSet *s0 = static_cast(key0.pointer); + icu::ScriptSet *s1 = static_cast(key1.pointer); + int32_t diff = s0->countMembers() - s1->countMembers(); + if (diff != 0) return diff; + int32_t i0 = s0->nextSetBit(0); + int32_t i1 = s1->nextSetBit(0); + while ((diff = i0-i1) == 0 && i0 > 0) { + i0 = s0->nextSetBit(i0+1); + i1 = s1->nextSetBit(i1+1); + } + return (int8_t)diff; +} + +U_CAPI int32_t U_EXPORT2 +uhash_hashScriptSet(const UElement key) { + icu::ScriptSet *s = static_cast(key.pointer); + return s->hashCode(); +} + +U_CAPI void U_EXPORT2 +uhash_deleteScriptSet(void *obj) { + icu::ScriptSet *s = static_cast(obj); + delete s; +} diff --git a/icu4c/source/i18n/scriptset.h b/icu4c/source/i18n/scriptset.h new file mode 100644 index 00000000000..62af5d591d1 --- /dev/null +++ b/icu4c/source/i18n/scriptset.h @@ -0,0 +1,76 @@ +/* +********************************************************************** +* Copyright (C) 2013, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* scriptset.h +* +* created on: 2013 Jan 7 +* created by: Andy Heninger +*/ + +#ifndef __SCRIPTSET_H__ +#define __SCRIPTSET_H__ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/uscript.h" + +#include "uelement.h" + +U_NAMESPACE_BEGIN + +//------------------------------------------------------------------------------- +// +// ScriptSet - A bit set representing a set of scripts. +// +// This class was originally used exclusively with script sets appearing +// as part of the spoof check whole script confusable binary data. Its +// use has since become more general, but the continued use to wrap +// prebuilt binary data does constrain the design. +// +//------------------------------------------------------------------------------- +class U_I18N_API ScriptSet: public UMemory { + public: + ScriptSet(); + ScriptSet(const ScriptSet &other); + ~ScriptSet(); + + UBool operator == (const ScriptSet &other) const; + ScriptSet & operator = (const ScriptSet &other); + + UBool test(UScriptCode script, UErrorCode &status) const; + ScriptSet &Union(const ScriptSet &other); + ScriptSet &set(UScriptCode script, UErrorCode &status); + ScriptSet &reset(UScriptCode script, UErrorCode &status); + ScriptSet &intersect(const ScriptSet &other); + ScriptSet &intersect(UScriptCode script, UErrorCode &status); + UBool intersects(const ScriptSet &other) const; // Sets contain at least one script in commmon. + UBool contains(const ScriptSet &other) const; // All set bits in other are also set in this. + + ScriptSet &setAll(); + ScriptSet &resetAll(); + int32_t countMembers() const; + int32_t hashCode() const; + int32_t nextSetBit(int32_t script) const; + + UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string. + ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents. + + private: + uint32_t bits[6]; +}; + +U_NAMESPACE_END + +U_CAPI UBool U_EXPORT2 +uhash_compareScriptSet(const UElement key1, const UElement key2); + +U_CAPI int32_t U_EXPORT2 +uhash_hashScriptSet(const UElement key); + +U_CAPI void U_EXPORT2 +uhash_deleteScriptSet(void *obj); + +#endif // __SCRIPTSET_H__ diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h index 494f73e2aac..ee0846702e9 100644 --- a/icu4c/source/i18n/ucln_in.h +++ b/icu4c/source/i18n/ucln_in.h @@ -26,6 +26,8 @@ as the functions are suppose to be called. It's usually best to have child dependencies called first. */ typedef enum ECleanupI18NType { UCLN_I18N_START = -1, + UCLN_I18N_IDENTIFIER_INFO, + UCLN_I18N_SPOOF, UCLN_I18N_TRANSLITERATOR, UCLN_I18N_REGEX, UCLN_I18N_ISLAMIC_CALENDAR, diff --git a/icu4c/source/i18n/unicode/alphaindex.h b/icu4c/source/i18n/unicode/alphaindex.h index a4dbbbd50cd..e6b4cd78075 100644 --- a/icu4c/source/i18n/unicode/alphaindex.h +++ b/icu4c/source/i18n/unicode/alphaindex.h @@ -180,6 +180,21 @@ class U_I18N_API AlphabeticIndex: public UObject { */ AlphabeticIndex(const Locale &locale, UErrorCode &status); + /** + * Construct an AlphabeticIndex that uses a specific collator. + * + * The index will be created with no labels; the addLabels() function must be called + * after creation to add the desired labels to the index. + * + * The index adopts the collator, and is responsible for deleting it. + * The caller should make nor further use of the collator after creating the index. + * + * @param collator The collator to use to order the contents of this index. + * @param status Error code, will be set with the reason if the + * operation fails. + * @draft ICU 51 + */ + AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status); /** * Construct an AlphabeticIndex that uses a specific collator. diff --git a/icu4c/source/i18n/unicode/uspoof.h b/icu4c/source/i18n/unicode/uspoof.h index 0a0c2ff37b8..08d7be884bb 100644 --- a/icu4c/source/i18n/unicode/uspoof.h +++ b/icu4c/source/i18n/unicode/uspoof.h @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 2008-2012, International Business Machines Corporation +* Copyright (C) 2008-2013, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** * file name: uspoof.h @@ -188,11 +188,27 @@ typedef enum USpoofChecks { Any Case Confusable. */ USPOOF_ANY_CASE = 8, + /** + * Check that an identifier is no looser than the specified RestrictionLevel. + * The default if uspoof_setRestrctionLevel() is not called is HIGHLY_RESTRICTIVE. + * + * If USPOOF_AUX_INFO is enabled the actual restriction level of the + * identifier being tested will also be returned by uspoof_check(). + * + * @see URestrictionLevel + * @see uspoof_setRestrictionLevel + * @see USPOOF_AUX_INFO + * + * @stable ICU 51 + */ + USPOOF_RESTRICTION_LEVEL = 16, + /** Check that an identifier contains only characters from a * single script (plus chars from the common and inherited scripts.) * Applies to checks of a single identifier check only. + * @deprecated ICU 51 Use RESTRICTION_LEVEL instead. */ - USPOOF_SINGLE_SCRIPT = 16, + USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL, /** Check an identifier for the presence of invisible characters, * such as zero-width spaces, or character sequences that are @@ -208,10 +224,78 @@ typedef enum USpoofChecks { */ USPOOF_CHAR_LIMIT = 64, - USPOOF_ALL_CHECKS = 0x7f + /** + * Check that an identifier does not include decimal digits from + * more than one numbering system. + * + * @draft ICU 51 + */ + USPOOF_MIXED_NUMBERS = 128, + + /** + * Enable all spoof checks. + * + * @stable ICU 4.6 + */ + USPOOF_ALL_CHECKS = 0xFFFF, + + /** + * Enable the return of auxillary (non-error) information in the + * upper bits of the check results value. + * + * If this "check" is not enabled, the results of uspoof_check() will be zero when an + * identifier passes all of the enabled checks. + * + * If this "check" is enabled, (uspoof_check() & USPOOF_ALL_CHECKS) will be zero + * when an identifier passes all checks. + * + * @draft ICU 51 + */ + USPOOF_AUX_INFO = 0x40000000 + } USpoofChecks; + /** + * Constants from UAX #39 for use in setRestrictionLevel(), and + * for returned identifier restriction levels in check results. + * @draft ICU 51 + */ + typedef enum URestrictionLevel { + /** + * Only ASCII characters: U+0000..U+007F + * + * @draft ICU 51 + */ + USPOOF_ASCII = 0x10000000, + /** + * All characters in each identifier must be from a single script, or from the combinations: Latin + Han + + * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the + * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin. + * + * @draft ICU 51 + */ + USPOOF_HIGHLY_RESTRICTIVE = 0x20000000, + /** + * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive + * + * @draft ICU 51 + */ + USPOOF_MODERATELY_RESTRICTIVE = 0x30000000, + /** + * Allow arbitrary mixtures of scripts. Otherwise, the same as Moderately Restrictive. + * + * @draft ICU 51 + */ + USPOOF_MINIMALLY_RESTRICTIVE = 0x40000000, + /** + * Any valid identifiers, including characters outside of the Identifier Profile. + * + * @draft ICU 51 + */ + USPOOF_UNRESTRICTIVE = 0x50000000 + } URestrictionLevel; + /** * Create a Unicode Spoof Checker, configured to perform all * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. @@ -255,7 +339,7 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng * Open a Spoof Checker from the source form of the spoof data. * The Three inputs correspond to the Unicode data files confusables.txt * confusablesWholeScript.txt and xidmdifications.txt as described in - * Unicode UAX 39. The syntax of the source data is as described in UAX 39 for + * Unicode UAX #39. The syntax of the source data is as described in UAX #39 for * these files, and the content of these files is acceptable input. * * The character encoding of the (char *) input text is UTF-8. @@ -357,6 +441,28 @@ uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); U_STABLE int32_t U_EXPORT2 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); +/** + * Set the loosest restriction level allowed. The default if this function + * is not called is HIGHLY_RESTRICTIVE. + * Calling this function also enables the RESTRICTION_LEVEL check. + * @param restrictionLevel The loosest restriction level allowed. + * @see URestrictionLevel + * @draft ICU 51 + */ +U_DRAFT void U_EXPORT2 +uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); + + +/** + * Get the Restriction Level that will be tested if the checks include RESTRICTION_LEVEL. + * + * @return The restriction level + * @see URestrictionLevel + * @draft ICU 51 + */ +U_DRAFT URestrictionLevel U_EXPORT2 +uspoof_getRestrictionLevel(const USpoofChecker *sc); + /** * Limit characters that are acceptable in identifiers being checked to those * normally used with the languages associated with the specified locales. @@ -488,7 +594,7 @@ uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status); * characters that are permitted. Ownership of the set * remains with the caller. The incoming set is cloned by * this function, so there are no restrictions on modifying - * or deleting the USet after calling this function. + * or deleting the UnicodeSet after calling this function. * @param status The error code, set if this function encounters a problem. * @stable ICU 4.2 */ @@ -527,31 +633,29 @@ uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status); * The set of checks to be performed is specified with uspoof_setChecks(). * * @param sc The USpoofChecker - * @param text The string to be checked for possible security issues, + * @param id The identifier to be checked for possible security issues, * in UTF-16 format. * @param length the length of the string to be checked, expressed in * 16 bit UTF-16 code units, or -1 if the string is * zero terminated. - * @param position An out parameter that receives the index of the - * first string position that fails the allowed character - * limitation checks. - * This parameter may be null if the position information - * is not needed. - * If the string passes the requested checks the - * parameter value will not be set. + * @param position An out parameter. + * Originally, the index of the first string position that failed a check. + * Now, always returns zero. + * This parameter may be null. * @param status The error code, set if an error occurred while attempting to * perform the check. * Spoofing or security issues detected with the input string are * not reported here, but through the function's return value. * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by - * enum USpoofChecks. Zero is returned if no issues - * are found with the input string. + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. * @stable ICU 4.2 */ U_STABLE int32_t U_EXPORT2 uspoof_check(const USpoofChecker *sc, - const UChar *text, int32_t length, + const UChar *id, int32_t length, int32_t *position, UErrorCode *status); @@ -562,16 +666,14 @@ uspoof_check(const USpoofChecker *sc, * The set of checks to be performed is specified with uspoof_setChecks(). * * @param sc The USpoofChecker - * @param text A UTF-8 string to be checked for possible security issues. + * @param id A identifier to be checked for possible security issues, in UTF8 format. * @param length the length of the string to be checked, or -1 if the string is * zero terminated. - * @param position An out parameter that receives the index of the - * first string position that fails the allowed character - * limitation checks. - * This parameter may be null if the position information - * is not needed. - * If the string passes the requested checks the - * parameter value will not be set. + * @param position An out parameter. + * Originally, the index of the first string position that failed a check. + * Now, always returns zero. + * This parameter may be null. + * @deprecated ICU 51 * @param status The error code, set if an error occurred while attempting to * perform the check. * Spoofing or security issues detected with the input string are @@ -580,13 +682,14 @@ uspoof_check(const USpoofChecker *sc, * a status of U_INVALID_CHAR_FOUND will be returned. * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by - * enum USpoofChecks. Zero is returned if no issues - * are found with the input string. + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. * @stable ICU 4.2 */ U_STABLE int32_t U_EXPORT2 uspoof_checkUTF8(const USpoofChecker *sc, - const char *text, int32_t length, + const char *id, int32_t length, int32_t *position, UErrorCode *status); @@ -598,28 +701,26 @@ uspoof_checkUTF8(const USpoofChecker *sc, * The set of checks to be performed is specified with uspoof_setChecks(). * * @param sc The USpoofChecker - * @param text A UnicodeString to be checked for possible security issues. - * @param position An out parameter that receives the index of the - * first string position that fails the allowed character - * limitation checks. - * This parameter may be null if the position information - * is not needed. - * If the string passes the requested checks the - * parameter value will not be set. + * @param id A identifier to be checked for possible security issues. + * @param position An out parameter. + * Originally, the index of the first string position that failed a check. + * Now, always returns zero. + * This parameter may be null. + * @deprecated ICU 51 * @param status The error code, set if an error occurred while attempting to * perform the check. * Spoofing or security issues detected with the input string are * not reported here, but through the function's return value. - * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by - * enum USpoofChecks. Zero is returned if no issues - * are found with the input string. + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. * @stable ICU 4.2 */ U_STABLE int32_t U_EXPORT2 uspoof_checkUnicodeString(const USpoofChecker *sc, - const icu::UnicodeString &text, + const icu::UnicodeString &id, int32_t *position, UErrorCode *status); @@ -645,30 +746,30 @@ uspoof_checkUnicodeString(const USpoofChecker *sc, * * * @param sc The USpoofChecker - * @param s1 The first of the two strings to be compared for + * @param id1 The first of the two identifiers to be compared for * confusability. The strings are in UTF-16 format. - * @param length1 the length of the first string, expressed in + * @param length1 the length of the first identifer, expressed in * 16 bit UTF-16 code units, or -1 if the string is - * zero terminated. - * @param s2 The second of the two strings to be compared for - * confusability. The strings are in UTF-16 format. - * @param length2 The length of the second string, expressed in + * nul terminated. + * @param id2 The second of the two identifiers to be compared for + * confusability. The identifiers are in UTF-16 format. + * @param length2 The length of the second identifiers, expressed in * 16 bit UTF-16 code units, or -1 if the string is - * zero terminated. + * nul terminated. * @param status The error code, set if an error occurred while attempting to * perform the check. - * Confusability of the strings is not reported here, + * Confusability of the identifiers is not reported here, * but through this function's return value. * @return An integer value with bit(s) set corresponding to * the type of confusability found, as defined by - * enum USpoofChecks. Zero is returned if the strings + * enum USpoofChecks. Zero is returned if the identifiers * are not confusable. * @stable ICU 4.2 */ U_STABLE int32_t U_EXPORT2 uspoof_areConfusable(const USpoofChecker *sc, - const UChar *s1, int32_t length1, - const UChar *s2, int32_t length2, + const UChar *id1, int32_t length1, + const UChar *id2, int32_t length2, UErrorCode *status); @@ -680,14 +781,14 @@ uspoof_areConfusable(const USpoofChecker *sc, * USpoofChecker. * * @param sc The USpoofChecker - * @param s1 The first of the two strings to be compared for + * @param id1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param length1 the length of the first identifiers, in bytes, or -1 + * if the string is nul terminated. + * @param id2 The second of the two identifiers to be compared for * confusability. The strings are in UTF-8 format. - * @param length1 the length of the first string, in bytes, or -1 - * if the string is zero terminated. - * @param s2 The second of the two strings to be compared for - * confusability. The strings are in UTF-18 format. * @param length2 The length of the second string in bytes, or -1 - * if the string is zero terminated. + * if the string is nul terminated. * @param status The error code, set if an error occurred while attempting to * perform the check. * Confusability of the strings is not reported here, @@ -700,8 +801,8 @@ uspoof_areConfusable(const USpoofChecker *sc, */ U_STABLE int32_t U_EXPORT2 uspoof_areConfusableUTF8(const USpoofChecker *sc, - const char *s1, int32_t length1, - const char *s2, int32_t length2, + const char *id1, int32_t length1, + const char *id2, int32_t length2, UErrorCode *status); @@ -715,17 +816,17 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc, * USpoofChecker. * * @param sc The USpoofChecker - * @param s1 The first of the two strings to be compared for + * @param id1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param id2 The second of the two identifiers to be compared for * confusability. The strings are in UTF-8 format. - * @param s2 The second of the two strings to be compared for - * confusability. The strings are in UTF-18 format. * @param status The error code, set if an error occurred while attempting to * perform the check. - * Confusability of the strings is not reported here, + * Confusability of the identifiers is not reported here, * but through this function's return value. * @return An integer value with bit(s) set corresponding to * the type of confusability found, as defined by - * enum USpoofChecks. Zero is returned if the strings + * enum USpoofChecks. Zero is returned if the identifiers * are not confusable. * @stable ICU 4.2 */ @@ -738,10 +839,10 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc, /** - * Get the "skeleton" for an identifier string. - * Skeletons are a transformation of the input string; - * Two strings are confusable if their skeletons are identical. - * See Unicode UAX 39 for additional information. + * Get the "skeleton" for an identifier. + * Skeletons are a transformation of the input identifier; + * Two identifiers are confusable if their skeletons are identical. + * See Unicode UAX #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large @@ -754,8 +855,8 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc, * The default is Mixed-Script, Lowercase. * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and * USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed. - * @param s The input string whose skeleton will be computed. - * @param length The length of the input string, expressed in 16 bit + * @param id The input identifier whose skeleton will be computed. + * @param length The length of the input identifier, expressed in 16 bit * UTF-16 code units, or -1 if the string is zero terminated. * @param dest The output buffer, to receive the skeleton string. * @param destCapacity The length of the output buffer, in 16 bit units. @@ -772,15 +873,15 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc, U_STABLE int32_t U_EXPORT2 uspoof_getSkeleton(const USpoofChecker *sc, uint32_t type, - const UChar *s, int32_t length, + const UChar *id, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status); /** - * Get the "skeleton" for an identifier string. - * Skeletons are a transformation of the input string; - * Two strings are confusable if their skeletons are identical. - * See Unicode UAX 39 for additional information. + * Get the "skeleton" for an identifier. + * Skeletons are a transformation of the input identifier; + * Two identifiers are confusable if their skeletons are identical. + * See Unicode UAX #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large @@ -793,7 +894,7 @@ uspoof_getSkeleton(const USpoofChecker *sc, * The default is Mixed-Script, Lowercase. * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and * USPOOF_ANY_CASE. The two flags may be ORed. - * @param s The UTF-8 format input string whose skeleton will be computed. + * @param id The UTF-8 format identifier whose skeleton will be computed. * @param length The length of the input string, in bytes, * or -1 if the string is zero terminated. * @param dest The output buffer, to receive the skeleton string. @@ -814,16 +915,16 @@ uspoof_getSkeleton(const USpoofChecker *sc, U_STABLE int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, - const char *s, int32_t length, + const char *id, int32_t length, char *dest, int32_t destCapacity, UErrorCode *status); #if U_SHOW_CPLUSPLUS_API /** - * Get the "skeleton" for an identifier string. - * Skeletons are a transformation of the input string; - * Two strings are confusable if their skeletons are identical. - * See Unicode UAX 39 for additional information. + * Get the "skeleton" for an identifier. + * Skeletons are a transformation of the input identifier; + * Two identifiers are confusable if their skeletons are identical. + * See Unicode UAX #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large @@ -836,8 +937,8 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc, * The default is Mixed-Script, Lowercase. * Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and * USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed. - * @param s The input string whose skeleton will be computed. - * @param dest The output string, to receive the skeleton string. + * @param id The input identifier whose skeleton will be computed. + * @param dest The output identifier, to receive the skeleton string. * @param status The error code, set if an error occurred while attempting to * perform the check. * @return A reference to the destination (skeleton) string. @@ -847,17 +948,83 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc, U_I18N_API icu::UnicodeString & U_EXPORT2 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, uint32_t type, - const icu::UnicodeString &s, + const icu::UnicodeString &id, icu::UnicodeString &dest, UErrorCode *status); #endif /* U_SHOW_CPLUSPLUS_API */ +/** + * Get the set of Candidate Characters for Inclusion in Identifiers, as defined + * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @draft ICU 51 + */ +U_DRAFT const USet * U_EXPORT2 +uspoof_getInclusionSet(UErrorCode *status); + +/** + * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined + * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @draft ICU 51 + */ +U_DRAFT const USet * U_EXPORT2 +uspoof_getRecommendedSet(UErrorCode *status); + + + +#if U_SHOW_CPLUSPLUS_API + +/** + * Get the set of Candidate Characters for Inclusion in Identifiers, as defined + * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @draft ICU 51 + */ +U_DRAFT const UnicodeSet * U_EXPORT2 +uspoof_getInclusionUnicodeSet(UErrorCode *status); + +/** + * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined + * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @draft ICU 51 + */ +U_DRAFT const UnicodeSet * U_EXPORT2 +uspoof_getRecommendedUnicodeSet(UErrorCode *status); + +#endif /* U_SHOW_CPLUSPLUS_API */ + /** * Serialize the data for a spoof detector into a chunk of memory. * The flattened spoof detection tables can later be used to efficiently * instantiate a new Spoof Detector. * + * The serialized spoof checker includes only the data compiled from the + * Unicode data tables by uspoof_openFromSource(); it does not include + * include any other state or configuration that may have been set. + * * @param sc the Spoof Detector whose data is to be serialized. * @param data a pointer to 32-bit-aligned memory to be filled with the data, * can be NULL if capacity==0 diff --git a/icu4c/source/i18n/uspoof.cpp b/icu4c/source/i18n/uspoof.cpp index 32f4322a17d..68531228e2b 100644 --- a/icu4c/source/i18n/uspoof.cpp +++ b/icu4c/source/i18n/uspoof.cpp @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 2008-2012, International Business Machines Corporation +* Copyright (C) 2008-2013, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** * file name: uspoof.cpp @@ -14,13 +14,19 @@ * Unicode Spoof Detection */ #include "unicode/utypes.h" +#include "unicode/normalizer2.h" #include "unicode/uspoof.h" -#include "unicode/unorm.h" #include "unicode/ustring.h" #include "unicode/utf16.h" #include "cmemory.h" -#include "uspoof_impl.h" +#include "cstring.h" +#include "identifier_info.h" +#include "mutex.h" +#include "scriptset.h" #include "uassert.h" +#include "ucln_in.h" +#include "uspoof_impl.h" +#include "umutex.h" #if !UCONFIG_NO_NORMALIZATION @@ -28,17 +34,74 @@ U_NAMESPACE_USE +// +// Static Objects used by the spoof impl, their thread safe initialization and their cleanup. +// +static UnicodeSet *gInclusionSet = NULL; +static UnicodeSet *gRecommendedSet = NULL; +static const Normalizer2 *gNfdNormalizer = NULL; +static UMutex gInitMutex = U_MUTEX_INITIALIZER; + +static UBool U_CALLCONV +uspoof_cleanup(void) { + delete gInclusionSet; + gInclusionSet = NULL; + delete gRecommendedSet; + gRecommendedSet = NULL; + gNfdNormalizer = NULL; + return TRUE; +} + +static void initializeStatics() { + Mutex m(&gInitMutex); + UErrorCode status = U_ZERO_ERROR; + if (gInclusionSet == NULL) { + gInclusionSet = new UnicodeSet(UnicodeString("[" + "\\-.\\u00B7\\u05F3\\u05F4\\u0F0B\\u200C\\u200D\\u2019]"), status); + gRecommendedSet = new UnicodeSet(UnicodeString("[" + "[0-z\\u00C0-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-" + "\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B\\u021E" + "\\u021F\\u0226-\\u0233\\u02BB\\u02BC\\u02EC\\u0300-\\u0304" + "\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-" + "\\u0328\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339" + "\\u0342-\\u0345\\u037B-\\u03CE\\u03FC-\\u045F\\u048A-\\u0525" + "\\u0531-\\u0586\\u05D0-\\u05F2\\u0621-\\u063F\\u0641-\\u0655" + "\\u0660-\\u0669\\u0670-\\u068D\\u068F-\\u06D5\\u06E5\\u06E6" + "\\u06EE-\\u06FF\\u0750-\\u07B1\\u0901-\\u0939\\u093C-\\u094D" + "\\u0950\\u0960-\\u0972\\u0979-\\u0A4D\\u0A5C-\\u0A74\\u0A81-" + "\\u0B43\\u0B47-\\u0B61\\u0B66-\\u0C56\\u0C60\\u0C61\\u0C66-" + "\\u0CD6\\u0CE0-\\u0CEF\\u0D02-\\u0D28\\u0D2A-\\u0D39\\u0D3D-" + "\\u0D43\\u0D46-\\u0D4D\\u0D57-\\u0D61\\u0D66-\\u0D8E\\u0D91-" + "\\u0DA5\\u0DA7-\\u0DDE\\u0DF2\\u0E01-\\u0ED9\\u0F00\\u0F20-" + "\\u0F8B\\u0F90-\\u109D\\u10D0-\\u10F0\\u10F7-\\u10FA\\u1200-" + "\\u135A\\u135F\\u1380-\\u138F\\u1401-\\u167F\\u1780-\\u17A2" + "\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7-" + "\\u17DC\\u17E0-\\u17E9\\u1810-\\u18A8\\u18AA-\\u18F5\\u1E00-" + "\\u1E99\\u1F00-\\u1FFC\\u2D30-\\u2D65\\u2D80-\\u2DDE\\u3005-" + "\\u3007\\u3041-\\u31B7\\u3400-\\u9FCB\\uA000-\\uA48C\\uA67F" + "\\uA717-\\uA71F\\uA788\\uAA60-\\uAA7B\\uAC00-\\uD7A3\\uFA0E-" + "\\uFA29\\U00020000-" + "\\U0002B734]-[[:Cn:][:nfkcqc=n:][:XIDC=n:]]]"), status); + gNfdNormalizer = Normalizer2::getNFDInstance(status); + } + ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup); + U_ASSERT(U_SUCCESS(status)); // TODO: remove after testing. + return; +} + + U_CAPI USpoofChecker * U_EXPORT2 uspoof_open(UErrorCode *status) { if (U_FAILURE(*status)) { return NULL; } + initializeStatics(); SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status); if (U_FAILURE(*status)) { delete si; si = NULL; } - return (USpoofChecker *)si; + return reinterpret_cast(si); } @@ -48,6 +111,7 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng if (U_FAILURE(*status)) { return NULL; } + initializeStatics(); SpoofData *sd = new SpoofData(data, length, *status); SpoofImpl *si = new SpoofImpl(sd, *status); if (U_FAILURE(*status)) { @@ -80,7 +144,7 @@ uspoof_clone(const USpoofChecker *sc, UErrorCode *status) { delete result; result = NULL; } - return (USpoofChecker *)result; + return reinterpret_cast(result); } @@ -119,6 +183,25 @@ uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) { return This->fChecks; } +U_CAPI void U_EXPORT2 +uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) { + UErrorCode status = U_ZERO_ERROR; + SpoofImpl *This = SpoofImpl::validateThis(sc, status); + if (This != NULL) { + This->fRestrictionLevel = restrictionLevel; + } +} + +U_CAPI URestrictionLevel U_EXPORT2 +uspoof_getRestrictionLevel(const USpoofChecker *sc) { + UErrorCode status = U_ZERO_ERROR; + const SpoofImpl *This = SpoofImpl::validateThis(sc, status); + if (This == NULL) { + return USPOOF_UNRESTRICTIVE; + } + return This->fRestrictionLevel; +} + U_CAPI void U_EXPORT2 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) { SpoofImpl *This = SpoofImpl::validateThis(sc, *status); @@ -141,7 +224,7 @@ uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) { U_CAPI const USet * U_EXPORT2 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) { const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status); - return reinterpret_cast(result); + return result->toUSet(); } U_CAPI const UnicodeSet * U_EXPORT2 @@ -156,7 +239,7 @@ uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) { U_CAPI void U_EXPORT2 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) { - const UnicodeSet *set = reinterpret_cast(chars); + const UnicodeSet *set = UnicodeSet::fromUSet(chars); uspoof_setAllowedUnicodeSet(sc, set, status); } @@ -185,7 +268,7 @@ uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCo U_CAPI int32_t U_EXPORT2 uspoof_check(const USpoofChecker *sc, - const UChar *text, int32_t length, + const UChar *id, int32_t length, int32_t *position, UErrorCode *status) { @@ -197,40 +280,209 @@ uspoof_check(const USpoofChecker *sc, *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - if (length == -1) { - // It's not worth the bother to handle nul terminated strings everywhere. - // Just get the length and be done with it. - length = u_strlen(text); + UnicodeString idStr((length == -1), id, length); // Aliasing constructor. + int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status); + return result; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_checkUTF8(const USpoofChecker *sc, + const char *id, int32_t length, + int32_t *position, + UErrorCode *status) { + + if (U_FAILURE(*status)) { + return 0; } + UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id))); + int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status); + return result; +} - int32_t result = 0; - int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32? - // A count of the number of non-Common or inherited scripts. - // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests. - // Share the computation when possible. scriptCount == -1 means that we haven't - // done it yet. - int32_t scriptCount = -1; +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusable(const USpoofChecker *sc, + const UChar *id1, int32_t length1, + const UChar *id2, int32_t length2, + UErrorCode *status) { + SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + if (length1 < -1 || length2 < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UnicodeString id1Str((length1==-1), id1, length1); // Aliasing constructor + UnicodeString id2Str((length2==-1), id2, length2); // Aliasing constructor + return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status); +} - if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) { - scriptCount = This->scriptScan(text, length, failPos, *status); - // printf("scriptCount (clipped to 2) = %d\n", scriptCount); - if ( scriptCount >= 2) { - // Note: scriptCount == 2 covers all cases of the number of scripts >= 2 - result |= USPOOF_SINGLE_SCRIPT; + +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusableUTF8(const USpoofChecker *sc, + const char *id1, int32_t length1, + const char *id2, int32_t length2, + UErrorCode *status) { + SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + if (length1 < -1 || length2 < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : uprv_strlen(id1))); + UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : uprv_strlen(id2))); + int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status); + return results; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusableUnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &id1, + const icu::UnicodeString &id2, + UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + // + // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, + // and for definitions of the types (single, whole, mixed-script) of confusables. + + // We only care about a few of the check flags. Ignore the others. + // If no tests relavant to this function have been specified, return an error. + // TODO: is this really the right thing to do? It's probably an error on the caller's part, + // but logically we would just return 0 (no error). + if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | + USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) { + *status = U_INVALID_STATE_ERROR; + return 0; + } + int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE; + + int32_t result = 0; + IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status); + if (U_FAILURE(*status)) { + return 0; + } + identifierInfo->setIdentifier(id1, *status); + int32_t id1ScriptCount = identifierInfo->getScriptCount(); + identifierInfo->setIdentifier(id2, *status); + int32_t id2ScriptCount = identifierInfo->getScriptCount(); + This->releaseIdentifierInfo(identifierInfo); + identifierInfo = NULL; + + if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { + UnicodeString id1Skeleton; + UnicodeString id2Skeleton; + if (id1ScriptCount <= 1 && id2ScriptCount <= 1) { + flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status); + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status); + if (id1Skeleton == id2Skeleton) { + result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; + } } } - if (This->fChecks & USPOOF_CHAR_LIMIT) { + if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { + // If the two inputs are single script confusable they cannot also be + // mixed or whole script confusable, according to the UAX39 definitions. + // So we can skip those tests. + return result; + } + + // Two identifiers are whole script confusable if each is of a single script + // and they are mixed script confusable. + UBool possiblyWholeScriptConfusables = + id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE); + + // + // Mixed Script Check + // + if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) { + // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us + // the mixed script table skeleton, which is what we want. + // The Any Case / Lower Case bit in the skelton flags was set at the top of the function. + UnicodeString id1Skeleton; + UnicodeString id2Skeleton; + flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status); + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status); + if (id1Skeleton == id2Skeleton) { + result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; + if (possiblyWholeScriptConfusables) { + result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; + } + } + } + + return result; +} + + + + +U_CAPI int32_t U_EXPORT2 +uspoof_checkUnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &id, + int32_t *position, + UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return 0; + } + int32_t result = 0; + + IdentifierInfo *identifierInfo = NULL; + if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) { + identifierInfo = This->getIdentifierInfo(*status); + if (U_FAILURE(*status)) { + goto cleanupAndReturn; + } + identifierInfo->setIdentifier(id, *status); + identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet); + } + + + if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) { + URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status); + if (idRestrictionLevel > This->fRestrictionLevel) { + result |= USPOOF_RESTRICTION_LEVEL; + } + if (This->fChecks & USPOOF_AUX_INFO) { + result |= idRestrictionLevel; + } + } + + if ((This->fChecks) & USPOOF_MIXED_NUMBERS) { + const UnicodeSet *numerics = identifierInfo->getNumerics(); + if (numerics->size() > 1) { + result |= USPOOF_MIXED_NUMBERS; + } + + // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier. + // We have no easy way to do the same in C. + // if (checkResult != null) { + // checkResult.numerics = numerics; + // } + } + + + if (This->fChecks & (USPOOF_CHAR_LIMIT)) { int32_t i; UChar32 c; + int32_t length = id.length(); for (i=0; ifAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; - if (i < failPos) { - failPos = i; - } break; } } @@ -239,9 +491,9 @@ uspoof_check(const USpoofChecker *sc, if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input - NFDBuffer normalizedInput(text, length, *status); - const UChar *nfdText = normalizedInput.getBuffer(); - int32_t nfdLength = normalizedInput.getLength(); + UnicodeString nfdText; + gNfdNormalizer->normalize(id, nfdText, *status); + int32_t nfdLength = nfdText.length(); if (This->fChecks & USPOOF_INVISIBLE) { @@ -254,7 +506,8 @@ uspoof_check(const USpoofChecker *sc, UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i length) { - failPos = length; - } break; } marksSeenSoFar.add(c); @@ -292,22 +539,29 @@ uspoof_check(const USpoofChecker *sc, // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be - // confusable with itself in its own script. + // confusable with itself in its own script. + // // If the number of such scripts is two or more, and the input consisted of - // characters all from a single script, we have a whole script confusable. - // (The two scripts will be the original script and the one that is confusable) + // characters all from a single script, we have a whole script confusable. + // (The two scripts will be the original script and the one that is confusable) + // // If the number of such scripts >= one, and the original input contained characters from - // more than one script, we have a mixed script confusable. (We can transform - // some of the characters, and end up with a visually similar string all in - // one script.) + // more than one script, we have a mixed script confusable. (We can transform + // some of the characters, and end up with a visually similar string all in + // one script.) - if (scriptCount == -1) { - int32_t t; - scriptCount = This->scriptScan(text, length, t, *status); + if (identifierInfo == NULL) { + identifierInfo = This->getIdentifierInfo(*status); + if (U_FAILURE(*status)) { + goto cleanupAndReturn; + } + identifierInfo->setIdentifier(id, *status); } + + int32_t scriptCount = identifierInfo->getScriptCount(); ScriptSet scripts; - This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status); + This->wholeScriptCheck(nfdText, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); @@ -324,297 +578,52 @@ uspoof_check(const USpoofChecker *sc, } } } - if (position != NULL && failPos != 0x7fffffff) { - *position = failPos; + +cleanupAndReturn: + This->releaseIdentifierInfo(identifierInfo); + if (position != NULL) { + *position = 0; } return result; } -U_CAPI int32_t U_EXPORT2 -uspoof_checkUTF8(const USpoofChecker *sc, - const char *text, int32_t length, - int32_t *position, - UErrorCode *status) { - - if (U_FAILURE(*status)) { - return 0; - } - UChar stackBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar* text16 = stackBuf; - int32_t len16; - - u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status); - if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { - return 0; - } - if (*status == U_BUFFER_OVERFLOW_ERROR) { - text16 = static_cast(uprv_malloc(len16 * sizeof(UChar) + 2)); - if (text16 == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - *status = U_ZERO_ERROR; - u_strFromUTF8(text16, len16+1, NULL, text, length, status); - } - - int32_t position16 = -1; - int32_t result = uspoof_check(sc, text16, len16, &position16, status); - if (U_FAILURE(*status)) { - return 0; - } - - if (position16 > 0) { - // Translate a UTF-16 based error position back to a UTF-8 offset. - // u_strToUTF8() in preflight mode is an easy way to do it. - U_ASSERT(position16 <= len16); - u_strToUTF8(NULL, 0, position, text16, position16, status); - if (position != NULL && *position > 0) { - // position is the required buffer length from u_strToUTF8, which includes - // space for a terminating NULL, which we don't want, hence the -1. - *position -= 1; - } - *status = U_ZERO_ERROR; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR. - } - - if (text16 != stackBuf) { - uprv_free(text16); - } - return result; - -} - -/* A convenience wrapper around the public uspoof_getSkeleton that handles - * allocating a larger buffer than provided if the original is too small. - */ -static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength, - UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) { - int32_t requiredCapacity = 0; - UChar *buf = dest; - - if (U_FAILURE(*status)) { - return NULL; - } - requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - buf = static_cast(uprv_malloc(requiredCapacity * sizeof(UChar))); - if (buf == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - *status = U_ZERO_ERROR; - uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status); - } - *outputLength = requiredCapacity; - return buf; -} - - -U_CAPI int32_t U_EXPORT2 -uspoof_areConfusable(const USpoofChecker *sc, - const UChar *s1, int32_t length1, - const UChar *s2, int32_t length2, - UErrorCode *status) { - const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); - if (U_FAILURE(*status)) { - return 0; - } - // - // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, - // and for definitions of the types (single, whole, mixed-script) of confusables. - - // We only care about a few of the check flags. Ignore the others. - // If no tests relavant to this function have been specified, return an error. - // TODO: is this really the right thing to do? It's probably an error on the caller's part, - // but logically we would just return 0 (no error). - if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | - USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) { - *status = U_INVALID_STATE_ERROR; - return 0; - } - int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE; - UChar s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *s1Skeleton; - int32_t s1SkeletonLength = 0; - - UChar s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *s2Skeleton; - int32_t s2SkeletonLength = 0; - - int32_t result = 0; - int32_t t; - int32_t s1ScriptCount = This->scriptScan(s1, length1, t, *status); - int32_t s2ScriptCount = This->scriptScan(s2, length2, t, *status); - - if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { - // Do the Single Script compare. - if (s1ScriptCount <= 1 && s2ScriptCount <= 1) { - flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; - s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf, - sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status); - s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf, - sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status); - if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) { - result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; - } - if (s1Skeleton != s1SkeletonBuf) { - uprv_free(s1Skeleton); - } - if (s2Skeleton != s2SkeletonBuf) { - uprv_free(s2Skeleton); - } - } - } - - if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { - // If the two inputs are single script confusable they cannot also be - // mixed or whole script confusable, according to the UAX39 definitions. - // So we can skip those tests. - return result; - } - - // Optimization for whole script confusables test: two identifiers are whole script confusable if - // each is of a single script and they are mixed script confusable. - UBool possiblyWholeScriptConfusables = - s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE); - - // - // Mixed Script Check - // - if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) { - // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us - // the mixed script table skeleton, which is what we want. - // The Any Case / Lower Case bit in the skelton flags was set at the top of the function. - flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; - s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf, - sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status); - s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf, - sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status); - if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) { - result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; - if (possiblyWholeScriptConfusables) { - result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; - } - } - if (s1Skeleton != s1SkeletonBuf) { - uprv_free(s1Skeleton); - } - if (s2Skeleton != s2SkeletonBuf) { - uprv_free(s2Skeleton); - } - } - - return result; -} - - -// Convenience function for converting a UTF-8 input to a UChar * string, including -// reallocating a buffer when required. Parameters and their interpretation mostly -// match u_strFromUTF8. - -static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength, - const char *in, int32_t inLength, UErrorCode *status) { - if (U_FAILURE(*status)) { - return NULL; - } - UChar *dest = outBuf; - u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - dest = static_cast(uprv_malloc(*outputLength * sizeof(UChar))); - if (dest == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - *status = U_ZERO_ERROR; - u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status); - } - return dest; -} - - - -U_CAPI int32_t U_EXPORT2 -uspoof_areConfusableUTF8(const USpoofChecker *sc, - const char *s1, int32_t length1, - const char *s2, int32_t length2, - UErrorCode *status) { - - SpoofImpl::validateThis(sc, *status); - if (U_FAILURE(*status)) { - return 0; - } - - UChar s1Buf[USPOOF_STACK_BUFFER_SIZE]; - int32_t lengthS1U; - UChar *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status); - - UChar s2Buf[USPOOF_STACK_BUFFER_SIZE]; - int32_t lengthS2U; - UChar *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status); - - int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status); - - if (s1U != s1Buf) { - uprv_free(s1U); - } - if (s2U != s2Buf) { - uprv_free(s2U); - } - return results; -} - - -U_CAPI int32_t U_EXPORT2 -uspoof_areConfusableUnicodeString(const USpoofChecker *sc, - const icu::UnicodeString &s1, - const icu::UnicodeString &s2, - UErrorCode *status) { - - const UChar *u1 = s1.getBuffer(); - int32_t length1 = s1.length(); - const UChar *u2 = s2.getBuffer(); - int32_t length2 = s2.length(); - - int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status); - return results; -} - - - - -U_CAPI int32_t U_EXPORT2 -uspoof_checkUnicodeString(const USpoofChecker *sc, - const icu::UnicodeString &text, - int32_t *position, - UErrorCode *status) { - int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status); - return result; -} - - U_CAPI int32_t U_EXPORT2 uspoof_getSkeleton(const USpoofChecker *sc, uint32_t type, - const UChar *s, int32_t length, + const UChar *id, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status) { - // TODO: this function could be sped up a bit - // Skip the input normalization when not needed, work from callers data. - // Put the initial skeleton straight into the caller's destination buffer. - // It probably won't need normalization. - // But these would make the structure more complicated. - - const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } - if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) || - (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) { + if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } + UnicodeString idStr((length==-1), id, length); // Aliasing constructor + UnicodeString destStr; + uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status); + destStr.extract(dest, destCapacity, *status); + return destStr.length(); +} + + + +U_I18N_API UnicodeString & U_EXPORT2 +uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, + uint32_t type, + const UnicodeString &id, + UnicodeString &dest, + UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return dest; + } + int32_t tableMask = 0; switch (type) { case 0: @@ -631,120 +640,24 @@ uspoof_getSkeleton(const USpoofChecker *sc, break; default: *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; + return dest; } - // NFD transform of the user supplied input - - UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *nfdInput = nfdStackBuf; - int32_t normalizedLen = unorm_normalize( - s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); - if (nfdInput == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - *status = U_ZERO_ERROR; - normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0, - nfdInput, normalizedLen+1, status); - } - if (U_FAILURE(*status)) { - if (nfdInput != nfdStackBuf) { - uprv_free(nfdInput); - } - return 0; - } - - // buffer to hold the Unicode defined skeleton mappings for a single code point - UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; + UnicodeString nfdId; + gNfdNormalizer->normalize(id, nfdId, *status); // Apply the skeleton mapping to the NFD normalized input string // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. int32_t inputIndex = 0; UnicodeString skelStr; - while (inputIndex < normalizedLen) { - UChar32 c; - U16_NEXT(nfdInput, inputIndex, normalizedLen, c); - int32_t replaceLen = This->confusableLookup(c, tableMask, buf); - skelStr.append(buf, replaceLen); + int32_t normalizedLen = nfdId.length(); + for (inputIndex=0; inputIndex < normalizedLen; ) { + UChar32 c = nfdId.char32At(inputIndex); + inputIndex += U16_LENGTH(c); + This->confusableLookup(c, tableMask, skelStr); } - if (nfdInput != nfdStackBuf) { - uprv_free(nfdInput); - } - - const UChar *result = skelStr.getBuffer(); - int32_t resultLen = skelStr.length(); - UChar *normedResult = NULL; - - // Check the skeleton for NFD, normalize it if needed. - // Unnormalized results should be very rare. - if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) { - normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status); - normedResult = static_cast(uprv_malloc((normalizedLen+1)*sizeof(UChar))); - if (normedResult == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - *status = U_ZERO_ERROR; - unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status); - result = normedResult; - resultLen = normalizedLen; - } - - // Copy the skeleton to the caller's buffer - if (U_SUCCESS(*status)) { - if (destCapacity == 0 || resultLen > destCapacity) { - *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING; - } else { - u_memcpy(dest, result, resultLen); - if (destCapacity > resultLen) { - dest[resultLen] = 0; - } else { - *status = U_STRING_NOT_TERMINATED_WARNING; - } - } - } - uprv_free(normedResult); - return resultLen; -} - - - -U_I18N_API UnicodeString & U_EXPORT2 -uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, - uint32_t type, - const UnicodeString &s, - UnicodeString &dest, - UErrorCode *status) { - if (U_FAILURE(*status)) { - return dest; - } - dest.remove(); - - const UChar *str = s.getBuffer(); - int32_t strLen = s.length(); - UChar smallBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *buf = smallBuf; - int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - buf = static_cast(uprv_malloc((outputSize+1)*sizeof(UChar))); - if (buf == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return dest; - } - *status = U_ZERO_ERROR; - uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status); - } - if (U_SUCCESS(*status)) { - dest.setTo(buf, outputSize); - } - - if (buf != smallBuf) { - uprv_free(buf); - } + gNfdNormalizer->normalize(skelStr, dest, *status); return dest; } @@ -752,62 +665,29 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, U_CAPI int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, - const char *s, int32_t length, + const char *id, int32_t length, char *dest, int32_t destCapacity, UErrorCode *status) { - // Lacking a UTF-8 normalization API, just converting the input to - // UTF-16 seems as good an approach as any. In typical use, input will - // be an identifier, which is to say not too long for stack buffers. + SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } - // Buffers for the UChar form of the input and skeleton strings. - UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *inBuf = smallInBuf; - UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE]; - UChar *outBuf = smallOutBuf; - - int32_t lengthInUChars = 0; - int32_t skelLengthInUChars = 0; - int32_t skelLengthInUTF8 = 0; - - u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars, - s, length, status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - inBuf = static_cast(uprv_malloc((lengthInUChars+1)*sizeof(UChar))); - if (inBuf == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - *status = U_ZERO_ERROR; - u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars, - s, length, status); - } - - skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars, - outBuf, USPOOF_STACK_BUFFER_SIZE, status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - outBuf = static_cast(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar))); - if (outBuf == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - *status = U_ZERO_ERROR; - skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars, - outBuf, skelLengthInUChars+1, status); + if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; } - u_strToUTF8(dest, destCapacity, &skelLengthInUTF8, - outBuf, skelLengthInUChars, status); + UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id))); + UnicodeString destStr; + uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status); + if (U_FAILURE(*status)) { + return 0; + } - cleanup: - if (inBuf != smallInBuf) { - uprv_free(inBuf); - } - if (outBuf != smallOutBuf) { - uprv_free(outBuf); - } - return skelLengthInUTF8; + int32_t lengthInUTF8 = 0; + u_strToUTF8(dest, destCapacity, &lengthInUTF8, + destStr.getBuffer(), destStr.length(), status); + return lengthInUTF8; } @@ -827,4 +707,30 @@ uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *stat return dataSize; } -#endif +U_CAPI const USet * U_EXPORT2 +uspoof_getInclusionSet(UErrorCode *) { + initializeStatics(); + return gInclusionSet->toUSet(); +} + +U_CAPI const USet * U_EXPORT2 +uspoof_getRecommendedSet(UErrorCode *) { + initializeStatics(); + return gRecommendedSet->toUSet(); +} + +U_I18N_API const UnicodeSet * U_EXPORT2 +uspoof_getInclusionUnicodeSet(UErrorCode *) { + initializeStatics(); + return gInclusionSet; +} + +U_I18N_API const UnicodeSet * U_EXPORT2 +uspoof_getRecommendedUnicodeSet(UErrorCode *) { + initializeStatics(); + return gRecommendedSet; +} + + + +#endif // !UCONFIG_NO_NORMALIZATION diff --git a/icu4c/source/i18n/uspoof_impl.cpp b/icu4c/source/i18n/uspoof_impl.cpp index 891b3e7b994..841d7887548 100644 --- a/icu4c/source/i18n/uspoof_impl.cpp +++ b/icu4c/source/i18n/uspoof_impl.cpp @@ -1,19 +1,20 @@ /* ********************************************************************** -* Copyright (C) 2008-2011, International Business Machines +* Copyright (C) 2008-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #include "unicode/uspoof.h" -#include "unicode/unorm.h" #include "unicode/uchar.h" #include "unicode/uniset.h" #include "unicode/utf16.h" #include "utrie2.h" #include "cmemory.h" #include "cstring.h" +#include "identifier_info.h" +#include "scriptset.h" #include "udatamem.h" #include "umutex.h" #include "udataswp.h" @@ -28,37 +29,41 @@ U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : - fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(uprv_strdup("")) { + fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , + fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { if (U_FAILURE(status)) { return; } - fMagic = USPOOF_MAGIC; fSpoofData = data; - fChecks = USPOOF_ALL_CHECKS; - UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); - if (allowedCharsSet == NULL || fAllowedLocales == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - allowedCharsSet->freeze(); - fAllowedCharsSet = allowedCharsSet; -} + fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; - -SpoofImpl::SpoofImpl() { - fMagic = USPOOF_MAGIC; - fSpoofData = NULL; - fChecks = USPOOF_ALL_CHECKS; UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); allowedCharsSet->freeze(); fAllowedCharsSet = allowedCharsSet; fAllowedLocales = uprv_strdup(""); + if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + fMagic = USPOOF_MAGIC; +} + + +SpoofImpl::SpoofImpl() : + fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , + fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { + UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); + allowedCharsSet->freeze(); + fAllowedCharsSet = allowedCharsSet; + fAllowedLocales = uprv_strdup(""); + fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; } // Copy Constructor, used by the user level clone() function. SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : - fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) { + fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , + fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { if (U_FAILURE(status)) { return; } @@ -72,6 +77,7 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : status = U_MEMORY_ALLOCATION_ERROR; } fAllowedLocales = uprv_strdup(src.fAllowedLocales); + fRestrictionLevel = src.fRestrictionLevel; } SpoofImpl::~SpoofImpl() { @@ -82,6 +88,7 @@ SpoofImpl::~SpoofImpl() { } delete fAllowedCharsSet; uprv_free((void *)fAllowedLocales); + delete fCachedIdentifierInfo; } // @@ -121,10 +128,10 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { // implementation. // // Given a source character, produce the corresponding -// replacement character(s) +// replacement character(s), appending them to the dest string. // //--------------------------------------------------------------------------------------- -int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const { +int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { // Binary search the spoof data key table for the inChar int32_t *low = fSpoofData->fCFUKeys; @@ -148,7 +155,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de if (inChar != midc) { // Char not found. It maps to itself. int i = 0; - U16_APPEND_UNSAFE(destBuf, i, inChar) + dest.append(inChar); return i; } foundChar: @@ -176,7 +183,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de // No key entry for this char & table. // The input char maps to itself. int i = 0; - U16_APPEND_UNSAFE(destBuf, i, inChar) + dest.append(inChar); return i; } @@ -188,7 +195,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de // an index into the string table (for longer strings) uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; if (stringLen == 1) { - destBuf[0] = value; + dest.append((UChar)value); return 1; } @@ -212,9 +219,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); UChar *src = &fSpoofData->fCFUStrings[value]; - for (ix=0; ixfAnyCaseTrie : fSpoofData->fLowerCaseTrie; result->setAll(); - while (inputIdx < length) { - U16_NEXT(text, inputIdx, length, c); + int32_t length = text.length(); + for (int32_t inputIdx=0; inputIdx < length;) { + UChar32 c = text.char32At(inputIdx); + inputIdx += U16_LENGTH(c); uint32_t index = utrie2_get32(table, c); if (index == 0) { // No confusables in another script for this char. @@ -249,7 +253,7 @@ void SpoofImpl::wholeScriptCheck( // Until then, grab the script from the char and intersect it with the set. UScriptCode cpScript = uscript_getScript(c, &status); U_ASSERT(cpScript > USCRIPT_INHERITED); - result->intersect(cpScript); + result->intersect(cpScript, status); } else if (index == 1) { // Script == Common or Inherited. Nothing to do. } else { @@ -371,47 +375,6 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr } -int32_t SpoofImpl::scriptScan - (const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const { - if (U_FAILURE(status)) { - return 0; - } - int32_t inputIdx = 0; - UChar32 c; - int32_t scriptCount = 0; - UScriptCode lastScript = USCRIPT_INVALID_CODE; - UScriptCode sc = USCRIPT_INVALID_CODE; - while ((inputIdx < length || length == -1) && scriptCount < 2) { - U16_NEXT(text, inputIdx, length, c); - if (c == 0 && length == -1) { - break; - } - sc = uscript_getScript(c, &status); - if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) { - continue; - } - - // Temporary fix: fold Japanese Hiragana and Katakana into Han. - // Names are allowed to mix these scripts. - // A more general solution will follow later for characters that are - // used with multiple scripts. - - if (sc == USCRIPT_HIRAGANA || sc == USCRIPT_KATAKANA || sc == USCRIPT_HANGUL) { - sc = USCRIPT_HAN; - } - - if (sc != lastScript) { - scriptCount++; - lastScript = sc; - } - } - if (scriptCount == 2) { - pos = inputIdx; - } - return scriptCount; -} - - // Convert a text format hex number. Utility function used by builder code. Static. // Input: UChar *string text. Output: a UChar32 // Input has been pre-checked, and will have no non-hex chars. @@ -443,6 +406,54 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC return (UChar32)val; } +// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. +// Maintain a one-element cache, which is sufficient to avoid repeatedly +// creating new ones unless we get multi-thread concurrency in spoof +// check operations, which should be statistically uncommon. + +// These functions are used in place of new & delete of an IdentifierInfo. +// They will recycle the IdentifierInfo when possible. +// They are logically const, and used within const functions that must be thread safe. +IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { + IdentifierInfo *returnIdInfo = NULL; + if (U_FAILURE(status)) { + return returnIdInfo; + } + SpoofImpl *nonConstThis = const_cast(this); + { + Mutex m; + returnIdInfo = nonConstThis->fCachedIdentifierInfo; + nonConstThis->fCachedIdentifierInfo = NULL; + } + if (returnIdInfo == NULL) { + returnIdInfo = new IdentifierInfo(status); + if (U_SUCCESS(status) && returnIdInfo == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status) && returnIdInfo != NULL) { + delete returnIdInfo; + returnIdInfo = NULL; + } + } + return returnIdInfo; +} + + +void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { + if (idInfo != NULL) { + SpoofImpl *nonConstThis = const_cast(this); + { + Mutex m; + if (nonConstThis->fCachedIdentifierInfo == NULL) { + nonConstThis->fCachedIdentifierInfo = idInfo; + idInfo = NULL; + } + } + delete idInfo; + } +}; + + //---------------------------------------------------------------------------------------------- @@ -673,149 +684,6 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { } -//---------------------------------------------------------------------------- -// -// ScriptSet implementation -// -//---------------------------------------------------------------------------- -ScriptSet::ScriptSet() { - for (uint32_t i=0; i 0) { - count++; - x &= (x - 1); // and off the least significant one bit. - } - } - return count; -} - - - -//----------------------------------------------------------------------------- -// -// NFDBuffer Implementation. -// -//----------------------------------------------------------------------------- - -NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) { - fNormalizedText = NULL; - fNormalizedTextLength = 0; - fOriginalText = text; - if (U_FAILURE(status)) { - return; - } - fNormalizedText = fSmallBuf; - fNormalizedTextLength = unorm_normalize( - text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status); - if (status == U_BUFFER_OVERFLOW_ERROR) { - status = U_ZERO_ERROR; - fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar)); - if (fNormalizedText == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } else { - fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0, - fNormalizedText, fNormalizedTextLength+1, &status); - } - } -} - - -NFDBuffer::~NFDBuffer() { - if (fNormalizedText != fSmallBuf) { - uprv_free(fNormalizedText); - } - fNormalizedText = 0; -} - -const UChar *NFDBuffer::getBuffer() { - return fNormalizedText; -} - -int32_t NFDBuffer::getLength() { - return fNormalizedTextLength; -} - - - - - U_NAMESPACE_END U_NAMESPACE_USE diff --git a/icu4c/source/i18n/uspoof_impl.h b/icu4c/source/i18n/uspoof_impl.h index f89af082e1a..dc9dda9da0f 100644 --- a/icu4c/source/i18n/uspoof_impl.h +++ b/icu4c/source/i18n/uspoof_impl.h @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 2008-2011, International Business Machines Corporation +* Copyright (C) 2008-2013, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** * @@ -15,10 +15,10 @@ #include "unicode/utypes.h" #include "unicode/uspoof.h" -#include "utrie2.h" #include "unicode/uscript.h" #include "unicode/udata.h" +#include "utrie2.h" #if !UCONFIG_NO_NORMALIZATION @@ -37,10 +37,11 @@ U_NAMESPACE_BEGIN // Magic number for sanity checking spoof data. #define USPOOF_MAGIC 0x3845fdef +class IdentifierInfo; +class ScriptSet; class SpoofData; struct SpoofDataHeader; struct SpoofStringLengthsElement; -class ScriptSet; /** * Class SpoofImpl corresponds directly to the plain C API opaque type @@ -65,7 +66,7 @@ public: * One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc. * @return The length in UTF-16 code units of the substition string. */ - int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const; + int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &destBuf) const; /** Set and Get AllowedLocales, implementations of the corresponding API */ void setAllowedLocales(const char *localesList, UErrorCode &status); @@ -83,23 +84,18 @@ public: // Return the test bit flag to be ORed into the eventual user return value // if a Spoof opportunity is detected. void wholeScriptCheck( - const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const; + const UnicodeString &text, ScriptSet *result, UErrorCode &status) const; - /** Scan a string to determine how many scripts it includes. - * Ignore characters with script=Common and scirpt=Inherited. - * @param text The UChar text to be scanned - * @param length The length of the input text, -1 for nul termintated. - * @param pos An out parameter, set to the first input postion at which - * a second script was encountered, ignoring Common and Inherited. - * @param status For errors. - * @return the number of (non-common,inherited) scripts encountered, - * clipped to a max of two. - */ - int32_t scriptScan(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const; - static UClassID U_EXPORT2 getStaticClassID(void); virtual UClassID getDynamicClassID(void) const; + // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. + // Maintain a one-element cache, which is sufficient to avoid repeatedly + // creating new ones unless we get multi-thread concurrency in spoof + // check operations, which should be statistically uncommon. + IdentifierInfo *getIdentifierInfo(UErrorCode &status) const; + void releaseIdentifierInfo(IdentifierInfo *idInfo) const; + // // Data Members // @@ -113,6 +109,9 @@ public: // for this Spoof Checker. Defaults to all chars. const char *fAllowedLocales; // The list of allowed locales. + URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier. + + IdentifierInfo *fCachedIdentifierInfo; // Do not use directly. See getIdentifierInfo().:w }; @@ -179,67 +178,6 @@ struct SpoofStringLengthsElement { }; -//------------------------------------------------------------------------------- -// -// ScriptSet - Wrapper class for the Script code bit sets that are part of the -// whole script confusable data. -// -// This class is used both at data build and at run time. -// The constructor is only used at build time. -// At run time, just point at the prebuilt data and go. -// -//------------------------------------------------------------------------------- -class ScriptSet: public UMemory { - public: - ScriptSet(); - ~ScriptSet(); - - UBool operator == (const ScriptSet &other); - ScriptSet & operator = (const ScriptSet &other); - - void Union(const ScriptSet &other); - void Union(UScriptCode script); - void intersect(const ScriptSet &other); - void intersect(UScriptCode script); - void setAll(); - void resetAll(); - int32_t countMembers(); - - private: - uint32_t bits[6]; -}; - - - - -//------------------------------------------------------------------------------- -// -// NFDBuffer A little class to handle the NFD normalization that is -// needed on incoming identifiers to be checked. -// Takes care of buffer handling and normalization -// -// Instances of this class are intended to be stack-allocated. -// -// TODO: how to map position offsets back to user values? -// -//-------------------------------------------------------------------------------- -class NFDBuffer: public UMemory { -public: - NFDBuffer(const UChar *text, int32_t length, UErrorCode &status); - ~NFDBuffer(); - const UChar *getBuffer(); - int32_t getLength(); - - private: - const UChar *fOriginalText; - UChar *fNormalizedText; - int32_t fNormalizedTextLength; - UChar fSmallBuf[USPOOF_STACK_BUFFER_SIZE]; -}; - - - - //------------------------------------------------------------------------------------- // diff --git a/icu4c/source/i18n/uspoof_wsconf.cpp b/icu4c/source/i18n/uspoof_wsconf.cpp index 6ddc9c33a05..ca232834fa2 100644 --- a/icu4c/source/i18n/uspoof_wsconf.cpp +++ b/icu4c/source/i18n/uspoof_wsconf.cpp @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 2008-2012, International Business Machines +* Copyright (C) 2008-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -29,6 +29,7 @@ #include "unicode/uregex.h" #include "unicode/ustring.h" #include "cmemory.h" +#include "scriptset.h" #include "uspoof_impl.h" #include "uhash.h" #include "uvector.h" @@ -244,8 +245,8 @@ void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, scriptSets->addElement(bsset, status); utrie2_set32(table, cp, setIndex, &status); } - bsset->sset->Union(targScript); - bsset->sset->Union(srcScript); + bsset->sset->set(targScript, status); + bsset->sset->set(srcScript, status); if (U_FAILURE(status)) { goto cleanup; diff --git a/icu4c/source/test/cintltst/spooftest.c b/icu4c/source/test/cintltst/spooftest.c index 75a8c66d91e..7096ad263a9 100644 --- a/icu4c/source/test/cintltst/spooftest.c +++ b/icu4c/source/test/cintltst/spooftest.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2009-2012, International Business Machines Corporation and + * Copyright (c) 2009-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -408,10 +408,13 @@ static void TestUSpoofCAPI(void) { TEST_ASSERT_SUCCESS(status); uset_close(tmpSet); - /* Latin Identifier should now fail; other non-latin test cases should still be OK */ + /* Latin Identifier should now fail; other non-latin test cases should still be OK + * Note: fail of CHAR_LIMIT also causes the restriction level to be USPOOF_UNRESTRICTIVE + * which will give us a USPOOF_RESTRICTION_LEVEL failure. + */ checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); - TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults); + TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT | USPOOF_RESTRICTION_LEVEL, checkResults); checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); @@ -432,7 +435,7 @@ static void TestUSpoofCAPI(void) { checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); - TEST_ASSERT_EQ(666, position); + TEST_ASSERT_EQ(0, position); u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status); TEST_ASSERT_SUCCESS(status); @@ -446,7 +449,7 @@ static void TestUSpoofCAPI(void) { checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults); - TEST_ASSERT_EQ(2, position); + TEST_ASSERT_EQ(0, position); TEST_TEARDOWN; diff --git a/icu4c/source/test/intltest/itspoof.cpp b/icu4c/source/test/intltest/itspoof.cpp index d33119c34c6..7aeeea788b3 100644 --- a/icu4c/source/test/intltest/itspoof.cpp +++ b/icu4c/source/test/intltest/itspoof.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2011, International Business Machines Corporation +* Copyright (C) 2011-2013, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** */ @@ -13,11 +13,18 @@ #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO #include "itspoof.h" -#include "unicode/uspoof.h" -#include "unicode/unistr.h" -#include "unicode/regex.h" + #include "unicode/normlzr.h" +#include "unicode/regex.h" +#include "unicode/unistr.h" +#include "unicode/uscript.h" +#include "unicode/uspoof.h" + #include "cstring.h" +#include "identifier_info.h" +#include "scriptset.h" +#include "uhash.h" + #include #include @@ -27,6 +34,9 @@ #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};} +#define TEST_ASSERT_MSG(expr, msg) {if ((expr)==FALSE) { \ + errln("Test Failure at file %s, line %d, %s: \"%s\" is false.\n", __FILE__, __LINE__, msg, #expr);};} + #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \ errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \ __FILE__, __LINE__, #a, (a), #b, (b)); }} @@ -35,6 +45,8 @@ errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \ __FILE__, __LINE__, #a, (a), #b, (b)); }} +#define LENGTHOF(array) ((int32_t)(sizeof(array)/sizeof((array)[0]))) + /* * TEST_SETUP and TEST_TEARDOWN * macros to handle the boilerplate around setting up test case. @@ -67,37 +79,63 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name testSpoofAPI(); } break; - case 1: + case 1: name = "TestSkeleton"; if (exec) { testSkeleton(); } break; - case 2: + case 2: name = "TestAreConfusable"; if (exec) { testAreConfusable(); } break; - case 3: + case 3: name = "TestInvisible"; if (exec) { testInvisible(); } break; - case 4: + case 4: name = "testConfData"; if (exec) { testConfData(); } break; - case 5: + case 5: name = "testBug8654"; if (exec) { testBug8654(); } break; - default: name=""; break; + case 6: + name = "testIdentifierInfo"; + if (exec) { + testIdentifierInfo(); + } + break; + case 7: + name = "testScriptSet"; + if (exec) { + testScriptSet(); + } + break; + case 8: + name = "testRestrictionLevel"; + if (exec) { + testRestrictionLevel(); + } + break; + case 9: + name = "testMixedNumbers"; + if (exec) { + testMixedNumbers(); + } + break; + + + default: name=""; break; } } @@ -110,7 +148,7 @@ void IntlTestSpoof::testSpoofAPI() { int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); - TEST_ASSERT_EQ(666, position); + TEST_ASSERT_EQ(0, position); TEST_TEARDOWN; TEST_SETUP @@ -250,12 +288,12 @@ void IntlTestSpoof::testInvisible() { int32_t position = -42; TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); TEST_ASSERT_SUCCESS(status); - TEST_ASSERT(position == -42); + TEST_ASSERT(0 == position); UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape(); TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status)); TEST_ASSERT_SUCCESS(status); - TEST_ASSERT_EQ(7, position); + TEST_ASSERT_EQ(0, position); // Two acute accents, one from the composed a with acute accent, \u00e1, // and one separate. @@ -263,7 +301,7 @@ void IntlTestSpoof::testInvisible() { UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status)); TEST_ASSERT_SUCCESS(status); - TEST_ASSERT_EQ(7, position); + TEST_ASSERT_EQ(0, position); TEST_TEARDOWN; } @@ -273,7 +311,7 @@ void IntlTestSpoof::testBug8654() { int32_t position = -42; TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE ); TEST_ASSERT_SUCCESS(status); - TEST_ASSERT_EQ(3, position); + TEST_ASSERT_EQ(0, position); TEST_TEARDOWN; } @@ -414,3 +452,305 @@ void IntlTestSpoof::testConfData() { } #endif // UCONFIG_NO_REGULAR_EXPRESSIONS +// testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time +void IntlTestSpoof::testIdentifierInfo() { + UErrorCode status = U_ZERO_ERROR; + ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status); + ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status); + TEST_ASSERT(bitset12.contains(bitset2)); + TEST_ASSERT(bitset12.contains(bitset12)); + TEST_ASSERT(!bitset2.contains(bitset12)); + + ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status); + ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status); + UElement arabEl; arabEl.pointer = &arabSet; + UElement latinEl; latinEl.pointer = &latinSet; + TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0); + TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0); + + UnicodeString scriptString; + bitset12.displayScripts(scriptString); + TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString); + + status = U_ZERO_ERROR; + UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status); + uhash_puti(alternates, &bitset12, 1, &status); + uhash_puti(alternates, &bitset2, 1, &status); + UnicodeString alternatesString; + IdentifierInfo::displayAlternates(alternatesString, alternates, status); + TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString); + TEST_ASSERT_SUCCESS(status); + + status = U_ZERO_ERROR; + ScriptSet tScriptSet; + tScriptSet.parseScripts(scriptString, status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(bitset12 == tScriptSet); + UnicodeString ss; + ss.remove(); + uhash_close(alternates); + + struct Test { + const char *fTestString; + URestrictionLevel fRestrictionLevel; + const char *fNumerics; + const char *fScripts; + const char *fAlternates; + const char *fCommonAlternates; + } tests[] = { + {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""}, + {"\\u0061\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"}, + {"\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"}, + {"\\u0061\\u30FC\\u3006\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""}, + {"\\u30A2\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""}, + {"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"}, + {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""}, + {"\\u0661\\u30FC\\u3006\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE, + "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""}, + {"\\u0061\\u30A2\\u30FC\\u3006\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, + "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""} + }; + + int testNum; + for (testNum = 0; testNum < LENGTHOF(tests); testNum++) { + char testNumStr[40]; + sprintf(testNumStr, "testNum = %d", testNum); + Test &test = tests[testNum]; + status = U_ZERO_ERROR; + UnicodeString testString(test.fTestString); // Note: may do charset conversion. + testString = testString.unescape(); + IdentifierInfo idInfo(status); + TEST_ASSERT_SUCCESS(status); + idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status)); + idInfo.setIdentifier(testString, status); + TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr); + + URestrictionLevel restrictionLevel = test.fRestrictionLevel; + TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr); + + status = U_ZERO_ERROR; + UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr); + + ScriptSet scripts; + scripts.parseScripts(UnicodeString(test.fScripts), status); + TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr); + + UnicodeString alternatesStr; + IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status); + TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr); + + ScriptSet commonAlternates; + commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status); + TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr); + } + + // Test of getScriptCount() + // Script and or Script Extension for chars used in the tests + // \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK + // \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK + // \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA + // + // \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA + // \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA + // \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A + // \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A + // 1234 ; Common # ascii digits + // \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT + + struct ScriptTest { + const char *fTestString; + int32_t fScriptCount; + } scriptTests[] = { + {"Hello", 1}, + {"Hello\\u0370", 2}, + {"1234", 0}, + {"Hello1234\\u0300", 1}, // Common and Inherited are ignored. + {"\\u0030", 0}, + {"abc\\u0951", 1}, + {"abc\\u3013", 2}, + {"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path. + {"\\u3013\\uA838", 2} + }; + + status = U_ZERO_ERROR; + IdentifierInfo identifierInfo(status); + for (testNum=0; testNum= USCRIPT_CODE_LIMIT); + s2.intersect(s1); + TEST_ASSERT(s2 == s1); + + s2.setAll(); + s2.reset(USCRIPT_COMMON, status); + s2.intersect(s1); + TEST_ASSERT(s2.countMembers() == 1); + + s1.resetAll(); + s1.set(USCRIPT_AFAKA, status); + s1.set(USCRIPT_VAI, status); + s1.set(USCRIPT_INHERITED, status); + int32_t n = -1; + for (int32_t i=0; i<4; i++) { + n = s1.nextSetBit(n+1); + switch (i) { + case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break; + case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break; + case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break; + case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break; + default: TEST_ASSERT(FALSE); + } + } + TEST_ASSERT_SUCCESS(status); +} + + +void IntlTestSpoof::testRestrictionLevel() { + struct Test { + const char *fId; + URestrictionLevel fExpectedRestrictionLevel; + } tests[] = { + {"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE}, + {"a", USPOOF_ASCII}, + {"\\u03B3", USPOOF_HIGHLY_RESTRICTIVE}, + {"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE}, + {"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE}, + {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE} + }; + char msgBuffer[100]; + + URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_HIGHLY_RESTRICTIVE, + USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE, USPOOF_UNRESTRICTIVE}; + + UErrorCode status = U_ZERO_ERROR; + IdentifierInfo idInfo(status); + TEST_ASSERT_SUCCESS(status); + idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status)); + TEST_ASSERT_SUCCESS(status); + for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) { + status = U_ZERO_ERROR; + const Test &test = tests[testNum]; + UnicodeString testString = UnicodeString(test.fId).unescape(); + URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel; + idInfo.setIdentifier(testString, status); + sprintf(msgBuffer, "testNum = %d ", testNum); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer); + for (int levelIndex=0; levelIndex levelSetInSpoofChecker || + !uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString); + sprintf(msgBuffer, "testNum = %d, levelIndex = %d", testNum, levelIndex); + TEST_ASSERT_MSG(expectedFailure == actualValue, msgBuffer); + TEST_ASSERT_SUCCESS(status); + uspoof_close(sc); + } + } +} + + +void IntlTestSpoof::testMixedNumbers() { + struct Test { + const char *fTestString; + const char *fExpectedSet; + } tests[] = { + {"1", "[0]"}, + {"\\u0967", "[\\u0966]"}, + {"1\\u0967", "[0\\u0966]"}, + {"\\u0661\\u06F1", "[\\u0660\\u06F0]"} + }; + UErrorCode status = U_ZERO_ERROR; + IdentifierInfo idInfo(status); + for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) { + char msgBuf[100]; + sprintf(msgBuf, "testNum = %d ", testNum); + Test &test = tests[testNum]; + + status = U_ZERO_ERROR; + UnicodeString testString = UnicodeString(test.fTestString).unescape(); + UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status); + idInfo.setIdentifier(testString, status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf); + + status = U_ZERO_ERROR; + USpoofChecker *sc = uspoof_open(&status); + uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this + int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status); + UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0); + TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf); + uspoof_close(sc); + } +} diff --git a/icu4c/source/test/intltest/itspoof.h b/icu4c/source/test/intltest/itspoof.h index abd54f5a930..a92c2ef48b2 100644 --- a/icu4c/source/test/intltest/itspoof.h +++ b/icu4c/source/test/intltest/itspoof.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2011, International Business Machines Corporation +* Copyright (C) 2011-2013, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** */ @@ -36,6 +36,14 @@ public: void testBug8654(); + void testIdentifierInfo(); + + void testScriptSet(); + + void testRestrictionLevel(); + + void testMixedNumbers(); + // Internal function to run a single skeleton test case. void checkSkeleton(const USpoofChecker *sc, uint32_t flags, const char *input, const char *expected, int32_t lineNum);