ICU-9440 spoof checker, merge updates from branch.

X-SVN-Rev: 33162
This commit is contained in:
Andy Heninger 2013-02-11 04:51:14 +00:00
parent 87158d4fba
commit e06001f2d0
17 changed files with 2013 additions and 883 deletions

View file

@ -86,7 +86,7 @@ tmunit.o tmutamt.o tmutfmt.o currpinf.o \
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o locdspnm.o \
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
tzfmt.o compactdecimalformat.o gender.o region.o
tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o identifier_info.o
## Header files to install
HEADERS = $(srcdir)/unicode/*.h

View file

@ -302,6 +302,7 @@
<ClCompile Include="gregocal.cpp" />
<ClCompile Include="gregoimp.cpp" />
<ClCompile Include="hebrwcal.cpp" />
<ClCompile Include="identifier_info.cpp" />
<ClCompile Include="indiancal.cpp" />
<ClCompile Include="islamcal.cpp" />
<ClCompile Include="japancal.cpp" />
@ -323,6 +324,7 @@
<ClCompile Include="reldtfmt.cpp" />
<ClCompile Include="selfmt.cpp" />
<ClCompile Include="simpletz.cpp" />
<ClCompile Include="scriptset.cpp" />
<ClCompile Include="smpdtfmt.cpp" />
<ClCompile Include="smpdtfst.cpp" />
<ClCompile Include="taiwncal.cpp" />
@ -1546,6 +1548,8 @@
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<ClInclude Include="identifier_info.h" />
<ClInclude Include="scriptset.h" />
<ClInclude Include="uspoof_conf.h" />
<ClInclude Include="uspoof_impl.h" />
<ClInclude Include="uspoof_wsconf.h" />
@ -1562,4 +1566,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View file

@ -456,6 +456,12 @@
<ClCompile Include="ucsdet.cpp">
<Filter>charset detect</Filter>
</ClCompile>
<ClCompile Include="identifier_info.cpp">
<Filter>spoof</Filter>
</ClCompile>
<ClCompile Include="scriptset.cpp">
<Filter>spoof</Filter>
</ClCompile>
<ClCompile Include="uspoof.cpp">
<Filter>spoof</Filter>
</ClCompile>
@ -759,6 +765,12 @@
<ClInclude Include="inputext.h">
<Filter>charset detect</Filter>
</ClInclude>
<ClInclude Include="identifier_info.h">
<Filter>spoof</Filter>
</ClInclude>
<ClInclude Include="scriptset.h">
<Filter>spoof</Filter>
</ClInclude>
<ClInclude Include="uspoof_conf.h">
<Filter>spoof</Filter>
</ClInclude>
@ -1017,4 +1029,4 @@
<Filter>formatting</Filter>
</CustomBuild>
</ItemGroup>
</Project>
</Project>

View file

@ -0,0 +1,314 @@
/*
**********************************************************************
* Copyright (C) 2012-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "identifier_info.h"
#include "mutex.h"
#include "scriptset.h"
#include "ucln_in.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
static UMutex gInitMutex = U_MUTEX_INITIALIZER;
static UBool gStaticsAreInitialized = FALSE;
UnicodeSet *IdentifierInfo::ASCII;
ScriptSet *IdentifierInfo::JAPANESE;
ScriptSet *IdentifierInfo::CHINESE;
ScriptSet *IdentifierInfo::KOREAN;
ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
UBool IdentifierInfo::cleanup() {
delete ASCII;
ASCII = NULL;
delete JAPANESE;
JAPANESE = NULL;
delete CHINESE;
CHINESE = NULL;
delete KOREAN;
KOREAN = NULL;
delete CONFUSABLE_WITH_LATIN;
CONFUSABLE_WITH_LATIN = NULL;
gStaticsAreInitialized = FALSE;
return TRUE;
}
U_CDECL_BEGIN
static UBool U_CALLCONV
IdentifierInfo_cleanup(void) {
return IdentifierInfo::cleanup();
}
U_CDECL_END
IdentifierInfo::IdentifierInfo(UErrorCode &status):
fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
if (U_FAILURE(status)) {
return;
}
{
Mutex lock(&gInitMutex);
if (!gStaticsAreInitialized) {
ASCII = new UnicodeSet(0, 0x7f);
JAPANESE = new ScriptSet();
CHINESE = new ScriptSet();
KOREAN = new ScriptSet();
CONFUSABLE_WITH_LATIN = new ScriptSet();
if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
|| CONFUSABLE_WITH_LATIN == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
ASCII->freeze();
JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
.set(USCRIPT_KATAKANA, status);
CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
.set(USCRIPT_CHEROKEE, status);
ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
gStaticsAreInitialized = TRUE;
}
}
fIdentifier = new UnicodeString();
fRequiredScripts = new ScriptSet();
fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
fCommonAmongAlternates = new ScriptSet();
fNumerics = new UnicodeSet();
fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
};
IdentifierInfo::~IdentifierInfo() {
delete fIdentifier;
delete fRequiredScripts;
uhash_close(fScriptSetSet);
delete fCommonAmongAlternates;
delete fNumerics;
delete fIdentifierProfile;
};
IdentifierInfo &IdentifierInfo::clear() {
fRequiredScripts->resetAll();
uhash_removeAll(fScriptSetSet);
fNumerics->clear();
fCommonAmongAlternates->resetAll();
return *this;
}
IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
*fIdentifierProfile = identifierProfile;
return *this;
}
const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
return *fIdentifierProfile;
}
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
*fIdentifier = identifier;
clear();
ScriptSet scriptsForCP;
UChar32 cp;
for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
cp = identifier.char32At(i);
// Store a representative character for each kind of decimal digit
if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
}
UScriptCode extensions[500];
int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
if (U_FAILURE(status)) {
return *this;
}
scriptsForCP.resetAll();
for (int32_t j=0; j<extensionsCount; j++) {
scriptsForCP.set(extensions[j], status);
}
scriptsForCP.reset(USCRIPT_COMMON, status);
scriptsForCP.reset(USCRIPT_INHERITED, status);
switch (scriptsForCP.countMembers()) {
case 0: break;
case 1:
// Single script, record it.
fRequiredScripts->Union(scriptsForCP);
break;
default:
if (!fRequiredScripts->intersects(scriptsForCP)
&& !uhash_geti(fScriptSetSet, &scriptsForCP)) {
// If the set hasn't been added already, add it
// (Add a copy, fScriptSetSet takes ownership of the copy.)
uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
}
break;
}
}
// Now make a final pass through ScriptSetSet to remove alternates that came before singles.
// [Kana], [Kana Hira] => [Kana]
// This is relatively infrequent, so doesn't have to be optimized.
// We also compute any commonalities among the alternates.
if (uhash_count(fScriptSetSet) > 0) {
fCommonAmongAlternates->setAll();
for (int32_t it = -1;;) {
const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
if (nextHashEl == NULL) {
break;
}
ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
// [Kana], [Kana Hira] => [Kana]
if (fRequiredScripts->intersects(*next)) {
uhash_removeElement(fScriptSetSet, nextHashEl);
} else {
fCommonAmongAlternates->intersect(*next);
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
for (int32_t otherIt = -1;;) {
const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
if (otherHashEl == NULL) {
break;
}
ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
if (next != other && next->contains(*other)) {
uhash_removeElement(fScriptSetSet, nextHashEl);
break;
}
}
}
}
}
if (uhash_count(fScriptSetSet) == 0) {
fCommonAmongAlternates->resetAll();
}
return *this;
}
const UnicodeString *IdentifierInfo::getIdentifier() const {
return fIdentifier;
}
const ScriptSet *IdentifierInfo::getScripts() const {
return fRequiredScripts;
}
const UHashtable *IdentifierInfo::getAlternates() const {
return fScriptSetSet;
}
const UnicodeSet *IdentifierInfo::getNumerics() const {
return fNumerics;
}
const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
return fCommonAmongAlternates;
}
URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
return USPOOF_UNRESTRICTIVE;
}
if (ASCII->containsAll(*fIdentifier)) {
return USPOOF_ASCII;
}
// This is a bit tricky. We look at a number of factors.
// The number of scripts in the text.
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
// Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
// time it is created, in setIdentifier().
int32_t cardinalityPlus = fRequiredScripts->countMembers() +
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
if (cardinalityPlus < 2) {
return USPOOF_HIGHLY_RESTRICTIVE;
}
if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
|| containsWithAlternates(*KOREAN, *fRequiredScripts)) {
return USPOOF_HIGHLY_RESTRICTIVE;
}
if (cardinalityPlus == 2 &&
fRequiredScripts->test(USCRIPT_LATIN, status) &&
!fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
return USPOOF_MODERATELY_RESTRICTIVE;
}
return USPOOF_MINIMALLY_RESTRICTIVE;
}
int32_t IdentifierInfo::getScriptCount() const {
// Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
int32_t count = fRequiredScripts->countMembers() +
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
return count;
}
UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
if (!container.contains(containee)) {
return FALSE;
}
for (int32_t iter = -1; ;) {
const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
if (hashEl == NULL) {
break;
}
ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
if (!container.intersects(*alternatives)) {
return false;
}
}
return true;
}
UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
UVector sorted(status);
if (U_FAILURE(status)) {
return dest;
}
for (int32_t pos = -1; ;) {
const UHashElement *el = uhash_nextElement(alternates, &pos);
if (el == NULL) {
break;
}
ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
sorted.addElement(ss, status);
}
sorted.sort(uhash_compareScriptSet, status);
UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
for (int32_t i=0; i<sorted.size(); i++) {
if (i>0) {
dest.append(separator);
}
ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
ss->displayScripts(dest);
}
return dest;
}
U_NAMESPACE_END

View file

@ -0,0 +1,200 @@
/*
**********************************************************************
* Copyright (C) 2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* indentifier_info.h
*
* created on: 2013 Jan 7
* created by: Andy Heninger
*/
#ifndef __IDENTIFIER_INFO_H__
#define __IDENTIFIER_INFO_H__
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/uspoof.h"
#include "uhash.h"
U_NAMESPACE_BEGIN
class ScriptSet;
// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
/**
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
* then setIdentifier. Available methods include:
* <ol>
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
* each of these.
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
* either Katakana or Hiragana.
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
* the identifier.
* <li>call getRestrictionLevel to see what the UTS36 restriction level is.
* </ol>
*
* This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
*/
class U_I18N_API IdentifierInfo : public UMemory {
public:
/**
* Create an identifier info object. Subsequently, call setIdentifier(), etc.
* @internal
*/
IdentifierInfo(UErrorCode &status);
/**
* Destructor
*/
virtual ~IdentifierInfo();
private:
/* Disallow copying for now. Can be added if there's a need. */
IdentifierInfo(const IdentifierInfo &other);
public:
/**
* Set the identifier profile: the characters that are to be allowed in the identifier.
*
* @param identifierProfile the characters that are to be allowed in the identifier
* @return this
* @internal
*/
IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
/**
* Get the identifier profile: the characters that are to be allowed in the identifier.
*
* @return The characters that are to be allowed in the identifier.
* @internal
*/
const UnicodeSet &getIdentifierProfile() const;
/**
* Set an identifier to analyze. Afterwards, call methods like getScripts()
*
* @param identifier the identifier to analyze
* @param status Errorcode, set if errors occur.
* @return this
* @internal
*/
IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
/**
* Get the identifier that was analyzed. The returned string is owned by the ICU library,
* and must not be deleted by the caller.
*
* @return the identifier that was analyzed.
* @internal
*/
const UnicodeString *getIdentifier() const;
/**
* Get the scripts found in the identifiers.
*
* @return the set of explicit scripts.
* @internal
*/
const ScriptSet *getScripts() const;
/**
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
* the set consisting of those scripts will be returned.
*
* @return a uhash, with each key being of type (ScriptSet *).
* This is a set, not a map, so the value stored in the uhash is not relevant.
* (It is, in fact, 1).
* Ownership of the uhash and its contents remains with the IndetifierInfo object,
* and remains valid until a new identifer is set or until the object is deleted.
* @internal
*/
const UHashtable *getAlternates() const;
/**
* Get the representative characters (zeros) for the numerics found in the identifier.
*
* @return the set of explicit scripts.
* @internal
*/
const UnicodeSet *getNumerics() const;
/**
* Find out which scripts are in common among the alternates.
*
* @return the set of scripts that are in common among the alternates.
* @internal
*/
const ScriptSet *getCommonAmongAlternates() const;
/**
* Get the number of scripts appearing in the identifier.
* Note: Common and Inherited scripts are omitted from the count.
* Note: Result may be high when the identifier contains characters
* with alternate scripts. The distinction between
* 0, 1 and > 1 will remain valid, however.
* @return the number of scripts.
*/
int32_t getScriptCount() const;
/**
* Find the "tightest" restriction level that the identifier satisfies.
*
* @return the restriction level.
* @internal
*/
URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
UnicodeString toString() const;
/**
* Produce a readable string of alternates.
*
* @param alternates a UHashtable of UScriptSets.
* Keys only, no meaningful values in the UHash.
* @return display form
* @internal
*/
static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
/**
* Static memory cleanup function.
* @internal
*/
static UBool cleanup();
private:
IdentifierInfo & clear();
UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
UnicodeString *fIdentifier;
ScriptSet *fRequiredScripts;
UHashtable *fScriptSetSet;
ScriptSet *fCommonAmongAlternates;
UnicodeSet *fNumerics;
UnicodeSet *fIdentifierProfile;
static UnicodeSet *ASCII;
static ScriptSet *JAPANESE;
static ScriptSet *CHINESE;
static ScriptSet *KOREAN;
static ScriptSet *CONFUSABLE_WITH_LATIN;
};
U_NAMESPACE_END
#endif // __IDENTIFIER_INFO_H__

View file

@ -0,0 +1,276 @@
/*
**********************************************************************
* Copyright (C) 2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* scriptset.cpp
*
* created on: 2013 Jan 7
* created by: Andy Heninger
*/
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/unistr.h"
#include "scriptset.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
//----------------------------------------------------------------------------
//
// ScriptSet implementation
//
//----------------------------------------------------------------------------
ScriptSet::ScriptSet() {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
bits[i] = 0;
}
}
ScriptSet::~ScriptSet() {
}
ScriptSet::ScriptSet(const ScriptSet &other) {
*this = other;
}
ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
bits[i] = other.bits[i];
}
return *this;
}
UBool ScriptSet::operator == (const ScriptSet &other) const {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
if (bits[i] != other.bits[i]) {
return FALSE;
}
}
return TRUE;
}
UBool ScriptSet::test(UScriptCode script, UErrorCode &status) const {
if (U_FAILURE(status)) {
return FALSE;
}
if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
uint32_t index = script / 32;
uint32_t bit = 1 << (script & 31);
return ((bits[index] & bit) != 0);
}
ScriptSet &ScriptSet::set(UScriptCode script, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
uint32_t index = script / 32;
uint32_t bit = 1 << (script & 31);
bits[index] |= bit;
return *this;
}
ScriptSet &ScriptSet::reset(UScriptCode script, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
uint32_t index = script / 32;
uint32_t bit = 1 << (script & 31);
bits[index] &= ~bit;
return *this;
}
ScriptSet &ScriptSet::Union(const ScriptSet &other) {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
bits[i] |= other.bits[i];
}
return *this;
}
ScriptSet &ScriptSet::intersect(const ScriptSet &other) {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
bits[i] &= other.bits[i];
}
return *this;
}
ScriptSet &ScriptSet::intersect(UScriptCode script, UErrorCode &status) {
ScriptSet t;
t.set(script, status);
if (U_SUCCESS(status)) {
this->intersect(t);
}
return *this;
}
UBool ScriptSet::intersects(const ScriptSet &other) const {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
if ((bits[i] & other.bits[i]) != 0) {
return true;
}
}
return false;
}
UBool ScriptSet::contains(const ScriptSet &other) const {
ScriptSet t(*this);
t.intersect(other);
return (t == other);
}
ScriptSet &ScriptSet::setAll() {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
bits[i] = 0xffffffffu;
}
return *this;
}
ScriptSet &ScriptSet::resetAll() {
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
bits[i] = 0;
}
return *this;
}
int32_t ScriptSet::countMembers() const {
// This bit counter is good for sparse numbers of '1's, which is
// very much the case that we will usually have.
int32_t count = 0;
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
uint32_t x = bits[i];
while (x > 0) {
count++;
x &= (x - 1); // and off the least significant one bit.
}
}
return count;
}
int32_t ScriptSet::hashCode() const {
int32_t hash = 0;
for (int32_t i=0; i<LENGTHOF(bits); i++) {
hash ^= bits[i];
}
return hash;
}
int32_t ScriptSet::nextSetBit(int32_t fromIndex) const {
// TODO: Wants a better implementation.
if (fromIndex < 0) {
return -1;
}
UErrorCode status = U_ZERO_ERROR;
for (int32_t scriptIndex = fromIndex; scriptIndex < (int32_t)sizeof(bits)*8; scriptIndex++) {
if (test((UScriptCode)scriptIndex, status)) {
return scriptIndex;
}
}
return -1;
}
UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const {
UBool firstTime = TRUE;
for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
if (!firstTime) {
dest.append(0x20);
}
firstTime = FALSE;
const char *scriptName = uscript_getShortName((UScriptCode(i)));
dest.append(UnicodeString(scriptName, -1, US_INV));
}
return dest;
}
ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode &status) {
resetAll();
if (U_FAILURE(status)) {
return *this;
}
UnicodeString oneScriptName;
for (int32_t i=0; i<scriptString.length();) {
UChar32 c = scriptString.char32At(i);
i = scriptString.moveIndex32(i, 1);
if (!u_isUWhiteSpace(c)) {
oneScriptName.append(c);
if (i < scriptString.length()) {
continue;
}
}
if (oneScriptName.length() > 0) {
char buf[40];
oneScriptName.extract(0, oneScriptName.length(), buf, sizeof(buf)-1, US_INV);
buf[sizeof(buf)-1] = 0;
int32_t sc = u_getPropertyValueEnum(UCHAR_SCRIPT, buf);
if (sc == UCHAR_INVALID_CODE) {
status = U_ILLEGAL_ARGUMENT_ERROR;
} else {
this->set((UScriptCode)sc, status);
}
if (U_FAILURE(status)) {
return *this;
}
oneScriptName.remove();
}
}
return *this;
}
U_NAMESPACE_END
U_CAPI UBool U_EXPORT2
uhash_equalsScriptSet(const UElement key1, const UElement key2) {
icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
icu::ScriptSet *s2 = static_cast<icu::ScriptSet *>(key2.pointer);
return (*s1 == *s2);
}
U_CAPI int8_t U_EXPORT2
uhash_compareScriptSet(UElement key0, UElement key1) {
icu::ScriptSet *s0 = static_cast<icu::ScriptSet *>(key0.pointer);
icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
int32_t diff = s0->countMembers() - s1->countMembers();
if (diff != 0) return diff;
int32_t i0 = s0->nextSetBit(0);
int32_t i1 = s1->nextSetBit(0);
while ((diff = i0-i1) == 0 && i0 > 0) {
i0 = s0->nextSetBit(i0+1);
i1 = s1->nextSetBit(i1+1);
}
return (int8_t)diff;
}
U_CAPI int32_t U_EXPORT2
uhash_hashScriptSet(const UElement key) {
icu::ScriptSet *s = static_cast<icu::ScriptSet *>(key.pointer);
return s->hashCode();
}
U_CAPI void U_EXPORT2
uhash_deleteScriptSet(void *obj) {
icu::ScriptSet *s = static_cast<icu::ScriptSet *>(obj);
delete s;
}

View file

@ -0,0 +1,76 @@
/*
**********************************************************************
* Copyright (C) 2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* scriptset.h
*
* created on: 2013 Jan 7
* created by: Andy Heninger
*/
#ifndef __SCRIPTSET_H__
#define __SCRIPTSET_H__
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/uscript.h"
#include "uelement.h"
U_NAMESPACE_BEGIN
//-------------------------------------------------------------------------------
//
// ScriptSet - A bit set representing a set of scripts.
//
// This class was originally used exclusively with script sets appearing
// as part of the spoof check whole script confusable binary data. Its
// use has since become more general, but the continued use to wrap
// prebuilt binary data does constrain the design.
//
//-------------------------------------------------------------------------------
class U_I18N_API ScriptSet: public UMemory {
public:
ScriptSet();
ScriptSet(const ScriptSet &other);
~ScriptSet();
UBool operator == (const ScriptSet &other) const;
ScriptSet & operator = (const ScriptSet &other);
UBool test(UScriptCode script, UErrorCode &status) const;
ScriptSet &Union(const ScriptSet &other);
ScriptSet &set(UScriptCode script, UErrorCode &status);
ScriptSet &reset(UScriptCode script, UErrorCode &status);
ScriptSet &intersect(const ScriptSet &other);
ScriptSet &intersect(UScriptCode script, UErrorCode &status);
UBool intersects(const ScriptSet &other) const; // Sets contain at least one script in commmon.
UBool contains(const ScriptSet &other) const; // All set bits in other are also set in this.
ScriptSet &setAll();
ScriptSet &resetAll();
int32_t countMembers() const;
int32_t hashCode() const;
int32_t nextSetBit(int32_t script) const;
UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string.
ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents.
private:
uint32_t bits[6];
};
U_NAMESPACE_END
U_CAPI UBool U_EXPORT2
uhash_compareScriptSet(const UElement key1, const UElement key2);
U_CAPI int32_t U_EXPORT2
uhash_hashScriptSet(const UElement key);
U_CAPI void U_EXPORT2
uhash_deleteScriptSet(void *obj);
#endif // __SCRIPTSET_H__

View file

@ -26,6 +26,8 @@ as the functions are suppose to be called.
It's usually best to have child dependencies called first. */
typedef enum ECleanupI18NType {
UCLN_I18N_START = -1,
UCLN_I18N_IDENTIFIER_INFO,
UCLN_I18N_SPOOF,
UCLN_I18N_TRANSLITERATOR,
UCLN_I18N_REGEX,
UCLN_I18N_ISLAMIC_CALENDAR,

View file

@ -180,6 +180,21 @@ class U_I18N_API AlphabeticIndex: public UObject {
*/
AlphabeticIndex(const Locale &locale, UErrorCode &status);
/**
* Construct an AlphabeticIndex that uses a specific collator.
*
* The index will be created with no labels; the addLabels() function must be called
* after creation to add the desired labels to the index.
*
* The index adopts the collator, and is responsible for deleting it.
* The caller should make nor further use of the collator after creating the index.
*
* @param collator The collator to use to order the contents of this index.
* @param status Error code, will be set with the reason if the
* operation fails.
* @draft ICU 51
*/
AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status);
/**
* Construct an AlphabeticIndex that uses a specific collator.

View file

@ -1,6 +1,6 @@
/*
***************************************************************************
* Copyright (C) 2008-2012, International Business Machines Corporation
* Copyright (C) 2008-2013, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
* file name: uspoof.h
@ -188,11 +188,27 @@ typedef enum USpoofChecks {
Any Case Confusable. */
USPOOF_ANY_CASE = 8,
/**
* Check that an identifier is no looser than the specified RestrictionLevel.
* The default if uspoof_setRestrctionLevel() is not called is HIGHLY_RESTRICTIVE.
*
* If USPOOF_AUX_INFO is enabled the actual restriction level of the
* identifier being tested will also be returned by uspoof_check().
*
* @see URestrictionLevel
* @see uspoof_setRestrictionLevel
* @see USPOOF_AUX_INFO
*
* @stable ICU 51
*/
USPOOF_RESTRICTION_LEVEL = 16,
/** Check that an identifier contains only characters from a
* single script (plus chars from the common and inherited scripts.)
* Applies to checks of a single identifier check only.
* @deprecated ICU 51 Use RESTRICTION_LEVEL instead.
*/
USPOOF_SINGLE_SCRIPT = 16,
USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL,
/** Check an identifier for the presence of invisible characters,
* such as zero-width spaces, or character sequences that are
@ -208,10 +224,78 @@ typedef enum USpoofChecks {
*/
USPOOF_CHAR_LIMIT = 64,
USPOOF_ALL_CHECKS = 0x7f
/**
* Check that an identifier does not include decimal digits from
* more than one numbering system.
*
* @draft ICU 51
*/
USPOOF_MIXED_NUMBERS = 128,
/**
* Enable all spoof checks.
*
* @stable ICU 4.6
*/
USPOOF_ALL_CHECKS = 0xFFFF,
/**
* Enable the return of auxillary (non-error) information in the
* upper bits of the check results value.
*
* If this "check" is not enabled, the results of uspoof_check() will be zero when an
* identifier passes all of the enabled checks.
*
* If this "check" is enabled, (uspoof_check() & USPOOF_ALL_CHECKS) will be zero
* when an identifier passes all checks.
*
* @draft ICU 51
*/
USPOOF_AUX_INFO = 0x40000000
} USpoofChecks;
/**
* Constants from UAX #39 for use in setRestrictionLevel(), and
* for returned identifier restriction levels in check results.
* @draft ICU 51
*/
typedef enum URestrictionLevel {
/**
* Only ASCII characters: U+0000..U+007F
*
* @draft ICU 51
*/
USPOOF_ASCII = 0x10000000,
/**
* All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
* Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
* vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
*
* @draft ICU 51
*/
USPOOF_HIGHLY_RESTRICTIVE = 0x20000000,
/**
* Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
*
* @draft ICU 51
*/
USPOOF_MODERATELY_RESTRICTIVE = 0x30000000,
/**
* Allow arbitrary mixtures of scripts. Otherwise, the same as Moderately Restrictive.
*
* @draft ICU 51
*/
USPOOF_MINIMALLY_RESTRICTIVE = 0x40000000,
/**
* Any valid identifiers, including characters outside of the Identifier Profile.
*
* @draft ICU 51
*/
USPOOF_UNRESTRICTIVE = 0x50000000
} URestrictionLevel;
/**
* Create a Unicode Spoof Checker, configured to perform all
* checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
@ -255,7 +339,7 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
* Open a Spoof Checker from the source form of the spoof data.
* The Three inputs correspond to the Unicode data files confusables.txt
* confusablesWholeScript.txt and xidmdifications.txt as described in
* Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
* Unicode UAX #39. The syntax of the source data is as described in UAX #39 for
* these files, and the content of these files is acceptable input.
*
* The character encoding of the (char *) input text is UTF-8.
@ -357,6 +441,28 @@ uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
U_STABLE int32_t U_EXPORT2
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
/**
* Set the loosest restriction level allowed. The default if this function
* is not called is HIGHLY_RESTRICTIVE.
* Calling this function also enables the RESTRICTION_LEVEL check.
* @param restrictionLevel The loosest restriction level allowed.
* @see URestrictionLevel
* @draft ICU 51
*/
U_DRAFT void U_EXPORT2
uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
/**
* Get the Restriction Level that will be tested if the checks include RESTRICTION_LEVEL.
*
* @return The restriction level
* @see URestrictionLevel
* @draft ICU 51
*/
U_DRAFT URestrictionLevel U_EXPORT2
uspoof_getRestrictionLevel(const USpoofChecker *sc);
/**
* Limit characters that are acceptable in identifiers being checked to those
* normally used with the languages associated with the specified locales.
@ -488,7 +594,7 @@ uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
* characters that are permitted. Ownership of the set
* remains with the caller. The incoming set is cloned by
* this function, so there are no restrictions on modifying
* or deleting the USet after calling this function.
* or deleting the UnicodeSet after calling this function.
* @param status The error code, set if this function encounters a problem.
* @stable ICU 4.2
*/
@ -527,31 +633,29 @@ uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
* The set of checks to be performed is specified with uspoof_setChecks().
*
* @param sc The USpoofChecker
* @param text The string to be checked for possible security issues,
* @param id The identifier to be checked for possible security issues,
* in UTF-16 format.
* @param length the length of the string to be checked, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* zero terminated.
* @param position An out parameter that receives the index of the
* first string position that fails the allowed character
* limitation checks.
* This parameter may be null if the position information
* is not needed.
* If the string passes the requested checks the
* parameter value will not be set.
* @param position An out parameter.
* Originally, the index of the first string position that failed a check.
* Now, always returns zero.
* This parameter may be null.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
* not reported here, but through the function's return value.
* @return An integer value with bits set for any potential security
* or spoofing issues detected. The bits are defined by
* enum USpoofChecks. Zero is returned if no issues
* are found with the input string.
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
* will be zero if the input string passes all of the
* enabled checks.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_check(const USpoofChecker *sc,
const UChar *text, int32_t length,
const UChar *id, int32_t length,
int32_t *position,
UErrorCode *status);
@ -562,16 +666,14 @@ uspoof_check(const USpoofChecker *sc,
* The set of checks to be performed is specified with uspoof_setChecks().
*
* @param sc The USpoofChecker
* @param text A UTF-8 string to be checked for possible security issues.
* @param id A identifier to be checked for possible security issues, in UTF8 format.
* @param length the length of the string to be checked, or -1 if the string is
* zero terminated.
* @param position An out parameter that receives the index of the
* first string position that fails the allowed character
* limitation checks.
* This parameter may be null if the position information
* is not needed.
* If the string passes the requested checks the
* parameter value will not be set.
* @param position An out parameter.
* Originally, the index of the first string position that failed a check.
* Now, always returns zero.
* This parameter may be null.
* @deprecated ICU 51
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
@ -580,13 +682,14 @@ uspoof_check(const USpoofChecker *sc,
* a status of U_INVALID_CHAR_FOUND will be returned.
* @return An integer value with bits set for any potential security
* or spoofing issues detected. The bits are defined by
* enum USpoofChecks. Zero is returned if no issues
* are found with the input string.
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
* will be zero if the input string passes all of the
* enabled checks.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_checkUTF8(const USpoofChecker *sc,
const char *text, int32_t length,
const char *id, int32_t length,
int32_t *position,
UErrorCode *status);
@ -598,28 +701,26 @@ uspoof_checkUTF8(const USpoofChecker *sc,
* The set of checks to be performed is specified with uspoof_setChecks().
*
* @param sc The USpoofChecker
* @param text A UnicodeString to be checked for possible security issues.
* @param position An out parameter that receives the index of the
* first string position that fails the allowed character
* limitation checks.
* This parameter may be null if the position information
* is not needed.
* If the string passes the requested checks the
* parameter value will not be set.
* @param id A identifier to be checked for possible security issues.
* @param position An out parameter.
* Originally, the index of the first string position that failed a check.
* Now, always returns zero.
* This parameter may be null.
* @deprecated ICU 51
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
* not reported here, but through the function's return value.
* @return An integer value with bits set for any potential security
* or spoofing issues detected. The bits are defined by
* enum USpoofChecks. Zero is returned if no issues
* are found with the input string.
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
* will be zero if the input string passes all of the
* enabled checks.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
const icu::UnicodeString &text,
const icu::UnicodeString &id,
int32_t *position,
UErrorCode *status);
@ -645,30 +746,30 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
*
*
* @param sc The USpoofChecker
* @param s1 The first of the two strings to be compared for
* @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-16 format.
* @param length1 the length of the first string, expressed in
* @param length1 the length of the first identifer, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* zero terminated.
* @param s2 The second of the two strings to be compared for
* confusability. The strings are in UTF-16 format.
* @param length2 The length of the second string, expressed in
* nul terminated.
* @param id2 The second of the two identifiers to be compared for
* confusability. The identifiers are in UTF-16 format.
* @param length2 The length of the second identifiers, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* zero terminated.
* nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the strings is not reported here,
* Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the strings
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_areConfusable(const USpoofChecker *sc,
const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
const UChar *id1, int32_t length1,
const UChar *id2, int32_t length2,
UErrorCode *status);
@ -680,14 +781,14 @@ uspoof_areConfusable(const USpoofChecker *sc,
* USpoofChecker.
*
* @param sc The USpoofChecker
* @param s1 The first of the two strings to be compared for
* @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param length1 the length of the first identifiers, in bytes, or -1
* if the string is nul terminated.
* @param id2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param length1 the length of the first string, in bytes, or -1
* if the string is zero terminated.
* @param s2 The second of the two strings to be compared for
* confusability. The strings are in UTF-18 format.
* @param length2 The length of the second string in bytes, or -1
* if the string is zero terminated.
* if the string is nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the strings is not reported here,
@ -700,8 +801,8 @@ uspoof_areConfusable(const USpoofChecker *sc,
*/
U_STABLE int32_t U_EXPORT2
uspoof_areConfusableUTF8(const USpoofChecker *sc,
const char *s1, int32_t length1,
const char *s2, int32_t length2,
const char *id1, int32_t length1,
const char *id2, int32_t length2,
UErrorCode *status);
@ -715,17 +816,17 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc,
* USpoofChecker.
*
* @param sc The USpoofChecker
* @param s1 The first of the two strings to be compared for
* @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param id2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param s2 The second of the two strings to be compared for
* confusability. The strings are in UTF-18 format.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the strings is not reported here,
* Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the strings
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
* @stable ICU 4.2
*/
@ -738,10 +839,10 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
/**
* Get the "skeleton" for an identifier string.
* Skeletons are a transformation of the input string;
* Two strings are confusable if their skeletons are identical.
* See Unicode UAX 39 for additional information.
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -754,8 +855,8 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
* The default is Mixed-Script, Lowercase.
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
* USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed.
* @param s The input string whose skeleton will be computed.
* @param length The length of the input string, expressed in 16 bit
* @param id The input identifier whose skeleton will be computed.
* @param length The length of the input identifier, expressed in 16 bit
* UTF-16 code units, or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
* @param destCapacity The length of the output buffer, in 16 bit units.
@ -772,15 +873,15 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
U_STABLE int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
uint32_t type,
const UChar *s, int32_t length,
const UChar *id, int32_t length,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
/**
* Get the "skeleton" for an identifier string.
* Skeletons are a transformation of the input string;
* Two strings are confusable if their skeletons are identical.
* See Unicode UAX 39 for additional information.
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -793,7 +894,7 @@ uspoof_getSkeleton(const USpoofChecker *sc,
* The default is Mixed-Script, Lowercase.
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
* USPOOF_ANY_CASE. The two flags may be ORed.
* @param s The UTF-8 format input string whose skeleton will be computed.
* @param id The UTF-8 format identifier whose skeleton will be computed.
* @param length The length of the input string, in bytes,
* or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
@ -814,16 +915,16 @@ uspoof_getSkeleton(const USpoofChecker *sc,
U_STABLE int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
uint32_t type,
const char *s, int32_t length,
const char *id, int32_t length,
char *dest, int32_t destCapacity,
UErrorCode *status);
#if U_SHOW_CPLUSPLUS_API
/**
* Get the "skeleton" for an identifier string.
* Skeletons are a transformation of the input string;
* Two strings are confusable if their skeletons are identical.
* See Unicode UAX 39 for additional information.
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -836,8 +937,8 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
* The default is Mixed-Script, Lowercase.
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
* USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed.
* @param s The input string whose skeleton will be computed.
* @param dest The output string, to receive the skeleton string.
* @param id The input identifier whose skeleton will be computed.
* @param dest The output identifier, to receive the skeleton string.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* @return A reference to the destination (skeleton) string.
@ -847,17 +948,83 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
U_I18N_API icu::UnicodeString & U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
uint32_t type,
const icu::UnicodeString &s,
const icu::UnicodeString &id,
icu::UnicodeString &dest,
UErrorCode *status);
#endif /* U_SHOW_CPLUSPLUS_API */
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
*
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
* be deleted by the caller.
*
* @param status The error code, set if a problem occurs while creating the set.
*
* @draft ICU 51
*/
U_DRAFT const USet * U_EXPORT2
uspoof_getInclusionSet(UErrorCode *status);
/**
* Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
*
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
* be deleted by the caller.
*
* @param status The error code, set if a problem occurs while creating the set.
*
* @draft ICU 51
*/
U_DRAFT const USet * U_EXPORT2
uspoof_getRecommendedSet(UErrorCode *status);
#if U_SHOW_CPLUSPLUS_API
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
*
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
* be deleted by the caller.
*
* @param status The error code, set if a problem occurs while creating the set.
*
* @draft ICU 51
*/
U_DRAFT const UnicodeSet * U_EXPORT2
uspoof_getInclusionUnicodeSet(UErrorCode *status);
/**
* Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
*
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
* be deleted by the caller.
*
* @param status The error code, set if a problem occurs while creating the set.
*
* @draft ICU 51
*/
U_DRAFT const UnicodeSet * U_EXPORT2
uspoof_getRecommendedUnicodeSet(UErrorCode *status);
#endif /* U_SHOW_CPLUSPLUS_API */
/**
* Serialize the data for a spoof detector into a chunk of memory.
* The flattened spoof detection tables can later be used to efficiently
* instantiate a new Spoof Detector.
*
* The serialized spoof checker includes only the data compiled from the
* Unicode data tables by uspoof_openFromSource(); it does not include
* include any other state or configuration that may have been set.
*
* @param sc the Spoof Detector whose data is to be serialized.
* @param data a pointer to 32-bit-aligned memory to be filled with the data,
* can be NULL if capacity==0

File diff suppressed because it is too large Load diff

View file

@ -1,19 +1,20 @@
/*
**********************************************************************
* Copyright (C) 2008-2011, International Business Machines
* Copyright (C) 2008-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/unorm.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
#include "identifier_info.h"
#include "scriptset.h"
#include "udatamem.h"
#include "umutex.h"
#include "udataswp.h"
@ -28,37 +29,41 @@ U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(uprv_strdup("")) {
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
if (U_FAILURE(status)) {
return;
}
fMagic = USPOOF_MAGIC;
fSpoofData = data;
fChecks = USPOOF_ALL_CHECKS;
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
if (allowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
}
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
SpoofImpl::SpoofImpl() {
fMagic = USPOOF_MAGIC;
fSpoofData = NULL;
fChecks = USPOOF_ALL_CHECKS;
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
fAllowedLocales = uprv_strdup("");
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
fMagic = USPOOF_MAGIC;
}
SpoofImpl::SpoofImpl() :
fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
fAllowedLocales = uprv_strdup("");
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
}
// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
if (U_FAILURE(status)) {
return;
}
@ -72,6 +77,7 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
status = U_MEMORY_ALLOCATION_ERROR;
}
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
fRestrictionLevel = src.fRestrictionLevel;
}
SpoofImpl::~SpoofImpl() {
@ -82,6 +88,7 @@ SpoofImpl::~SpoofImpl() {
}
delete fAllowedCharsSet;
uprv_free((void *)fAllowedLocales);
delete fCachedIdentifierInfo;
}
//
@ -121,10 +128,10 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
// implementation.
//
// Given a source character, produce the corresponding
// replacement character(s)
// replacement character(s), appending them to the dest string.
//
//---------------------------------------------------------------------------------------
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const {
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
// Binary search the spoof data key table for the inChar
int32_t *low = fSpoofData->fCFUKeys;
@ -148,7 +155,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
if (inChar != midc) {
// Char not found. It maps to itself.
int i = 0;
U16_APPEND_UNSAFE(destBuf, i, inChar)
dest.append(inChar);
return i;
}
foundChar:
@ -176,7 +183,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
// No key entry for this char & table.
// The input char maps to itself.
int i = 0;
U16_APPEND_UNSAFE(destBuf, i, inChar)
dest.append(inChar);
return i;
}
@ -188,7 +195,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
// an index into the string table (for longer strings)
uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
if (stringLen == 1) {
destBuf[0] = value;
dest.append((UChar)value);
return 1;
}
@ -212,9 +219,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
UChar *src = &fSpoofData->fCFUStrings[value];
for (ix=0; ix<stringLen; ix++) {
destBuf[ix] = src[ix];
}
dest.append(src, stringLen);
return stringLen;
}
@ -231,16 +236,15 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
//
//---------------------------------------------------------------------------------------
void SpoofImpl::wholeScriptCheck(
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const {
int32_t inputIdx = 0;
UChar32 c;
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
UTrie2 *table =
(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
result->setAll();
while (inputIdx < length) {
U16_NEXT(text, inputIdx, length, c);
int32_t length = text.length();
for (int32_t inputIdx=0; inputIdx < length;) {
UChar32 c = text.char32At(inputIdx);
inputIdx += U16_LENGTH(c);
uint32_t index = utrie2_get32(table, c);
if (index == 0) {
// No confusables in another script for this char.
@ -249,7 +253,7 @@ void SpoofImpl::wholeScriptCheck(
// Until then, grab the script from the char and intersect it with the set.
UScriptCode cpScript = uscript_getScript(c, &status);
U_ASSERT(cpScript > USCRIPT_INHERITED);
result->intersect(cpScript);
result->intersect(cpScript, status);
} else if (index == 1) {
// Script == Common or Inherited. Nothing to do.
} else {
@ -371,47 +375,6 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr
}
int32_t SpoofImpl::scriptScan
(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
int32_t inputIdx = 0;
UChar32 c;
int32_t scriptCount = 0;
UScriptCode lastScript = USCRIPT_INVALID_CODE;
UScriptCode sc = USCRIPT_INVALID_CODE;
while ((inputIdx < length || length == -1) && scriptCount < 2) {
U16_NEXT(text, inputIdx, length, c);
if (c == 0 && length == -1) {
break;
}
sc = uscript_getScript(c, &status);
if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) {
continue;
}
// Temporary fix: fold Japanese Hiragana and Katakana into Han.
// Names are allowed to mix these scripts.
// A more general solution will follow later for characters that are
// used with multiple scripts.
if (sc == USCRIPT_HIRAGANA || sc == USCRIPT_KATAKANA || sc == USCRIPT_HANGUL) {
sc = USCRIPT_HAN;
}
if (sc != lastScript) {
scriptCount++;
lastScript = sc;
}
}
if (scriptCount == 2) {
pos = inputIdx;
}
return scriptCount;
}
// Convert a text format hex number. Utility function used by builder code. Static.
// Input: UChar *string text. Output: a UChar32
// Input has been pre-checked, and will have no non-hex chars.
@ -443,6 +406,54 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC
return (UChar32)val;
}
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
// Maintain a one-element cache, which is sufficient to avoid repeatedly
// creating new ones unless we get multi-thread concurrency in spoof
// check operations, which should be statistically uncommon.
// These functions are used in place of new & delete of an IdentifierInfo.
// They will recycle the IdentifierInfo when possible.
// They are logically const, and used within const functions that must be thread safe.
IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
IdentifierInfo *returnIdInfo = NULL;
if (U_FAILURE(status)) {
return returnIdInfo;
}
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
{
Mutex m;
returnIdInfo = nonConstThis->fCachedIdentifierInfo;
nonConstThis->fCachedIdentifierInfo = NULL;
}
if (returnIdInfo == NULL) {
returnIdInfo = new IdentifierInfo(status);
if (U_SUCCESS(status) && returnIdInfo == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status) && returnIdInfo != NULL) {
delete returnIdInfo;
returnIdInfo = NULL;
}
}
return returnIdInfo;
}
void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
if (idInfo != NULL) {
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
{
Mutex m;
if (nonConstThis->fCachedIdentifierInfo == NULL) {
nonConstThis->fCachedIdentifierInfo = idInfo;
idInfo = NULL;
}
}
delete idInfo;
}
};
//----------------------------------------------------------------------------------------------
@ -673,149 +684,6 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
}
//----------------------------------------------------------------------------
//
// ScriptSet implementation
//
//----------------------------------------------------------------------------
ScriptSet::ScriptSet() {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0;
}
}
ScriptSet::~ScriptSet() {
}
UBool ScriptSet::operator == (const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
if (bits[i] != other.bits[i]) {
return FALSE;
}
}
return TRUE;
}
void ScriptSet::Union(UScriptCode script) {
uint32_t index = script / 32;
uint32_t bit = 1 << (script & 31);
U_ASSERT(index < sizeof(bits)*4);
bits[index] |= bit;
}
void ScriptSet::Union(const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] |= other.bits[i];
}
}
void ScriptSet::intersect(const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] &= other.bits[i];
}
}
void ScriptSet::intersect(UScriptCode script) {
uint32_t index = script / 32;
uint32_t bit = 1 << (script & 31);
U_ASSERT(index < sizeof(bits)*4);
uint32_t i;
for (i=0; i<index; i++) {
bits[i] = 0;
}
bits[index] &= bit;
for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0;
}
}
ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = other.bits[i];
}
return *this;
}
void ScriptSet::setAll() {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0xffffffffu;
}
}
void ScriptSet::resetAll() {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0;
}
}
int32_t ScriptSet::countMembers() {
// This bit counter is good for sparse numbers of '1's, which is
// very much the case that we will usually have.
int32_t count = 0;
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
uint32_t x = bits[i];
while (x > 0) {
count++;
x &= (x - 1); // and off the least significant one bit.
}
}
return count;
}
//-----------------------------------------------------------------------------
//
// NFDBuffer Implementation.
//
//-----------------------------------------------------------------------------
NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
fNormalizedText = NULL;
fNormalizedTextLength = 0;
fOriginalText = text;
if (U_FAILURE(status)) {
return;
}
fNormalizedText = fSmallBuf;
fNormalizedTextLength = unorm_normalize(
text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
if (fNormalizedText == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
fNormalizedText, fNormalizedTextLength+1, &status);
}
}
}
NFDBuffer::~NFDBuffer() {
if (fNormalizedText != fSmallBuf) {
uprv_free(fNormalizedText);
}
fNormalizedText = 0;
}
const UChar *NFDBuffer::getBuffer() {
return fNormalizedText;
}
int32_t NFDBuffer::getLength() {
return fNormalizedTextLength;
}
U_NAMESPACE_END
U_NAMESPACE_USE

View file

@ -1,6 +1,6 @@
/*
***************************************************************************
* Copyright (C) 2008-2011, International Business Machines Corporation
* Copyright (C) 2008-2013, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*
@ -15,10 +15,10 @@
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "utrie2.h"
#include "unicode/uscript.h"
#include "unicode/udata.h"
#include "utrie2.h"
#if !UCONFIG_NO_NORMALIZATION
@ -37,10 +37,11 @@ U_NAMESPACE_BEGIN
// Magic number for sanity checking spoof data.
#define USPOOF_MAGIC 0x3845fdef
class IdentifierInfo;
class ScriptSet;
class SpoofData;
struct SpoofDataHeader;
struct SpoofStringLengthsElement;
class ScriptSet;
/**
* Class SpoofImpl corresponds directly to the plain C API opaque type
@ -65,7 +66,7 @@ public:
* One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
* @return The length in UTF-16 code units of the substition string.
*/
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const;
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &destBuf) const;
/** Set and Get AllowedLocales, implementations of the corresponding API */
void setAllowedLocales(const char *localesList, UErrorCode &status);
@ -83,23 +84,18 @@ public:
// Return the test bit flag to be ORed into the eventual user return value
// if a Spoof opportunity is detected.
void wholeScriptCheck(
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const;
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const;
/** Scan a string to determine how many scripts it includes.
* Ignore characters with script=Common and scirpt=Inherited.
* @param text The UChar text to be scanned
* @param length The length of the input text, -1 for nul termintated.
* @param pos An out parameter, set to the first input postion at which
* a second script was encountered, ignoring Common and Inherited.
* @param status For errors.
* @return the number of (non-common,inherited) scripts encountered,
* clipped to a max of two.
*/
int32_t scriptScan(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const;
static UClassID U_EXPORT2 getStaticClassID(void);
virtual UClassID getDynamicClassID(void) const;
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
// Maintain a one-element cache, which is sufficient to avoid repeatedly
// creating new ones unless we get multi-thread concurrency in spoof
// check operations, which should be statistically uncommon.
IdentifierInfo *getIdentifierInfo(UErrorCode &status) const;
void releaseIdentifierInfo(IdentifierInfo *idInfo) const;
//
// Data Members
//
@ -113,6 +109,9 @@ public:
// for this Spoof Checker. Defaults to all chars.
const char *fAllowedLocales; // The list of allowed locales.
URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier.
IdentifierInfo *fCachedIdentifierInfo; // Do not use directly. See getIdentifierInfo().:w
};
@ -179,67 +178,6 @@ struct SpoofStringLengthsElement {
};
//-------------------------------------------------------------------------------
//
// ScriptSet - Wrapper class for the Script code bit sets that are part of the
// whole script confusable data.
//
// This class is used both at data build and at run time.
// The constructor is only used at build time.
// At run time, just point at the prebuilt data and go.
//
//-------------------------------------------------------------------------------
class ScriptSet: public UMemory {
public:
ScriptSet();
~ScriptSet();
UBool operator == (const ScriptSet &other);
ScriptSet & operator = (const ScriptSet &other);
void Union(const ScriptSet &other);
void Union(UScriptCode script);
void intersect(const ScriptSet &other);
void intersect(UScriptCode script);
void setAll();
void resetAll();
int32_t countMembers();
private:
uint32_t bits[6];
};
//-------------------------------------------------------------------------------
//
// NFDBuffer A little class to handle the NFD normalization that is
// needed on incoming identifiers to be checked.
// Takes care of buffer handling and normalization
//
// Instances of this class are intended to be stack-allocated.
//
// TODO: how to map position offsets back to user values?
//
//--------------------------------------------------------------------------------
class NFDBuffer: public UMemory {
public:
NFDBuffer(const UChar *text, int32_t length, UErrorCode &status);
~NFDBuffer();
const UChar *getBuffer();
int32_t getLength();
private:
const UChar *fOriginalText;
UChar *fNormalizedText;
int32_t fNormalizedTextLength;
UChar fSmallBuf[USPOOF_STACK_BUFFER_SIZE];
};
//-------------------------------------------------------------------------------------
//

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2008-2012, International Business Machines
* Copyright (C) 2008-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -29,6 +29,7 @@
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "scriptset.h"
#include "uspoof_impl.h"
#include "uhash.h"
#include "uvector.h"
@ -244,8 +245,8 @@ void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
scriptSets->addElement(bsset, status);
utrie2_set32(table, cp, setIndex, &status);
}
bsset->sset->Union(targScript);
bsset->sset->Union(srcScript);
bsset->sset->set(targScript, status);
bsset->sset->set(srcScript, status);
if (U_FAILURE(status)) {
goto cleanup;

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2009-2012, International Business Machines Corporation and
* Copyright (c) 2009-2013, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -408,10 +408,13 @@ static void TestUSpoofCAPI(void) {
TEST_ASSERT_SUCCESS(status);
uset_close(tmpSet);
/* Latin Identifier should now fail; other non-latin test cases should still be OK */
/* Latin Identifier should now fail; other non-latin test cases should still be OK
* Note: fail of CHAR_LIMIT also causes the restriction level to be USPOOF_UNRESTRICTIVE
* which will give us a USPOOF_RESTRICTION_LEVEL failure.
*/
checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT | USPOOF_RESTRICTION_LEVEL, checkResults);
checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
@ -432,7 +435,7 @@ static void TestUSpoofCAPI(void) {
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(0, checkResults);
TEST_ASSERT_EQ(666, position);
TEST_ASSERT_EQ(0, position);
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
TEST_ASSERT_SUCCESS(status);
@ -446,7 +449,7 @@ static void TestUSpoofCAPI(void) {
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
TEST_ASSERT_EQ(2, position);
TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2011, International Business Machines Corporation
* Copyright (C) 2011-2013, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
@ -13,11 +13,18 @@
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
#include "itspoof.h"
#include "unicode/uspoof.h"
#include "unicode/unistr.h"
#include "unicode/regex.h"
#include "unicode/normlzr.h"
#include "unicode/regex.h"
#include "unicode/unistr.h"
#include "unicode/uscript.h"
#include "unicode/uspoof.h"
#include "cstring.h"
#include "identifier_info.h"
#include "scriptset.h"
#include "uhash.h"
#include <stdlib.h>
#include <stdio.h>
@ -27,6 +34,9 @@
#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
#define TEST_ASSERT_MSG(expr, msg) {if ((expr)==FALSE) { \
errln("Test Failure at file %s, line %d, %s: \"%s\" is false.\n", __FILE__, __LINE__, msg, #expr);};}
#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
__FILE__, __LINE__, #a, (a), #b, (b)); }}
@ -35,6 +45,8 @@
errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
__FILE__, __LINE__, #a, (a), #b, (b)); }}
#define LENGTHOF(array) ((int32_t)(sizeof(array)/sizeof((array)[0])))
/*
* TEST_SETUP and TEST_TEARDOWN
* macros to handle the boilerplate around setting up test case.
@ -67,37 +79,63 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
testSpoofAPI();
}
break;
case 1:
case 1:
name = "TestSkeleton";
if (exec) {
testSkeleton();
}
break;
case 2:
case 2:
name = "TestAreConfusable";
if (exec) {
testAreConfusable();
}
break;
case 3:
case 3:
name = "TestInvisible";
if (exec) {
testInvisible();
}
break;
case 4:
case 4:
name = "testConfData";
if (exec) {
testConfData();
}
break;
case 5:
case 5:
name = "testBug8654";
if (exec) {
testBug8654();
}
break;
default: name=""; break;
case 6:
name = "testIdentifierInfo";
if (exec) {
testIdentifierInfo();
}
break;
case 7:
name = "testScriptSet";
if (exec) {
testScriptSet();
}
break;
case 8:
name = "testRestrictionLevel";
if (exec) {
testRestrictionLevel();
}
break;
case 9:
name = "testMixedNumbers";
if (exec) {
testMixedNumbers();
}
break;
default: name=""; break;
}
}
@ -110,7 +148,7 @@ void IntlTestSpoof::testSpoofAPI() {
int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(0, checkResults);
TEST_ASSERT_EQ(666, position);
TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;
TEST_SETUP
@ -250,12 +288,12 @@ void IntlTestSpoof::testInvisible() {
int32_t position = -42;
TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(position == -42);
TEST_ASSERT(0 == position);
UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(7, position);
TEST_ASSERT_EQ(0, position);
// Two acute accents, one from the composed a with acute accent, \u00e1,
// and one separate.
@ -263,7 +301,7 @@ void IntlTestSpoof::testInvisible() {
UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(7, position);
TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;
}
@ -273,7 +311,7 @@ void IntlTestSpoof::testBug8654() {
int32_t position = -42;
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE );
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(3, position);
TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;
}
@ -414,3 +452,305 @@ void IntlTestSpoof::testConfData() {
}
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
// testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time
void IntlTestSpoof::testIdentifierInfo() {
UErrorCode status = U_ZERO_ERROR;
ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
TEST_ASSERT(bitset12.contains(bitset2));
TEST_ASSERT(bitset12.contains(bitset12));
TEST_ASSERT(!bitset2.contains(bitset12));
ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
UElement arabEl; arabEl.pointer = &arabSet;
UElement latinEl; latinEl.pointer = &latinSet;
TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
UnicodeString scriptString;
bitset12.displayScripts(scriptString);
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
status = U_ZERO_ERROR;
UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status);
uhash_puti(alternates, &bitset12, 1, &status);
uhash_puti(alternates, &bitset2, 1, &status);
UnicodeString alternatesString;
IdentifierInfo::displayAlternates(alternatesString, alternates, status);
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString);
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
ScriptSet tScriptSet;
tScriptSet.parseScripts(scriptString, status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(bitset12 == tScriptSet);
UnicodeString ss;
ss.remove();
uhash_close(alternates);
struct Test {
const char *fTestString;
URestrictionLevel fRestrictionLevel;
const char *fNumerics;
const char *fScripts;
const char *fAlternates;
const char *fCommonAlternates;
} tests[] = {
{"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""},
{"\\u0061\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
{"\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"},
{"\\u0061\\u30FC\\u3006\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
{"\\u30A2\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
{"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"},
{"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""},
{"\\u0661\\u30FC\\u3006\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE,
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""},
{"\\u0061\\u30A2\\u30FC\\u3006\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE,
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""}
};
int testNum;
for (testNum = 0; testNum < LENGTHOF(tests); testNum++) {
char testNumStr[40];
sprintf(testNumStr, "testNum = %d", testNum);
Test &test = tests[testNum];
status = U_ZERO_ERROR;
UnicodeString testString(test.fTestString); // Note: may do charset conversion.
testString = testString.unescape();
IdentifierInfo idInfo(status);
TEST_ASSERT_SUCCESS(status);
idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
idInfo.setIdentifier(testString, status);
TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr);
URestrictionLevel restrictionLevel = test.fRestrictionLevel;
TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr);
status = U_ZERO_ERROR;
UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr);
ScriptSet scripts;
scripts.parseScripts(UnicodeString(test.fScripts), status);
TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr);
UnicodeString alternatesStr;
IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status);
TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr);
ScriptSet commonAlternates;
commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status);
TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr);
}
// Test of getScriptCount()
// Script and or Script Extension for chars used in the tests
// \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
// \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK
// \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA
//
// \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA
// \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA
// \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A
// \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A
// 1234 ; Common # ascii digits
// \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT
struct ScriptTest {
const char *fTestString;
int32_t fScriptCount;
} scriptTests[] = {
{"Hello", 1},
{"Hello\\u0370", 2},
{"1234", 0},
{"Hello1234\\u0300", 1}, // Common and Inherited are ignored.
{"\\u0030", 0},
{"abc\\u0951", 1},
{"abc\\u3013", 2},
{"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path.
{"\\u3013\\uA838", 2}
};
status = U_ZERO_ERROR;
IdentifierInfo identifierInfo(status);
for (testNum=0; testNum<LENGTHOF(scriptTests); testNum++) {
ScriptTest &test = scriptTests[testNum];
char msgBuf[100];
sprintf(msgBuf, "testNum = %d ", testNum);
UnicodeString testString = UnicodeString(test.fTestString).unescape();
status = U_ZERO_ERROR;
identifierInfo.setIdentifier(testString, status);
int32_t scriptCount = identifierInfo.getScriptCount();
TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf);
}
}
void IntlTestSpoof::testScriptSet() {
ScriptSet s1;
ScriptSet s2;
UErrorCode status = U_ZERO_ERROR;
TEST_ASSERT(s1 == s2);
s1.set(USCRIPT_ARABIC,status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(!(s1 == s2));
TEST_ASSERT(s1.test(USCRIPT_ARABIC, status));
TEST_ASSERT(s1.test(USCRIPT_GREEK, status) == FALSE);
status = U_ZERO_ERROR;
s1.reset(USCRIPT_ARABIC, status);
TEST_ASSERT(s1 == s2);
status = U_ZERO_ERROR;
s1.setAll();
TEST_ASSERT(s1.test(USCRIPT_COMMON, status));
TEST_ASSERT(s1.test(USCRIPT_ETHIOPIC, status));
TEST_ASSERT(s1.test(USCRIPT_CODE_LIMIT, status));
s1.resetAll();
TEST_ASSERT(!s1.test(USCRIPT_COMMON, status));
TEST_ASSERT(!s1.test(USCRIPT_ETHIOPIC, status));
TEST_ASSERT(!s1.test(USCRIPT_CODE_LIMIT, status));
status = U_ZERO_ERROR;
s1.set(USCRIPT_TAKRI, status);
s1.set(USCRIPT_BLISSYMBOLS, status);
s2.setAll();
TEST_ASSERT(s2.contains(s1));
TEST_ASSERT(!s1.contains(s2));
TEST_ASSERT(s2.intersects(s1));
TEST_ASSERT(s1.intersects(s2));
s2.reset(USCRIPT_TAKRI, status);
TEST_ASSERT(!s2.contains(s1));
TEST_ASSERT(!s1.contains(s2));
TEST_ASSERT(s1.intersects(s2));
TEST_ASSERT(s2.intersects(s1));
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
s1.resetAll();
s1.set(USCRIPT_NKO, status);
s1.set(USCRIPT_COMMON, status);
s2 = s1;
TEST_ASSERT(s2 == s1);
TEST_ASSERT_EQ(2, s2.countMembers());
s2.intersect(s1);
TEST_ASSERT(s2 == s1);
s2.setAll();
TEST_ASSERT(!(s2 == s1));
TEST_ASSERT(s2.countMembers() >= USCRIPT_CODE_LIMIT);
s2.intersect(s1);
TEST_ASSERT(s2 == s1);
s2.setAll();
s2.reset(USCRIPT_COMMON, status);
s2.intersect(s1);
TEST_ASSERT(s2.countMembers() == 1);
s1.resetAll();
s1.set(USCRIPT_AFAKA, status);
s1.set(USCRIPT_VAI, status);
s1.set(USCRIPT_INHERITED, status);
int32_t n = -1;
for (int32_t i=0; i<4; i++) {
n = s1.nextSetBit(n+1);
switch (i) {
case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break;
case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break;
case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break;
case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break;
default: TEST_ASSERT(FALSE);
}
}
TEST_ASSERT_SUCCESS(status);
}
void IntlTestSpoof::testRestrictionLevel() {
struct Test {
const char *fId;
URestrictionLevel fExpectedRestrictionLevel;
} tests[] = {
{"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE},
{"a", USPOOF_ASCII},
{"\\u03B3", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE},
{"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}
};
char msgBuffer[100];
URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_HIGHLY_RESTRICTIVE,
USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE, USPOOF_UNRESTRICTIVE};
UErrorCode status = U_ZERO_ERROR;
IdentifierInfo idInfo(status);
TEST_ASSERT_SUCCESS(status);
idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
TEST_ASSERT_SUCCESS(status);
for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) {
status = U_ZERO_ERROR;
const Test &test = tests[testNum];
UnicodeString testString = UnicodeString(test.fId).unescape();
URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel;
idInfo.setIdentifier(testString, status);
sprintf(msgBuffer, "testNum = %d ", testNum);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer);
for (int levelIndex=0; levelIndex<LENGTHOF(restrictionLevels); levelIndex++) {
status = U_ZERO_ERROR;
URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex];
USpoofChecker *sc = uspoof_open(&status);
uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
UBool actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status) != 0;
// we want to fail if the text is (say) MODERATE and the testLevel is ASCII
UBool expectedFailure = expectedLevel > levelSetInSpoofChecker ||
!uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString);
sprintf(msgBuffer, "testNum = %d, levelIndex = %d", testNum, levelIndex);
TEST_ASSERT_MSG(expectedFailure == actualValue, msgBuffer);
TEST_ASSERT_SUCCESS(status);
uspoof_close(sc);
}
}
}
void IntlTestSpoof::testMixedNumbers() {
struct Test {
const char *fTestString;
const char *fExpectedSet;
} tests[] = {
{"1", "[0]"},
{"\\u0967", "[\\u0966]"},
{"1\\u0967", "[0\\u0966]"},
{"\\u0661\\u06F1", "[\\u0660\\u06F0]"}
};
UErrorCode status = U_ZERO_ERROR;
IdentifierInfo idInfo(status);
for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) {
char msgBuf[100];
sprintf(msgBuf, "testNum = %d ", testNum);
Test &test = tests[testNum];
status = U_ZERO_ERROR;
UnicodeString testString = UnicodeString(test.fTestString).unescape();
UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status);
idInfo.setIdentifier(testString, status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf);
status = U_ZERO_ERROR;
USpoofChecker *sc = uspoof_open(&status);
uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0);
TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
uspoof_close(sc);
}
}

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2011, International Business Machines Corporation
* Copyright (C) 2011-2013, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
@ -36,6 +36,14 @@ public:
void testBug8654();
void testIdentifierInfo();
void testScriptSet();
void testRestrictionLevel();
void testMixedNumbers();
// Internal function to run a single skeleton test case.
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
const char *input, const char *expected, int32_t lineNum);