mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-9440 spoof checker, merge updates from branch.
X-SVN-Rev: 33162
This commit is contained in:
parent
87158d4fba
commit
e06001f2d0
17 changed files with 2013 additions and 883 deletions
|
@ -86,7 +86,7 @@ tmunit.o tmutamt.o tmutfmt.o currpinf.o \
|
|||
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \
|
||||
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o locdspnm.o \
|
||||
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
|
||||
tzfmt.o compactdecimalformat.o gender.o region.o
|
||||
tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o identifier_info.o
|
||||
|
||||
## Header files to install
|
||||
HEADERS = $(srcdir)/unicode/*.h
|
||||
|
|
|
@ -302,6 +302,7 @@
|
|||
<ClCompile Include="gregocal.cpp" />
|
||||
<ClCompile Include="gregoimp.cpp" />
|
||||
<ClCompile Include="hebrwcal.cpp" />
|
||||
<ClCompile Include="identifier_info.cpp" />
|
||||
<ClCompile Include="indiancal.cpp" />
|
||||
<ClCompile Include="islamcal.cpp" />
|
||||
<ClCompile Include="japancal.cpp" />
|
||||
|
@ -323,6 +324,7 @@
|
|||
<ClCompile Include="reldtfmt.cpp" />
|
||||
<ClCompile Include="selfmt.cpp" />
|
||||
<ClCompile Include="simpletz.cpp" />
|
||||
<ClCompile Include="scriptset.cpp" />
|
||||
<ClCompile Include="smpdtfmt.cpp" />
|
||||
<ClCompile Include="smpdtfst.cpp" />
|
||||
<ClCompile Include="taiwncal.cpp" />
|
||||
|
@ -1546,6 +1548,8 @@
|
|||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<ClInclude Include="identifier_info.h" />
|
||||
<ClInclude Include="scriptset.h" />
|
||||
<ClInclude Include="uspoof_conf.h" />
|
||||
<ClInclude Include="uspoof_impl.h" />
|
||||
<ClInclude Include="uspoof_wsconf.h" />
|
||||
|
@ -1562,4 +1566,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -456,6 +456,12 @@
|
|||
<ClCompile Include="ucsdet.cpp">
|
||||
<Filter>charset detect</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="identifier_info.cpp">
|
||||
<Filter>spoof</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="scriptset.cpp">
|
||||
<Filter>spoof</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="uspoof.cpp">
|
||||
<Filter>spoof</Filter>
|
||||
</ClCompile>
|
||||
|
@ -759,6 +765,12 @@
|
|||
<ClInclude Include="inputext.h">
|
||||
<Filter>charset detect</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="identifier_info.h">
|
||||
<Filter>spoof</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="scriptset.h">
|
||||
<Filter>spoof</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="uspoof_conf.h">
|
||||
<Filter>spoof</Filter>
|
||||
</ClInclude>
|
||||
|
@ -1017,4 +1029,4 @@
|
|||
<Filter>formatting</Filter>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
314
icu4c/source/i18n/identifier_info.cpp
Normal file
314
icu4c/source/i18n/identifier_info.cpp
Normal file
|
@ -0,0 +1,314 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2012-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/utf16.h"
|
||||
|
||||
#include "identifier_info.h"
|
||||
#include "mutex.h"
|
||||
#include "scriptset.h"
|
||||
#include "ucln_in.h"
|
||||
#include "uvector.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
static UMutex gInitMutex = U_MUTEX_INITIALIZER;
|
||||
static UBool gStaticsAreInitialized = FALSE;
|
||||
|
||||
UnicodeSet *IdentifierInfo::ASCII;
|
||||
ScriptSet *IdentifierInfo::JAPANESE;
|
||||
ScriptSet *IdentifierInfo::CHINESE;
|
||||
ScriptSet *IdentifierInfo::KOREAN;
|
||||
ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
|
||||
|
||||
UBool IdentifierInfo::cleanup() {
|
||||
delete ASCII;
|
||||
ASCII = NULL;
|
||||
delete JAPANESE;
|
||||
JAPANESE = NULL;
|
||||
delete CHINESE;
|
||||
CHINESE = NULL;
|
||||
delete KOREAN;
|
||||
KOREAN = NULL;
|
||||
delete CONFUSABLE_WITH_LATIN;
|
||||
CONFUSABLE_WITH_LATIN = NULL;
|
||||
gStaticsAreInitialized = FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
IdentifierInfo_cleanup(void) {
|
||||
return IdentifierInfo::cleanup();
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
|
||||
IdentifierInfo::IdentifierInfo(UErrorCode &status):
|
||||
fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
|
||||
fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
{
|
||||
Mutex lock(&gInitMutex);
|
||||
if (!gStaticsAreInitialized) {
|
||||
ASCII = new UnicodeSet(0, 0x7f);
|
||||
JAPANESE = new ScriptSet();
|
||||
CHINESE = new ScriptSet();
|
||||
KOREAN = new ScriptSet();
|
||||
CONFUSABLE_WITH_LATIN = new ScriptSet();
|
||||
if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
|
||||
|| CONFUSABLE_WITH_LATIN == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
ASCII->freeze();
|
||||
JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
|
||||
.set(USCRIPT_KATAKANA, status);
|
||||
CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
|
||||
KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
|
||||
CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
|
||||
.set(USCRIPT_CHEROKEE, status);
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
|
||||
gStaticsAreInitialized = TRUE;
|
||||
}
|
||||
}
|
||||
fIdentifier = new UnicodeString();
|
||||
fRequiredScripts = new ScriptSet();
|
||||
fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
|
||||
uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
|
||||
fCommonAmongAlternates = new ScriptSet();
|
||||
fNumerics = new UnicodeSet();
|
||||
fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
|
||||
|
||||
if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
|
||||
fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
};
|
||||
|
||||
IdentifierInfo::~IdentifierInfo() {
|
||||
delete fIdentifier;
|
||||
delete fRequiredScripts;
|
||||
uhash_close(fScriptSetSet);
|
||||
delete fCommonAmongAlternates;
|
||||
delete fNumerics;
|
||||
delete fIdentifierProfile;
|
||||
};
|
||||
|
||||
|
||||
IdentifierInfo &IdentifierInfo::clear() {
|
||||
fRequiredScripts->resetAll();
|
||||
uhash_removeAll(fScriptSetSet);
|
||||
fNumerics->clear();
|
||||
fCommonAmongAlternates->resetAll();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
|
||||
*fIdentifierProfile = identifierProfile;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
|
||||
return *fIdentifierProfile;
|
||||
}
|
||||
|
||||
|
||||
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
*fIdentifier = identifier;
|
||||
clear();
|
||||
ScriptSet scriptsForCP;
|
||||
UChar32 cp;
|
||||
for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
|
||||
cp = identifier.char32At(i);
|
||||
// Store a representative character for each kind of decimal digit
|
||||
if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
|
||||
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
|
||||
fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
|
||||
}
|
||||
UScriptCode extensions[500];
|
||||
int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
scriptsForCP.resetAll();
|
||||
for (int32_t j=0; j<extensionsCount; j++) {
|
||||
scriptsForCP.set(extensions[j], status);
|
||||
}
|
||||
scriptsForCP.reset(USCRIPT_COMMON, status);
|
||||
scriptsForCP.reset(USCRIPT_INHERITED, status);
|
||||
switch (scriptsForCP.countMembers()) {
|
||||
case 0: break;
|
||||
case 1:
|
||||
// Single script, record it.
|
||||
fRequiredScripts->Union(scriptsForCP);
|
||||
break;
|
||||
default:
|
||||
if (!fRequiredScripts->intersects(scriptsForCP)
|
||||
&& !uhash_geti(fScriptSetSet, &scriptsForCP)) {
|
||||
// If the set hasn't been added already, add it
|
||||
// (Add a copy, fScriptSetSet takes ownership of the copy.)
|
||||
uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Now make a final pass through ScriptSetSet to remove alternates that came before singles.
|
||||
// [Kana], [Kana Hira] => [Kana]
|
||||
// This is relatively infrequent, so doesn't have to be optimized.
|
||||
// We also compute any commonalities among the alternates.
|
||||
if (uhash_count(fScriptSetSet) > 0) {
|
||||
fCommonAmongAlternates->setAll();
|
||||
for (int32_t it = -1;;) {
|
||||
const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
|
||||
if (nextHashEl == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
|
||||
// [Kana], [Kana Hira] => [Kana]
|
||||
if (fRequiredScripts->intersects(*next)) {
|
||||
uhash_removeElement(fScriptSetSet, nextHashEl);
|
||||
} else {
|
||||
fCommonAmongAlternates->intersect(*next);
|
||||
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
|
||||
for (int32_t otherIt = -1;;) {
|
||||
const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
|
||||
if (otherHashEl == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
|
||||
if (next != other && next->contains(*other)) {
|
||||
uhash_removeElement(fScriptSetSet, nextHashEl);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (uhash_count(fScriptSetSet) == 0) {
|
||||
fCommonAmongAlternates->resetAll();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
const UnicodeString *IdentifierInfo::getIdentifier() const {
|
||||
return fIdentifier;
|
||||
}
|
||||
|
||||
const ScriptSet *IdentifierInfo::getScripts() const {
|
||||
return fRequiredScripts;
|
||||
}
|
||||
|
||||
const UHashtable *IdentifierInfo::getAlternates() const {
|
||||
return fScriptSetSet;
|
||||
}
|
||||
|
||||
|
||||
const UnicodeSet *IdentifierInfo::getNumerics() const {
|
||||
return fNumerics;
|
||||
}
|
||||
|
||||
const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
|
||||
return fCommonAmongAlternates;
|
||||
}
|
||||
|
||||
URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
|
||||
if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
|
||||
return USPOOF_UNRESTRICTIVE;
|
||||
}
|
||||
if (ASCII->containsAll(*fIdentifier)) {
|
||||
return USPOOF_ASCII;
|
||||
}
|
||||
// This is a bit tricky. We look at a number of factors.
|
||||
// The number of scripts in the text.
|
||||
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
|
||||
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
|
||||
|
||||
// Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
|
||||
// time it is created, in setIdentifier().
|
||||
int32_t cardinalityPlus = fRequiredScripts->countMembers() +
|
||||
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
|
||||
if (cardinalityPlus < 2) {
|
||||
return USPOOF_HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
|
||||
|| containsWithAlternates(*KOREAN, *fRequiredScripts)) {
|
||||
return USPOOF_HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
if (cardinalityPlus == 2 &&
|
||||
fRequiredScripts->test(USCRIPT_LATIN, status) &&
|
||||
!fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
|
||||
return USPOOF_MODERATELY_RESTRICTIVE;
|
||||
}
|
||||
return USPOOF_MINIMALLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
int32_t IdentifierInfo::getScriptCount() const {
|
||||
// Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
|
||||
int32_t count = fRequiredScripts->countMembers() +
|
||||
(fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
|
||||
UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
|
||||
if (!container.contains(containee)) {
|
||||
return FALSE;
|
||||
}
|
||||
for (int32_t iter = -1; ;) {
|
||||
const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
|
||||
if (hashEl == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
|
||||
if (!container.intersects(*alternatives)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
|
||||
UVector sorted(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return dest;
|
||||
}
|
||||
for (int32_t pos = -1; ;) {
|
||||
const UHashElement *el = uhash_nextElement(alternates, &pos);
|
||||
if (el == NULL) {
|
||||
break;
|
||||
}
|
||||
ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
|
||||
sorted.addElement(ss, status);
|
||||
}
|
||||
sorted.sort(uhash_compareScriptSet, status);
|
||||
UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
|
||||
for (int32_t i=0; i<sorted.size(); i++) {
|
||||
if (i>0) {
|
||||
dest.append(separator);
|
||||
}
|
||||
ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
|
||||
ss->displayScripts(dest);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
200
icu4c/source/i18n/identifier_info.h
Normal file
200
icu4c/source/i18n/identifier_info.h
Normal file
|
@ -0,0 +1,200 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* indentifier_info.h
|
||||
*
|
||||
* created on: 2013 Jan 7
|
||||
* created by: Andy Heninger
|
||||
*/
|
||||
|
||||
#ifndef __IDENTIFIER_INFO_H__
|
||||
#define __IDENTIFIER_INFO_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class ScriptSet;
|
||||
|
||||
// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
|
||||
|
||||
/**
|
||||
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
|
||||
* then setIdentifier. Available methods include:
|
||||
* <ol>
|
||||
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
|
||||
* each of these.
|
||||
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
|
||||
* either Katakana or Hiragana.
|
||||
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
|
||||
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
|
||||
* the identifier.
|
||||
* <li>call getRestrictionLevel to see what the UTS36 restriction level is.
|
||||
* </ol>
|
||||
*
|
||||
* This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
|
||||
*/
|
||||
class U_I18N_API IdentifierInfo : public UMemory {
|
||||
|
||||
public:
|
||||
/**
|
||||
* Create an identifier info object. Subsequently, call setIdentifier(), etc.
|
||||
* @internal
|
||||
*/
|
||||
IdentifierInfo(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~IdentifierInfo();
|
||||
|
||||
private:
|
||||
/* Disallow copying for now. Can be added if there's a need. */
|
||||
IdentifierInfo(const IdentifierInfo &other);
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Set the identifier profile: the characters that are to be allowed in the identifier.
|
||||
*
|
||||
* @param identifierProfile the characters that are to be allowed in the identifier
|
||||
* @return this
|
||||
* @internal
|
||||
*/
|
||||
IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
|
||||
|
||||
/**
|
||||
* Get the identifier profile: the characters that are to be allowed in the identifier.
|
||||
*
|
||||
* @return The characters that are to be allowed in the identifier.
|
||||
* @internal
|
||||
*/
|
||||
const UnicodeSet &getIdentifierProfile() const;
|
||||
|
||||
|
||||
/**
|
||||
* Set an identifier to analyze. Afterwards, call methods like getScripts()
|
||||
*
|
||||
* @param identifier the identifier to analyze
|
||||
* @param status Errorcode, set if errors occur.
|
||||
* @return this
|
||||
* @internal
|
||||
*/
|
||||
IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Get the identifier that was analyzed. The returned string is owned by the ICU library,
|
||||
* and must not be deleted by the caller.
|
||||
*
|
||||
* @return the identifier that was analyzed.
|
||||
* @internal
|
||||
*/
|
||||
const UnicodeString *getIdentifier() const;
|
||||
|
||||
|
||||
/**
|
||||
* Get the scripts found in the identifiers.
|
||||
*
|
||||
* @return the set of explicit scripts.
|
||||
* @internal
|
||||
*/
|
||||
const ScriptSet *getScripts() const;
|
||||
|
||||
/**
|
||||
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
|
||||
* the set consisting of those scripts will be returned.
|
||||
*
|
||||
* @return a uhash, with each key being of type (ScriptSet *).
|
||||
* This is a set, not a map, so the value stored in the uhash is not relevant.
|
||||
* (It is, in fact, 1).
|
||||
* Ownership of the uhash and its contents remains with the IndetifierInfo object,
|
||||
* and remains valid until a new identifer is set or until the object is deleted.
|
||||
* @internal
|
||||
*/
|
||||
const UHashtable *getAlternates() const;
|
||||
|
||||
/**
|
||||
* Get the representative characters (zeros) for the numerics found in the identifier.
|
||||
*
|
||||
* @return the set of explicit scripts.
|
||||
* @internal
|
||||
*/
|
||||
const UnicodeSet *getNumerics() const;
|
||||
|
||||
/**
|
||||
* Find out which scripts are in common among the alternates.
|
||||
*
|
||||
* @return the set of scripts that are in common among the alternates.
|
||||
* @internal
|
||||
*/
|
||||
const ScriptSet *getCommonAmongAlternates() const;
|
||||
|
||||
/**
|
||||
* Get the number of scripts appearing in the identifier.
|
||||
* Note: Common and Inherited scripts are omitted from the count.
|
||||
* Note: Result may be high when the identifier contains characters
|
||||
* with alternate scripts. The distinction between
|
||||
* 0, 1 and > 1 will remain valid, however.
|
||||
* @return the number of scripts.
|
||||
*/
|
||||
int32_t getScriptCount() const;
|
||||
|
||||
/**
|
||||
* Find the "tightest" restriction level that the identifier satisfies.
|
||||
*
|
||||
* @return the restriction level.
|
||||
* @internal
|
||||
*/
|
||||
URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
|
||||
|
||||
UnicodeString toString() const;
|
||||
|
||||
/**
|
||||
* Produce a readable string of alternates.
|
||||
*
|
||||
* @param alternates a UHashtable of UScriptSets.
|
||||
* Keys only, no meaningful values in the UHash.
|
||||
* @return display form
|
||||
* @internal
|
||||
*/
|
||||
static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Static memory cleanup function.
|
||||
* @internal
|
||||
*/
|
||||
static UBool cleanup();
|
||||
private:
|
||||
|
||||
IdentifierInfo & clear();
|
||||
UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
|
||||
|
||||
UnicodeString *fIdentifier;
|
||||
ScriptSet *fRequiredScripts;
|
||||
UHashtable *fScriptSetSet;
|
||||
ScriptSet *fCommonAmongAlternates;
|
||||
UnicodeSet *fNumerics;
|
||||
UnicodeSet *fIdentifierProfile;
|
||||
|
||||
static UnicodeSet *ASCII;
|
||||
static ScriptSet *JAPANESE;
|
||||
static ScriptSet *CHINESE;
|
||||
static ScriptSet *KOREAN;
|
||||
static ScriptSet *CONFUSABLE_WITH_LATIN;
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __IDENTIFIER_INFO_H__
|
||||
|
276
icu4c/source/i18n/scriptset.cpp
Normal file
276
icu4c/source/i18n/scriptset.cpp
Normal file
|
@ -0,0 +1,276 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* scriptset.cpp
|
||||
*
|
||||
* created on: 2013 Jan 7
|
||||
* created by: Andy Heninger
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
#include "scriptset.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// ScriptSet implementation
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
ScriptSet::ScriptSet() {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ScriptSet::~ScriptSet() {
|
||||
}
|
||||
|
||||
ScriptSet::ScriptSet(const ScriptSet &other) {
|
||||
*this = other;
|
||||
}
|
||||
|
||||
|
||||
ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
bits[i] = other.bits[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
UBool ScriptSet::operator == (const ScriptSet &other) const {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
if (bits[i] != other.bits[i]) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool ScriptSet::test(UScriptCode script, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
uint32_t index = script / 32;
|
||||
uint32_t bit = 1 << (script & 31);
|
||||
return ((bits[index] & bit) != 0);
|
||||
}
|
||||
|
||||
|
||||
ScriptSet &ScriptSet::set(UScriptCode script, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
uint32_t index = script / 32;
|
||||
uint32_t bit = 1 << (script & 31);
|
||||
bits[index] |= bit;
|
||||
return *this;
|
||||
}
|
||||
|
||||
ScriptSet &ScriptSet::reset(UScriptCode script, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return *this;
|
||||
}
|
||||
uint32_t index = script / 32;
|
||||
uint32_t bit = 1 << (script & 31);
|
||||
bits[index] &= ~bit;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
ScriptSet &ScriptSet::Union(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
bits[i] |= other.bits[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
ScriptSet &ScriptSet::intersect(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
bits[i] &= other.bits[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
ScriptSet &ScriptSet::intersect(UScriptCode script, UErrorCode &status) {
|
||||
ScriptSet t;
|
||||
t.set(script, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
this->intersect(t);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UBool ScriptSet::intersects(const ScriptSet &other) const {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
if ((bits[i] & other.bits[i]) != 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
UBool ScriptSet::contains(const ScriptSet &other) const {
|
||||
ScriptSet t(*this);
|
||||
t.intersect(other);
|
||||
return (t == other);
|
||||
}
|
||||
|
||||
|
||||
ScriptSet &ScriptSet::setAll() {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
bits[i] = 0xffffffffu;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
ScriptSet &ScriptSet::resetAll() {
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
int32_t ScriptSet::countMembers() const {
|
||||
// This bit counter is good for sparse numbers of '1's, which is
|
||||
// very much the case that we will usually have.
|
||||
int32_t count = 0;
|
||||
for (uint32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
uint32_t x = bits[i];
|
||||
while (x > 0) {
|
||||
count++;
|
||||
x &= (x - 1); // and off the least significant one bit.
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
int32_t ScriptSet::hashCode() const {
|
||||
int32_t hash = 0;
|
||||
for (int32_t i=0; i<LENGTHOF(bits); i++) {
|
||||
hash ^= bits[i];
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
int32_t ScriptSet::nextSetBit(int32_t fromIndex) const {
|
||||
// TODO: Wants a better implementation.
|
||||
if (fromIndex < 0) {
|
||||
return -1;
|
||||
}
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
for (int32_t scriptIndex = fromIndex; scriptIndex < (int32_t)sizeof(bits)*8; scriptIndex++) {
|
||||
if (test((UScriptCode)scriptIndex, status)) {
|
||||
return scriptIndex;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const {
|
||||
UBool firstTime = TRUE;
|
||||
for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
|
||||
if (!firstTime) {
|
||||
dest.append(0x20);
|
||||
}
|
||||
firstTime = FALSE;
|
||||
const char *scriptName = uscript_getShortName((UScriptCode(i)));
|
||||
dest.append(UnicodeString(scriptName, -1, US_INV));
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode &status) {
|
||||
resetAll();
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
UnicodeString oneScriptName;
|
||||
for (int32_t i=0; i<scriptString.length();) {
|
||||
UChar32 c = scriptString.char32At(i);
|
||||
i = scriptString.moveIndex32(i, 1);
|
||||
if (!u_isUWhiteSpace(c)) {
|
||||
oneScriptName.append(c);
|
||||
if (i < scriptString.length()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (oneScriptName.length() > 0) {
|
||||
char buf[40];
|
||||
oneScriptName.extract(0, oneScriptName.length(), buf, sizeof(buf)-1, US_INV);
|
||||
buf[sizeof(buf)-1] = 0;
|
||||
int32_t sc = u_getPropertyValueEnum(UCHAR_SCRIPT, buf);
|
||||
if (sc == UCHAR_INVALID_CODE) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
} else {
|
||||
this->set((UScriptCode)sc, status);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
oneScriptName.remove();
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_equalsScriptSet(const UElement key1, const UElement key2) {
|
||||
icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
|
||||
icu::ScriptSet *s2 = static_cast<icu::ScriptSet *>(key2.pointer);
|
||||
return (*s1 == *s2);
|
||||
}
|
||||
|
||||
U_CAPI int8_t U_EXPORT2
|
||||
uhash_compareScriptSet(UElement key0, UElement key1) {
|
||||
icu::ScriptSet *s0 = static_cast<icu::ScriptSet *>(key0.pointer);
|
||||
icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
|
||||
int32_t diff = s0->countMembers() - s1->countMembers();
|
||||
if (diff != 0) return diff;
|
||||
int32_t i0 = s0->nextSetBit(0);
|
||||
int32_t i1 = s1->nextSetBit(0);
|
||||
while ((diff = i0-i1) == 0 && i0 > 0) {
|
||||
i0 = s0->nextSetBit(i0+1);
|
||||
i1 = s1->nextSetBit(i1+1);
|
||||
}
|
||||
return (int8_t)diff;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashScriptSet(const UElement key) {
|
||||
icu::ScriptSet *s = static_cast<icu::ScriptSet *>(key.pointer);
|
||||
return s->hashCode();
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_deleteScriptSet(void *obj) {
|
||||
icu::ScriptSet *s = static_cast<icu::ScriptSet *>(obj);
|
||||
delete s;
|
||||
}
|
76
icu4c/source/i18n/scriptset.h
Normal file
76
icu4c/source/i18n/scriptset.h
Normal file
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* scriptset.h
|
||||
*
|
||||
* created on: 2013 Jan 7
|
||||
* created by: Andy Heninger
|
||||
*/
|
||||
|
||||
#ifndef __SCRIPTSET_H__
|
||||
#define __SCRIPTSET_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
#include "uelement.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// ScriptSet - A bit set representing a set of scripts.
|
||||
//
|
||||
// This class was originally used exclusively with script sets appearing
|
||||
// as part of the spoof check whole script confusable binary data. Its
|
||||
// use has since become more general, but the continued use to wrap
|
||||
// prebuilt binary data does constrain the design.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
class U_I18N_API ScriptSet: public UMemory {
|
||||
public:
|
||||
ScriptSet();
|
||||
ScriptSet(const ScriptSet &other);
|
||||
~ScriptSet();
|
||||
|
||||
UBool operator == (const ScriptSet &other) const;
|
||||
ScriptSet & operator = (const ScriptSet &other);
|
||||
|
||||
UBool test(UScriptCode script, UErrorCode &status) const;
|
||||
ScriptSet &Union(const ScriptSet &other);
|
||||
ScriptSet &set(UScriptCode script, UErrorCode &status);
|
||||
ScriptSet &reset(UScriptCode script, UErrorCode &status);
|
||||
ScriptSet &intersect(const ScriptSet &other);
|
||||
ScriptSet &intersect(UScriptCode script, UErrorCode &status);
|
||||
UBool intersects(const ScriptSet &other) const; // Sets contain at least one script in commmon.
|
||||
UBool contains(const ScriptSet &other) const; // All set bits in other are also set in this.
|
||||
|
||||
ScriptSet &setAll();
|
||||
ScriptSet &resetAll();
|
||||
int32_t countMembers() const;
|
||||
int32_t hashCode() const;
|
||||
int32_t nextSetBit(int32_t script) const;
|
||||
|
||||
UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string.
|
||||
ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents.
|
||||
|
||||
private:
|
||||
uint32_t bits[6];
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_compareScriptSet(const UElement key1, const UElement key2);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashScriptSet(const UElement key);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_deleteScriptSet(void *obj);
|
||||
|
||||
#endif // __SCRIPTSET_H__
|
|
@ -26,6 +26,8 @@ as the functions are suppose to be called.
|
|||
It's usually best to have child dependencies called first. */
|
||||
typedef enum ECleanupI18NType {
|
||||
UCLN_I18N_START = -1,
|
||||
UCLN_I18N_IDENTIFIER_INFO,
|
||||
UCLN_I18N_SPOOF,
|
||||
UCLN_I18N_TRANSLITERATOR,
|
||||
UCLN_I18N_REGEX,
|
||||
UCLN_I18N_ISLAMIC_CALENDAR,
|
||||
|
|
|
@ -180,6 +180,21 @@ class U_I18N_API AlphabeticIndex: public UObject {
|
|||
*/
|
||||
AlphabeticIndex(const Locale &locale, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Construct an AlphabeticIndex that uses a specific collator.
|
||||
*
|
||||
* The index will be created with no labels; the addLabels() function must be called
|
||||
* after creation to add the desired labels to the index.
|
||||
*
|
||||
* The index adopts the collator, and is responsible for deleting it.
|
||||
* The caller should make nor further use of the collator after creating the index.
|
||||
*
|
||||
* @param collator The collator to use to order the contents of this index.
|
||||
* @param status Error code, will be set with the reason if the
|
||||
* operation fails.
|
||||
* @draft ICU 51
|
||||
*/
|
||||
AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Construct an AlphabeticIndex that uses a specific collator.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2008-2012, International Business Machines Corporation
|
||||
* Copyright (C) 2008-2013, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
* file name: uspoof.h
|
||||
|
@ -188,11 +188,27 @@ typedef enum USpoofChecks {
|
|||
Any Case Confusable. */
|
||||
USPOOF_ANY_CASE = 8,
|
||||
|
||||
/**
|
||||
* Check that an identifier is no looser than the specified RestrictionLevel.
|
||||
* The default if uspoof_setRestrctionLevel() is not called is HIGHLY_RESTRICTIVE.
|
||||
*
|
||||
* If USPOOF_AUX_INFO is enabled the actual restriction level of the
|
||||
* identifier being tested will also be returned by uspoof_check().
|
||||
*
|
||||
* @see URestrictionLevel
|
||||
* @see uspoof_setRestrictionLevel
|
||||
* @see USPOOF_AUX_INFO
|
||||
*
|
||||
* @stable ICU 51
|
||||
*/
|
||||
USPOOF_RESTRICTION_LEVEL = 16,
|
||||
|
||||
/** Check that an identifier contains only characters from a
|
||||
* single script (plus chars from the common and inherited scripts.)
|
||||
* Applies to checks of a single identifier check only.
|
||||
* @deprecated ICU 51 Use RESTRICTION_LEVEL instead.
|
||||
*/
|
||||
USPOOF_SINGLE_SCRIPT = 16,
|
||||
USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL,
|
||||
|
||||
/** Check an identifier for the presence of invisible characters,
|
||||
* such as zero-width spaces, or character sequences that are
|
||||
|
@ -208,10 +224,78 @@ typedef enum USpoofChecks {
|
|||
*/
|
||||
USPOOF_CHAR_LIMIT = 64,
|
||||
|
||||
USPOOF_ALL_CHECKS = 0x7f
|
||||
/**
|
||||
* Check that an identifier does not include decimal digits from
|
||||
* more than one numbering system.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
USPOOF_MIXED_NUMBERS = 128,
|
||||
|
||||
/**
|
||||
* Enable all spoof checks.
|
||||
*
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
USPOOF_ALL_CHECKS = 0xFFFF,
|
||||
|
||||
/**
|
||||
* Enable the return of auxillary (non-error) information in the
|
||||
* upper bits of the check results value.
|
||||
*
|
||||
* If this "check" is not enabled, the results of uspoof_check() will be zero when an
|
||||
* identifier passes all of the enabled checks.
|
||||
*
|
||||
* If this "check" is enabled, (uspoof_check() & USPOOF_ALL_CHECKS) will be zero
|
||||
* when an identifier passes all checks.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
USPOOF_AUX_INFO = 0x40000000
|
||||
|
||||
} USpoofChecks;
|
||||
|
||||
|
||||
/**
|
||||
* Constants from UAX #39 for use in setRestrictionLevel(), and
|
||||
* for returned identifier restriction levels in check results.
|
||||
* @draft ICU 51
|
||||
*/
|
||||
typedef enum URestrictionLevel {
|
||||
/**
|
||||
* Only ASCII characters: U+0000..U+007F
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
USPOOF_ASCII = 0x10000000,
|
||||
/**
|
||||
* All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
|
||||
* Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
|
||||
* vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
USPOOF_HIGHLY_RESTRICTIVE = 0x20000000,
|
||||
/**
|
||||
* Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
USPOOF_MODERATELY_RESTRICTIVE = 0x30000000,
|
||||
/**
|
||||
* Allow arbitrary mixtures of scripts. Otherwise, the same as Moderately Restrictive.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
USPOOF_MINIMALLY_RESTRICTIVE = 0x40000000,
|
||||
/**
|
||||
* Any valid identifiers, including characters outside of the Identifier Profile.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
USPOOF_UNRESTRICTIVE = 0x50000000
|
||||
} URestrictionLevel;
|
||||
|
||||
/**
|
||||
* Create a Unicode Spoof Checker, configured to perform all
|
||||
* checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
|
||||
|
@ -255,7 +339,7 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
|
|||
* Open a Spoof Checker from the source form of the spoof data.
|
||||
* The Three inputs correspond to the Unicode data files confusables.txt
|
||||
* confusablesWholeScript.txt and xidmdifications.txt as described in
|
||||
* Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
|
||||
* Unicode UAX #39. The syntax of the source data is as described in UAX #39 for
|
||||
* these files, and the content of these files is acceptable input.
|
||||
*
|
||||
* The character encoding of the (char *) input text is UTF-8.
|
||||
|
@ -357,6 +441,28 @@ uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
|
|||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Set the loosest restriction level allowed. The default if this function
|
||||
* is not called is HIGHLY_RESTRICTIVE.
|
||||
* Calling this function also enables the RESTRICTION_LEVEL check.
|
||||
* @param restrictionLevel The loosest restriction level allowed.
|
||||
* @see URestrictionLevel
|
||||
* @draft ICU 51
|
||||
*/
|
||||
U_DRAFT void U_EXPORT2
|
||||
uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
|
||||
|
||||
|
||||
/**
|
||||
* Get the Restriction Level that will be tested if the checks include RESTRICTION_LEVEL.
|
||||
*
|
||||
* @return The restriction level
|
||||
* @see URestrictionLevel
|
||||
* @draft ICU 51
|
||||
*/
|
||||
U_DRAFT URestrictionLevel U_EXPORT2
|
||||
uspoof_getRestrictionLevel(const USpoofChecker *sc);
|
||||
|
||||
/**
|
||||
* Limit characters that are acceptable in identifiers being checked to those
|
||||
* normally used with the languages associated with the specified locales.
|
||||
|
@ -488,7 +594,7 @@ uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
|
|||
* characters that are permitted. Ownership of the set
|
||||
* remains with the caller. The incoming set is cloned by
|
||||
* this function, so there are no restrictions on modifying
|
||||
* or deleting the USet after calling this function.
|
||||
* or deleting the UnicodeSet after calling this function.
|
||||
* @param status The error code, set if this function encounters a problem.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
|
@ -527,31 +633,29 @@ uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
|
|||
* The set of checks to be performed is specified with uspoof_setChecks().
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param text The string to be checked for possible security issues,
|
||||
* @param id The identifier to be checked for possible security issues,
|
||||
* in UTF-16 format.
|
||||
* @param length the length of the string to be checked, expressed in
|
||||
* 16 bit UTF-16 code units, or -1 if the string is
|
||||
* zero terminated.
|
||||
* @param position An out parameter that receives the index of the
|
||||
* first string position that fails the allowed character
|
||||
* limitation checks.
|
||||
* This parameter may be null if the position information
|
||||
* is not needed.
|
||||
* If the string passes the requested checks the
|
||||
* parameter value will not be set.
|
||||
* @param position An out parameter.
|
||||
* Originally, the index of the first string position that failed a check.
|
||||
* Now, always returns zero.
|
||||
* This parameter may be null.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Spoofing or security issues detected with the input string are
|
||||
* not reported here, but through the function's return value.
|
||||
* @return An integer value with bits set for any potential security
|
||||
* or spoofing issues detected. The bits are defined by
|
||||
* enum USpoofChecks. Zero is returned if no issues
|
||||
* are found with the input string.
|
||||
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
|
||||
* will be zero if the input string passes all of the
|
||||
* enabled checks.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_check(const USpoofChecker *sc,
|
||||
const UChar *text, int32_t length,
|
||||
const UChar *id, int32_t length,
|
||||
int32_t *position,
|
||||
UErrorCode *status);
|
||||
|
||||
|
@ -562,16 +666,14 @@ uspoof_check(const USpoofChecker *sc,
|
|||
* The set of checks to be performed is specified with uspoof_setChecks().
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param text A UTF-8 string to be checked for possible security issues.
|
||||
* @param id A identifier to be checked for possible security issues, in UTF8 format.
|
||||
* @param length the length of the string to be checked, or -1 if the string is
|
||||
* zero terminated.
|
||||
* @param position An out parameter that receives the index of the
|
||||
* first string position that fails the allowed character
|
||||
* limitation checks.
|
||||
* This parameter may be null if the position information
|
||||
* is not needed.
|
||||
* If the string passes the requested checks the
|
||||
* parameter value will not be set.
|
||||
* @param position An out parameter.
|
||||
* Originally, the index of the first string position that failed a check.
|
||||
* Now, always returns zero.
|
||||
* This parameter may be null.
|
||||
* @deprecated ICU 51
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Spoofing or security issues detected with the input string are
|
||||
|
@ -580,13 +682,14 @@ uspoof_check(const USpoofChecker *sc,
|
|||
* a status of U_INVALID_CHAR_FOUND will be returned.
|
||||
* @return An integer value with bits set for any potential security
|
||||
* or spoofing issues detected. The bits are defined by
|
||||
* enum USpoofChecks. Zero is returned if no issues
|
||||
* are found with the input string.
|
||||
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
|
||||
* will be zero if the input string passes all of the
|
||||
* enabled checks.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_checkUTF8(const USpoofChecker *sc,
|
||||
const char *text, int32_t length,
|
||||
const char *id, int32_t length,
|
||||
int32_t *position,
|
||||
UErrorCode *status);
|
||||
|
||||
|
@ -598,28 +701,26 @@ uspoof_checkUTF8(const USpoofChecker *sc,
|
|||
* The set of checks to be performed is specified with uspoof_setChecks().
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param text A UnicodeString to be checked for possible security issues.
|
||||
* @param position An out parameter that receives the index of the
|
||||
* first string position that fails the allowed character
|
||||
* limitation checks.
|
||||
* This parameter may be null if the position information
|
||||
* is not needed.
|
||||
* If the string passes the requested checks the
|
||||
* parameter value will not be set.
|
||||
* @param id A identifier to be checked for possible security issues.
|
||||
* @param position An out parameter.
|
||||
* Originally, the index of the first string position that failed a check.
|
||||
* Now, always returns zero.
|
||||
* This parameter may be null.
|
||||
* @deprecated ICU 51
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Spoofing or security issues detected with the input string are
|
||||
* not reported here, but through the function's return value.
|
||||
|
||||
* @return An integer value with bits set for any potential security
|
||||
* or spoofing issues detected. The bits are defined by
|
||||
* enum USpoofChecks. Zero is returned if no issues
|
||||
* are found with the input string.
|
||||
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
|
||||
* will be zero if the input string passes all of the
|
||||
* enabled checks.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_checkUnicodeString(const USpoofChecker *sc,
|
||||
const icu::UnicodeString &text,
|
||||
const icu::UnicodeString &id,
|
||||
int32_t *position,
|
||||
UErrorCode *status);
|
||||
|
||||
|
@ -645,30 +746,30 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
|
|||
*
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param s1 The first of the two strings to be compared for
|
||||
* @param id1 The first of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-16 format.
|
||||
* @param length1 the length of the first string, expressed in
|
||||
* @param length1 the length of the first identifer, expressed in
|
||||
* 16 bit UTF-16 code units, or -1 if the string is
|
||||
* zero terminated.
|
||||
* @param s2 The second of the two strings to be compared for
|
||||
* confusability. The strings are in UTF-16 format.
|
||||
* @param length2 The length of the second string, expressed in
|
||||
* nul terminated.
|
||||
* @param id2 The second of the two identifiers to be compared for
|
||||
* confusability. The identifiers are in UTF-16 format.
|
||||
* @param length2 The length of the second identifiers, expressed in
|
||||
* 16 bit UTF-16 code units, or -1 if the string is
|
||||
* zero terminated.
|
||||
* nul terminated.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Confusability of the strings is not reported here,
|
||||
* Confusability of the identifiers is not reported here,
|
||||
* but through this function's return value.
|
||||
* @return An integer value with bit(s) set corresponding to
|
||||
* the type of confusability found, as defined by
|
||||
* enum USpoofChecks. Zero is returned if the strings
|
||||
* enum USpoofChecks. Zero is returned if the identifiers
|
||||
* are not confusable.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_areConfusable(const USpoofChecker *sc,
|
||||
const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
const UChar *id1, int32_t length1,
|
||||
const UChar *id2, int32_t length2,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
|
@ -680,14 +781,14 @@ uspoof_areConfusable(const USpoofChecker *sc,
|
|||
* USpoofChecker.
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param s1 The first of the two strings to be compared for
|
||||
* @param id1 The first of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param length1 the length of the first identifiers, in bytes, or -1
|
||||
* if the string is nul terminated.
|
||||
* @param id2 The second of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param length1 the length of the first string, in bytes, or -1
|
||||
* if the string is zero terminated.
|
||||
* @param s2 The second of the two strings to be compared for
|
||||
* confusability. The strings are in UTF-18 format.
|
||||
* @param length2 The length of the second string in bytes, or -1
|
||||
* if the string is zero terminated.
|
||||
* if the string is nul terminated.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Confusability of the strings is not reported here,
|
||||
|
@ -700,8 +801,8 @@ uspoof_areConfusable(const USpoofChecker *sc,
|
|||
*/
|
||||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_areConfusableUTF8(const USpoofChecker *sc,
|
||||
const char *s1, int32_t length1,
|
||||
const char *s2, int32_t length2,
|
||||
const char *id1, int32_t length1,
|
||||
const char *id2, int32_t length2,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
|
@ -715,17 +816,17 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc,
|
|||
* USpoofChecker.
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param s1 The first of the two strings to be compared for
|
||||
* @param id1 The first of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param id2 The second of the two identifiers to be compared for
|
||||
* confusability. The strings are in UTF-8 format.
|
||||
* @param s2 The second of the two strings to be compared for
|
||||
* confusability. The strings are in UTF-18 format.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* Confusability of the strings is not reported here,
|
||||
* Confusability of the identifiers is not reported here,
|
||||
* but through this function's return value.
|
||||
* @return An integer value with bit(s) set corresponding to
|
||||
* the type of confusability found, as defined by
|
||||
* enum USpoofChecks. Zero is returned if the strings
|
||||
* enum USpoofChecks. Zero is returned if the identifiers
|
||||
* are not confusable.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
|
@ -738,10 +839,10 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
|||
|
||||
|
||||
/**
|
||||
* Get the "skeleton" for an identifier string.
|
||||
* Skeletons are a transformation of the input string;
|
||||
* Two strings are confusable if their skeletons are identical.
|
||||
* See Unicode UAX 39 for additional information.
|
||||
* Get the "skeleton" for an identifier.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are confusable if their skeletons are identical.
|
||||
* See Unicode UAX #39 for additional information.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
|
@ -754,8 +855,8 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
|||
* The default is Mixed-Script, Lowercase.
|
||||
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
|
||||
* USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed.
|
||||
* @param s The input string whose skeleton will be computed.
|
||||
* @param length The length of the input string, expressed in 16 bit
|
||||
* @param id The input identifier whose skeleton will be computed.
|
||||
* @param length The length of the input identifier, expressed in 16 bit
|
||||
* UTF-16 code units, or -1 if the string is zero terminated.
|
||||
* @param dest The output buffer, to receive the skeleton string.
|
||||
* @param destCapacity The length of the output buffer, in 16 bit units.
|
||||
|
@ -772,15 +873,15 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
|||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_getSkeleton(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
const UChar *s, int32_t length,
|
||||
const UChar *id, int32_t length,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the "skeleton" for an identifier string.
|
||||
* Skeletons are a transformation of the input string;
|
||||
* Two strings are confusable if their skeletons are identical.
|
||||
* See Unicode UAX 39 for additional information.
|
||||
* Get the "skeleton" for an identifier.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are confusable if their skeletons are identical.
|
||||
* See Unicode UAX #39 for additional information.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
|
@ -793,7 +894,7 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||
* The default is Mixed-Script, Lowercase.
|
||||
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
|
||||
* USPOOF_ANY_CASE. The two flags may be ORed.
|
||||
* @param s The UTF-8 format input string whose skeleton will be computed.
|
||||
* @param id The UTF-8 format identifier whose skeleton will be computed.
|
||||
* @param length The length of the input string, in bytes,
|
||||
* or -1 if the string is zero terminated.
|
||||
* @param dest The output buffer, to receive the skeleton string.
|
||||
|
@ -814,16 +915,16 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||
U_STABLE int32_t U_EXPORT2
|
||||
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
const char *s, int32_t length,
|
||||
const char *id, int32_t length,
|
||||
char *dest, int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
/**
|
||||
* Get the "skeleton" for an identifier string.
|
||||
* Skeletons are a transformation of the input string;
|
||||
* Two strings are confusable if their skeletons are identical.
|
||||
* See Unicode UAX 39 for additional information.
|
||||
* Get the "skeleton" for an identifier.
|
||||
* Skeletons are a transformation of the input identifier;
|
||||
* Two identifiers are confusable if their skeletons are identical.
|
||||
* See Unicode UAX #39 for additional information.
|
||||
*
|
||||
* Using skeletons directly makes it possible to quickly check
|
||||
* whether an identifier is confusable with any of some large
|
||||
|
@ -836,8 +937,8 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
|||
* The default is Mixed-Script, Lowercase.
|
||||
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
|
||||
* USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed.
|
||||
* @param s The input string whose skeleton will be computed.
|
||||
* @param dest The output string, to receive the skeleton string.
|
||||
* @param id The input identifier whose skeleton will be computed.
|
||||
* @param dest The output identifier, to receive the skeleton string.
|
||||
* @param status The error code, set if an error occurred while attempting to
|
||||
* perform the check.
|
||||
* @return A reference to the destination (skeleton) string.
|
||||
|
@ -847,17 +948,83 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
|||
U_I18N_API icu::UnicodeString & U_EXPORT2
|
||||
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
const icu::UnicodeString &s,
|
||||
const icu::UnicodeString &id,
|
||||
icu::UnicodeString &dest,
|
||||
UErrorCode *status);
|
||||
#endif /* U_SHOW_CPLUSPLUS_API */
|
||||
|
||||
|
||||
/**
|
||||
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
|
||||
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
|
||||
*
|
||||
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
|
||||
* be deleted by the caller.
|
||||
*
|
||||
* @param status The error code, set if a problem occurs while creating the set.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
U_DRAFT const USet * U_EXPORT2
|
||||
uspoof_getInclusionSet(UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
|
||||
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
|
||||
*
|
||||
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
|
||||
* be deleted by the caller.
|
||||
*
|
||||
* @param status The error code, set if a problem occurs while creating the set.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
U_DRAFT const USet * U_EXPORT2
|
||||
uspoof_getRecommendedSet(UErrorCode *status);
|
||||
|
||||
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
|
||||
/**
|
||||
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
|
||||
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
|
||||
*
|
||||
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
|
||||
* be deleted by the caller.
|
||||
*
|
||||
* @param status The error code, set if a problem occurs while creating the set.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
U_DRAFT const UnicodeSet * U_EXPORT2
|
||||
uspoof_getInclusionUnicodeSet(UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
|
||||
* in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
|
||||
*
|
||||
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
|
||||
* be deleted by the caller.
|
||||
*
|
||||
* @param status The error code, set if a problem occurs while creating the set.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
U_DRAFT const UnicodeSet * U_EXPORT2
|
||||
uspoof_getRecommendedUnicodeSet(UErrorCode *status);
|
||||
|
||||
#endif /* U_SHOW_CPLUSPLUS_API */
|
||||
|
||||
/**
|
||||
* Serialize the data for a spoof detector into a chunk of memory.
|
||||
* The flattened spoof detection tables can later be used to efficiently
|
||||
* instantiate a new Spoof Detector.
|
||||
*
|
||||
* The serialized spoof checker includes only the data compiled from the
|
||||
* Unicode data tables by uspoof_openFromSource(); it does not include
|
||||
* include any other state or configuration that may have been set.
|
||||
*
|
||||
* @param sc the Spoof Detector whose data is to be serialized.
|
||||
* @param data a pointer to 32-bit-aligned memory to be filled with the data,
|
||||
* can be NULL if capacity==0
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,19 +1,20 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2008-2011, International Business Machines
|
||||
* Copyright (C) 2008-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "utrie2.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "identifier_info.h"
|
||||
#include "scriptset.h"
|
||||
#include "udatamem.h"
|
||||
#include "umutex.h"
|
||||
#include "udataswp.h"
|
||||
|
@ -28,37 +29,41 @@ U_NAMESPACE_BEGIN
|
|||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
|
||||
|
||||
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
|
||||
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(uprv_strdup("")) {
|
||||
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
||||
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fMagic = USPOOF_MAGIC;
|
||||
fSpoofData = data;
|
||||
fChecks = USPOOF_ALL_CHECKS;
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
if (allowedCharsSet == NULL || fAllowedLocales == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
}
|
||||
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
|
||||
|
||||
|
||||
SpoofImpl::SpoofImpl() {
|
||||
fMagic = USPOOF_MAGIC;
|
||||
fSpoofData = NULL;
|
||||
fChecks = USPOOF_ALL_CHECKS;
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
fAllowedLocales = uprv_strdup("");
|
||||
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
fMagic = USPOOF_MAGIC;
|
||||
}
|
||||
|
||||
|
||||
SpoofImpl::SpoofImpl() :
|
||||
fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
||||
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
fAllowedLocales = uprv_strdup("");
|
||||
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
|
||||
// Copy Constructor, used by the user level clone() function.
|
||||
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
||||
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
|
||||
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
||||
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -72,6 +77,7 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
|||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
|
||||
fRestrictionLevel = src.fRestrictionLevel;
|
||||
}
|
||||
|
||||
SpoofImpl::~SpoofImpl() {
|
||||
|
@ -82,6 +88,7 @@ SpoofImpl::~SpoofImpl() {
|
|||
}
|
||||
delete fAllowedCharsSet;
|
||||
uprv_free((void *)fAllowedLocales);
|
||||
delete fCachedIdentifierInfo;
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -121,10 +128,10 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
|
|||
// implementation.
|
||||
//
|
||||
// Given a source character, produce the corresponding
|
||||
// replacement character(s)
|
||||
// replacement character(s), appending them to the dest string.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const {
|
||||
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
|
||||
|
||||
// Binary search the spoof data key table for the inChar
|
||||
int32_t *low = fSpoofData->fCFUKeys;
|
||||
|
@ -148,7 +155,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
|
|||
if (inChar != midc) {
|
||||
// Char not found. It maps to itself.
|
||||
int i = 0;
|
||||
U16_APPEND_UNSAFE(destBuf, i, inChar)
|
||||
dest.append(inChar);
|
||||
return i;
|
||||
}
|
||||
foundChar:
|
||||
|
@ -176,7 +183,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
|
|||
// No key entry for this char & table.
|
||||
// The input char maps to itself.
|
||||
int i = 0;
|
||||
U16_APPEND_UNSAFE(destBuf, i, inChar)
|
||||
dest.append(inChar);
|
||||
return i;
|
||||
}
|
||||
|
||||
|
@ -188,7 +195,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
|
|||
// an index into the string table (for longer strings)
|
||||
uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
|
||||
if (stringLen == 1) {
|
||||
destBuf[0] = value;
|
||||
dest.append((UChar)value);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -212,9 +219,7 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
|
|||
|
||||
U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
|
||||
UChar *src = &fSpoofData->fCFUStrings[value];
|
||||
for (ix=0; ix<stringLen; ix++) {
|
||||
destBuf[ix] = src[ix];
|
||||
}
|
||||
dest.append(src, stringLen);
|
||||
return stringLen;
|
||||
}
|
||||
|
||||
|
@ -231,16 +236,15 @@ int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *de
|
|||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
void SpoofImpl::wholeScriptCheck(
|
||||
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const {
|
||||
|
||||
int32_t inputIdx = 0;
|
||||
UChar32 c;
|
||||
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
|
||||
|
||||
UTrie2 *table =
|
||||
(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
|
||||
result->setAll();
|
||||
while (inputIdx < length) {
|
||||
U16_NEXT(text, inputIdx, length, c);
|
||||
int32_t length = text.length();
|
||||
for (int32_t inputIdx=0; inputIdx < length;) {
|
||||
UChar32 c = text.char32At(inputIdx);
|
||||
inputIdx += U16_LENGTH(c);
|
||||
uint32_t index = utrie2_get32(table, c);
|
||||
if (index == 0) {
|
||||
// No confusables in another script for this char.
|
||||
|
@ -249,7 +253,7 @@ void SpoofImpl::wholeScriptCheck(
|
|||
// Until then, grab the script from the char and intersect it with the set.
|
||||
UScriptCode cpScript = uscript_getScript(c, &status);
|
||||
U_ASSERT(cpScript > USCRIPT_INHERITED);
|
||||
result->intersect(cpScript);
|
||||
result->intersect(cpScript, status);
|
||||
} else if (index == 1) {
|
||||
// Script == Common or Inherited. Nothing to do.
|
||||
} else {
|
||||
|
@ -371,47 +375,6 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr
|
|||
}
|
||||
|
||||
|
||||
int32_t SpoofImpl::scriptScan
|
||||
(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
int32_t inputIdx = 0;
|
||||
UChar32 c;
|
||||
int32_t scriptCount = 0;
|
||||
UScriptCode lastScript = USCRIPT_INVALID_CODE;
|
||||
UScriptCode sc = USCRIPT_INVALID_CODE;
|
||||
while ((inputIdx < length || length == -1) && scriptCount < 2) {
|
||||
U16_NEXT(text, inputIdx, length, c);
|
||||
if (c == 0 && length == -1) {
|
||||
break;
|
||||
}
|
||||
sc = uscript_getScript(c, &status);
|
||||
if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Temporary fix: fold Japanese Hiragana and Katakana into Han.
|
||||
// Names are allowed to mix these scripts.
|
||||
// A more general solution will follow later for characters that are
|
||||
// used with multiple scripts.
|
||||
|
||||
if (sc == USCRIPT_HIRAGANA || sc == USCRIPT_KATAKANA || sc == USCRIPT_HANGUL) {
|
||||
sc = USCRIPT_HAN;
|
||||
}
|
||||
|
||||
if (sc != lastScript) {
|
||||
scriptCount++;
|
||||
lastScript = sc;
|
||||
}
|
||||
}
|
||||
if (scriptCount == 2) {
|
||||
pos = inputIdx;
|
||||
}
|
||||
return scriptCount;
|
||||
}
|
||||
|
||||
|
||||
// Convert a text format hex number. Utility function used by builder code. Static.
|
||||
// Input: UChar *string text. Output: a UChar32
|
||||
// Input has been pre-checked, and will have no non-hex chars.
|
||||
|
@ -443,6 +406,54 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC
|
|||
return (UChar32)val;
|
||||
}
|
||||
|
||||
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
|
||||
// Maintain a one-element cache, which is sufficient to avoid repeatedly
|
||||
// creating new ones unless we get multi-thread concurrency in spoof
|
||||
// check operations, which should be statistically uncommon.
|
||||
|
||||
// These functions are used in place of new & delete of an IdentifierInfo.
|
||||
// They will recycle the IdentifierInfo when possible.
|
||||
// They are logically const, and used within const functions that must be thread safe.
|
||||
IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
|
||||
IdentifierInfo *returnIdInfo = NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return returnIdInfo;
|
||||
}
|
||||
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
|
||||
{
|
||||
Mutex m;
|
||||
returnIdInfo = nonConstThis->fCachedIdentifierInfo;
|
||||
nonConstThis->fCachedIdentifierInfo = NULL;
|
||||
}
|
||||
if (returnIdInfo == NULL) {
|
||||
returnIdInfo = new IdentifierInfo(status);
|
||||
if (U_SUCCESS(status) && returnIdInfo == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if (U_FAILURE(status) && returnIdInfo != NULL) {
|
||||
delete returnIdInfo;
|
||||
returnIdInfo = NULL;
|
||||
}
|
||||
}
|
||||
return returnIdInfo;
|
||||
}
|
||||
|
||||
|
||||
void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
|
||||
if (idInfo != NULL) {
|
||||
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
|
||||
{
|
||||
Mutex m;
|
||||
if (nonConstThis->fCachedIdentifierInfo == NULL) {
|
||||
nonConstThis->fCachedIdentifierInfo = idInfo;
|
||||
idInfo = NULL;
|
||||
}
|
||||
}
|
||||
delete idInfo;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------------
|
||||
|
@ -673,149 +684,6 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
|
|||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// ScriptSet implementation
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
ScriptSet::ScriptSet() {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ScriptSet::~ScriptSet() {
|
||||
}
|
||||
|
||||
UBool ScriptSet::operator == (const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
if (bits[i] != other.bits[i]) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void ScriptSet::Union(UScriptCode script) {
|
||||
uint32_t index = script / 32;
|
||||
uint32_t bit = 1 << (script & 31);
|
||||
U_ASSERT(index < sizeof(bits)*4);
|
||||
bits[index] |= bit;
|
||||
}
|
||||
|
||||
|
||||
void ScriptSet::Union(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] |= other.bits[i];
|
||||
}
|
||||
}
|
||||
|
||||
void ScriptSet::intersect(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] &= other.bits[i];
|
||||
}
|
||||
}
|
||||
|
||||
void ScriptSet::intersect(UScriptCode script) {
|
||||
uint32_t index = script / 32;
|
||||
uint32_t bit = 1 << (script & 31);
|
||||
U_ASSERT(index < sizeof(bits)*4);
|
||||
uint32_t i;
|
||||
for (i=0; i<index; i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
bits[index] &= bit;
|
||||
for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = other.bits[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
void ScriptSet::setAll() {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0xffffffffu;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ScriptSet::resetAll() {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t ScriptSet::countMembers() {
|
||||
// This bit counter is good for sparse numbers of '1's, which is
|
||||
// very much the case that we will usually have.
|
||||
int32_t count = 0;
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
uint32_t x = bits[i];
|
||||
while (x > 0) {
|
||||
count++;
|
||||
x &= (x - 1); // and off the least significant one bit.
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// NFDBuffer Implementation.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
|
||||
fNormalizedText = NULL;
|
||||
fNormalizedTextLength = 0;
|
||||
fOriginalText = text;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fNormalizedText = fSmallBuf;
|
||||
fNormalizedTextLength = unorm_normalize(
|
||||
text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
|
||||
if (status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
status = U_ZERO_ERROR;
|
||||
fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
|
||||
if (fNormalizedText == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
|
||||
fNormalizedText, fNormalizedTextLength+1, &status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
NFDBuffer::~NFDBuffer() {
|
||||
if (fNormalizedText != fSmallBuf) {
|
||||
uprv_free(fNormalizedText);
|
||||
}
|
||||
fNormalizedText = 0;
|
||||
}
|
||||
|
||||
const UChar *NFDBuffer::getBuffer() {
|
||||
return fNormalizedText;
|
||||
}
|
||||
|
||||
int32_t NFDBuffer::getLength() {
|
||||
return fNormalizedTextLength;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2008-2011, International Business Machines Corporation
|
||||
* Copyright (C) 2008-2013, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
*
|
||||
|
@ -15,10 +15,10 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "utrie2.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/udata.h"
|
||||
|
||||
#include "utrie2.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
|
@ -37,10 +37,11 @@ U_NAMESPACE_BEGIN
|
|||
// Magic number for sanity checking spoof data.
|
||||
#define USPOOF_MAGIC 0x3845fdef
|
||||
|
||||
class IdentifierInfo;
|
||||
class ScriptSet;
|
||||
class SpoofData;
|
||||
struct SpoofDataHeader;
|
||||
struct SpoofStringLengthsElement;
|
||||
class ScriptSet;
|
||||
|
||||
/**
|
||||
* Class SpoofImpl corresponds directly to the plain C API opaque type
|
||||
|
@ -65,7 +66,7 @@ public:
|
|||
* One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
|
||||
* @return The length in UTF-16 code units of the substition string.
|
||||
*/
|
||||
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const;
|
||||
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &destBuf) const;
|
||||
|
||||
/** Set and Get AllowedLocales, implementations of the corresponding API */
|
||||
void setAllowedLocales(const char *localesList, UErrorCode &status);
|
||||
|
@ -83,23 +84,18 @@ public:
|
|||
// Return the test bit flag to be ORed into the eventual user return value
|
||||
// if a Spoof opportunity is detected.
|
||||
void wholeScriptCheck(
|
||||
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const;
|
||||
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const;
|
||||
|
||||
/** Scan a string to determine how many scripts it includes.
|
||||
* Ignore characters with script=Common and scirpt=Inherited.
|
||||
* @param text The UChar text to be scanned
|
||||
* @param length The length of the input text, -1 for nul termintated.
|
||||
* @param pos An out parameter, set to the first input postion at which
|
||||
* a second script was encountered, ignoring Common and Inherited.
|
||||
* @param status For errors.
|
||||
* @return the number of (non-common,inherited) scripts encountered,
|
||||
* clipped to a max of two.
|
||||
*/
|
||||
int32_t scriptScan(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const;
|
||||
|
||||
static UClassID U_EXPORT2 getStaticClassID(void);
|
||||
virtual UClassID getDynamicClassID(void) const;
|
||||
|
||||
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
|
||||
// Maintain a one-element cache, which is sufficient to avoid repeatedly
|
||||
// creating new ones unless we get multi-thread concurrency in spoof
|
||||
// check operations, which should be statistically uncommon.
|
||||
IdentifierInfo *getIdentifierInfo(UErrorCode &status) const;
|
||||
void releaseIdentifierInfo(IdentifierInfo *idInfo) const;
|
||||
|
||||
//
|
||||
// Data Members
|
||||
//
|
||||
|
@ -113,6 +109,9 @@ public:
|
|||
// for this Spoof Checker. Defaults to all chars.
|
||||
|
||||
const char *fAllowedLocales; // The list of allowed locales.
|
||||
URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier.
|
||||
|
||||
IdentifierInfo *fCachedIdentifierInfo; // Do not use directly. See getIdentifierInfo().:w
|
||||
};
|
||||
|
||||
|
||||
|
@ -179,67 +178,6 @@ struct SpoofStringLengthsElement {
|
|||
};
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// ScriptSet - Wrapper class for the Script code bit sets that are part of the
|
||||
// whole script confusable data.
|
||||
//
|
||||
// This class is used both at data build and at run time.
|
||||
// The constructor is only used at build time.
|
||||
// At run time, just point at the prebuilt data and go.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
class ScriptSet: public UMemory {
|
||||
public:
|
||||
ScriptSet();
|
||||
~ScriptSet();
|
||||
|
||||
UBool operator == (const ScriptSet &other);
|
||||
ScriptSet & operator = (const ScriptSet &other);
|
||||
|
||||
void Union(const ScriptSet &other);
|
||||
void Union(UScriptCode script);
|
||||
void intersect(const ScriptSet &other);
|
||||
void intersect(UScriptCode script);
|
||||
void setAll();
|
||||
void resetAll();
|
||||
int32_t countMembers();
|
||||
|
||||
private:
|
||||
uint32_t bits[6];
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// NFDBuffer A little class to handle the NFD normalization that is
|
||||
// needed on incoming identifiers to be checked.
|
||||
// Takes care of buffer handling and normalization
|
||||
//
|
||||
// Instances of this class are intended to be stack-allocated.
|
||||
//
|
||||
// TODO: how to map position offsets back to user values?
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
class NFDBuffer: public UMemory {
|
||||
public:
|
||||
NFDBuffer(const UChar *text, int32_t length, UErrorCode &status);
|
||||
~NFDBuffer();
|
||||
const UChar *getBuffer();
|
||||
int32_t getLength();
|
||||
|
||||
private:
|
||||
const UChar *fOriginalText;
|
||||
UChar *fNormalizedText;
|
||||
int32_t fNormalizedTextLength;
|
||||
UChar fSmallBuf[USPOOF_STACK_BUFFER_SIZE];
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2008-2012, International Business Machines
|
||||
* Copyright (C) 2008-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -29,6 +29,7 @@
|
|||
#include "unicode/uregex.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "scriptset.h"
|
||||
#include "uspoof_impl.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
|
@ -244,8 +245,8 @@ void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
|||
scriptSets->addElement(bsset, status);
|
||||
utrie2_set32(table, cp, setIndex, &status);
|
||||
}
|
||||
bsset->sset->Union(targScript);
|
||||
bsset->sset->Union(srcScript);
|
||||
bsset->sset->set(targScript, status);
|
||||
bsset->sset->set(srcScript, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2009-2012, International Business Machines Corporation and
|
||||
* Copyright (c) 2009-2013, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
|
@ -408,10 +408,13 @@ static void TestUSpoofCAPI(void) {
|
|||
TEST_ASSERT_SUCCESS(status);
|
||||
uset_close(tmpSet);
|
||||
|
||||
/* Latin Identifier should now fail; other non-latin test cases should still be OK */
|
||||
/* Latin Identifier should now fail; other non-latin test cases should still be OK
|
||||
* Note: fail of CHAR_LIMIT also causes the restriction level to be USPOOF_UNRESTRICTIVE
|
||||
* which will give us a USPOOF_RESTRICTION_LEVEL failure.
|
||||
*/
|
||||
checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
|
||||
TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT | USPOOF_RESTRICTION_LEVEL, checkResults);
|
||||
|
||||
checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
@ -432,7 +435,7 @@ static void TestUSpoofCAPI(void) {
|
|||
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(0, checkResults);
|
||||
TEST_ASSERT_EQ(666, position);
|
||||
TEST_ASSERT_EQ(0, position);
|
||||
|
||||
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
@ -446,7 +449,7 @@ static void TestUSpoofCAPI(void) {
|
|||
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
|
||||
TEST_ASSERT_EQ(2, position);
|
||||
TEST_ASSERT_EQ(0, position);
|
||||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2011, International Business Machines Corporation
|
||||
* Copyright (C) 2011-2013, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -13,11 +13,18 @@
|
|||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
|
||||
|
||||
#include "itspoof.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/regex.h"
|
||||
|
||||
#include "unicode/normlzr.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/uspoof.h"
|
||||
|
||||
#include "cstring.h"
|
||||
#include "identifier_info.h"
|
||||
#include "scriptset.h"
|
||||
#include "uhash.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
@ -27,6 +34,9 @@
|
|||
#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
|
||||
errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
|
||||
|
||||
#define TEST_ASSERT_MSG(expr, msg) {if ((expr)==FALSE) { \
|
||||
errln("Test Failure at file %s, line %d, %s: \"%s\" is false.\n", __FILE__, __LINE__, msg, #expr);};}
|
||||
|
||||
#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
|
||||
errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
|
||||
__FILE__, __LINE__, #a, (a), #b, (b)); }}
|
||||
|
@ -35,6 +45,8 @@
|
|||
errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
|
||||
__FILE__, __LINE__, #a, (a), #b, (b)); }}
|
||||
|
||||
#define LENGTHOF(array) ((int32_t)(sizeof(array)/sizeof((array)[0])))
|
||||
|
||||
/*
|
||||
* TEST_SETUP and TEST_TEARDOWN
|
||||
* macros to handle the boilerplate around setting up test case.
|
||||
|
@ -67,37 +79,63 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
|
|||
testSpoofAPI();
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
case 1:
|
||||
name = "TestSkeleton";
|
||||
if (exec) {
|
||||
testSkeleton();
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case 2:
|
||||
name = "TestAreConfusable";
|
||||
if (exec) {
|
||||
testAreConfusable();
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
case 3:
|
||||
name = "TestInvisible";
|
||||
if (exec) {
|
||||
testInvisible();
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
case 4:
|
||||
name = "testConfData";
|
||||
if (exec) {
|
||||
testConfData();
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
case 5:
|
||||
name = "testBug8654";
|
||||
if (exec) {
|
||||
testBug8654();
|
||||
}
|
||||
break;
|
||||
default: name=""; break;
|
||||
case 6:
|
||||
name = "testIdentifierInfo";
|
||||
if (exec) {
|
||||
testIdentifierInfo();
|
||||
}
|
||||
break;
|
||||
case 7:
|
||||
name = "testScriptSet";
|
||||
if (exec) {
|
||||
testScriptSet();
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
name = "testRestrictionLevel";
|
||||
if (exec) {
|
||||
testRestrictionLevel();
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
name = "testMixedNumbers";
|
||||
if (exec) {
|
||||
testMixedNumbers();
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
default: name=""; break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,7 +148,7 @@ void IntlTestSpoof::testSpoofAPI() {
|
|||
int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(0, checkResults);
|
||||
TEST_ASSERT_EQ(666, position);
|
||||
TEST_ASSERT_EQ(0, position);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
TEST_SETUP
|
||||
|
@ -250,12 +288,12 @@ void IntlTestSpoof::testInvisible() {
|
|||
int32_t position = -42;
|
||||
TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(position == -42);
|
||||
TEST_ASSERT(0 == position);
|
||||
|
||||
UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
|
||||
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(7, position);
|
||||
TEST_ASSERT_EQ(0, position);
|
||||
|
||||
// Two acute accents, one from the composed a with acute accent, \u00e1,
|
||||
// and one separate.
|
||||
|
@ -263,7 +301,7 @@ void IntlTestSpoof::testInvisible() {
|
|||
UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
|
||||
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(7, position);
|
||||
TEST_ASSERT_EQ(0, position);
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
|
@ -273,7 +311,7 @@ void IntlTestSpoof::testBug8654() {
|
|||
int32_t position = -42;
|
||||
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE );
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(3, position);
|
||||
TEST_ASSERT_EQ(0, position);
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
||||
|
@ -414,3 +452,305 @@ void IntlTestSpoof::testConfData() {
|
|||
}
|
||||
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
// testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time
|
||||
void IntlTestSpoof::testIdentifierInfo() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
|
||||
ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
|
||||
TEST_ASSERT(bitset12.contains(bitset2));
|
||||
TEST_ASSERT(bitset12.contains(bitset12));
|
||||
TEST_ASSERT(!bitset2.contains(bitset12));
|
||||
|
||||
ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
|
||||
ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
|
||||
UElement arabEl; arabEl.pointer = &arabSet;
|
||||
UElement latinEl; latinEl.pointer = &latinSet;
|
||||
TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
|
||||
TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
|
||||
|
||||
UnicodeString scriptString;
|
||||
bitset12.displayScripts(scriptString);
|
||||
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status);
|
||||
uhash_puti(alternates, &bitset12, 1, &status);
|
||||
uhash_puti(alternates, &bitset2, 1, &status);
|
||||
UnicodeString alternatesString;
|
||||
IdentifierInfo::displayAlternates(alternatesString, alternates, status);
|
||||
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
ScriptSet tScriptSet;
|
||||
tScriptSet.parseScripts(scriptString, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(bitset12 == tScriptSet);
|
||||
UnicodeString ss;
|
||||
ss.remove();
|
||||
uhash_close(alternates);
|
||||
|
||||
struct Test {
|
||||
const char *fTestString;
|
||||
URestrictionLevel fRestrictionLevel;
|
||||
const char *fNumerics;
|
||||
const char *fScripts;
|
||||
const char *fAlternates;
|
||||
const char *fCommonAlternates;
|
||||
} tests[] = {
|
||||
{"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""},
|
||||
{"\\u0061\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
|
||||
{"\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"},
|
||||
{"\\u0061\\u30FC\\u3006\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
|
||||
{"\\u30A2\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
|
||||
{"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"},
|
||||
{"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""},
|
||||
{"\\u0661\\u30FC\\u3006\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE,
|
||||
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""},
|
||||
{"\\u0061\\u30A2\\u30FC\\u3006\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE,
|
||||
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""}
|
||||
};
|
||||
|
||||
int testNum;
|
||||
for (testNum = 0; testNum < LENGTHOF(tests); testNum++) {
|
||||
char testNumStr[40];
|
||||
sprintf(testNumStr, "testNum = %d", testNum);
|
||||
Test &test = tests[testNum];
|
||||
status = U_ZERO_ERROR;
|
||||
UnicodeString testString(test.fTestString); // Note: may do charset conversion.
|
||||
testString = testString.unescape();
|
||||
IdentifierInfo idInfo(status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
|
||||
idInfo.setIdentifier(testString, status);
|
||||
TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr);
|
||||
|
||||
URestrictionLevel restrictionLevel = test.fRestrictionLevel;
|
||||
TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr);
|
||||
|
||||
ScriptSet scripts;
|
||||
scripts.parseScripts(UnicodeString(test.fScripts), status);
|
||||
TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr);
|
||||
|
||||
UnicodeString alternatesStr;
|
||||
IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status);
|
||||
TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr);
|
||||
|
||||
ScriptSet commonAlternates;
|
||||
commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status);
|
||||
TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr);
|
||||
}
|
||||
|
||||
// Test of getScriptCount()
|
||||
// Script and or Script Extension for chars used in the tests
|
||||
// \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
|
||||
// \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK
|
||||
// \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA
|
||||
//
|
||||
// \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA
|
||||
// \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA
|
||||
// \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A
|
||||
// \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A
|
||||
// 1234 ; Common # ascii digits
|
||||
// \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT
|
||||
|
||||
struct ScriptTest {
|
||||
const char *fTestString;
|
||||
int32_t fScriptCount;
|
||||
} scriptTests[] = {
|
||||
{"Hello", 1},
|
||||
{"Hello\\u0370", 2},
|
||||
{"1234", 0},
|
||||
{"Hello1234\\u0300", 1}, // Common and Inherited are ignored.
|
||||
{"\\u0030", 0},
|
||||
{"abc\\u0951", 1},
|
||||
{"abc\\u3013", 2},
|
||||
{"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path.
|
||||
{"\\u3013\\uA838", 2}
|
||||
};
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
IdentifierInfo identifierInfo(status);
|
||||
for (testNum=0; testNum<LENGTHOF(scriptTests); testNum++) {
|
||||
ScriptTest &test = scriptTests[testNum];
|
||||
char msgBuf[100];
|
||||
sprintf(msgBuf, "testNum = %d ", testNum);
|
||||
UnicodeString testString = UnicodeString(test.fTestString).unescape();
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
identifierInfo.setIdentifier(testString, status);
|
||||
int32_t scriptCount = identifierInfo.getScriptCount();
|
||||
TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf);
|
||||
}
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testScriptSet() {
|
||||
ScriptSet s1;
|
||||
ScriptSet s2;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
TEST_ASSERT(s1 == s2);
|
||||
s1.set(USCRIPT_ARABIC,status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(!(s1 == s2));
|
||||
TEST_ASSERT(s1.test(USCRIPT_ARABIC, status));
|
||||
TEST_ASSERT(s1.test(USCRIPT_GREEK, status) == FALSE);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
s1.reset(USCRIPT_ARABIC, status);
|
||||
TEST_ASSERT(s1 == s2);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
s1.setAll();
|
||||
TEST_ASSERT(s1.test(USCRIPT_COMMON, status));
|
||||
TEST_ASSERT(s1.test(USCRIPT_ETHIOPIC, status));
|
||||
TEST_ASSERT(s1.test(USCRIPT_CODE_LIMIT, status));
|
||||
s1.resetAll();
|
||||
TEST_ASSERT(!s1.test(USCRIPT_COMMON, status));
|
||||
TEST_ASSERT(!s1.test(USCRIPT_ETHIOPIC, status));
|
||||
TEST_ASSERT(!s1.test(USCRIPT_CODE_LIMIT, status));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
s1.set(USCRIPT_TAKRI, status);
|
||||
s1.set(USCRIPT_BLISSYMBOLS, status);
|
||||
s2.setAll();
|
||||
TEST_ASSERT(s2.contains(s1));
|
||||
TEST_ASSERT(!s1.contains(s2));
|
||||
TEST_ASSERT(s2.intersects(s1));
|
||||
TEST_ASSERT(s1.intersects(s2));
|
||||
s2.reset(USCRIPT_TAKRI, status);
|
||||
TEST_ASSERT(!s2.contains(s1));
|
||||
TEST_ASSERT(!s1.contains(s2));
|
||||
TEST_ASSERT(s1.intersects(s2));
|
||||
TEST_ASSERT(s2.intersects(s1));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
s1.resetAll();
|
||||
s1.set(USCRIPT_NKO, status);
|
||||
s1.set(USCRIPT_COMMON, status);
|
||||
s2 = s1;
|
||||
TEST_ASSERT(s2 == s1);
|
||||
TEST_ASSERT_EQ(2, s2.countMembers());
|
||||
s2.intersect(s1);
|
||||
TEST_ASSERT(s2 == s1);
|
||||
s2.setAll();
|
||||
TEST_ASSERT(!(s2 == s1));
|
||||
TEST_ASSERT(s2.countMembers() >= USCRIPT_CODE_LIMIT);
|
||||
s2.intersect(s1);
|
||||
TEST_ASSERT(s2 == s1);
|
||||
|
||||
s2.setAll();
|
||||
s2.reset(USCRIPT_COMMON, status);
|
||||
s2.intersect(s1);
|
||||
TEST_ASSERT(s2.countMembers() == 1);
|
||||
|
||||
s1.resetAll();
|
||||
s1.set(USCRIPT_AFAKA, status);
|
||||
s1.set(USCRIPT_VAI, status);
|
||||
s1.set(USCRIPT_INHERITED, status);
|
||||
int32_t n = -1;
|
||||
for (int32_t i=0; i<4; i++) {
|
||||
n = s1.nextSetBit(n+1);
|
||||
switch (i) {
|
||||
case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break;
|
||||
case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break;
|
||||
case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break;
|
||||
case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break;
|
||||
default: TEST_ASSERT(FALSE);
|
||||
}
|
||||
}
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
}
|
||||
|
||||
|
||||
void IntlTestSpoof::testRestrictionLevel() {
|
||||
struct Test {
|
||||
const char *fId;
|
||||
URestrictionLevel fExpectedRestrictionLevel;
|
||||
} tests[] = {
|
||||
{"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE},
|
||||
{"a", USPOOF_ASCII},
|
||||
{"\\u03B3", USPOOF_HIGHLY_RESTRICTIVE},
|
||||
{"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE},
|
||||
{"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE},
|
||||
{"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}
|
||||
};
|
||||
char msgBuffer[100];
|
||||
|
||||
URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_HIGHLY_RESTRICTIVE,
|
||||
USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE, USPOOF_UNRESTRICTIVE};
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
IdentifierInfo idInfo(status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) {
|
||||
status = U_ZERO_ERROR;
|
||||
const Test &test = tests[testNum];
|
||||
UnicodeString testString = UnicodeString(test.fId).unescape();
|
||||
URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel;
|
||||
idInfo.setIdentifier(testString, status);
|
||||
sprintf(msgBuffer, "testNum = %d ", testNum);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer);
|
||||
for (int levelIndex=0; levelIndex<LENGTHOF(restrictionLevels); levelIndex++) {
|
||||
status = U_ZERO_ERROR;
|
||||
URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex];
|
||||
USpoofChecker *sc = uspoof_open(&status);
|
||||
uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
|
||||
uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
|
||||
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
|
||||
UBool actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status) != 0;
|
||||
|
||||
// we want to fail if the text is (say) MODERATE and the testLevel is ASCII
|
||||
UBool expectedFailure = expectedLevel > levelSetInSpoofChecker ||
|
||||
!uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString);
|
||||
sprintf(msgBuffer, "testNum = %d, levelIndex = %d", testNum, levelIndex);
|
||||
TEST_ASSERT_MSG(expectedFailure == actualValue, msgBuffer);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
uspoof_close(sc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void IntlTestSpoof::testMixedNumbers() {
|
||||
struct Test {
|
||||
const char *fTestString;
|
||||
const char *fExpectedSet;
|
||||
} tests[] = {
|
||||
{"1", "[0]"},
|
||||
{"\\u0967", "[\\u0966]"},
|
||||
{"1\\u0967", "[0\\u0966]"},
|
||||
{"\\u0661\\u06F1", "[\\u0660\\u06F0]"}
|
||||
};
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
IdentifierInfo idInfo(status);
|
||||
for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) {
|
||||
char msgBuf[100];
|
||||
sprintf(msgBuf, "testNum = %d ", testNum);
|
||||
Test &test = tests[testNum];
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UnicodeString testString = UnicodeString(test.fTestString).unescape();
|
||||
UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status);
|
||||
idInfo.setIdentifier(testString, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
USpoofChecker *sc = uspoof_open(&status);
|
||||
uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
|
||||
int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
|
||||
UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0);
|
||||
TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
|
||||
uspoof_close(sc);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2011, International Business Machines Corporation
|
||||
* Copyright (C) 2011-2013, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -36,6 +36,14 @@ public:
|
|||
|
||||
void testBug8654();
|
||||
|
||||
void testIdentifierInfo();
|
||||
|
||||
void testScriptSet();
|
||||
|
||||
void testRestrictionLevel();
|
||||
|
||||
void testMixedNumbers();
|
||||
|
||||
// Internal function to run a single skeleton test case.
|
||||
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
|
||||
const char *input, const char *expected, int32_t lineNum);
|
||||
|
|
Loading…
Add table
Reference in a new issue