ICU-12549 Updating SpoofChecker to latest Unicode specification.

X-SVN-Rev: 39218
This commit is contained in:
Shane Carr 2016-09-13 22:15:13 +00:00
parent 85f8d034a7
commit 2ceb565df3
19 changed files with 1721 additions and 1404 deletions

View file

@ -362,7 +362,7 @@ public:
UnicodeSet();
/**
* Constructs a set containing the given range. If <code>end >
* Constructs a set containing the given range. If <code>end <
* start</code> then an empty set is created.
*
* @param start first character, inclusive, of range

View file

@ -92,10 +92,10 @@ csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o zonemeta.o \
standardplural.o upluralrules.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o udateintervalformat.o \
tmunit.o tmutamt.o tmutfmt.o currpinf.o \
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o decfmtst.o smpdtfst.o \
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o ufieldpositer.o \
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o identifier_info.o \
tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o \
uregion.o reldatefmt.o quantityformatter.o measunit.o \
sharedbreakiterator.o scientificnumberformatter.o digitgrouping.o \
digitinterval.o digitformatter.o digitaffix.o valueformatter.o \

View file

@ -337,7 +337,6 @@
<ClCompile Include="gregocal.cpp" />
<ClCompile Include="gregoimp.cpp" />
<ClCompile Include="hebrwcal.cpp" />
<ClCompile Include="identifier_info.cpp" />
<ClCompile Include="indiancal.cpp" />
<ClCompile Include="islamcal.cpp" />
<ClCompile Include="japancal.cpp" />
@ -464,7 +463,6 @@
<ClCompile Include="uspoof_build.cpp" />
<ClCompile Include="uspoof_conf.cpp" />
<ClCompile Include="uspoof_impl.cpp" />
<ClCompile Include="uspoof_wsconf.cpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="unicode\alphaindex.h">
@ -1686,11 +1684,9 @@
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<ClInclude Include="identifier_info.h" />
<ClInclude Include="scriptset.h" />
<ClInclude Include="uspoof_conf.h" />
<ClInclude Include="uspoof_impl.h" />
<ClInclude Include="uspoof_wsconf.h" />
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="i18n.rc" />

View file

@ -501,9 +501,6 @@
<ClCompile Include="ucsdet.cpp">
<Filter>charset detect</Filter>
</ClCompile>
<ClCompile Include="identifier_info.cpp">
<Filter>spoof</Filter>
</ClCompile>
<ClCompile Include="scriptset.cpp">
<Filter>spoof</Filter>
</ClCompile>
@ -519,9 +516,6 @@
<ClCompile Include="uspoof_impl.cpp">
<Filter>spoof</Filter>
</ClCompile>
<ClCompile Include="uspoof_wsconf.cpp">
<Filter>spoof</Filter>
</ClCompile>
<ClCompile Include="alphaindex.cpp">
<Filter>collation</Filter>
</ClCompile>
@ -943,9 +937,6 @@
<ClInclude Include="inputext.h">
<Filter>charset detect</Filter>
</ClInclude>
<ClInclude Include="identifier_info.h">
<Filter>spoof</Filter>
</ClInclude>
<ClInclude Include="scriptset.h">
<Filter>spoof</Filter>
</ClInclude>
@ -955,9 +946,6 @@
<ClInclude Include="uspoof_impl.h">
<Filter>spoof</Filter>
</ClInclude>
<ClInclude Include="uspoof_wsconf.h">
<Filter>spoof</Filter>
</ClInclude>
<ClInclude Include="tzgnames.h">
<Filter>formatting</Filter>
</ClInclude>

View file

@ -193,6 +193,15 @@ int32_t ScriptSet::nextSetBit(int32_t fromIndex) const {
return -1;
}
UBool ScriptSet::isEmpty() const {
for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
if (bits[i] != 0) {
return FALSE;
}
}
return TRUE;
}
UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const {
UBool firstTime = TRUE;
for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
@ -240,6 +249,41 @@ ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode
return *this;
}
void ScriptSet::setScriptExtensions(UChar32 codePoint, UErrorCode& status) {
if (U_FAILURE(status)) { return; }
static const int32_t FIRST_GUESS_SCRIPT_CAPACITY = 5;
MaybeStackArray<UScriptCode,FIRST_GUESS_SCRIPT_CAPACITY> scripts;
UErrorCode internalStatus = U_ZERO_ERROR;
int32_t script_count = -1;
while (TRUE) {
script_count = uscript_getScriptExtensions(
codePoint, scripts.getAlias(), FIRST_GUESS_SCRIPT_CAPACITY, &internalStatus);
if (internalStatus == U_BUFFER_OVERFLOW_ERROR) {
// Need to allocate more space
if (scripts.resize(script_count) == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
internalStatus = U_ZERO_ERROR;
} else {
break;
}
}
// Check if we failed for some reason other than buffer overflow
if (U_FAILURE(internalStatus)) {
status = internalStatus;
return;
}
// Load the scripts into the ScriptSet and return
for (int32_t i = 0; i < script_count; i++) {
this->set(scripts[i], status);
if (U_FAILURE(status)) { return; }
}
}
U_NAMESPACE_END
U_CAPI UBool U_EXPORT2

View file

@ -58,9 +58,14 @@ class U_I18N_API ScriptSet: public UMemory {
int32_t hashCode() const;
int32_t nextSetBit(int32_t script) const;
UBool isEmpty() const;
UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string.
ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents.
// Wraps around UScript::getScriptExtensions() and adds the corresponding scripts to this instance.
void setScriptExtensions(UChar32 codePoint, UErrorCode& status);
private:
uint32_t bits[6];
};

View file

@ -26,8 +26,8 @@ as the functions are suppose to be called.
It's usually best to have child dependencies called first. */
typedef enum ECleanupI18NType {
UCLN_I18N_START = -1,
UCLN_I18N_IDENTIFIER_INFO,
UCLN_I18N_SPOOF,
UCLN_I18N_SPOOFDATA,
UCLN_I18N_TRANSLITERATOR,
UCLN_I18N_REGEX,
UCLN_I18N_ISLAMIC_CALENDAR,

File diff suppressed because it is too large Load diff

View file

@ -22,7 +22,6 @@
#include "unicode/utf16.h"
#include "cmemory.h"
#include "cstring.h"
#include "identifier_info.h"
#include "mutex.h"
#include "scriptset.h"
#include "uassert.h"
@ -42,9 +41,7 @@ U_NAMESPACE_USE
static UnicodeSet *gInclusionSet = NULL;
static UnicodeSet *gRecommendedSet = NULL;
static const Normalizer2 *gNfdNormalizer = NULL;
static SpoofData *gDefaultSpoofData = NULL;
static UInitOnce gSpoofInitStaticsOnce = U_INITONCE_INITIALIZER;
static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
static UBool U_CALLCONV
uspoof_cleanup(void) {
@ -53,83 +50,78 @@ uspoof_cleanup(void) {
delete gRecommendedSet;
gRecommendedSet = NULL;
gNfdNormalizer = NULL;
if (gDefaultSpoofData) {
gDefaultSpoofData->removeReference(); // Will delete, assuming all user-level spoof checkers were closed.
}
gDefaultSpoofData = NULL;
gSpoofInitStaticsOnce.reset();
gSpoofInitDefaultOnce.reset();
return TRUE;
}
static void U_CALLCONV initializeStatics(UErrorCode &status) {
static const char *inclusionPat =
"[\\u0027\\u002D-\\u002E\\u003A\\u00B7\\u0375\\u058A\\u05F3-\\u05F4"
"\\u06FD-\\u06FE\\u0F0B\\u200C-\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]";
"['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C\\u200D\\u2010\\u"
"2019\\u2027\\u30A0\\u30FB]";
gInclusionSet = new UnicodeSet(UnicodeString(inclusionPat, -1, US_INV), status);
gInclusionSet->freeze();
// Note: data from http://unicode.org/Public/security/latest/xidmodifications.txt version 8.0.0
// There is no tooling to generate this from the .txt file, hand extracted with editor macros.
// Ultimately, data will be available as character properties, eliminating this.
// Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt
// There is tooling to generate this constant in the unicodetools project:
// org.unicode.text.tools.RecommendedSetGenerator
// It will print the Java and C++ code to the console for easy copy-paste into this file.
// Note: concatenated string constants do not work with UNICODE_STRING_SIMPLE on all platforms.
static const char *recommendedPat =
"[\\u0030-\\u0039\\u0041-\\u005A\\u005F\\u0061-\\u007A\\u00C0-\\u00D6\\u00D8-\\u00F6"
"\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0-\\u01A1"
"\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B"
"\\u021E-\\u021F\\u0226-\\u0233\\u0259\\u02BB-\\u02BC\\u02EC\\u0300-\\u0304\\u0306-\\u030C"
"\\u030F-\\u0311\\u0313-\\u0314\\u031B\\u0323-\\u0328\\u032D-\\u032E\\u0330-\\u0331"
"\\u0335\\u0338-\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386\\u0388-\\u038A\\u038C"
"\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u052E-\\u052F\\u0531-\\u0556"
"\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0620-\\u063F\\u0641-\\u0655"
"\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06D3\\u06D5\\u06E5-\\u06E6"
"\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2\\u0901-\\u094D\\u094F-\\u0950"
"\\u0956-\\u0957\\u0960-\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983"
"\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9"
"\\u09BC-\\u09C4\\u09C7-\\u09C8\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1"
"\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32"
"\\u0A35\\u0A38-\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47-\\u0A48\\u0A4B-\\u0A4D\\u0A5C"
"\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0"
"\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0"
"\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28"
"\\u0B2A-\\u0B30\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47-\\u0B48\\u0B4B-\\u0B4D"
"\\u0B56-\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82-\\u0B83\\u0B85-\\u0B8A"
"\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4"
"\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0"
"\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28"
"\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55-\\u0C56"
"\\u0C60-\\u0C61\\u0C66-\\u0C6F\\u0C82-\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8"
"\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5-\\u0CD6"
"\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1-\\u0CF2\\u0D02-\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10"
"\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D57\\u0D60-\\u0D61"
"\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82-\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5"
"\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6"
"\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59"
"\\u0E81-\\u0E82\\u0E84\\u0E87-\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F"
"\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA-\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD"
"\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE-\\u0EDF\\u0F00\\u0F20-\\u0F29"
"\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56"
"\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71-\\u0F72\\u0F74\\u0F7A-\\u0F80"
"\\u0F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
"\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D"
"\\u10C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D"
"\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0"
"\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310"
"\\u1312-\\u1315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7"
"\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1E00-\\u1E99"
"\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
"\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78"
"\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8"
"\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC"
"\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6"
"\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6"
"\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099-\\u309A\\u309D-\\u309E\\u30A1-\\u30FA"
"\\u30FC-\\u30FE\\u3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660-\\uA661"
"\\uA674-\\uA67B\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D-\\uA78E\\uA790-\\uA793"
"\\uA7A0-\\uA7AA\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06"
"\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E-\\uFA0F"
"\\uFA11\\uFA13-\\uFA14\\uFA1F\\uFA21\\uFA23-\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6"
"\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]";
"[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u014"
"8\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E"
"6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02B"
"C\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u03"
"28\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386"
"\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u05"
"2E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0"
"620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-"
"\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2"
"\\u08B6-\\u08BD\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096"
"F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u0"
"9A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u"
"09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-"
"\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\"
"u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A9"
"3-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0"
"ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\"
"u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47"
"\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83"
"\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3"
"\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0B"
"D0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u"
"0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56"
"\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92"
"-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0"
"CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0"
"D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57"
"\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D9"
"6\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0"
"DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\"
"u0E59\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u"
"0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\"
"u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29"
"\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F"
"56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0"
"F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
"\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10"
"C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u"
"1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2"
"-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1"
"315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-"
"\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C80-\\u1C88\\u1E00-\\u1E9"
"9\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1"
"F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F"
"7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1"
"FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-"
"\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0"
"-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3"
"005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-\\u30FE\\u"
"3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660\\uA661\\uA674-\\uA67B"
"\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7AE"
"\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB"
"11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13\\uF"
"A14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6\\U0002A700-\\U0"
"002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]";
gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat, -1, US_INV), status);
gRecommendedSet->freeze();
@ -137,11 +129,6 @@ static void U_CALLCONV initializeStatics(UErrorCode &status) {
ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
}
static void U_CALLCONV initializeDefaultData(UErrorCode &status) {
gDefaultSpoofData = SpoofData::getDefault(status);
ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
}
U_CFUNC void uspoof_internalInitStatics(UErrorCode *status) {
umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status);
}
@ -149,14 +136,10 @@ U_CFUNC void uspoof_internalInitStatics(UErrorCode *status) {
U_CAPI USpoofChecker * U_EXPORT2
uspoof_open(UErrorCode *status) {
umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status);
umtx_initOnce(gSpoofInitDefaultOnce, &initializeDefaultData, *status);
if (U_FAILURE(*status)) {
return NULL;
}
SpoofImpl *si = new SpoofImpl(gDefaultSpoofData, *status);
if (si) {
gDefaultSpoofData->addReference();
}
SpoofImpl *si = new SpoofImpl(*status);
if (U_SUCCESS(*status) && si == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
}
@ -164,7 +147,7 @@ uspoof_open(UErrorCode *status) {
delete si;
si = NULL;
}
return reinterpret_cast<USpoofChecker *>(si);
return si->asUSpoofChecker();
}
@ -190,9 +173,9 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
}
if (pActualLength != NULL) {
*pActualLength = sd->fRawData->fLength;
*pActualLength = sd->size();
}
return reinterpret_cast<USpoofChecker *>(si);
return si->asUSpoofChecker();
}
@ -207,7 +190,7 @@ uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
delete result;
result = NULL;
}
return reinterpret_cast<USpoofChecker *>(result);
return result->asUSpoofChecker();
}
@ -335,7 +318,23 @@ uspoof_check(const USpoofChecker *sc,
const UChar *id, int32_t length,
int32_t *position,
UErrorCode *status) {
// Backwards compatibility:
if (position != NULL) {
*position = 0;
}
// Delegate to uspoof_check2
return uspoof_check2(sc, id, length, NULL, status);
}
U_CAPI int32_t U_EXPORT2
uspoof_check2(const USpoofChecker *sc,
const UChar* id, int32_t length,
USpoofCheckResult* checkResult,
UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return 0;
@ -345,7 +344,7 @@ uspoof_check(const USpoofChecker *sc,
return 0;
}
UnicodeString idStr((length == -1), id, length); // Aliasing constructor.
int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status);
return result;
}
@ -356,11 +355,27 @@ uspoof_checkUTF8(const USpoofChecker *sc,
int32_t *position,
UErrorCode *status) {
// Backwards compatibility:
if (position != NULL) {
*position = 0;
}
// Delegate to uspoof_check2
return uspoof_check2UTF8(sc, id, length, NULL, status);
}
U_CAPI int32_t U_EXPORT2
uspoof_check2UTF8(const USpoofChecker *sc,
const char *id, int32_t length,
USpoofCheckResult* checkResult,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return 0;
}
UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status);
return result;
}
@ -414,7 +429,7 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
if (U_FAILURE(*status)) {
return 0;
}
//
//
// See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
@ -422,125 +437,95 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
// If no tests relavant to this function have been specified, return an error.
// TODO: is this really the right thing to do? It's probably an error on the caller's part,
// but logically we would just return 0 (no error).
if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
*status = U_INVALID_STATE_ERROR;
return 0;
}
int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
int32_t result = 0;
IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status);
if (U_FAILURE(*status)) {
// Compute the skeletons and check for confusability.
UnicodeString id1Skeleton;
uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id1, id1Skeleton, status);
UnicodeString id2Skeleton;
uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id2, id2Skeleton, status);
if (U_FAILURE(*status)) { return 0; }
if (id1Skeleton != id2Skeleton) {
return 0;
}
identifierInfo->setIdentifier(id1, *status);
int32_t id1ScriptCount = identifierInfo->getScriptCount();
int32_t id1FirstScript = identifierInfo->getScripts()->nextSetBit(0);
identifierInfo->setIdentifier(id2, *status);
int32_t id2ScriptCount = identifierInfo->getScriptCount();
int32_t id2FirstScript = identifierInfo->getScripts()->nextSetBit(0);
This->releaseIdentifierInfo(identifierInfo);
identifierInfo = NULL;
if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
UnicodeString id1Skeleton;
UnicodeString id2Skeleton;
if (id1ScriptCount <= 1 && id2ScriptCount <= 1 && id1FirstScript == id2FirstScript) {
flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
if (id1Skeleton == id2Skeleton) {
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
}
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
// of confusables according to UTS 39 section 4.
// Start by computing the resolved script sets of id1 and id2.
ScriptSet id1RSS;
This->getResolvedScriptSet(id1, id1RSS, *status);
ScriptSet id2RSS;
This->getResolvedScriptSet(id2, id2RSS, *status);
// Turn on all applicable flags
int32_t result = 0;
if (id1RSS.intersects(id2RSS)) {
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
} else {
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) {
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
}
if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
// If the two inputs are single script confusable they cannot also be
// mixed or whole script confusable, according to the UAX39 definitions.
// So we can skip those tests.
return result;
// Turn off flags that the user doesn't want
if ((This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) == 0) {
result &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
}
// Two identifiers are whole script confusable if each is of a single script
// and they are mixed script confusable.
UBool possiblyWholeScriptConfusables =
id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
//
// Mixed Script Check
//
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
// For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
// the mixed script table skeleton, which is what we want.
// The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
UnicodeString id1Skeleton;
UnicodeString id2Skeleton;
flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
if (id1Skeleton == id2Skeleton) {
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
if (possiblyWholeScriptConfusables) {
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
}
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) == 0) {
result &= ~USPOOF_MIXED_SCRIPT_CONFUSABLE;
}
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) == 0) {
result &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
const icu::UnicodeString &id,
const icu::UnicodeString &id,
int32_t *position,
UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return 0;
// Backwards compatibility:
if (position != NULL) {
*position = 0;
}
// Delegate to uspoof_check2
return uspoof_check2UnicodeString(sc, id, NULL, status);
}
int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* checkResult, UErrorCode* status) {
U_ASSERT(This != NULL);
U_ASSERT(checkResult != NULL);
checkResult->clear();
int32_t result = 0;
IdentifierInfo *identifierInfo = NULL;
if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
identifierInfo = This->getIdentifierInfo(*status);
if (U_FAILURE(*status)) {
goto cleanupAndReturn;
}
identifierInfo->setIdentifier(id, *status);
identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
}
if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
if (0 != (This->fChecks & USPOOF_RESTRICTION_LEVEL)) {
URestrictionLevel idRestrictionLevel = This->getRestrictionLevel(id, *status);
if (idRestrictionLevel > This->fRestrictionLevel) {
result |= USPOOF_RESTRICTION_LEVEL;
}
if (This->fChecks & USPOOF_AUX_INFO) {
result |= idRestrictionLevel;
}
checkResult->fRestrictionLevel = idRestrictionLevel;
}
if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
const UnicodeSet *numerics = identifierInfo->getNumerics();
if (numerics->size() > 1) {
if (0 != (This->fChecks & USPOOF_MIXED_NUMBERS)) {
UnicodeSet numerics;
This->getNumerics(id, numerics, *status);
if (numerics.size() > 1) {
result |= USPOOF_MIXED_NUMBERS;
}
// TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
// We have no easy way to do the same in C.
// if (checkResult != null) {
// checkResult.numerics = numerics;
// }
checkResult->fNumerics = numerics; // UnicodeSet::operator=
}
if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) {
int32_t i;
UChar32 c;
int32_t length = id.length();
@ -554,103 +539,74 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
}
}
if (This->fChecks &
(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
// These are the checks that need to be done on NFD input
if (0 != (This->fChecks & USPOOF_INVISIBLE)) {
// This check needs to be done on NFD input
UnicodeString nfdText;
gNfdNormalizer->normalize(id, nfdText, *status);
int32_t nfdLength = nfdText.length();
if (This->fChecks & USPOOF_INVISIBLE) {
// scan for more than one occurence of the same non-spacing mark
// in a sequence of non-spacing marks.
int32_t i;
UChar32 c;
UChar32 firstNonspacingMark = 0;
UBool haveMultipleMarks = FALSE;
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
for (i=0; i<nfdLength ;) {
c = nfdText.char32At(i);
i += U16_LENGTH(c);
if (u_charType(c) != U_NON_SPACING_MARK) {
firstNonspacingMark = 0;
if (haveMultipleMarks) {
marksSeenSoFar.clear();
haveMultipleMarks = FALSE;
}
continue;
}
if (firstNonspacingMark == 0) {
firstNonspacingMark = c;
continue;
}
if (!haveMultipleMarks) {
marksSeenSoFar.add(firstNonspacingMark);
haveMultipleMarks = TRUE;
}
if (marksSeenSoFar.contains(c)) {
// report the error, and stop scanning.
// No need to find more than the first failure.
result |= USPOOF_INVISIBLE;
break;
}
marksSeenSoFar.add(c);
}
}
// scan for more than one occurence of the same non-spacing mark
// in a sequence of non-spacing marks.
int32_t i;
UChar32 c;
UChar32 firstNonspacingMark = 0;
UBool haveMultipleMarks = FALSE;
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
// The basic test is the same for both whole and mixed script confusables.
// Compute the set of scripts that every input character has a confusable in.
// For this computation an input character is always considered to be
// confusable with itself in its own script.
//
// If the number of such scripts is two or more, and the input consisted of
// characters all from a single script, we have a whole script confusable.
// (The two scripts will be the original script and the one that is confusable)
//
// If the number of such scripts >= one, and the original input contained characters from
// more than one script, we have a mixed script confusable. (We can transform
// some of the characters, and end up with a visually similar string all in
// one script.)
if (identifierInfo == NULL) {
identifierInfo = This->getIdentifierInfo(*status);
if (U_FAILURE(*status)) {
goto cleanupAndReturn;
for (i=0; i<nfdLength ;) {
c = nfdText.char32At(i);
i += U16_LENGTH(c);
if (u_charType(c) != U_NON_SPACING_MARK) {
firstNonspacingMark = 0;
if (haveMultipleMarks) {
marksSeenSoFar.clear();
haveMultipleMarks = FALSE;
}
identifierInfo->setIdentifier(id, *status);
continue;
}
int32_t scriptCount = identifierInfo->getScriptCount();
ScriptSet scripts;
This->wholeScriptCheck(nfdText, &scripts, *status);
int32_t confusableScriptCount = scripts.countMembers();
//printf("confusableScriptCount = %d\n", confusableScriptCount);
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
confusableScriptCount >= 2 &&
scriptCount == 1) {
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
if (firstNonspacingMark == 0) {
firstNonspacingMark = c;
continue;
}
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
confusableScriptCount >= 1 &&
scriptCount > 1) {
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
if (!haveMultipleMarks) {
marksSeenSoFar.add(firstNonspacingMark);
haveMultipleMarks = TRUE;
}
if (marksSeenSoFar.contains(c)) {
// report the error, and stop scanning.
// No need to find more than the first failure.
result |= USPOOF_INVISIBLE;
break;
}
marksSeenSoFar.add(c);
}
}
cleanupAndReturn:
This->releaseIdentifierInfo(identifierInfo);
if (position != NULL) {
*position = 0;
checkResult->fChecks = result;
return checkResult->toCombinedBitmask(This->fChecks);
}
U_CAPI int32_t U_EXPORT2
uspoof_check2UnicodeString(const USpoofChecker *sc,
const icu::UnicodeString &id,
USpoofCheckResult* checkResult,
UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return FALSE;
}
if (checkResult != NULL) {
CheckResult* ThisCheckResult = CheckResult::validateThis(checkResult, *status);
if (ThisCheckResult == NULL) {
return FALSE;
}
return checkImpl(This, id, ThisCheckResult, status);
} else {
// Stack-allocate the checkResult since this method doesn't return it
CheckResult stackCheckResult;
return checkImpl(This, id, &stackCheckResult, status);
}
return result;
}
@ -681,7 +637,7 @@ uspoof_getSkeleton(const USpoofChecker *sc,
U_I18N_API UnicodeString & U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
uint32_t type,
uint32_t /*type*/,
const UnicodeString &id,
UnicodeString &dest,
UErrorCode *status) {
@ -690,21 +646,9 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
return dest;
}
int32_t tableMask = 0;
switch (type) {
case 0:
tableMask = USPOOF_ML_TABLE_FLAG;
break;
case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
tableMask = USPOOF_SL_TABLE_FLAG;
break;
case USPOOF_ANY_CASE:
tableMask = USPOOF_MA_TABLE_FLAG;
break;
case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
tableMask = USPOOF_SA_TABLE_FLAG;
break;
default:
// Check that at least one of the CONFUSABLE flags is turned on. If not,
// return an error.
if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return dest;
}
@ -720,7 +664,7 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
for (inputIndex=0; inputIndex < normalizedLen; ) {
UChar32 c = nfdId.char32At(inputIndex);
inputIndex += U16_LENGTH(c);
This->confusableLookup(c, tableMask, skelStr);
This->fSpoofData->confusableLookup(c, skelStr);
}
gNfdNormalizer->normalize(skelStr, dest, *status);
@ -764,13 +708,8 @@ uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *stat
U_ASSERT(U_FAILURE(*status));
return 0;
}
int32_t dataSize = This->fSpoofData->fRawData->fLength;
if (capacity < dataSize) {
*status = U_BUFFER_OVERFLOW_ERROR;
return dataSize;
}
uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
return dataSize;
return This->fSpoofData->serialize(buf, capacity, *status);
}
U_CAPI const USet * U_EXPORT2
@ -797,6 +736,48 @@ uspoof_getRecommendedUnicodeSet(UErrorCode *status) {
return gRecommendedSet;
}
//------------------
// CheckResult APIs
//------------------
U_CAPI USpoofCheckResult* U_EXPORT2
uspoof_openCheckResult(UErrorCode *status) {
CheckResult* checkResult = new CheckResult();
if (checkResult == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
return checkResult->asUSpoofCheckResult();
}
U_CAPI void U_EXPORT2
uspoof_closeCheckResult(USpoofCheckResult* checkResult) {
UErrorCode status = U_ZERO_ERROR;
CheckResult* This = CheckResult::validateThis(checkResult, status);
delete This;
}
U_CAPI int32_t U_EXPORT2
uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status) {
const CheckResult* This = CheckResult::validateThis(checkResult, *status);
if (U_FAILURE(*status)) { return 0; }
return This->fChecks;
}
U_CAPI URestrictionLevel U_EXPORT2
uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status) {
const CheckResult* This = CheckResult::validateThis(checkResult, *status);
if (U_FAILURE(*status)) { return USPOOF_UNRESTRICTIVE; }
return This->fRestrictionLevel;
}
U_CAPI const USet* U_EXPORT2
uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status) {
const CheckResult* This = CheckResult::validateThis(checkResult, *status);
if (U_FAILURE(*status)) { return NULL; }
return This->fNumerics.toUSet();
}
#endif // !UCONFIG_NO_NORMALIZATION

View file

@ -37,7 +37,6 @@
#include "uassert.h"
#include "uarrsort.h"
#include "uspoof_conf.h"
#include "uspoof_wsconf.h"
#if !UCONFIG_NO_NORMALIZATION
@ -50,7 +49,7 @@ U_CFUNC void uspoof_internalInitStatics(UErrorCode *status);
U_CAPI USpoofChecker * U_EXPORT2
uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
const char* /*confusablesWholeScript*/, int32_t /*confusablesWholeScriptLen*/,
int32_t *errorType, UParseError *pe, UErrorCode *status) {
uspoof_internalInitStatics(status);
if (U_FAILURE(*status)) {
@ -76,7 +75,6 @@ uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
// Compile the binary data from the source (text) format.
ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status);
buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status);
if (U_FAILURE(*status)) {
delete This;

View file

@ -45,8 +45,7 @@ U_NAMESPACE_USE
//
// The binary structures are described in uspoof_impl.h
//
// 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
// tables. Each maps from a UChar32 to a String.
// 1. Parse the data, making a hash table mapping from a UChar32 to a String.
//
// 2. Sort all of the strings encountered by length, since they will need to
// be stored in that order in the final string table.
@ -63,7 +62,7 @@ U_NAMESPACE_USE
SPUString::SPUString(UnicodeString *s) {
fStr = s;
fStrTableIndex = 0;
fCharOrStrTableIndex = 0;
}
@ -145,15 +144,11 @@ SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
fSpoofImpl(spImpl),
fInput(NULL),
fSLTable(NULL),
fSATable(NULL),
fMLTable(NULL),
fMATable(NULL),
fTable(NULL),
fKeySet(NULL),
fKeyVec(NULL),
fValueVec(NULL),
fStringTable(NULL),
fStringLengthsTable(NULL),
stringPool(NULL),
fParseLine(NULL),
fParseHexNum(NULL),
@ -162,10 +157,7 @@ ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &stat
if (U_FAILURE(status)) {
return;
}
fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fKeySet = new UnicodeSet();
fKeyVec = new UVector(status);
fValueVec = new UVector(status);
@ -177,14 +169,10 @@ ConfusabledataBuilder::~ConfusabledataBuilder() {
uprv_free(fInput);
uregex_close(fParseLine);
uregex_close(fParseHexNum);
uhash_close(fSLTable);
uhash_close(fSATable);
uhash_close(fMLTable);
uhash_close(fMATable);
uhash_close(fTable);
delete fKeySet;
delete fKeyVec;
delete fStringTable;
delete fStringLengthsTable;
delete fValueVec;
delete stringPool;
}
@ -230,7 +218,7 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
// any line. What was matched is determined by examining which capture groups have a match.
// Capture Group 1: the source char
// Capture Group 2: the replacement chars
// Capture Group 3-6 the table type, SL, SA, ML, or MA
// Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
// Capture Group 7: A blank or comment only line.
// Capture Group 8: A syntactically invalid line. Anything that didn't match before.
// Example Line from the confusables.txt source file:
@ -296,41 +284,12 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
// This a little like a Java intern() - any duplicates will be eliminated.
SPUString *smapString = stringPool->addString(mapString, status);
// Add the UChar32 -> string mapping to the appropriate table.
UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
NULL;
if (U_SUCCESS(status) && table == NULL) {
status = U_PARSE_ERROR;
}
if (U_FAILURE(status)) {
return;
}
// Add the UChar32 -> string mapping to the table.
// For Unicode 8, the SL, SA and ML tables have been discontinued.
// All input data from confusables.txt is tagged MA.
// ICU spoof check functions should ignore the specified table and always
// use this MA Data.
// For now, implement by populating the MA data into all four tables, and
// keep the multiple table implementation in place, in case it comes back
// at some time in the future.
// There is no run time size penalty to keeping the four table implementation -
// the data is shared when it's the same betweeen tables.
if (table != fMATable) {
status = U_PARSE_ERROR;
return;
};
// uhash_iput(table, keyChar, smapString, &status);
uhash_iput(fSLTable, keyChar, smapString, &status);
uhash_iput(fSATable, keyChar, smapString, &status);
uhash_iput(fMLTable, keyChar, smapString, &status);
uhash_iput(fMATable, keyChar, smapString, &status);
uhash_iput(fTable, keyChar, smapString, &status);
if (U_FAILURE(status)) { return; }
fKeySet->add(keyChar);
if (U_FAILURE(status)) {
return;
}
}
// Input data is now all parsed and collected.
@ -343,43 +302,24 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
// Build up the string array, and record the index of each string therein
// in the (build time only) string pool.
// Strings of length one are not entered into the strings array.
// At the same time, build up the string lengths table, which records the
// position in the string table of the first string of each length >= 4.
// (Strings in the table are sorted by length)
stringPool->sort(status);
fStringTable = new UnicodeString();
fStringLengthsTable = new UVector(status);
int32_t previousStringLength = 0;
int32_t previousStringIndex = 0;
int32_t poolSize = stringPool->size();
int32_t i;
for (i=0; i<poolSize; i++) {
SPUString *s = stringPool->getByIndex(i);
int32_t strLen = s->fStr->length();
int32_t strIndex = fStringTable->length();
U_ASSERT(strLen >= previousStringLength);
if (strLen == 1) {
// strings of length one do not get an entry in the string table.
// Keep the single string character itself here, which is the same
// convention that is used in the final run-time string table index.
s->fStrTableIndex = s->fStr->charAt(0);
s->fCharOrStrTableIndex = s->fStr->charAt(0);
} else {
if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
fStringLengthsTable->addElement(previousStringIndex, status);
fStringLengthsTable->addElement(previousStringLength, status);
}
s->fStrTableIndex = strIndex;
s->fCharOrStrTableIndex = strIndex;
fStringTable->append(*(s->fStr));
}
previousStringLength = strLen;
previousStringIndex = strIndex;
}
// Make the final entry to the string lengths table.
// (it holds an entry for the _last_ string of each length, so adding the
// final one doesn't happen in the main loop because no longer string was encountered.)
if (previousStringLength >= 4) {
fStringLengthsTable->addElement(previousStringIndex, status);
fStringLengthsTable->addElement(previousStringLength, status);
}
// Construct the compile-time Key and Value tables
@ -398,10 +338,15 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
// code points requires a nested loop.
for (UChar32 keyChar=fKeySet->getRangeStart(range);
keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
U_ASSERT(targetMapping != NULL);
int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
targetMapping->fStr->length());
int32_t value = targetMapping->fCharOrStrTableIndex;
fKeyVec->addElement(key, status);
fValueVec->addElement(value, status);
}
}
@ -437,14 +382,14 @@ void ConfusabledataBuilder::outputData(UErrorCode &status) {
return;
}
int i;
int32_t previousKey = 0;
UChar32 previousCodePoint = 0;
for (i=0; i<numKeys; i++) {
int32_t key = fKeyVec->elementAti(i);
(void)previousKey; // Suppress unused variable warning on gcc.
U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
U_ASSERT((key & 0xff000000) != 0);
UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
// strictly greater because there can be only one entry per code point
U_ASSERT(codePoint > previousCodePoint);
keys[i] = key;
previousKey = key;
previousCodePoint = codePoint;
}
SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
@ -486,143 +431,6 @@ void ConfusabledataBuilder::outputData(UErrorCode &status) {
rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
rawData->fCFUStringTableLen = stringsLength;
fSpoofImpl->fSpoofData->fCFUStrings = strings;
// The String Lengths Table
// While copying into the runtime array do some sanity checks on the values
// Each complete entry contains two fields, an index and an offset.
// Lengths should increase with each entry.
// Offsets should be less than the size of the string table.
int32_t lengthTableLength = fStringLengthsTable->size();
uint16_t *stringLengths =
static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
if (U_FAILURE(status)) {
return;
}
int32_t destIndex = 0;
uint32_t previousLength = 0;
for (i=0; i<lengthTableLength; i+=2) {
uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
U_ASSERT(offset < stringsLength);
U_ASSERT(length < 40);
(void)previousLength; // Suppress unused variable warning on gcc.
U_ASSERT(length > previousLength);
stringLengths[destIndex++] = static_cast<uint16_t>(offset);
stringLengths[destIndex++] = static_cast<uint16_t>(length);
previousLength = length;
}
rawData = fSpoofImpl->fSpoofData->fRawData;
rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData);
// Note: StringLengthsSize in the raw data is the number of complete entries,
// each consisting of a pair of 16 bit values, hence the divide by 2.
rawData->fCFUStringLengthsSize = lengthTableLength / 2;
fSpoofImpl->fSpoofData->fCFUStringLengths =
reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
}
// addKeyEntry Construction of the confusable Key and Mapping Values tables.
// This is an intermediate point in the building process.
// We already have the mappings in the hash tables fSLTable, etc.
// This function builds corresponding run-time style table entries into
// fKeyVec and fValueVec
void ConfusabledataBuilder::addKeyEntry(
UChar32 keyChar, // The key character
UHashtable *table, // The table, one of SATable, MATable, etc.
int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
UErrorCode &status) {
SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
if (targetMapping == NULL) {
// No mapping for this key character.
// (This function is called for all four tables for each key char that
// is seen anywhere, so this no entry cases are very much expected.)
return;
}
// Check whether there is already an entry with the correct mapping.
// If so, simply set the flag in the keyTable saying that the existing entry
// applies to the table that we're doing now.
UBool keyHasMultipleValues = FALSE;
int32_t i;
for (i=fKeyVec->size()-1; i>=0 ; i--) {
int32_t key = fKeyVec->elementAti(i);
if ((key & 0x0ffffff) != keyChar) {
// We have now checked all existing key entries for this key char (if any)
// without finding one with the same mapping.
break;
}
UnicodeString mapping = getMapping(i);
if (mapping == *(targetMapping->fStr)) {
// The run time entry we are currently testing has the correct mapping.
// Set the flag in it indicating that it applies to the new table also.
key |= tableFlag;
fKeyVec->setElementAt(key, i);
return;
}
keyHasMultipleValues = TRUE;
}
// Need to add a new entry to the binary data being built for this mapping.
// Includes adding entries to both the key table and the parallel values table.
int32_t newKey = keyChar | tableFlag;
if (keyHasMultipleValues) {
newKey |= USPOOF_KEY_MULTIPLE_VALUES;
}
int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
if (adjustedMappingLength>3) {
adjustedMappingLength = 3;
}
newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
int32_t newData = targetMapping->fStrTableIndex;
fKeyVec->addElement(newKey, status);
fValueVec->addElement(newData, status);
// If the preceding key entry is for the same key character (but with a different mapping)
// set the multiple-values flag on it.
if (keyHasMultipleValues) {
int32_t previousKeyIndex = fKeyVec->size() - 2;
int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
fKeyVec->setElementAt(previousKey, previousKeyIndex);
}
}
UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
int32_t key = fKeyVec->elementAti(index);
int32_t value = fValueVec->elementAti(index);
int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
int32_t lastIndexWithLen;
switch (length) {
case 0:
return UnicodeString(static_cast<UChar>(value));
case 1:
case 2:
return UnicodeString(*fStringTable, value, length+1);
case 3:
length = 0;
int32_t i;
for (i=0; i<fStringLengthsTable->size(); i+=2) {
lastIndexWithLen = fStringLengthsTable->elementAti(i);
if (value <= lastIndexWithLen) {
length = fStringLengthsTable->elementAti(i+1);
break;
}
}
U_ASSERT(length>=3);
return UnicodeString(*fStringTable, value, length);
default:
U_ASSERT(FALSE);
}
return UnicodeString();
}
#endif

View file

@ -38,9 +38,9 @@ U_NAMESPACE_BEGIN
struct SPUString : public UMemory {
UnicodeString *fStr; // The actual string.
int32_t fStrTableIndex; // Index into the final runtime data for this string.
// (or, for length 1, the single string char itself,
// there being no string table entry for it.)
int32_t fCharOrStrTableIndex; // Index into the final runtime data for this
// string (or, for length 1, the single string char
// itself, there being no string table entry for it.)
SPUString(UnicodeString *s);
~SPUString();
};
@ -88,10 +88,7 @@ class ConfusabledataBuilder : public UMemory {
private:
SpoofImpl *fSpoofImpl;
UChar *fInput;
UHashtable *fSLTable;
UHashtable *fSATable;
UHashtable *fMLTable;
UHashtable *fMATable;
UHashtable *fTable;
UnicodeSet *fKeySet; // A set of all keys (UChar32s) that go into the four mapping tables.
// The binary data is first assembled into the following four collections, then
@ -99,7 +96,6 @@ class ConfusabledataBuilder : public UMemory {
UVector *fKeyVec;
UVector *fValueVec;
UnicodeString *fStringTable;
UVector *fStringLengthsTable;
SPUStringPool *stringPool;
URegularExpression *fParseLine;

View file

@ -15,11 +15,11 @@
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
#include "identifier_info.h"
#include "scriptset.h"
#include "umutex.h"
#include "udataswp.h"
#include "uassert.h"
#include "ucln_in.h"
#include "uspoof_impl.h"
#if !UCONFIG_NO_NORMALIZATION
@ -29,14 +29,38 @@ U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
if (U_FAILURE(status)) {
return;
}
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
construct(status);
fSpoofData = data;
}
SpoofImpl::SpoofImpl(UErrorCode& status) {
construct(status);
// TODO: Call this method where it is actually needed, instead of in the
// constructor, to allow for lazy data loading. See #12696.
fSpoofData = SpoofData::getDefault(status);
}
SpoofImpl::SpoofImpl() {
UErrorCode status = U_ZERO_ERROR;
construct(status);
// TODO: Call this method where it is actually needed, instead of in the
// constructor, to allow for lazy data loading. See #12696.
fSpoofData = SpoofData::getDefault(status);
}
void SpoofImpl::construct(UErrorCode& status) {
fMagic = USPOOF_MAGIC;
fChecks = USPOOF_ALL_CHECKS;
fSpoofData = NULL;
fAllowedCharsSet = NULL;
fAllowedLocales = NULL;
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
if (U_FAILURE(status)) { return; }
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
@ -45,25 +69,13 @@ SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
fMagic = USPOOF_MAGIC;
}
SpoofImpl::SpoofImpl() :
fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
fAllowedLocales = uprv_strdup("");
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
}
// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
fAllowedLocales(NULL) {
if (U_FAILURE(status)) {
return;
}
@ -88,7 +100,11 @@ SpoofImpl::~SpoofImpl() {
}
delete fAllowedCharsSet;
uprv_free((void *)fAllowedLocales);
delete fCachedIdentifierInfo;
}
// Cast this instance as a USpoofChecker for the C API.
USpoofChecker *SpoofImpl::asUSpoofChecker() {
return reinterpret_cast<USpoofChecker*>(this);
}
//
@ -104,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st
return NULL;
}
SpoofImpl *This = (SpoofImpl *)sc;
if (This->fMagic != USPOOF_MAGIC ||
This->fSpoofData == NULL) {
if (This->fMagic != USPOOF_MAGIC) {
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
return NULL;
}
return This;
@ -121,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
}
//--------------------------------------------------------------------------------------
//
// confusableLookup() This is the heart of the confusable skeleton generation
// implementation.
//
// Given a source character, produce the corresponding
// replacement character(s), appending them to the dest string.
//
//---------------------------------------------------------------------------------------
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
// Binary search the spoof data key table for the inChar
int32_t *low = fSpoofData->fCFUKeys;
int32_t *mid = NULL;
int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
UChar32 midc;
do {
int32_t delta = ((int32_t)(limit-low))/2;
mid = low + delta;
midc = *mid & 0x1fffff;
if (inChar == midc) {
goto foundChar;
} else if (inChar < midc) {
limit = mid;
} else {
low = mid;
}
} while (low < limit-1);
mid = low;
midc = *mid & 0x1fffff;
if (inChar != midc) {
// Char not found. It maps to itself.
int i = 0;
dest.append(inChar);
return i;
}
foundChar:
int32_t keyFlags = *mid & 0xff000000;
if ((keyFlags & tableMask) == 0) {
// We found the right key char, but the entry doesn't pertain to the
// table we need. See if there is an adjacent key that does
if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
int32_t *altMid;
for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
keyFlags = *altMid & 0xff000000;
if (keyFlags & tableMask) {
mid = altMid;
goto foundKey;
}
}
for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
keyFlags = *altMid & 0xff000000;
if (keyFlags & tableMask) {
mid = altMid;
goto foundKey;
}
}
}
// No key entry for this char & table.
// The input char maps to itself.
int i = 0;
dest.append(inChar);
return i;
}
foundKey:
int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
// Value is either a UChar (for strings of length 1) or
// an index into the string table (for longer strings)
uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
if (stringLen == 1) {
dest.append((UChar)value);
return 1;
}
// String length of 4 from the above lookup is used for all strings of length >= 4.
// For these, get the real length from the string lengths table,
// which maps string table indexes to lengths.
// All strings of the same length are stored contiguously in the string table.
// 'value' from the lookup above is the starting index for the desired string.
int32_t ix;
if (stringLen == 4) {
int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
for (ix = 0; ix < stringLengthsLimit; ix++) {
if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
break;
}
}
U_ASSERT(ix < stringLengthsLimit);
}
U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
UChar *src = &fSpoofData->fCFUStrings[value];
dest.append(src, stringLen);
return stringLen;
}
//---------------------------------------------------------------------------------------
//
// wholeScriptCheck()
//
// Input text is already normalized to NFD
// Return the set of scripts, each of which can represent something that is
// confusable with the input text. The script of the input text
// is included; input consisting of characters from a single script will
// always produce a result consisting of a set containing that script.
//
//---------------------------------------------------------------------------------------
void SpoofImpl::wholeScriptCheck(
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
UTrie2 *table =
(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
result->setAll();
int32_t length = text.length();
for (int32_t inputIdx=0; inputIdx < length;) {
UChar32 c = text.char32At(inputIdx);
inputIdx += U16_LENGTH(c);
uint32_t index = utrie2_get32(table, c);
if (index == 0) {
// No confusables in another script for this char.
// TODO: we should change the data to have sets with just the single script
// bit for the script of this char. Gets rid of this special case.
// Until then, grab the script from the char and intersect it with the set.
UScriptCode cpScript = uscript_getScript(c, &status);
U_ASSERT(cpScript > USCRIPT_INHERITED);
result->intersect(cpScript, status);
} else if (index == 1) {
// Script == Common or Inherited. Nothing to do.
} else {
result->intersect(fSpoofData->fScriptSets[index]);
}
}
}
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
UnicodeSet allowedChars;
UnicodeSet *tmpSet = NULL;
@ -374,6 +247,137 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr
}
}
// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
result.resetAll();
result.setScriptExtensions(codePoint, status);
if (U_FAILURE(status)) { return; }
// Section 5.1 step 1
if (result.test(USCRIPT_HAN, status)) {
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
result.set(USCRIPT_JAPANESE, status);
result.set(USCRIPT_KOREAN, status);
}
if (result.test(USCRIPT_HIRAGANA, status)) {
result.set(USCRIPT_JAPANESE, status);
}
if (result.test(USCRIPT_KATAKANA, status)) {
result.set(USCRIPT_JAPANESE, status);
}
if (result.test(USCRIPT_HANGUL, status)) {
result.set(USCRIPT_KOREAN, status);
}
if (result.test(USCRIPT_BOPOMOFO, status)) {
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
}
// Section 5.1 step 2
if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
result.setAll();
}
}
// Computes the resolved script set for a string, according to UTS 39 section 5.1.
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
}
// Computes the resolved script set for a string, omitting characters having the specified script.
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
result.setAll();
ScriptSet temp;
UChar32 codePoint;
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
codePoint = input.char32At(i);
// Compute the augmented script set for the character
getAugmentedScriptSet(codePoint, temp, status);
if (U_FAILURE(status)) { return; }
// Intersect the augmented script set with the resolved script set, but only if the character doesn't
// have the script specified in the function call
if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
result.intersect(temp);
}
}
}
// Computes the set of numerics for a string, according to UTS 39 section 5.3.
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
result.clear();
UChar32 codePoint;
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
codePoint = input.char32At(i);
// Store a representative character for each kind of decimal digit
if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
// Store the zero character as a representative for comparison.
// Unicode guarantees it is codePoint - value
result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
}
}
}
// Computes the restriction level of a string, according to UTS 39 section 5.2.
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
// Section 5.2 step 1:
if (!fAllowedCharsSet->containsAll(input)) {
return USPOOF_UNRESTRICTIVE;
}
// Section 5.2 step 2
// Java use a static UnicodeSet for this test. In C++, avoid the static variable
// and just do a simple for loop.
UBool allASCII = TRUE;
for (int32_t i=0, length=input.length(); i<length; i++) {
if (input.charAt(i) > 0x7f) {
allASCII = FALSE;
break;
}
}
if (allASCII) {
return USPOOF_ASCII;
}
// Section 5.2 steps 3:
ScriptSet resolvedScriptSet;
getResolvedScriptSet(input, resolvedScriptSet, status);
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
// Section 5.2 step 4:
if (!resolvedScriptSet.isEmpty()) {
return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
}
// Section 5.2 step 5:
ScriptSet resolvedNoLatn;
getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
// Section 5.2 step 6:
if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
|| resolvedNoLatn.test(USCRIPT_JAPANESE, status)
|| resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
return USPOOF_HIGHLY_RESTRICTIVE;
}
// Section 5.2 step 7:
if (!resolvedNoLatn.isEmpty()
&& !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
&& !resolvedNoLatn.test(USCRIPT_GREEK, status)
&& !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
return USPOOF_MODERATELY_RESTRICTIVE;
}
// Section 5.2 step 8:
return USPOOF_MINIMALLY_RESTRICTIVE;
}
// Convert a text format hex number. Utility function used by builder code. Static.
// Input: UChar *string text. Output: a UChar32
@ -406,55 +410,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC
return (UChar32)val;
}
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
// Maintain a one-element cache, which is sufficient to avoid repeatedly
// creating new ones unless we get multi-thread concurrency in spoof
// check operations, which should be statistically uncommon.
// These functions are used in place of new & delete of an IdentifierInfo.
// They will recycle the IdentifierInfo when possible.
// They are logically const, and used within const functions that must be thread safe.
IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
IdentifierInfo *returnIdInfo = NULL;
if (U_FAILURE(status)) {
return returnIdInfo;
}
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
{
Mutex m;
returnIdInfo = nonConstThis->fCachedIdentifierInfo;
nonConstThis->fCachedIdentifierInfo = NULL;
}
if (returnIdInfo == NULL) {
returnIdInfo = new IdentifierInfo(status);
if (U_SUCCESS(status) && returnIdInfo == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status) && returnIdInfo != NULL) {
delete returnIdInfo;
returnIdInfo = NULL;
}
}
return returnIdInfo;
//-----------------------------------------
//
// class CheckResult Implementation
//
//-----------------------------------------
CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
clear();
}
USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
return reinterpret_cast<USpoofCheckResult*>(this);
}
void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
if (idInfo != NULL) {
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
{
Mutex m;
if (nonConstThis->fCachedIdentifierInfo == NULL) {
nonConstThis->fCachedIdentifierInfo = idInfo;
idInfo = NULL;
}
}
delete idInfo;
//
// Incoming parameter check on Status and the CheckResult object
// received from the C API.
//
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
if (U_FAILURE(status)) { return NULL; }
if (ptr == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
CheckResult *This = (CheckResult*) ptr;
if (This->fMagic != USPOOF_CHECK_MAGIC) {
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
return This;
}
CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
return const_cast<CheckResult *>
(CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
}
void CheckResult::clear() {
fChecks = 0;
fNumerics.clear();
fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
}
int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
return fChecks | fRestrictionLevel;
} else {
return fChecks;
}
}
CheckResult::~CheckResult() {
}
//----------------------------------------------------------------------------------------------
//
@ -463,12 +472,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
//----------------------------------------------------------------------------------------------
UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
UBool SpoofData::validateDataVersion(UErrorCode &status) const {
if (U_FAILURE(status) ||
rawData == NULL ||
rawData->fMagic != USPOOF_MAGIC ||
rawData->fFormatVersion[0] > 1 ||
rawData->fFormatVersion[1] > 0) {
fRawData == NULL ||
fRawData->fMagic != USPOOF_MAGIC ||
fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
fRawData->fFormatVersion[1] != 0 ||
fRawData->fFormatVersion[2] != 0 ||
fRawData->fFormatVersion[3] != 0) {
status = U_INVALID_FORMAT_ERROR;
return FALSE;
}
@ -487,7 +498,7 @@ spoofDataIsAcceptable(void *context,
pInfo->dataFormat[1] == 0x66 &&
pInfo->dataFormat[2] == 0x75 &&
pInfo->dataFormat[3] == 0x20 &&
pInfo->formatVersion[0] == 1
pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
) {
UVersionInfo *version = static_cast<UVersionInfo *>(context);
if(version != NULL) {
@ -499,32 +510,61 @@ spoofDataIsAcceptable(void *context,
}
}
// Methods for the loading of the default confusables data file. The confusable
// data is loaded only when it is needed.
//
// SpoofData::getDefault() - return a wrapper around the spoof data that is
// baked into the default ICU data.
// SpoofData::getDefault() - Return the default confusables data, and call the
// initOnce() if it is not available. Adds a reference
// to the SpoofData that the caller is responsible for
// decrementing when they are done with the data.
//
// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
// SpoofData is shared by all spoof checkers using the default data.
// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
// is shared by all spoof checkers using the default data.
//
SpoofData *SpoofData::getDefault(UErrorCode &status) {
// uspoof_cleanupDefaultData - Called during cleanup.
//
static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
static SpoofData* gDefaultSpoofData;
static UBool U_CALLCONV
uspoof_cleanupDefaultData(void) {
if (gDefaultSpoofData) {
// Will delete, assuming all user-level spoof checkers were closed.
gDefaultSpoofData->removeReference();
gDefaultSpoofData = NULL;
gSpoofInitDefaultOnce.reset();
}
return TRUE;
}
static void uspoof_loadDefaultData(UErrorCode& status) {
UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
spoofDataIsAcceptable,
NULL, // context, would receive dataVersion if supplied.
&status);
if (U_FAILURE(status)) { return; }
gDefaultSpoofData = new SpoofData(udm, status);
if (U_FAILURE(status)) {
return NULL;
delete gDefaultSpoofData;
return;
}
SpoofData *This = new SpoofData(udm, status);
if (U_FAILURE(status)) {
delete This;
return NULL;
}
if (This == NULL) {
if (gDefaultSpoofData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
return This;
ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
}
SpoofData* SpoofData::getDefault(UErrorCode& status) {
umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
if (U_FAILURE(status)) { return NULL; }
gDefaultSpoofData->addReference();
return gDefaultSpoofData;
}
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
{
reset();
@ -535,7 +575,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
// fRawData is non-const because it may be constructed by the data builder.
fRawData = reinterpret_cast<SpoofDataHeader *>(
const_cast<void *>(udata_getMemory(udm)));
validateDataVersion(fRawData, status);
validateDataVersion(status);
initPtrs(status);
}
@ -556,7 +596,7 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
status = U_INVALID_FORMAT_ERROR;
return;
}
validateDataVersion(fRawData, status);
validateDataVersion(status);
initPtrs(status);
}
@ -584,7 +624,7 @@ SpoofData::SpoofData(UErrorCode &status) {
uprv_memset(fRawData, 0, initialSize);
fRawData->fMagic = USPOOF_MAGIC;
fRawData->fFormatVersion[0] = 1;
fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
fRawData->fFormatVersion[1] = 0;
fRawData->fFormatVersion[2] = 0;
fRawData->fFormatVersion[3] = 0;
@ -602,11 +642,7 @@ void SpoofData::reset() {
fRefCount = 1;
fCFUKeys = NULL;
fCFUValues = NULL;
fCFUStringLengths = NULL;
fCFUStrings = NULL;
fAnyCaseTrie = NULL;
fLowerCaseTrie = NULL;
fScriptSets = NULL;
}
@ -628,7 +664,6 @@ void SpoofData::reset() {
void SpoofData::initPtrs(UErrorCode &status) {
fCFUKeys = NULL;
fCFUValues = NULL;
fCFUStringLengths = NULL;
fCFUStrings = NULL;
if (U_FAILURE(status)) {
return;
@ -639,33 +674,13 @@ void SpoofData::initPtrs(UErrorCode &status) {
if (fRawData->fCFUStringIndex != 0) {
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
}
if (fRawData->fCFUStringLengths != 0) {
fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
}
if (fRawData->fCFUStringTable != 0) {
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
}
if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
}
if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
}
if (fRawData->fScriptSets != 0) {
fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
}
}
SpoofData::~SpoofData() {
utrie2_close(fAnyCaseTrie);
fAnyCaseTrie = NULL;
utrie2_close(fLowerCaseTrie);
fLowerCaseTrie = NULL;
if (fDataOwned) {
uprv_free(fRawData);
}
@ -710,6 +725,78 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
return (char *)fRawData + returnOffset;
}
int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
int32_t dataSize = fRawData->fLength;
if (capacity < dataSize) {
status = U_BUFFER_OVERFLOW_ERROR;
return dataSize;
}
uprv_memcpy(buf, fRawData, dataSize);
return dataSize;
}
int32_t SpoofData::size() const {
return fRawData->fLength;
}
//-------------------------------
//
// Front-end APIs for SpoofData
//
//-------------------------------
int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
// Perform a binary search.
// [lo, hi), i.e lo is inclusive, hi is exclusive.
// The result after the loop will be in lo.
int32_t lo = 0;
int32_t hi = length();
do {
int32_t mid = (lo + hi) / 2;
if (codePointAt(mid) > inChar) {
hi = mid;
} else if (codePointAt(mid) < inChar) {
lo = mid;
} else {
// Found result. Break early.
lo = mid;
break;
}
} while (hi - lo > 1);
// Did we find an entry? If not, the char maps to itself.
if (codePointAt(lo) != inChar) {
dest.append(inChar);
return 1;
}
// Add the element to the string builder and return.
return appendValueTo(lo, dest);
}
int32_t SpoofData::length() const {
return fRawData->fCFUKeysSize;
}
UChar32 SpoofData::codePointAt(int32_t index) const {
return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
}
int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
// Value is either a char (for strings of length 1) or
// an index into the string table (for longer strings)
uint16_t value = fCFUValues[index];
if (stringLength == 1) {
dest.append((UChar)value);
} else {
dest.append(fCFUStrings + value, stringLength);
}
return stringLength;
}
U_NAMESPACE_END
@ -741,7 +828,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
pInfo->dataFormat[1]==0x66 &&
pInfo->dataFormat[2]==0x75 &&
pInfo->dataFormat[3]==0x20 &&
pInfo->formatVersion[0]==1 )) {
pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
pInfo->formatVersion[1]==0 &&
pInfo->formatVersion[2]==0 &&
pInfo->formatVersion[3]==0 )) {
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
"(format version %02x %02x %02x %02x) is not recognized\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
@ -830,26 +920,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// String Lengths Section
sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// Any Case Trie
sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// Lower Case Trie
sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// Script Sets. The data is an array of int32_t
sectionStart = ds->readUInt32(spoofDH->fScriptSets);
sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// And, last, swap the header itself.
// int32_t fMagic // swap this
// uint8_t fFormatVersion[4] // Do not swap this, just copy

View file

@ -15,6 +15,7 @@
#ifndef USPOOFIM_H
#define USPOOFIM_H
#include "uassert.h"
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
@ -39,11 +40,13 @@ U_NAMESPACE_BEGIN
// Magic number for sanity checking spoof data.
#define USPOOF_MAGIC 0x3845fdef
class IdentifierInfo;
// Magic number for sanity checking spoof checkers.
#define USPOOF_CHECK_MAGIC 0x2734ecde
class ScriptSet;
class SpoofData;
struct SpoofDataHeader;
struct SpoofStringLengthsElement;
class ConfusableDataUtils;
/**
* Class SpoofImpl corresponds directly to the plain C API opaque type
@ -51,25 +54,20 @@ struct SpoofStringLengthsElement;
*/
class SpoofImpl : public UObject {
public:
SpoofImpl(SpoofData *data, UErrorCode &status);
SpoofImpl();
virtual ~SpoofImpl();
SpoofImpl(SpoofData *data, UErrorCode& status);
SpoofImpl(UErrorCode& status);
SpoofImpl();
void construct(UErrorCode& status);
virtual ~SpoofImpl();
/** Copy constructor, used by the user level uspoof_clone() function.
*/
SpoofImpl(const SpoofImpl &src, UErrorCode &status);
USpoofChecker *asUSpoofChecker();
static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status);
static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status);
/** Get the confusable skeleton transform for a single code point.
* The result is a string with a length between 1 and 18.
* @param tableMask bit flag specifying which confusable table to use.
* One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
* @return The length in UTF-16 code units of the substition string.
*/
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &destBuf) const;
/** Set and Get AllowedLocales, implementations of the corresponding API */
void setAllowedLocales(const char *localesList, UErrorCode &status);
const char * getAllowedLocales(UErrorCode &status);
@ -78,26 +76,19 @@ public:
// the specified locale. Part of the implementation of setAllowedLocales.
void addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status);
// Functions implementing the features of UTS 39 section 5.
static void getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status);
void getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const;
void getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const;
void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const;
URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const;
/** parse a hex number. Untility used by the builders. */
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);
// Implementation for Whole Script tests.
// Return the test bit flag to be ORed into the eventual user return value
// if a Spoof opportunity is detected.
void wholeScriptCheck(
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const;
static UClassID U_EXPORT2 getStaticClassID(void);
virtual UClassID getDynamicClassID(void) const;
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
// Maintain a one-element cache, which is sufficient to avoid repeatedly
// creating new ones unless we get multi-thread concurrency in spoof
// check operations, which should be statistically uncommon.
IdentifierInfo *getIdentifierInfo(UErrorCode &status) const;
void releaseIdentifierInfo(IdentifierInfo *idInfo) const;
//
// Data Members
//
@ -108,14 +99,36 @@ public:
SpoofData *fSpoofData;
const UnicodeSet *fAllowedCharsSet; // The UnicodeSet of allowed characters.
// for this Spoof Checker. Defaults to all chars.
// for this Spoof Checker. Defaults to all chars.
const char *fAllowedLocales; // The list of allowed locales.
URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier.
IdentifierInfo *fCachedIdentifierInfo; // Do not use directly. See getIdentifierInfo().:w
};
/**
* Class CheckResult corresponds directly to the plain C API opaque type
* USpoofCheckResult. One can be cast to the other.
*/
class CheckResult : public UObject {
public:
CheckResult();
virtual ~CheckResult();
USpoofCheckResult *asUSpoofCheckResult();
static CheckResult *validateThis(USpoofCheckResult *ptr, UErrorCode &status);
static const CheckResult *validateThis(const USpoofCheckResult *ptr, UErrorCode &status);
void clear();
// Used to convert this CheckResult to the older int32_t return value API
int32_t toCombinedBitmask(int32_t expectedChecks);
// Data Members (all stack-allocated)
int32_t fMagic; // Internal sanity check.
int32_t fChecks; // Bit vector of checks that were failed.
UnicodeSet fNumerics; // Set of numerics found in the string.
URestrictionLevel fRestrictionLevel; // The restriction level of the string.
};
//
@ -127,14 +140,7 @@ public:
//
// The keys are stored as a sorted array of 32 bit ints.
// bits 0-23 a code point value
// bits 24-31 flags
// 24: 1 if entry applies to SL table
// 25: 1 if entry applies to SA table
// 26: 1 if entry applies to ML table
// 27: 1 if entry applies to MA table
// 28: 1 if there are multiple entries for this code point.
// 29-30: length of value string, in UChars.
// values are (1, 2, 3, other)
// bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
// The key table is sorted in ascending code point order. (not on the
// 32 bit int value, the flag bits do not participate in the sorting.)
//
@ -154,33 +160,25 @@ public:
//
// There is no nul character or other mark between adjacent strings.
//
// String Lengths table
// The length of strings from 1 to 3 is flagged in the key table.
// For strings of length 4 or longer, the string length table provides a
// mapping between an index into the string table and the corresponding length.
// Strings of these lengths are rare, so lookup time is not an issue.
// Each entry consists of
// uint16_t index of the _last_ string with this length
// uint16_t the length
//
// Flag bits in the Key entries
#define USPOOF_SL_TABLE_FLAG (1<<24)
#define USPOOF_SA_TABLE_FLAG (1<<25)
#define USPOOF_ML_TABLE_FLAG (1<<26)
#define USPOOF_MA_TABLE_FLAG (1<<27)
#define USPOOF_KEY_MULTIPLE_VALUES (1<<28)
#define USPOOF_KEY_LENGTH_SHIFT 29
#define USPOOF_KEY_LENGTH_FIELD(x) (((x)>>29) & 3)
struct SpoofStringLengthsElement {
uint16_t fLastString; // index in string table of last string with this length
uint16_t fStrLength; // Length of strings
// Internal functions for manipulating confusable data table keys
#define USPOOF_CONFUSABLE_DATA_FORMAT_VERSION 2 // version for ICU 58
class ConfusableDataUtils {
public:
inline static UChar32 keyToCodePoint(int32_t key) {
return key & 0x00ffffff;
}
inline static int32_t keyToLength(int32_t key) {
return ((key & 0xff000000) >> 24) + 1;
}
inline static int32_t codePointAndLengthToKey(UChar32 codePoint, int32_t length) {
U_ASSERT((codePoint & 0x00ffffff) == codePoint);
U_ASSERT(length <= 256);
return codePoint | ((length - 1) << 24);
}
};
//-------------------------------------------------------------------------------------
//
// SpoofData
@ -197,7 +195,9 @@ struct SpoofStringLengthsElement {
//---------------------------------------------------------------------------------------
class SpoofData: public UMemory {
public:
static SpoofData *getDefault(UErrorCode &status); // Load standard ICU spoof data.
static SpoofData* getDefault(UErrorCode &status); // Get standard ICU spoof data.
static void releaseDefault(); // Cleanup reference to default spoof data.
SpoofData(UErrorCode &status); // Create new spoof data wrapper.
// Only used when building new data from rules.
@ -212,7 +212,8 @@ class SpoofData: public UMemory {
// Check raw Spoof Data Version compatibility.
// Return TRUE it looks good.
static UBool validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status);
UBool validateDataVersion(UErrorCode &status) const;
~SpoofData(); // Destructor not normally used.
// Use removeReference() instead.
// Reference Counting functions.
@ -222,6 +223,35 @@ class SpoofData: public UMemory {
SpoofData *addReference();
void removeReference();
// Reset all fields to an initial state.
// Called from the top of all constructors.
void reset();
// Copy this instance's raw data buffer to the specified address.
int32_t serialize(void *buf, int32_t capacity, UErrorCode &status) const;
// Get the total number of bytes of data backed by this SpoofData.
// Not to be confused with length, which returns the number of confusable entries.
int32_t size() const;
// Get the confusable skeleton transform for a single code point.
// The result is a string with a length between 1 and 18 as of Unicode 9.
// This is the main public endpoint for this class.
// @return The length in UTF-16 code units of the substition string.
int32_t confusableLookup(UChar32 inChar, UnicodeString &dest) const;
// Get the number of confusable entries in this SpoofData.
int32_t length() const;
// Get the code point (key) at the specified index.
UChar32 codePointAt(int32_t index) const;
// Get the confusable skeleton (value) at the specified index.
// Append it to the specified UnicodeString&.
// @return The length in UTF-16 code units of the skeleton string.
int32_t appendValueTo(int32_t index, UnicodeString& dest) const;
private:
// Reserve space in the raw data. For use by builder when putting together a
// new set of data. Init the new storage to zero, to prevent inconsistent
// results if it is not all otherwise set by the requester.
@ -232,10 +262,6 @@ class SpoofData: public UMemory {
// initialize the pointers from this object to the raw data.
void initPtrs(UErrorCode &status);
// Reset all fields to an initial state.
// Called from the top of all constructors.
void reset();
SpoofDataHeader *fRawData; // Ptr to the raw memory-mapped data
UBool fDataOwned; // True if the raw data is owned, and needs
// to be deleted when refcount goes to zero.
@ -249,15 +275,10 @@ class SpoofData: public UMemory {
// Confusable data
int32_t *fCFUKeys;
uint16_t *fCFUValues;
SpoofStringLengthsElement *fCFUStringLengths;
UChar *fCFUStrings;
// Whole Script Confusable Data
UTrie2 *fAnyCaseTrie;
UTrie2 *fLowerCaseTrie;
ScriptSet *fScriptSets;
};
friend class ConfusabledataBuilder;
};
//---------------------------------------------------------------------------------------
//
@ -286,49 +307,13 @@ struct SpoofDataHeader {
int32_t fCFUStringTable; // byte offset of String table
int32_t fCFUStringTableLen; // length of string table (in 16 bit UChars)
int32_t fCFUStringLengths; // byte offset to String Lengths table
int32_t fCFUStringLengthsSize; // number of entries in lengths table. (2 x 16 bits each)
// The following sections are for data from confusablesWholeScript.txt
int32_t fAnyCaseTrie; // byte offset to the serialized Any Case Trie
int32_t fAnyCaseTrieLength; // Length (bytes) of the serialized Any Case Trie
int32_t fLowerCaseTrie; // byte offset to the serialized Lower Case Trie
int32_t fLowerCaseTrieLength; // Length (bytes) of the serialized Lower Case Trie
int32_t fScriptSets; // byte offset to array of ScriptSets
int32_t fScriptSetsLength; // Number of ScriptSets (24 bytes each)
// The following sections are for data from xidmodifications.txt
int32_t unused[15]; // Padding, Room for Expansion
};
};
//
// Structure for the Whole Script Confusable Data
// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
// Whole Script confusable data
//
// The data provides mappings from code points to a set of scripts
// that contain characters that might be confused with the code point.
// There are two mappings, one for lower case only, and one for characters
// of any case.
//
// The actual data consists of a utrie2 to map from a code point to an offset,
// and an array of UScriptSets (essentially bit maps) that is indexed
// by the offsets obtained from the Trie.
//
//
U_NAMESPACE_END
#endif /* __cplusplus */

View file

@ -222,7 +222,7 @@ static void TestUSpoofCAPI(void) {
checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, checkResults);
uspoof_close(sc2);
free(buf);
@ -299,7 +299,7 @@ static void TestUSpoofCAPI(void) {
checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, checkResults);
uspoof_close(clone2);
TEST_TEARDOWN;
@ -318,7 +318,7 @@ static void TestUSpoofCAPI(void) {
result = uspoof_check(sc, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, result);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, result);
TEST_TEARDOWN
@ -428,7 +428,7 @@ static void TestUSpoofCAPI(void) {
checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
TEST_ASSERT_EQ(0, checkResults);
TEST_TEARDOWN;
/*
@ -436,7 +436,7 @@ static void TestUSpoofCAPI(void) {
*/
TEST_SETUP
char utf8buf[200];
int32_t checkResults;
int32_t checkResults, checkResults2;
int32_t position;
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
@ -457,12 +457,61 @@ static void TestUSpoofCAPI(void) {
TEST_ASSERT_SUCCESS(status);
position = 666;
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
checkResults2 = uspoof_check(sc, scMixed, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT , checkResults);
TEST_ASSERT_EQ(0, position);
TEST_ASSERT_EQ(checkResults , checkResults2);
TEST_TEARDOWN;
/*
* uspoof_check2 variants
*/
TEST_SETUP
int32_t result1, result2;
char utf8buf[200];
uspoof_setChecks(sc, USPOOF_ALL_CHECKS | USPOOF_AUX_INFO, &status);
USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
TEST_ASSERT_SUCCESS(status);
const UChar* tests[] = { goodLatin, scMixed, scLatin,
goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) {
const UChar* str = tests[i];
// Basic test
result1 = uspoof_check(sc, str, -1, NULL, &status);
result2 = uspoof_check2(sc, str, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(result1, result2);
// With check result parameter
result1 = uspoof_check(sc, str, -1, NULL, &status);
result2 = uspoof_check2(sc, str, -1, checkResult, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(result1, result2);
// Checks from checkResult should be same as those from bitmask
TEST_ASSERT_EQ(result1 & USPOOF_ALL_CHECKS, uspoof_getCheckResultChecks(checkResult, &status));
// Restriction level from checkResult should be same as that from bitmask
URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult, &status);
TEST_ASSERT_EQ(result1 & restrictionLevel, restrictionLevel);
// UTF8 endpoint
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
TEST_ASSERT_SUCCESS(status);
result1 = uspoof_checkUTF8(sc, utf8buf, -1, NULL, &status);
result2 = uspoof_check2UTF8(sc, utf8buf, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(result1, result2);
}
uspoof_closeCheckResult(checkResult);
TEST_TEARDOWN;
/*
* uspoof_areConfusable()
*/

View file

@ -803,8 +803,7 @@ group: charset_detector
uclean_i18n
group: spoof_detection
uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o uspoof_wsconf.o
identifier_info.o scriptset.o
uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o scriptset.o
deps
uniset_props regex unorm uscript

View file

@ -23,7 +23,6 @@
#include "unicode/uspoof.h"
#include "cstring.h"
#include "identifier_info.h"
#include "scriptset.h"
#include "uhash.h"
@ -58,11 +57,15 @@
USpoofChecker *sc; \
sc = uspoof_open(&status); \
TEST_ASSERT_SUCCESS(status); \
USpoofCheckResult *checkResult; \
checkResult = uspoof_openCheckResult(&status); \
TEST_ASSERT_SUCCESS(status); \
if (U_SUCCESS(status)){
#define TEST_TEARDOWN \
} \
TEST_ASSERT_SUCCESS(status); \
uspoof_closeCheckResult(checkResult); \
uspoof_close(sc); \
}
@ -81,7 +84,6 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
TESTCASE_AUTO(testInvisible);
TESTCASE_AUTO(testConfData);
TESTCASE_AUTO(testBug8654);
TESTCASE_AUTO(testIdentifierInfo);
TESTCASE_AUTO(testScriptSet);
TESTCASE_AUTO(testRestrictionLevel);
TESTCASE_AUTO(testMixedNumbers);
@ -105,6 +107,7 @@ void IntlTestSpoof::testSpoofAPI() {
UnicodeString s1("cxs");
UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
TEST_TEARDOWN;
@ -223,8 +226,9 @@ void IntlTestSpoof::testAreConfusable() {
"A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
"A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
int32_t result = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, result);
TEST_TEARDOWN;
}
@ -398,146 +402,6 @@ void IntlTestSpoof::testConfData() {
}
}
// testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time
void IntlTestSpoof::testIdentifierInfo() {
UErrorCode status = U_ZERO_ERROR;
ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
TEST_ASSERT(bitset12.contains(bitset2));
TEST_ASSERT(bitset12.contains(bitset12));
TEST_ASSERT(!bitset2.contains(bitset12));
ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
UElement arabEl; arabEl.pointer = &arabSet;
UElement latinEl; latinEl.pointer = &latinSet;
TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
UnicodeString scriptString;
bitset12.displayScripts(scriptString);
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
status = U_ZERO_ERROR;
UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status);
uhash_puti(alternates, &bitset12, 1, &status);
uhash_puti(alternates, &bitset2, 1, &status);
UnicodeString alternatesString;
IdentifierInfo::displayAlternates(alternatesString, alternates, status);
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString);
TEST_ASSERT_SUCCESS(status);
status = U_ZERO_ERROR;
ScriptSet tScriptSet;
tScriptSet.parseScripts(scriptString, status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(bitset12 == tScriptSet);
UnicodeString ss;
ss.remove();
uhash_close(alternates);
struct Test {
const char *fTestString;
URestrictionLevel fRestrictionLevel;
const char *fNumerics;
const char *fScripts;
const char *fAlternates;
const char *fCommonAlternates;
} tests[] = {
{"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""},
{"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
{"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"},
{"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
{"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
{"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"},
{"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""},
{"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE,
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
{"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE,
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}
};
int testNum;
for (testNum = 0; testNum < UPRV_LENGTHOF(tests); testNum++) {
char testNumStr[40];
sprintf(testNumStr, "testNum = %d", testNum);
Test &test = tests[testNum];
status = U_ZERO_ERROR;
UnicodeString testString(test.fTestString); // Note: may do charset conversion.
testString = testString.unescape();
IdentifierInfo idInfo(status);
TEST_ASSERT_SUCCESS(status);
UnicodeSet allowedChars;
// Allowed Identifier Characters. In addition to the Recommended Set,
// allow u303c, which has an interesting script extension of Hani Hira Kana.
allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C);
idInfo.setIdentifierProfile(allowedChars);
idInfo.setIdentifier(testString, status);
TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr);
URestrictionLevel restrictionLevel = test.fRestrictionLevel;
TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr);
status = U_ZERO_ERROR;
UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr);
ScriptSet scripts;
scripts.parseScripts(UnicodeString(test.fScripts), status);
TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr);
UnicodeString alternatesStr;
IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status);
TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr);
ScriptSet commonAlternates;
commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status);
TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr);
}
// Test of getScriptCount()
// Script and or Script Extension for chars used in the tests
// \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
// \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK
// \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA
//
// \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA
// \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA
// \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A
// \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A
// 1234 ; Common # ascii digits
// \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT
struct ScriptTest {
const char *fTestString;
int32_t fScriptCount;
} scriptTests[] = {
{"Hello", 1},
{"Hello\\u0370", 2},
{"1234", 0},
{"Hello1234\\u0300", 1}, // Common and Inherited are ignored.
{"\\u0030", 0},
{"abc\\u0951", 1},
{"abc\\u3013", 2},
{"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path.
{"\\u3013\\uA838", 2}
};
status = U_ZERO_ERROR;
IdentifierInfo identifierInfo(status);
for (testNum=0; testNum<UPRV_LENGTHOF(scriptTests); testNum++) {
ScriptTest &test = scriptTests[testNum];
char msgBuf[100];
sprintf(msgBuf, "testNum = %d ", testNum);
UnicodeString testString = UnicodeString(test.fTestString).unescape();
status = U_ZERO_ERROR;
identifierInfo.setIdentifier(testString, status);
int32_t scriptCount = identifierInfo.getScriptCount();
TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf);
}
}
void IntlTestSpoof::testScriptSet() {
ScriptSet s1;
@ -600,6 +464,14 @@ void IntlTestSpoof::testScriptSet() {
s2.intersect(s1);
TEST_ASSERT(s2.countMembers() == 1);
s1.resetAll();
TEST_ASSERT(s1.isEmpty());
s1.set(USCRIPT_LATIN, status);
TEST_ASSERT(!s1.isEmpty());
s1.setAll();
TEST_ASSERT(!s1.isEmpty());
TEST_ASSERT_SUCCESS(status);
s1.resetAll();
s1.set(USCRIPT_AFAKA, status);
s1.set(USCRIPT_VAI, status);
@ -616,6 +488,39 @@ void IntlTestSpoof::testScriptSet() {
}
}
TEST_ASSERT_SUCCESS(status);
// Script extensions. Depends on data.
s1.resetAll();
s1.setScriptExtensions(0x67, status);
TEST_ASSERT(s1.countMembers() == 1);
TEST_ASSERT(s1.test(USCRIPT_LATIN, status));
TEST_ASSERT_SUCCESS(status);
s1.resetAll();
s1.setScriptExtensions(0x303C, status);
TEST_ASSERT(s1.countMembers() == 3);
TEST_ASSERT(s1.test(USCRIPT_HAN, status));
TEST_ASSERT(s1.test(USCRIPT_HIRAGANA, status));
TEST_ASSERT(s1.test(USCRIPT_KATAKANA, status));
TEST_ASSERT_SUCCESS(status);
// Additional tests
ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
TEST_ASSERT(bitset12.contains(bitset2));
TEST_ASSERT(bitset12.contains(bitset12));
TEST_ASSERT(!bitset2.contains(bitset12));
ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
UElement arabEl; arabEl.pointer = &arabSet;
UElement latinEl; latinEl.pointer = &latinSet;
TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
UnicodeString scriptString;
bitset12.displayScripts(scriptString);
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
}
@ -629,35 +534,40 @@ void IntlTestSpoof::testRestrictionLevel() {
{"\\u03B3", USPOOF_SINGLE_SCRIPT_RESTRICTIVE},
{"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE},
{"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}
{"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE},
{"\\u0061\\u2665", USPOOF_UNRESTRICTIVE},
{"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
{"\\u0061\\u0031\\u0661", USPOOF_MODERATELY_RESTRICTIVE},
{"\\u0061\\u0031\\u0661\\u06F1", USPOOF_MODERATELY_RESTRICTIVE},
{"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE},
{"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE}
};
char msgBuffer[100];
URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_SINGLE_SCRIPT_RESTRICTIVE,
USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE,
USPOOF_UNRESTRICTIVE};
USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE,
USPOOF_UNRESTRICTIVE};
UErrorCode status = U_ZERO_ERROR;
IdentifierInfo idInfo(status);
TEST_ASSERT_SUCCESS(status);
idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
TEST_ASSERT_SUCCESS(status);
UnicodeSet allowedChars;
// Allowed Identifier Characters. In addition to the Recommended Set,
// allow u303c, which has an interesting script extension of Hani Hira Kana.
allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C);
for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
status = U_ZERO_ERROR;
const Test &test = tests[testNum];
UnicodeString testString = UnicodeString(test.fId).unescape();
URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel;
idInfo.setIdentifier(testString, status);
sprintf(msgBuffer, "testNum = %d ", testNum);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer);
for (int levelIndex=0; levelIndex<UPRV_LENGTHOF(restrictionLevels); levelIndex++) {
status = U_ZERO_ERROR;
URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex];
USpoofChecker *sc = uspoof_open(&status);
uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status);
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
int32_t actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status);
// we want to fail if the text is (say) MODERATE and the testLevel is ASCII
@ -665,9 +575,6 @@ void IntlTestSpoof::testRestrictionLevel() {
if (expectedLevel > levelSetInSpoofChecker) {
expectedValue |= USPOOF_RESTRICTION_LEVEL;
}
if (!uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString)) {
expectedValue |= USPOOF_CHAR_LIMIT;
}
sprintf(msgBuffer, "testNum = %d, levelIndex = %d, expected = %#x, actual = %#x",
testNum, levelIndex, expectedValue, actualValue);
TEST_ASSERT_MSG(expectedValue == actualValue, msgBuffer);
@ -675,9 +582,9 @@ void IntlTestSpoof::testRestrictionLevel() {
// Run the same check again, with the Spoof Checker configured to return
// the actual restriction level.
uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status);
uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status);
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status);
int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
TEST_ASSERT_SUCCESS(status);
if (U_SUCCESS(status)) {
@ -687,8 +594,8 @@ void IntlTestSpoof::testRestrictionLevel() {
uspoof_close(sc);
}
}
}
}
void IntlTestSpoof::testMixedNumbers() {
struct Test {
@ -698,10 +605,18 @@ void IntlTestSpoof::testMixedNumbers() {
{"1", "[0]"},
{"\\u0967", "[\\u0966]"},
{"1\\u0967", "[0\\u0966]"},
{"\\u0661\\u06F1", "[\\u0660\\u06F0]"}
{"\\u0661\\u06F1", "[\\u0660\\u06F0]"},
{"\\u0061\\u2665", "[]"},
{"\\u0061\\u303C", "[]"},
{"\\u0061\\u30FC\\u303C", "[]"},
{"\\u0061\\u30FC\\u303C\\u30A2", "[]"},
{"\\u30A2\\u0061\\u30FC\\u303C", "[]"},
{"\\u0061\\u0031\\u0661", "[\\u0030\\u0660]"},
{"\\u0061\\u0031\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0]"},
{"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"},
{"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"}
};
UErrorCode status = U_ZERO_ERROR;
IdentifierInfo idInfo(status);
for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
char msgBuf[100];
sprintf(msgBuf, "testNum = %d ", testNum);
@ -710,17 +625,16 @@ void IntlTestSpoof::testMixedNumbers() {
status = U_ZERO_ERROR;
UnicodeString testString = UnicodeString(test.fTestString).unescape();
UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status);
idInfo.setIdentifier(testString, status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf);
status = U_ZERO_ERROR;
USpoofChecker *sc = uspoof_open(&status);
uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0);
TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
uspoof_close(sc);
TEST_SETUP
uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
uspoof_check2UnicodeString(sc, testString, checkResult, &status);
UBool mixedNumberFailure = ((uspoof_getCheckResultChecks(checkResult, &status) & USPOOF_MIXED_NUMBERS) != 0);
TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
const UnicodeSet* actualSet = UnicodeSet::fromUSet(uspoof_getCheckResultNumerics(checkResult, &status));
TEST_ASSERT_MSG(expectedSet == *actualSet, msgBuf);
TEST_TEARDOWN
}
}

View file

@ -38,8 +38,6 @@ public:
void testBug8654();
void testIdentifierInfo();
void testScriptSet();
void testRestrictionLevel();

View file

@ -16,17 +16,20 @@
// derived from the Unicode Consortium data described in
// Unicode UAX 39.
//
// Usage: gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt -o output-file.cfu
// Usage: gencfu [options] -r confusables-file.txt -o output-file.cfu
//
// options: -v verbose
// -? or -h help
//
// The input rule filew is are plain text files containing confusable character
// definitions in the input format defined by Unicode UAX39 for the files
// confusables.txt and confusablesWholeScript.txt. This source (.txt) format
// confusables.txt. This source (.txt) format
// is also accepted direaccepted by ICU spoof detedtors. The
// files must be encoded in utf-8 format, with or without a BOM.
//
// The script used to compile confusablesWholeScript.txt into the CFU file
// until the Unicode consortium deprecated it.
//
//--------------------------------------------------------------------
#include "unicode/utypes.h"
@ -53,7 +56,7 @@ static UOption options[]={
UOPTION_HELP_QUESTION_MARK, /* 1 */
UOPTION_VERBOSE, /* 2 */
{ "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
{ "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */
{ "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */ // deprecated
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */
UOPTION_ICUDATADIR, /* 6 */
UOPTION_DESTDIR, /* 7 */
@ -62,7 +65,7 @@ static UOption options[]={
};
void usageAndDie(int retCode) {
printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName);
printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
@ -133,7 +136,6 @@ static const char *readFile(const char *fileName, int32_t *len);
int main(int argc, char **argv) {
UErrorCode status = U_ZERO_ERROR;
const char *confFileName;
const char *confWSFileName;
const char *outFileName;
const char *outDir = NULL;
const char *copyright = NULL;
@ -156,12 +158,11 @@ int main(int argc, char **argv) {
usageAndDie(0);
}
if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
if (!(options[3].doesOccur && options[5].doesOccur)) {
fprintf(stderr, "confusables file and output file must all be specified.\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
confFileName = options[3].value;
confWSFileName = options[4].value;
outFileName = options[5].value;
if (options[6].doesOccur) {
@ -220,13 +221,6 @@ int main(int argc, char **argv) {
exit(-1);
}
int32_t wsConfusablesLen = 0;
const char *wsConfsables = readFile(confWSFileName, &wsConfusablesLen);
if (wsConfsables == NULL) {
printf("gencfu: error reading file \"%s\"\n", confFileName);
exit(-1);
}
//
// Create the Spoof Detector from the source confusables files.
// This will compile the data.
@ -236,13 +230,11 @@ int main(int argc, char **argv) {
parseError.offset = 0;
int32_t errType;
USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
wsConfsables, wsConfusablesLen,
NULL, 0,
&errType, &parseError, &status);
if (U_FAILURE(status)) {
const char *errFile =
(errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n",
u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
u_errorName(status), confFileName, (int)parseError.line, (int)parseError.offset);
exit(status);
};
@ -297,7 +289,6 @@ int main(int argc, char **argv) {
uspoof_close(sc);
delete [] outData;
delete [] confusables;
delete [] wsConfsables;
u_cleanup();
if (!quiet) {
printf("gencfu: tool completed successfully.\n");