mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-12549 Updating SpoofChecker to latest Unicode specification.
X-SVN-Rev: 39218
This commit is contained in:
parent
85f8d034a7
commit
2ceb565df3
19 changed files with 1721 additions and 1404 deletions
|
@ -362,7 +362,7 @@ public:
|
|||
UnicodeSet();
|
||||
|
||||
/**
|
||||
* Constructs a set containing the given range. If <code>end >
|
||||
* Constructs a set containing the given range. If <code>end <
|
||||
* start</code> then an empty set is created.
|
||||
*
|
||||
* @param start first character, inclusive, of range
|
||||
|
|
|
@ -92,10 +92,10 @@ csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.
|
|||
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o zonemeta.o \
|
||||
standardplural.o upluralrules.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o udateintervalformat.o \
|
||||
tmunit.o tmutamt.o tmutfmt.o currpinf.o \
|
||||
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \
|
||||
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o decfmtst.o smpdtfst.o \
|
||||
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o ufieldpositer.o \
|
||||
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
|
||||
tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o identifier_info.o \
|
||||
tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o \
|
||||
uregion.o reldatefmt.o quantityformatter.o measunit.o \
|
||||
sharedbreakiterator.o scientificnumberformatter.o digitgrouping.o \
|
||||
digitinterval.o digitformatter.o digitaffix.o valueformatter.o \
|
||||
|
|
|
@ -337,7 +337,6 @@
|
|||
<ClCompile Include="gregocal.cpp" />
|
||||
<ClCompile Include="gregoimp.cpp" />
|
||||
<ClCompile Include="hebrwcal.cpp" />
|
||||
<ClCompile Include="identifier_info.cpp" />
|
||||
<ClCompile Include="indiancal.cpp" />
|
||||
<ClCompile Include="islamcal.cpp" />
|
||||
<ClCompile Include="japancal.cpp" />
|
||||
|
@ -464,7 +463,6 @@
|
|||
<ClCompile Include="uspoof_build.cpp" />
|
||||
<ClCompile Include="uspoof_conf.cpp" />
|
||||
<ClCompile Include="uspoof_impl.cpp" />
|
||||
<ClCompile Include="uspoof_wsconf.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="unicode\alphaindex.h">
|
||||
|
@ -1686,11 +1684,9 @@
|
|||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<ClInclude Include="identifier_info.h" />
|
||||
<ClInclude Include="scriptset.h" />
|
||||
<ClInclude Include="uspoof_conf.h" />
|
||||
<ClInclude Include="uspoof_impl.h" />
|
||||
<ClInclude Include="uspoof_wsconf.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ResourceCompile Include="i18n.rc" />
|
||||
|
|
|
@ -501,9 +501,6 @@
|
|||
<ClCompile Include="ucsdet.cpp">
|
||||
<Filter>charset detect</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="identifier_info.cpp">
|
||||
<Filter>spoof</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="scriptset.cpp">
|
||||
<Filter>spoof</Filter>
|
||||
</ClCompile>
|
||||
|
@ -519,9 +516,6 @@
|
|||
<ClCompile Include="uspoof_impl.cpp">
|
||||
<Filter>spoof</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="uspoof_wsconf.cpp">
|
||||
<Filter>spoof</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="alphaindex.cpp">
|
||||
<Filter>collation</Filter>
|
||||
</ClCompile>
|
||||
|
@ -943,9 +937,6 @@
|
|||
<ClInclude Include="inputext.h">
|
||||
<Filter>charset detect</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="identifier_info.h">
|
||||
<Filter>spoof</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="scriptset.h">
|
||||
<Filter>spoof</Filter>
|
||||
</ClInclude>
|
||||
|
@ -955,9 +946,6 @@
|
|||
<ClInclude Include="uspoof_impl.h">
|
||||
<Filter>spoof</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="uspoof_wsconf.h">
|
||||
<Filter>spoof</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="tzgnames.h">
|
||||
<Filter>formatting</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -193,6 +193,15 @@ int32_t ScriptSet::nextSetBit(int32_t fromIndex) const {
|
|||
return -1;
|
||||
}
|
||||
|
||||
UBool ScriptSet::isEmpty() const {
|
||||
for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
|
||||
if (bits[i] != 0) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const {
|
||||
UBool firstTime = TRUE;
|
||||
for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
|
||||
|
@ -240,6 +249,41 @@ ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode
|
|||
return *this;
|
||||
}
|
||||
|
||||
void ScriptSet::setScriptExtensions(UChar32 codePoint, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) { return; }
|
||||
static const int32_t FIRST_GUESS_SCRIPT_CAPACITY = 5;
|
||||
MaybeStackArray<UScriptCode,FIRST_GUESS_SCRIPT_CAPACITY> scripts;
|
||||
UErrorCode internalStatus = U_ZERO_ERROR;
|
||||
int32_t script_count = -1;
|
||||
|
||||
while (TRUE) {
|
||||
script_count = uscript_getScriptExtensions(
|
||||
codePoint, scripts.getAlias(), FIRST_GUESS_SCRIPT_CAPACITY, &internalStatus);
|
||||
if (internalStatus == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// Need to allocate more space
|
||||
if (scripts.resize(script_count) == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
internalStatus = U_ZERO_ERROR;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we failed for some reason other than buffer overflow
|
||||
if (U_FAILURE(internalStatus)) {
|
||||
status = internalStatus;
|
||||
return;
|
||||
}
|
||||
|
||||
// Load the scripts into the ScriptSet and return
|
||||
for (int32_t i = 0; i < script_count; i++) {
|
||||
this->set(scripts[i], status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
|
|
|
@ -58,9 +58,14 @@ class U_I18N_API ScriptSet: public UMemory {
|
|||
int32_t hashCode() const;
|
||||
int32_t nextSetBit(int32_t script) const;
|
||||
|
||||
UBool isEmpty() const;
|
||||
|
||||
UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string.
|
||||
ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents.
|
||||
|
||||
// Wraps around UScript::getScriptExtensions() and adds the corresponding scripts to this instance.
|
||||
void setScriptExtensions(UChar32 codePoint, UErrorCode& status);
|
||||
|
||||
private:
|
||||
uint32_t bits[6];
|
||||
};
|
||||
|
|
|
@ -26,8 +26,8 @@ as the functions are suppose to be called.
|
|||
It's usually best to have child dependencies called first. */
|
||||
typedef enum ECleanupI18NType {
|
||||
UCLN_I18N_START = -1,
|
||||
UCLN_I18N_IDENTIFIER_INFO,
|
||||
UCLN_I18N_SPOOF,
|
||||
UCLN_I18N_SPOOFDATA,
|
||||
UCLN_I18N_TRANSLITERATOR,
|
||||
UCLN_I18N_REGEX,
|
||||
UCLN_I18N_ISLAMIC_CALENDAR,
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -22,7 +22,6 @@
|
|||
#include "unicode/utf16.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "identifier_info.h"
|
||||
#include "mutex.h"
|
||||
#include "scriptset.h"
|
||||
#include "uassert.h"
|
||||
|
@ -42,9 +41,7 @@ U_NAMESPACE_USE
|
|||
static UnicodeSet *gInclusionSet = NULL;
|
||||
static UnicodeSet *gRecommendedSet = NULL;
|
||||
static const Normalizer2 *gNfdNormalizer = NULL;
|
||||
static SpoofData *gDefaultSpoofData = NULL;
|
||||
static UInitOnce gSpoofInitStaticsOnce = U_INITONCE_INITIALIZER;
|
||||
static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
|
||||
|
||||
static UBool U_CALLCONV
|
||||
uspoof_cleanup(void) {
|
||||
|
@ -53,83 +50,78 @@ uspoof_cleanup(void) {
|
|||
delete gRecommendedSet;
|
||||
gRecommendedSet = NULL;
|
||||
gNfdNormalizer = NULL;
|
||||
if (gDefaultSpoofData) {
|
||||
gDefaultSpoofData->removeReference(); // Will delete, assuming all user-level spoof checkers were closed.
|
||||
}
|
||||
gDefaultSpoofData = NULL;
|
||||
gSpoofInitStaticsOnce.reset();
|
||||
gSpoofInitDefaultOnce.reset();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void U_CALLCONV initializeStatics(UErrorCode &status) {
|
||||
static const char *inclusionPat =
|
||||
"[\\u0027\\u002D-\\u002E\\u003A\\u00B7\\u0375\\u058A\\u05F3-\\u05F4"
|
||||
"\\u06FD-\\u06FE\\u0F0B\\u200C-\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]";
|
||||
"['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C\\u200D\\u2010\\u"
|
||||
"2019\\u2027\\u30A0\\u30FB]";
|
||||
gInclusionSet = new UnicodeSet(UnicodeString(inclusionPat, -1, US_INV), status);
|
||||
gInclusionSet->freeze();
|
||||
|
||||
// Note: data from http://unicode.org/Public/security/latest/xidmodifications.txt version 8.0.0
|
||||
// There is no tooling to generate this from the .txt file, hand extracted with editor macros.
|
||||
// Ultimately, data will be available as character properties, eliminating this.
|
||||
|
||||
// Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt
|
||||
// There is tooling to generate this constant in the unicodetools project:
|
||||
// org.unicode.text.tools.RecommendedSetGenerator
|
||||
// It will print the Java and C++ code to the console for easy copy-paste into this file.
|
||||
// Note: concatenated string constants do not work with UNICODE_STRING_SIMPLE on all platforms.
|
||||
static const char *recommendedPat =
|
||||
"[\\u0030-\\u0039\\u0041-\\u005A\\u005F\\u0061-\\u007A\\u00C0-\\u00D6\\u00D8-\\u00F6"
|
||||
"\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0-\\u01A1"
|
||||
"\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B"
|
||||
"\\u021E-\\u021F\\u0226-\\u0233\\u0259\\u02BB-\\u02BC\\u02EC\\u0300-\\u0304\\u0306-\\u030C"
|
||||
"\\u030F-\\u0311\\u0313-\\u0314\\u031B\\u0323-\\u0328\\u032D-\\u032E\\u0330-\\u0331"
|
||||
"\\u0335\\u0338-\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386\\u0388-\\u038A\\u038C"
|
||||
"\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u052E-\\u052F\\u0531-\\u0556"
|
||||
"\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0620-\\u063F\\u0641-\\u0655"
|
||||
"\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06D3\\u06D5\\u06E5-\\u06E6"
|
||||
"\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2\\u0901-\\u094D\\u094F-\\u0950"
|
||||
"\\u0956-\\u0957\\u0960-\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983"
|
||||
"\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9"
|
||||
"\\u09BC-\\u09C4\\u09C7-\\u09C8\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1"
|
||||
"\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32"
|
||||
"\\u0A35\\u0A38-\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47-\\u0A48\\u0A4B-\\u0A4D\\u0A5C"
|
||||
"\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0"
|
||||
"\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0"
|
||||
"\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28"
|
||||
"\\u0B2A-\\u0B30\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47-\\u0B48\\u0B4B-\\u0B4D"
|
||||
"\\u0B56-\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82-\\u0B83\\u0B85-\\u0B8A"
|
||||
"\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4"
|
||||
"\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0"
|
||||
"\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28"
|
||||
"\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55-\\u0C56"
|
||||
"\\u0C60-\\u0C61\\u0C66-\\u0C6F\\u0C82-\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8"
|
||||
"\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5-\\u0CD6"
|
||||
"\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1-\\u0CF2\\u0D02-\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10"
|
||||
"\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D57\\u0D60-\\u0D61"
|
||||
"\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82-\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5"
|
||||
"\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6"
|
||||
"\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59"
|
||||
"\\u0E81-\\u0E82\\u0E84\\u0E87-\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F"
|
||||
"\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA-\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD"
|
||||
"\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE-\\u0EDF\\u0F00\\u0F20-\\u0F29"
|
||||
"\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56"
|
||||
"\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71-\\u0F72\\u0F74\\u0F7A-\\u0F80"
|
||||
"\\u0F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
|
||||
"\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D"
|
||||
"\\u10C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D"
|
||||
"\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0"
|
||||
"\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310"
|
||||
"\\u1312-\\u1315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7"
|
||||
"\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1E00-\\u1E99"
|
||||
"\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
|
||||
"\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78"
|
||||
"\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8"
|
||||
"\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC"
|
||||
"\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6"
|
||||
"\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6"
|
||||
"\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099-\\u309A\\u309D-\\u309E\\u30A1-\\u30FA"
|
||||
"\\u30FC-\\u30FE\\u3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660-\\uA661"
|
||||
"\\uA674-\\uA67B\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D-\\uA78E\\uA790-\\uA793"
|
||||
"\\uA7A0-\\uA7AA\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06"
|
||||
"\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E-\\uFA0F"
|
||||
"\\uFA11\\uFA13-\\uFA14\\uFA1F\\uFA21\\uFA23-\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6"
|
||||
"\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]";
|
||||
"[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u014"
|
||||
"8\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E"
|
||||
"6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02B"
|
||||
"C\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u03"
|
||||
"28\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386"
|
||||
"\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u05"
|
||||
"2E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0"
|
||||
"620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-"
|
||||
"\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2"
|
||||
"\\u08B6-\\u08BD\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096"
|
||||
"F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u0"
|
||||
"9A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u"
|
||||
"09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-"
|
||||
"\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\"
|
||||
"u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A9"
|
||||
"3-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0"
|
||||
"ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\"
|
||||
"u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47"
|
||||
"\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83"
|
||||
"\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3"
|
||||
"\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0B"
|
||||
"D0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u"
|
||||
"0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56"
|
||||
"\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92"
|
||||
"-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0"
|
||||
"CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0"
|
||||
"D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57"
|
||||
"\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D9"
|
||||
"6\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0"
|
||||
"DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\"
|
||||
"u0E59\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u"
|
||||
"0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\"
|
||||
"u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29"
|
||||
"\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F"
|
||||
"56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0"
|
||||
"F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
|
||||
"\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10"
|
||||
"C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u"
|
||||
"1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2"
|
||||
"-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1"
|
||||
"315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-"
|
||||
"\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C80-\\u1C88\\u1E00-\\u1E9"
|
||||
"9\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1"
|
||||
"F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F"
|
||||
"7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1"
|
||||
"FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-"
|
||||
"\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0"
|
||||
"-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3"
|
||||
"005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-\\u30FE\\u"
|
||||
"3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660\\uA661\\uA674-\\uA67B"
|
||||
"\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7AE"
|
||||
"\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB"
|
||||
"11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13\\uF"
|
||||
"A14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6\\U0002A700-\\U0"
|
||||
"002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]";
|
||||
|
||||
gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat, -1, US_INV), status);
|
||||
gRecommendedSet->freeze();
|
||||
|
@ -137,11 +129,6 @@ static void U_CALLCONV initializeStatics(UErrorCode &status) {
|
|||
ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
|
||||
}
|
||||
|
||||
static void U_CALLCONV initializeDefaultData(UErrorCode &status) {
|
||||
gDefaultSpoofData = SpoofData::getDefault(status);
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
|
||||
}
|
||||
|
||||
U_CFUNC void uspoof_internalInitStatics(UErrorCode *status) {
|
||||
umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status);
|
||||
}
|
||||
|
@ -149,14 +136,10 @@ U_CFUNC void uspoof_internalInitStatics(UErrorCode *status) {
|
|||
U_CAPI USpoofChecker * U_EXPORT2
|
||||
uspoof_open(UErrorCode *status) {
|
||||
umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status);
|
||||
umtx_initOnce(gSpoofInitDefaultOnce, &initializeDefaultData, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
SpoofImpl *si = new SpoofImpl(gDefaultSpoofData, *status);
|
||||
if (si) {
|
||||
gDefaultSpoofData->addReference();
|
||||
}
|
||||
SpoofImpl *si = new SpoofImpl(*status);
|
||||
if (U_SUCCESS(*status) && si == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
@ -164,7 +147,7 @@ uspoof_open(UErrorCode *status) {
|
|||
delete si;
|
||||
si = NULL;
|
||||
}
|
||||
return reinterpret_cast<USpoofChecker *>(si);
|
||||
return si->asUSpoofChecker();
|
||||
}
|
||||
|
||||
|
||||
|
@ -190,9 +173,9 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
|
|||
}
|
||||
|
||||
if (pActualLength != NULL) {
|
||||
*pActualLength = sd->fRawData->fLength;
|
||||
*pActualLength = sd->size();
|
||||
}
|
||||
return reinterpret_cast<USpoofChecker *>(si);
|
||||
return si->asUSpoofChecker();
|
||||
}
|
||||
|
||||
|
||||
|
@ -207,7 +190,7 @@ uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
|
|||
delete result;
|
||||
result = NULL;
|
||||
}
|
||||
return reinterpret_cast<USpoofChecker *>(result);
|
||||
return result->asUSpoofChecker();
|
||||
}
|
||||
|
||||
|
||||
|
@ -335,7 +318,23 @@ uspoof_check(const USpoofChecker *sc,
|
|||
const UChar *id, int32_t length,
|
||||
int32_t *position,
|
||||
UErrorCode *status) {
|
||||
|
||||
|
||||
// Backwards compatibility:
|
||||
if (position != NULL) {
|
||||
*position = 0;
|
||||
}
|
||||
|
||||
// Delegate to uspoof_check2
|
||||
return uspoof_check2(sc, id, length, NULL, status);
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_check2(const USpoofChecker *sc,
|
||||
const UChar* id, int32_t length,
|
||||
USpoofCheckResult* checkResult,
|
||||
UErrorCode *status) {
|
||||
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return 0;
|
||||
|
@ -345,7 +344,7 @@ uspoof_check(const USpoofChecker *sc,
|
|||
return 0;
|
||||
}
|
||||
UnicodeString idStr((length == -1), id, length); // Aliasing constructor.
|
||||
int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
|
||||
int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -356,11 +355,27 @@ uspoof_checkUTF8(const USpoofChecker *sc,
|
|||
int32_t *position,
|
||||
UErrorCode *status) {
|
||||
|
||||
// Backwards compatibility:
|
||||
if (position != NULL) {
|
||||
*position = 0;
|
||||
}
|
||||
|
||||
// Delegate to uspoof_check2
|
||||
return uspoof_check2UTF8(sc, id, length, NULL, status);
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_check2UTF8(const USpoofChecker *sc,
|
||||
const char *id, int32_t length,
|
||||
USpoofCheckResult* checkResult,
|
||||
UErrorCode *status) {
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
|
||||
int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
|
||||
int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -414,7 +429,7 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
|||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
//
|
||||
//
|
||||
// See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
|
||||
// and for definitions of the types (single, whole, mixed-script) of confusables.
|
||||
|
||||
|
@ -422,125 +437,95 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
|||
// If no tests relavant to this function have been specified, return an error.
|
||||
// TODO: is this really the right thing to do? It's probably an error on the caller's part,
|
||||
// but logically we would just return 0 (no error).
|
||||
if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
|
||||
USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
|
||||
if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
|
||||
*status = U_INVALID_STATE_ERROR;
|
||||
return 0;
|
||||
}
|
||||
int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
|
||||
|
||||
int32_t result = 0;
|
||||
IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status);
|
||||
if (U_FAILURE(*status)) {
|
||||
// Compute the skeletons and check for confusability.
|
||||
UnicodeString id1Skeleton;
|
||||
uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id1, id1Skeleton, status);
|
||||
UnicodeString id2Skeleton;
|
||||
uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id2, id2Skeleton, status);
|
||||
if (U_FAILURE(*status)) { return 0; }
|
||||
if (id1Skeleton != id2Skeleton) {
|
||||
return 0;
|
||||
}
|
||||
identifierInfo->setIdentifier(id1, *status);
|
||||
int32_t id1ScriptCount = identifierInfo->getScriptCount();
|
||||
int32_t id1FirstScript = identifierInfo->getScripts()->nextSetBit(0);
|
||||
identifierInfo->setIdentifier(id2, *status);
|
||||
int32_t id2ScriptCount = identifierInfo->getScriptCount();
|
||||
int32_t id2FirstScript = identifierInfo->getScripts()->nextSetBit(0);
|
||||
This->releaseIdentifierInfo(identifierInfo);
|
||||
identifierInfo = NULL;
|
||||
|
||||
if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
|
||||
UnicodeString id1Skeleton;
|
||||
UnicodeString id2Skeleton;
|
||||
if (id1ScriptCount <= 1 && id2ScriptCount <= 1 && id1FirstScript == id2FirstScript) {
|
||||
flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
|
||||
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
|
||||
if (id1Skeleton == id2Skeleton) {
|
||||
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
|
||||
// of confusables according to UTS 39 section 4.
|
||||
// Start by computing the resolved script sets of id1 and id2.
|
||||
ScriptSet id1RSS;
|
||||
This->getResolvedScriptSet(id1, id1RSS, *status);
|
||||
ScriptSet id2RSS;
|
||||
This->getResolvedScriptSet(id2, id2RSS, *status);
|
||||
|
||||
// Turn on all applicable flags
|
||||
int32_t result = 0;
|
||||
if (id1RSS.intersects(id2RSS)) {
|
||||
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
} else {
|
||||
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) {
|
||||
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
}
|
||||
|
||||
if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
|
||||
// If the two inputs are single script confusable they cannot also be
|
||||
// mixed or whole script confusable, according to the UAX39 definitions.
|
||||
// So we can skip those tests.
|
||||
return result;
|
||||
// Turn off flags that the user doesn't want
|
||||
if ((This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) == 0) {
|
||||
result &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
|
||||
// Two identifiers are whole script confusable if each is of a single script
|
||||
// and they are mixed script confusable.
|
||||
UBool possiblyWholeScriptConfusables =
|
||||
id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
|
||||
|
||||
//
|
||||
// Mixed Script Check
|
||||
//
|
||||
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
|
||||
// For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
|
||||
// the mixed script table skeleton, which is what we want.
|
||||
// The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
|
||||
UnicodeString id1Skeleton;
|
||||
UnicodeString id2Skeleton;
|
||||
flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
|
||||
uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
|
||||
if (id1Skeleton == id2Skeleton) {
|
||||
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
if (possiblyWholeScriptConfusables) {
|
||||
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
}
|
||||
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) == 0) {
|
||||
result &= ~USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) == 0) {
|
||||
result &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_checkUnicodeString(const USpoofChecker *sc,
|
||||
const icu::UnicodeString &id,
|
||||
const icu::UnicodeString &id,
|
||||
int32_t *position,
|
||||
UErrorCode *status) {
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return 0;
|
||||
|
||||
// Backwards compatibility:
|
||||
if (position != NULL) {
|
||||
*position = 0;
|
||||
}
|
||||
|
||||
// Delegate to uspoof_check2
|
||||
return uspoof_check2UnicodeString(sc, id, NULL, status);
|
||||
}
|
||||
|
||||
int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* checkResult, UErrorCode* status) {
|
||||
U_ASSERT(This != NULL);
|
||||
U_ASSERT(checkResult != NULL);
|
||||
checkResult->clear();
|
||||
int32_t result = 0;
|
||||
|
||||
IdentifierInfo *identifierInfo = NULL;
|
||||
if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
|
||||
identifierInfo = This->getIdentifierInfo(*status);
|
||||
if (U_FAILURE(*status)) {
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
identifierInfo->setIdentifier(id, *status);
|
||||
identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
|
||||
}
|
||||
|
||||
|
||||
if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
|
||||
URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
|
||||
if (0 != (This->fChecks & USPOOF_RESTRICTION_LEVEL)) {
|
||||
URestrictionLevel idRestrictionLevel = This->getRestrictionLevel(id, *status);
|
||||
if (idRestrictionLevel > This->fRestrictionLevel) {
|
||||
result |= USPOOF_RESTRICTION_LEVEL;
|
||||
}
|
||||
if (This->fChecks & USPOOF_AUX_INFO) {
|
||||
result |= idRestrictionLevel;
|
||||
}
|
||||
checkResult->fRestrictionLevel = idRestrictionLevel;
|
||||
}
|
||||
|
||||
if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
|
||||
const UnicodeSet *numerics = identifierInfo->getNumerics();
|
||||
if (numerics->size() > 1) {
|
||||
if (0 != (This->fChecks & USPOOF_MIXED_NUMBERS)) {
|
||||
UnicodeSet numerics;
|
||||
This->getNumerics(id, numerics, *status);
|
||||
if (numerics.size() > 1) {
|
||||
result |= USPOOF_MIXED_NUMBERS;
|
||||
}
|
||||
|
||||
// TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
|
||||
// We have no easy way to do the same in C.
|
||||
// if (checkResult != null) {
|
||||
// checkResult.numerics = numerics;
|
||||
// }
|
||||
checkResult->fNumerics = numerics; // UnicodeSet::operator=
|
||||
}
|
||||
|
||||
|
||||
if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
|
||||
if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) {
|
||||
int32_t i;
|
||||
UChar32 c;
|
||||
int32_t length = id.length();
|
||||
|
@ -554,103 +539,74 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
|
|||
}
|
||||
}
|
||||
|
||||
if (This->fChecks &
|
||||
(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
|
||||
// These are the checks that need to be done on NFD input
|
||||
if (0 != (This->fChecks & USPOOF_INVISIBLE)) {
|
||||
// This check needs to be done on NFD input
|
||||
UnicodeString nfdText;
|
||||
gNfdNormalizer->normalize(id, nfdText, *status);
|
||||
int32_t nfdLength = nfdText.length();
|
||||
|
||||
if (This->fChecks & USPOOF_INVISIBLE) {
|
||||
|
||||
// scan for more than one occurence of the same non-spacing mark
|
||||
// in a sequence of non-spacing marks.
|
||||
int32_t i;
|
||||
UChar32 c;
|
||||
UChar32 firstNonspacingMark = 0;
|
||||
UBool haveMultipleMarks = FALSE;
|
||||
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
|
||||
|
||||
for (i=0; i<nfdLength ;) {
|
||||
c = nfdText.char32At(i);
|
||||
i += U16_LENGTH(c);
|
||||
if (u_charType(c) != U_NON_SPACING_MARK) {
|
||||
firstNonspacingMark = 0;
|
||||
if (haveMultipleMarks) {
|
||||
marksSeenSoFar.clear();
|
||||
haveMultipleMarks = FALSE;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (firstNonspacingMark == 0) {
|
||||
firstNonspacingMark = c;
|
||||
continue;
|
||||
}
|
||||
if (!haveMultipleMarks) {
|
||||
marksSeenSoFar.add(firstNonspacingMark);
|
||||
haveMultipleMarks = TRUE;
|
||||
}
|
||||
if (marksSeenSoFar.contains(c)) {
|
||||
// report the error, and stop scanning.
|
||||
// No need to find more than the first failure.
|
||||
result |= USPOOF_INVISIBLE;
|
||||
break;
|
||||
}
|
||||
marksSeenSoFar.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
// scan for more than one occurence of the same non-spacing mark
|
||||
// in a sequence of non-spacing marks.
|
||||
int32_t i;
|
||||
UChar32 c;
|
||||
UChar32 firstNonspacingMark = 0;
|
||||
UBool haveMultipleMarks = FALSE;
|
||||
UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
|
||||
|
||||
if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
|
||||
// The basic test is the same for both whole and mixed script confusables.
|
||||
// Compute the set of scripts that every input character has a confusable in.
|
||||
// For this computation an input character is always considered to be
|
||||
// confusable with itself in its own script.
|
||||
//
|
||||
// If the number of such scripts is two or more, and the input consisted of
|
||||
// characters all from a single script, we have a whole script confusable.
|
||||
// (The two scripts will be the original script and the one that is confusable)
|
||||
//
|
||||
// If the number of such scripts >= one, and the original input contained characters from
|
||||
// more than one script, we have a mixed script confusable. (We can transform
|
||||
// some of the characters, and end up with a visually similar string all in
|
||||
// one script.)
|
||||
|
||||
if (identifierInfo == NULL) {
|
||||
identifierInfo = This->getIdentifierInfo(*status);
|
||||
if (U_FAILURE(*status)) {
|
||||
goto cleanupAndReturn;
|
||||
for (i=0; i<nfdLength ;) {
|
||||
c = nfdText.char32At(i);
|
||||
i += U16_LENGTH(c);
|
||||
if (u_charType(c) != U_NON_SPACING_MARK) {
|
||||
firstNonspacingMark = 0;
|
||||
if (haveMultipleMarks) {
|
||||
marksSeenSoFar.clear();
|
||||
haveMultipleMarks = FALSE;
|
||||
}
|
||||
identifierInfo->setIdentifier(id, *status);
|
||||
continue;
|
||||
}
|
||||
|
||||
int32_t scriptCount = identifierInfo->getScriptCount();
|
||||
|
||||
ScriptSet scripts;
|
||||
This->wholeScriptCheck(nfdText, &scripts, *status);
|
||||
int32_t confusableScriptCount = scripts.countMembers();
|
||||
//printf("confusableScriptCount = %d\n", confusableScriptCount);
|
||||
|
||||
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 2 &&
|
||||
scriptCount == 1) {
|
||||
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
if (firstNonspacingMark == 0) {
|
||||
firstNonspacingMark = c;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 1 &&
|
||||
scriptCount > 1) {
|
||||
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
if (!haveMultipleMarks) {
|
||||
marksSeenSoFar.add(firstNonspacingMark);
|
||||
haveMultipleMarks = TRUE;
|
||||
}
|
||||
if (marksSeenSoFar.contains(c)) {
|
||||
// report the error, and stop scanning.
|
||||
// No need to find more than the first failure.
|
||||
result |= USPOOF_INVISIBLE;
|
||||
break;
|
||||
}
|
||||
marksSeenSoFar.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
cleanupAndReturn:
|
||||
This->releaseIdentifierInfo(identifierInfo);
|
||||
if (position != NULL) {
|
||||
*position = 0;
|
||||
checkResult->fChecks = result;
|
||||
return checkResult->toCombinedBitmask(This->fChecks);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_check2UnicodeString(const USpoofChecker *sc,
|
||||
const icu::UnicodeString &id,
|
||||
USpoofCheckResult* checkResult,
|
||||
UErrorCode *status) {
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (checkResult != NULL) {
|
||||
CheckResult* ThisCheckResult = CheckResult::validateThis(checkResult, *status);
|
||||
if (ThisCheckResult == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
return checkImpl(This, id, ThisCheckResult, status);
|
||||
} else {
|
||||
// Stack-allocate the checkResult since this method doesn't return it
|
||||
CheckResult stackCheckResult;
|
||||
return checkImpl(This, id, &stackCheckResult, status);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
@ -681,7 +637,7 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
|||
|
||||
U_I18N_API UnicodeString & U_EXPORT2
|
||||
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
uint32_t /*type*/,
|
||||
const UnicodeString &id,
|
||||
UnicodeString &dest,
|
||||
UErrorCode *status) {
|
||||
|
@ -690,21 +646,9 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
|||
return dest;
|
||||
}
|
||||
|
||||
int32_t tableMask = 0;
|
||||
switch (type) {
|
||||
case 0:
|
||||
tableMask = USPOOF_ML_TABLE_FLAG;
|
||||
break;
|
||||
case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
|
||||
tableMask = USPOOF_SL_TABLE_FLAG;
|
||||
break;
|
||||
case USPOOF_ANY_CASE:
|
||||
tableMask = USPOOF_MA_TABLE_FLAG;
|
||||
break;
|
||||
case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
|
||||
tableMask = USPOOF_SA_TABLE_FLAG;
|
||||
break;
|
||||
default:
|
||||
// Check that at least one of the CONFUSABLE flags is turned on. If not,
|
||||
// return an error.
|
||||
if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return dest;
|
||||
}
|
||||
|
@ -720,7 +664,7 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
|||
for (inputIndex=0; inputIndex < normalizedLen; ) {
|
||||
UChar32 c = nfdId.char32At(inputIndex);
|
||||
inputIndex += U16_LENGTH(c);
|
||||
This->confusableLookup(c, tableMask, skelStr);
|
||||
This->fSpoofData->confusableLookup(c, skelStr);
|
||||
}
|
||||
|
||||
gNfdNormalizer->normalize(skelStr, dest, *status);
|
||||
|
@ -764,13 +708,8 @@ uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *stat
|
|||
U_ASSERT(U_FAILURE(*status));
|
||||
return 0;
|
||||
}
|
||||
int32_t dataSize = This->fSpoofData->fRawData->fLength;
|
||||
if (capacity < dataSize) {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return dataSize;
|
||||
}
|
||||
uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
|
||||
return dataSize;
|
||||
|
||||
return This->fSpoofData->serialize(buf, capacity, *status);
|
||||
}
|
||||
|
||||
U_CAPI const USet * U_EXPORT2
|
||||
|
@ -797,6 +736,48 @@ uspoof_getRecommendedUnicodeSet(UErrorCode *status) {
|
|||
return gRecommendedSet;
|
||||
}
|
||||
|
||||
//------------------
|
||||
// CheckResult APIs
|
||||
//------------------
|
||||
|
||||
U_CAPI USpoofCheckResult* U_EXPORT2
|
||||
uspoof_openCheckResult(UErrorCode *status) {
|
||||
CheckResult* checkResult = new CheckResult();
|
||||
if (checkResult == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
return checkResult->asUSpoofCheckResult();
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uspoof_closeCheckResult(USpoofCheckResult* checkResult) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
CheckResult* This = CheckResult::validateThis(checkResult, status);
|
||||
delete This;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status) {
|
||||
const CheckResult* This = CheckResult::validateThis(checkResult, *status);
|
||||
if (U_FAILURE(*status)) { return 0; }
|
||||
return This->fChecks;
|
||||
}
|
||||
|
||||
U_CAPI URestrictionLevel U_EXPORT2
|
||||
uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status) {
|
||||
const CheckResult* This = CheckResult::validateThis(checkResult, *status);
|
||||
if (U_FAILURE(*status)) { return USPOOF_UNRESTRICTIVE; }
|
||||
return This->fRestrictionLevel;
|
||||
}
|
||||
|
||||
U_CAPI const USet* U_EXPORT2
|
||||
uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status) {
|
||||
const CheckResult* This = CheckResult::validateThis(checkResult, *status);
|
||||
if (U_FAILURE(*status)) { return NULL; }
|
||||
return This->fNumerics.toUSet();
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
|
|
|
@ -37,7 +37,6 @@
|
|||
#include "uassert.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uspoof_conf.h"
|
||||
#include "uspoof_wsconf.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
|
@ -50,7 +49,7 @@ U_CFUNC void uspoof_internalInitStatics(UErrorCode *status);
|
|||
|
||||
U_CAPI USpoofChecker * U_EXPORT2
|
||||
uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
|
||||
const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
|
||||
const char* /*confusablesWholeScript*/, int32_t /*confusablesWholeScriptLen*/,
|
||||
int32_t *errorType, UParseError *pe, UErrorCode *status) {
|
||||
uspoof_internalInitStatics(status);
|
||||
if (U_FAILURE(*status)) {
|
||||
|
@ -76,7 +75,6 @@ uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
|
|||
|
||||
// Compile the binary data from the source (text) format.
|
||||
ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status);
|
||||
buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status);
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
delete This;
|
||||
|
|
|
@ -45,8 +45,7 @@ U_NAMESPACE_USE
|
|||
//
|
||||
// The binary structures are described in uspoof_impl.h
|
||||
//
|
||||
// 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
|
||||
// tables. Each maps from a UChar32 to a String.
|
||||
// 1. Parse the data, making a hash table mapping from a UChar32 to a String.
|
||||
//
|
||||
// 2. Sort all of the strings encountered by length, since they will need to
|
||||
// be stored in that order in the final string table.
|
||||
|
@ -63,7 +62,7 @@ U_NAMESPACE_USE
|
|||
|
||||
SPUString::SPUString(UnicodeString *s) {
|
||||
fStr = s;
|
||||
fStrTableIndex = 0;
|
||||
fCharOrStrTableIndex = 0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -145,15 +144,11 @@ SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
|
|||
ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
|
||||
fSpoofImpl(spImpl),
|
||||
fInput(NULL),
|
||||
fSLTable(NULL),
|
||||
fSATable(NULL),
|
||||
fMLTable(NULL),
|
||||
fMATable(NULL),
|
||||
fTable(NULL),
|
||||
fKeySet(NULL),
|
||||
fKeyVec(NULL),
|
||||
fValueVec(NULL),
|
||||
fStringTable(NULL),
|
||||
fStringLengthsTable(NULL),
|
||||
stringPool(NULL),
|
||||
fParseLine(NULL),
|
||||
fParseHexNum(NULL),
|
||||
|
@ -162,10 +157,7 @@ ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &stat
|
|||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fKeySet = new UnicodeSet();
|
||||
fKeyVec = new UVector(status);
|
||||
fValueVec = new UVector(status);
|
||||
|
@ -177,14 +169,10 @@ ConfusabledataBuilder::~ConfusabledataBuilder() {
|
|||
uprv_free(fInput);
|
||||
uregex_close(fParseLine);
|
||||
uregex_close(fParseHexNum);
|
||||
uhash_close(fSLTable);
|
||||
uhash_close(fSATable);
|
||||
uhash_close(fMLTable);
|
||||
uhash_close(fMATable);
|
||||
uhash_close(fTable);
|
||||
delete fKeySet;
|
||||
delete fKeyVec;
|
||||
delete fStringTable;
|
||||
delete fStringLengthsTable;
|
||||
delete fValueVec;
|
||||
delete stringPool;
|
||||
}
|
||||
|
@ -230,7 +218,7 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
|
|||
// any line. What was matched is determined by examining which capture groups have a match.
|
||||
// Capture Group 1: the source char
|
||||
// Capture Group 2: the replacement chars
|
||||
// Capture Group 3-6 the table type, SL, SA, ML, or MA
|
||||
// Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
|
||||
// Capture Group 7: A blank or comment only line.
|
||||
// Capture Group 8: A syntactically invalid line. Anything that didn't match before.
|
||||
// Example Line from the confusables.txt source file:
|
||||
|
@ -296,41 +284,12 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
|
|||
// This a little like a Java intern() - any duplicates will be eliminated.
|
||||
SPUString *smapString = stringPool->addString(mapString, status);
|
||||
|
||||
// Add the UChar32 -> string mapping to the appropriate table.
|
||||
UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
|
||||
uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
|
||||
uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
|
||||
uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
|
||||
NULL;
|
||||
if (U_SUCCESS(status) && table == NULL) {
|
||||
status = U_PARSE_ERROR;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Add the UChar32 -> string mapping to the table.
|
||||
// For Unicode 8, the SL, SA and ML tables have been discontinued.
|
||||
// All input data from confusables.txt is tagged MA.
|
||||
// ICU spoof check functions should ignore the specified table and always
|
||||
// use this MA Data.
|
||||
// For now, implement by populating the MA data into all four tables, and
|
||||
// keep the multiple table implementation in place, in case it comes back
|
||||
// at some time in the future.
|
||||
// There is no run time size penalty to keeping the four table implementation -
|
||||
// the data is shared when it's the same betweeen tables.
|
||||
if (table != fMATable) {
|
||||
status = U_PARSE_ERROR;
|
||||
return;
|
||||
};
|
||||
// uhash_iput(table, keyChar, smapString, &status);
|
||||
uhash_iput(fSLTable, keyChar, smapString, &status);
|
||||
uhash_iput(fSATable, keyChar, smapString, &status);
|
||||
uhash_iput(fMLTable, keyChar, smapString, &status);
|
||||
uhash_iput(fMATable, keyChar, smapString, &status);
|
||||
uhash_iput(fTable, keyChar, smapString, &status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
fKeySet->add(keyChar);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Input data is now all parsed and collected.
|
||||
|
@ -343,43 +302,24 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
|
|||
// Build up the string array, and record the index of each string therein
|
||||
// in the (build time only) string pool.
|
||||
// Strings of length one are not entered into the strings array.
|
||||
// At the same time, build up the string lengths table, which records the
|
||||
// position in the string table of the first string of each length >= 4.
|
||||
// (Strings in the table are sorted by length)
|
||||
stringPool->sort(status);
|
||||
fStringTable = new UnicodeString();
|
||||
fStringLengthsTable = new UVector(status);
|
||||
int32_t previousStringLength = 0;
|
||||
int32_t previousStringIndex = 0;
|
||||
int32_t poolSize = stringPool->size();
|
||||
int32_t i;
|
||||
for (i=0; i<poolSize; i++) {
|
||||
SPUString *s = stringPool->getByIndex(i);
|
||||
int32_t strLen = s->fStr->length();
|
||||
int32_t strIndex = fStringTable->length();
|
||||
U_ASSERT(strLen >= previousStringLength);
|
||||
if (strLen == 1) {
|
||||
// strings of length one do not get an entry in the string table.
|
||||
// Keep the single string character itself here, which is the same
|
||||
// convention that is used in the final run-time string table index.
|
||||
s->fStrTableIndex = s->fStr->charAt(0);
|
||||
s->fCharOrStrTableIndex = s->fStr->charAt(0);
|
||||
} else {
|
||||
if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
|
||||
fStringLengthsTable->addElement(previousStringIndex, status);
|
||||
fStringLengthsTable->addElement(previousStringLength, status);
|
||||
}
|
||||
s->fStrTableIndex = strIndex;
|
||||
s->fCharOrStrTableIndex = strIndex;
|
||||
fStringTable->append(*(s->fStr));
|
||||
}
|
||||
previousStringLength = strLen;
|
||||
previousStringIndex = strIndex;
|
||||
}
|
||||
// Make the final entry to the string lengths table.
|
||||
// (it holds an entry for the _last_ string of each length, so adding the
|
||||
// final one doesn't happen in the main loop because no longer string was encountered.)
|
||||
if (previousStringLength >= 4) {
|
||||
fStringLengthsTable->addElement(previousStringIndex, status);
|
||||
fStringLengthsTable->addElement(previousStringLength, status);
|
||||
}
|
||||
|
||||
// Construct the compile-time Key and Value tables
|
||||
|
@ -398,10 +338,15 @@ void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesL
|
|||
// code points requires a nested loop.
|
||||
for (UChar32 keyChar=fKeySet->getRangeStart(range);
|
||||
keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
|
||||
addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
|
||||
addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
|
||||
addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
|
||||
addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
|
||||
SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
|
||||
U_ASSERT(targetMapping != NULL);
|
||||
|
||||
int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
|
||||
targetMapping->fStr->length());
|
||||
int32_t value = targetMapping->fCharOrStrTableIndex;
|
||||
|
||||
fKeyVec->addElement(key, status);
|
||||
fValueVec->addElement(value, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -437,14 +382,14 @@ void ConfusabledataBuilder::outputData(UErrorCode &status) {
|
|||
return;
|
||||
}
|
||||
int i;
|
||||
int32_t previousKey = 0;
|
||||
UChar32 previousCodePoint = 0;
|
||||
for (i=0; i<numKeys; i++) {
|
||||
int32_t key = fKeyVec->elementAti(i);
|
||||
(void)previousKey; // Suppress unused variable warning on gcc.
|
||||
U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
|
||||
U_ASSERT((key & 0xff000000) != 0);
|
||||
UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
|
||||
// strictly greater because there can be only one entry per code point
|
||||
U_ASSERT(codePoint > previousCodePoint);
|
||||
keys[i] = key;
|
||||
previousKey = key;
|
||||
previousCodePoint = codePoint;
|
||||
}
|
||||
SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
|
||||
rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
|
||||
|
@ -486,143 +431,6 @@ void ConfusabledataBuilder::outputData(UErrorCode &status) {
|
|||
rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
|
||||
rawData->fCFUStringTableLen = stringsLength;
|
||||
fSpoofImpl->fSpoofData->fCFUStrings = strings;
|
||||
|
||||
// The String Lengths Table
|
||||
// While copying into the runtime array do some sanity checks on the values
|
||||
// Each complete entry contains two fields, an index and an offset.
|
||||
// Lengths should increase with each entry.
|
||||
// Offsets should be less than the size of the string table.
|
||||
int32_t lengthTableLength = fStringLengthsTable->size();
|
||||
uint16_t *stringLengths =
|
||||
static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t destIndex = 0;
|
||||
uint32_t previousLength = 0;
|
||||
for (i=0; i<lengthTableLength; i+=2) {
|
||||
uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
|
||||
uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
|
||||
U_ASSERT(offset < stringsLength);
|
||||
U_ASSERT(length < 40);
|
||||
(void)previousLength; // Suppress unused variable warning on gcc.
|
||||
U_ASSERT(length > previousLength);
|
||||
stringLengths[destIndex++] = static_cast<uint16_t>(offset);
|
||||
stringLengths[destIndex++] = static_cast<uint16_t>(length);
|
||||
previousLength = length;
|
||||
}
|
||||
rawData = fSpoofImpl->fSpoofData->fRawData;
|
||||
rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData);
|
||||
// Note: StringLengthsSize in the raw data is the number of complete entries,
|
||||
// each consisting of a pair of 16 bit values, hence the divide by 2.
|
||||
rawData->fCFUStringLengthsSize = lengthTableLength / 2;
|
||||
fSpoofImpl->fSpoofData->fCFUStringLengths =
|
||||
reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// addKeyEntry Construction of the confusable Key and Mapping Values tables.
|
||||
// This is an intermediate point in the building process.
|
||||
// We already have the mappings in the hash tables fSLTable, etc.
|
||||
// This function builds corresponding run-time style table entries into
|
||||
// fKeyVec and fValueVec
|
||||
|
||||
void ConfusabledataBuilder::addKeyEntry(
|
||||
UChar32 keyChar, // The key character
|
||||
UHashtable *table, // The table, one of SATable, MATable, etc.
|
||||
int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
|
||||
UErrorCode &status) {
|
||||
|
||||
SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
|
||||
if (targetMapping == NULL) {
|
||||
// No mapping for this key character.
|
||||
// (This function is called for all four tables for each key char that
|
||||
// is seen anywhere, so this no entry cases are very much expected.)
|
||||
return;
|
||||
}
|
||||
|
||||
// Check whether there is already an entry with the correct mapping.
|
||||
// If so, simply set the flag in the keyTable saying that the existing entry
|
||||
// applies to the table that we're doing now.
|
||||
|
||||
UBool keyHasMultipleValues = FALSE;
|
||||
int32_t i;
|
||||
for (i=fKeyVec->size()-1; i>=0 ; i--) {
|
||||
int32_t key = fKeyVec->elementAti(i);
|
||||
if ((key & 0x0ffffff) != keyChar) {
|
||||
// We have now checked all existing key entries for this key char (if any)
|
||||
// without finding one with the same mapping.
|
||||
break;
|
||||
}
|
||||
UnicodeString mapping = getMapping(i);
|
||||
if (mapping == *(targetMapping->fStr)) {
|
||||
// The run time entry we are currently testing has the correct mapping.
|
||||
// Set the flag in it indicating that it applies to the new table also.
|
||||
key |= tableFlag;
|
||||
fKeyVec->setElementAt(key, i);
|
||||
return;
|
||||
}
|
||||
keyHasMultipleValues = TRUE;
|
||||
}
|
||||
|
||||
// Need to add a new entry to the binary data being built for this mapping.
|
||||
// Includes adding entries to both the key table and the parallel values table.
|
||||
|
||||
int32_t newKey = keyChar | tableFlag;
|
||||
if (keyHasMultipleValues) {
|
||||
newKey |= USPOOF_KEY_MULTIPLE_VALUES;
|
||||
}
|
||||
int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
|
||||
if (adjustedMappingLength>3) {
|
||||
adjustedMappingLength = 3;
|
||||
}
|
||||
newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
|
||||
|
||||
int32_t newData = targetMapping->fStrTableIndex;
|
||||
|
||||
fKeyVec->addElement(newKey, status);
|
||||
fValueVec->addElement(newData, status);
|
||||
|
||||
// If the preceding key entry is for the same key character (but with a different mapping)
|
||||
// set the multiple-values flag on it.
|
||||
if (keyHasMultipleValues) {
|
||||
int32_t previousKeyIndex = fKeyVec->size() - 2;
|
||||
int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
|
||||
previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
|
||||
fKeyVec->setElementAt(previousKey, previousKeyIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
|
||||
int32_t key = fKeyVec->elementAti(index);
|
||||
int32_t value = fValueVec->elementAti(index);
|
||||
int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
|
||||
int32_t lastIndexWithLen;
|
||||
switch (length) {
|
||||
case 0:
|
||||
return UnicodeString(static_cast<UChar>(value));
|
||||
case 1:
|
||||
case 2:
|
||||
return UnicodeString(*fStringTable, value, length+1);
|
||||
case 3:
|
||||
length = 0;
|
||||
int32_t i;
|
||||
for (i=0; i<fStringLengthsTable->size(); i+=2) {
|
||||
lastIndexWithLen = fStringLengthsTable->elementAti(i);
|
||||
if (value <= lastIndexWithLen) {
|
||||
length = fStringLengthsTable->elementAti(i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
U_ASSERT(length>=3);
|
||||
return UnicodeString(*fStringTable, value, length);
|
||||
default:
|
||||
U_ASSERT(FALSE);
|
||||
}
|
||||
return UnicodeString();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -38,9 +38,9 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
struct SPUString : public UMemory {
|
||||
UnicodeString *fStr; // The actual string.
|
||||
int32_t fStrTableIndex; // Index into the final runtime data for this string.
|
||||
// (or, for length 1, the single string char itself,
|
||||
// there being no string table entry for it.)
|
||||
int32_t fCharOrStrTableIndex; // Index into the final runtime data for this
|
||||
// string (or, for length 1, the single string char
|
||||
// itself, there being no string table entry for it.)
|
||||
SPUString(UnicodeString *s);
|
||||
~SPUString();
|
||||
};
|
||||
|
@ -88,10 +88,7 @@ class ConfusabledataBuilder : public UMemory {
|
|||
private:
|
||||
SpoofImpl *fSpoofImpl;
|
||||
UChar *fInput;
|
||||
UHashtable *fSLTable;
|
||||
UHashtable *fSATable;
|
||||
UHashtable *fMLTable;
|
||||
UHashtable *fMATable;
|
||||
UHashtable *fTable;
|
||||
UnicodeSet *fKeySet; // A set of all keys (UChar32s) that go into the four mapping tables.
|
||||
|
||||
// The binary data is first assembled into the following four collections, then
|
||||
|
@ -99,7 +96,6 @@ class ConfusabledataBuilder : public UMemory {
|
|||
UVector *fKeyVec;
|
||||
UVector *fValueVec;
|
||||
UnicodeString *fStringTable;
|
||||
UVector *fStringLengthsTable;
|
||||
|
||||
SPUStringPool *stringPool;
|
||||
URegularExpression *fParseLine;
|
||||
|
|
|
@ -15,11 +15,11 @@
|
|||
#include "utrie2.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "identifier_info.h"
|
||||
#include "scriptset.h"
|
||||
#include "umutex.h"
|
||||
#include "udataswp.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_in.h"
|
||||
#include "uspoof_impl.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
@ -29,14 +29,38 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
|
||||
|
||||
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
|
||||
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
|
||||
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
|
||||
construct(status);
|
||||
fSpoofData = data;
|
||||
}
|
||||
|
||||
SpoofImpl::SpoofImpl(UErrorCode& status) {
|
||||
construct(status);
|
||||
|
||||
// TODO: Call this method where it is actually needed, instead of in the
|
||||
// constructor, to allow for lazy data loading. See #12696.
|
||||
fSpoofData = SpoofData::getDefault(status);
|
||||
}
|
||||
|
||||
SpoofImpl::SpoofImpl() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
construct(status);
|
||||
|
||||
// TODO: Call this method where it is actually needed, instead of in the
|
||||
// constructor, to allow for lazy data loading. See #12696.
|
||||
fSpoofData = SpoofData::getDefault(status);
|
||||
}
|
||||
|
||||
void SpoofImpl::construct(UErrorCode& status) {
|
||||
fMagic = USPOOF_MAGIC;
|
||||
fChecks = USPOOF_ALL_CHECKS;
|
||||
fSpoofData = NULL;
|
||||
fAllowedCharsSet = NULL;
|
||||
fAllowedLocales = NULL;
|
||||
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
|
||||
|
||||
if (U_FAILURE(status)) { return; }
|
||||
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
|
@ -45,25 +69,13 @@ SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
|
|||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
fMagic = USPOOF_MAGIC;
|
||||
}
|
||||
|
||||
|
||||
SpoofImpl::SpoofImpl() :
|
||||
fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
||||
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
fAllowedLocales = uprv_strdup("");
|
||||
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
|
||||
// Copy Constructor, used by the user level clone() function.
|
||||
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
||||
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
||||
fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
||||
fAllowedLocales(NULL) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -88,7 +100,11 @@ SpoofImpl::~SpoofImpl() {
|
|||
}
|
||||
delete fAllowedCharsSet;
|
||||
uprv_free((void *)fAllowedLocales);
|
||||
delete fCachedIdentifierInfo;
|
||||
}
|
||||
|
||||
// Cast this instance as a USpoofChecker for the C API.
|
||||
USpoofChecker *SpoofImpl::asUSpoofChecker() {
|
||||
return reinterpret_cast<USpoofChecker*>(this);
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -104,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st
|
|||
return NULL;
|
||||
}
|
||||
SpoofImpl *This = (SpoofImpl *)sc;
|
||||
if (This->fMagic != USPOOF_MAGIC ||
|
||||
This->fSpoofData == NULL) {
|
||||
if (This->fMagic != USPOOF_MAGIC) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
|
||||
if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
|
||||
return NULL;
|
||||
}
|
||||
return This;
|
||||
|
@ -121,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------------
|
||||
//
|
||||
// confusableLookup() This is the heart of the confusable skeleton generation
|
||||
// implementation.
|
||||
//
|
||||
// Given a source character, produce the corresponding
|
||||
// replacement character(s), appending them to the dest string.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
|
||||
|
||||
// Binary search the spoof data key table for the inChar
|
||||
int32_t *low = fSpoofData->fCFUKeys;
|
||||
int32_t *mid = NULL;
|
||||
int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
|
||||
UChar32 midc;
|
||||
do {
|
||||
int32_t delta = ((int32_t)(limit-low))/2;
|
||||
mid = low + delta;
|
||||
midc = *mid & 0x1fffff;
|
||||
if (inChar == midc) {
|
||||
goto foundChar;
|
||||
} else if (inChar < midc) {
|
||||
limit = mid;
|
||||
} else {
|
||||
low = mid;
|
||||
}
|
||||
} while (low < limit-1);
|
||||
mid = low;
|
||||
midc = *mid & 0x1fffff;
|
||||
if (inChar != midc) {
|
||||
// Char not found. It maps to itself.
|
||||
int i = 0;
|
||||
dest.append(inChar);
|
||||
return i;
|
||||
}
|
||||
foundChar:
|
||||
int32_t keyFlags = *mid & 0xff000000;
|
||||
if ((keyFlags & tableMask) == 0) {
|
||||
// We found the right key char, but the entry doesn't pertain to the
|
||||
// table we need. See if there is an adjacent key that does
|
||||
if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
|
||||
int32_t *altMid;
|
||||
for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
|
||||
keyFlags = *altMid & 0xff000000;
|
||||
if (keyFlags & tableMask) {
|
||||
mid = altMid;
|
||||
goto foundKey;
|
||||
}
|
||||
}
|
||||
for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
|
||||
keyFlags = *altMid & 0xff000000;
|
||||
if (keyFlags & tableMask) {
|
||||
mid = altMid;
|
||||
goto foundKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
// No key entry for this char & table.
|
||||
// The input char maps to itself.
|
||||
int i = 0;
|
||||
dest.append(inChar);
|
||||
return i;
|
||||
}
|
||||
|
||||
foundKey:
|
||||
int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
|
||||
int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
|
||||
|
||||
// Value is either a UChar (for strings of length 1) or
|
||||
// an index into the string table (for longer strings)
|
||||
uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
|
||||
if (stringLen == 1) {
|
||||
dest.append((UChar)value);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// String length of 4 from the above lookup is used for all strings of length >= 4.
|
||||
// For these, get the real length from the string lengths table,
|
||||
// which maps string table indexes to lengths.
|
||||
// All strings of the same length are stored contiguously in the string table.
|
||||
// 'value' from the lookup above is the starting index for the desired string.
|
||||
|
||||
int32_t ix;
|
||||
if (stringLen == 4) {
|
||||
int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
|
||||
for (ix = 0; ix < stringLengthsLimit; ix++) {
|
||||
if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
|
||||
stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
|
||||
break;
|
||||
}
|
||||
}
|
||||
U_ASSERT(ix < stringLengthsLimit);
|
||||
}
|
||||
|
||||
U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
|
||||
UChar *src = &fSpoofData->fCFUStrings[value];
|
||||
dest.append(src, stringLen);
|
||||
return stringLen;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// wholeScriptCheck()
|
||||
//
|
||||
// Input text is already normalized to NFD
|
||||
// Return the set of scripts, each of which can represent something that is
|
||||
// confusable with the input text. The script of the input text
|
||||
// is included; input consisting of characters from a single script will
|
||||
// always produce a result consisting of a set containing that script.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
void SpoofImpl::wholeScriptCheck(
|
||||
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
|
||||
|
||||
UTrie2 *table =
|
||||
(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
|
||||
result->setAll();
|
||||
int32_t length = text.length();
|
||||
for (int32_t inputIdx=0; inputIdx < length;) {
|
||||
UChar32 c = text.char32At(inputIdx);
|
||||
inputIdx += U16_LENGTH(c);
|
||||
uint32_t index = utrie2_get32(table, c);
|
||||
if (index == 0) {
|
||||
// No confusables in another script for this char.
|
||||
// TODO: we should change the data to have sets with just the single script
|
||||
// bit for the script of this char. Gets rid of this special case.
|
||||
// Until then, grab the script from the char and intersect it with the set.
|
||||
UScriptCode cpScript = uscript_getScript(c, &status);
|
||||
U_ASSERT(cpScript > USCRIPT_INHERITED);
|
||||
result->intersect(cpScript, status);
|
||||
} else if (index == 1) {
|
||||
// Script == Common or Inherited. Nothing to do.
|
||||
} else {
|
||||
result->intersect(fSpoofData->fScriptSets[index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
|
||||
UnicodeSet allowedChars;
|
||||
UnicodeSet *tmpSet = NULL;
|
||||
|
@ -374,6 +247,137 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr
|
|||
}
|
||||
}
|
||||
|
||||
// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
|
||||
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
|
||||
result.resetAll();
|
||||
result.setScriptExtensions(codePoint, status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
|
||||
// Section 5.1 step 1
|
||||
if (result.test(USCRIPT_HAN, status)) {
|
||||
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
|
||||
result.set(USCRIPT_JAPANESE, status);
|
||||
result.set(USCRIPT_KOREAN, status);
|
||||
}
|
||||
if (result.test(USCRIPT_HIRAGANA, status)) {
|
||||
result.set(USCRIPT_JAPANESE, status);
|
||||
}
|
||||
if (result.test(USCRIPT_KATAKANA, status)) {
|
||||
result.set(USCRIPT_JAPANESE, status);
|
||||
}
|
||||
if (result.test(USCRIPT_HANGUL, status)) {
|
||||
result.set(USCRIPT_KOREAN, status);
|
||||
}
|
||||
if (result.test(USCRIPT_BOPOMOFO, status)) {
|
||||
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
|
||||
}
|
||||
|
||||
// Section 5.1 step 2
|
||||
if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
|
||||
result.setAll();
|
||||
}
|
||||
}
|
||||
|
||||
// Computes the resolved script set for a string, according to UTS 39 section 5.1.
|
||||
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
|
||||
getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
|
||||
}
|
||||
|
||||
// Computes the resolved script set for a string, omitting characters having the specified script.
|
||||
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
|
||||
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
|
||||
result.setAll();
|
||||
|
||||
ScriptSet temp;
|
||||
UChar32 codePoint;
|
||||
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
|
||||
codePoint = input.char32At(i);
|
||||
|
||||
// Compute the augmented script set for the character
|
||||
getAugmentedScriptSet(codePoint, temp, status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
|
||||
// Intersect the augmented script set with the resolved script set, but only if the character doesn't
|
||||
// have the script specified in the function call
|
||||
if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
|
||||
result.intersect(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Computes the set of numerics for a string, according to UTS 39 section 5.3.
|
||||
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
|
||||
result.clear();
|
||||
|
||||
UChar32 codePoint;
|
||||
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
|
||||
codePoint = input.char32At(i);
|
||||
|
||||
// Store a representative character for each kind of decimal digit
|
||||
if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
|
||||
// Store the zero character as a representative for comparison.
|
||||
// Unicode guarantees it is codePoint - value
|
||||
result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Computes the restriction level of a string, according to UTS 39 section 5.2.
|
||||
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
|
||||
// Section 5.2 step 1:
|
||||
if (!fAllowedCharsSet->containsAll(input)) {
|
||||
return USPOOF_UNRESTRICTIVE;
|
||||
}
|
||||
|
||||
// Section 5.2 step 2
|
||||
// Java use a static UnicodeSet for this test. In C++, avoid the static variable
|
||||
// and just do a simple for loop.
|
||||
UBool allASCII = TRUE;
|
||||
for (int32_t i=0, length=input.length(); i<length; i++) {
|
||||
if (input.charAt(i) > 0x7f) {
|
||||
allASCII = FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allASCII) {
|
||||
return USPOOF_ASCII;
|
||||
}
|
||||
|
||||
// Section 5.2 steps 3:
|
||||
ScriptSet resolvedScriptSet;
|
||||
getResolvedScriptSet(input, resolvedScriptSet, status);
|
||||
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
|
||||
|
||||
// Section 5.2 step 4:
|
||||
if (!resolvedScriptSet.isEmpty()) {
|
||||
return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
|
||||
}
|
||||
|
||||
// Section 5.2 step 5:
|
||||
ScriptSet resolvedNoLatn;
|
||||
getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
|
||||
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
|
||||
|
||||
// Section 5.2 step 6:
|
||||
if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
|
||||
|| resolvedNoLatn.test(USCRIPT_JAPANESE, status)
|
||||
|| resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
|
||||
return USPOOF_HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
// Section 5.2 step 7:
|
||||
if (!resolvedNoLatn.isEmpty()
|
||||
&& !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
|
||||
&& !resolvedNoLatn.test(USCRIPT_GREEK, status)
|
||||
&& !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
|
||||
return USPOOF_MODERATELY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
// Section 5.2 step 8:
|
||||
return USPOOF_MINIMALLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Convert a text format hex number. Utility function used by builder code. Static.
|
||||
// Input: UChar *string text. Output: a UChar32
|
||||
|
@ -406,55 +410,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC
|
|||
return (UChar32)val;
|
||||
}
|
||||
|
||||
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
|
||||
// Maintain a one-element cache, which is sufficient to avoid repeatedly
|
||||
// creating new ones unless we get multi-thread concurrency in spoof
|
||||
// check operations, which should be statistically uncommon.
|
||||
|
||||
// These functions are used in place of new & delete of an IdentifierInfo.
|
||||
// They will recycle the IdentifierInfo when possible.
|
||||
// They are logically const, and used within const functions that must be thread safe.
|
||||
IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
|
||||
IdentifierInfo *returnIdInfo = NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return returnIdInfo;
|
||||
}
|
||||
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
|
||||
{
|
||||
Mutex m;
|
||||
returnIdInfo = nonConstThis->fCachedIdentifierInfo;
|
||||
nonConstThis->fCachedIdentifierInfo = NULL;
|
||||
}
|
||||
if (returnIdInfo == NULL) {
|
||||
returnIdInfo = new IdentifierInfo(status);
|
||||
if (U_SUCCESS(status) && returnIdInfo == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if (U_FAILURE(status) && returnIdInfo != NULL) {
|
||||
delete returnIdInfo;
|
||||
returnIdInfo = NULL;
|
||||
}
|
||||
}
|
||||
return returnIdInfo;
|
||||
//-----------------------------------------
|
||||
//
|
||||
// class CheckResult Implementation
|
||||
//
|
||||
//-----------------------------------------
|
||||
|
||||
CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
|
||||
clear();
|
||||
}
|
||||
|
||||
USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
|
||||
return reinterpret_cast<USpoofCheckResult*>(this);
|
||||
}
|
||||
|
||||
void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
|
||||
if (idInfo != NULL) {
|
||||
SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
|
||||
{
|
||||
Mutex m;
|
||||
if (nonConstThis->fCachedIdentifierInfo == NULL) {
|
||||
nonConstThis->fCachedIdentifierInfo = idInfo;
|
||||
idInfo = NULL;
|
||||
}
|
||||
}
|
||||
delete idInfo;
|
||||
//
|
||||
// Incoming parameter check on Status and the CheckResult object
|
||||
// received from the C API.
|
||||
//
|
||||
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) { return NULL; }
|
||||
if (ptr == NULL) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
CheckResult *This = (CheckResult*) ptr;
|
||||
if (This->fMagic != USPOOF_CHECK_MAGIC) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
return This;
|
||||
}
|
||||
|
||||
CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
|
||||
return const_cast<CheckResult *>
|
||||
(CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
|
||||
}
|
||||
|
||||
void CheckResult::clear() {
|
||||
fChecks = 0;
|
||||
fNumerics.clear();
|
||||
fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
|
||||
}
|
||||
|
||||
int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
|
||||
if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
|
||||
return fChecks | fRestrictionLevel;
|
||||
} else {
|
||||
return fChecks;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
CheckResult::~CheckResult() {
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -463,12 +472,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
|
|||
//----------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
|
||||
UBool SpoofData::validateDataVersion(UErrorCode &status) const {
|
||||
if (U_FAILURE(status) ||
|
||||
rawData == NULL ||
|
||||
rawData->fMagic != USPOOF_MAGIC ||
|
||||
rawData->fFormatVersion[0] > 1 ||
|
||||
rawData->fFormatVersion[1] > 0) {
|
||||
fRawData == NULL ||
|
||||
fRawData->fMagic != USPOOF_MAGIC ||
|
||||
fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
|
||||
fRawData->fFormatVersion[1] != 0 ||
|
||||
fRawData->fFormatVersion[2] != 0 ||
|
||||
fRawData->fFormatVersion[3] != 0) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -487,7 +498,7 @@ spoofDataIsAcceptable(void *context,
|
|||
pInfo->dataFormat[1] == 0x66 &&
|
||||
pInfo->dataFormat[2] == 0x75 &&
|
||||
pInfo->dataFormat[3] == 0x20 &&
|
||||
pInfo->formatVersion[0] == 1
|
||||
pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
|
||||
) {
|
||||
UVersionInfo *version = static_cast<UVersionInfo *>(context);
|
||||
if(version != NULL) {
|
||||
|
@ -499,32 +510,61 @@ spoofDataIsAcceptable(void *context,
|
|||
}
|
||||
}
|
||||
|
||||
// Methods for the loading of the default confusables data file. The confusable
|
||||
// data is loaded only when it is needed.
|
||||
//
|
||||
// SpoofData::getDefault() - return a wrapper around the spoof data that is
|
||||
// baked into the default ICU data.
|
||||
// SpoofData::getDefault() - Return the default confusables data, and call the
|
||||
// initOnce() if it is not available. Adds a reference
|
||||
// to the SpoofData that the caller is responsible for
|
||||
// decrementing when they are done with the data.
|
||||
//
|
||||
// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
|
||||
// SpoofData is shared by all spoof checkers using the default data.
|
||||
// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
|
||||
// is shared by all spoof checkers using the default data.
|
||||
//
|
||||
SpoofData *SpoofData::getDefault(UErrorCode &status) {
|
||||
// uspoof_cleanupDefaultData - Called during cleanup.
|
||||
//
|
||||
|
||||
static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
|
||||
static SpoofData* gDefaultSpoofData;
|
||||
|
||||
static UBool U_CALLCONV
|
||||
uspoof_cleanupDefaultData(void) {
|
||||
if (gDefaultSpoofData) {
|
||||
// Will delete, assuming all user-level spoof checkers were closed.
|
||||
gDefaultSpoofData->removeReference();
|
||||
gDefaultSpoofData = NULL;
|
||||
gSpoofInitDefaultOnce.reset();
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void uspoof_loadDefaultData(UErrorCode& status) {
|
||||
UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
|
||||
spoofDataIsAcceptable,
|
||||
NULL, // context, would receive dataVersion if supplied.
|
||||
&status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
gDefaultSpoofData = new SpoofData(udm, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
delete gDefaultSpoofData;
|
||||
return;
|
||||
}
|
||||
SpoofData *This = new SpoofData(udm, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete This;
|
||||
return NULL;
|
||||
}
|
||||
if (This == NULL) {
|
||||
if (gDefaultSpoofData == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
return This;
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
|
||||
}
|
||||
|
||||
SpoofData* SpoofData::getDefault(UErrorCode& status) {
|
||||
umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
|
||||
if (U_FAILURE(status)) { return NULL; }
|
||||
gDefaultSpoofData->addReference();
|
||||
return gDefaultSpoofData;
|
||||
}
|
||||
|
||||
|
||||
|
||||
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
|
||||
{
|
||||
reset();
|
||||
|
@ -535,7 +575,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
|
|||
// fRawData is non-const because it may be constructed by the data builder.
|
||||
fRawData = reinterpret_cast<SpoofDataHeader *>(
|
||||
const_cast<void *>(udata_getMemory(udm)));
|
||||
validateDataVersion(fRawData, status);
|
||||
validateDataVersion(status);
|
||||
initPtrs(status);
|
||||
}
|
||||
|
||||
|
@ -556,7 +596,7 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
|
|||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
validateDataVersion(fRawData, status);
|
||||
validateDataVersion(status);
|
||||
initPtrs(status);
|
||||
}
|
||||
|
||||
|
@ -584,7 +624,7 @@ SpoofData::SpoofData(UErrorCode &status) {
|
|||
uprv_memset(fRawData, 0, initialSize);
|
||||
|
||||
fRawData->fMagic = USPOOF_MAGIC;
|
||||
fRawData->fFormatVersion[0] = 1;
|
||||
fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
|
||||
fRawData->fFormatVersion[1] = 0;
|
||||
fRawData->fFormatVersion[2] = 0;
|
||||
fRawData->fFormatVersion[3] = 0;
|
||||
|
@ -602,11 +642,7 @@ void SpoofData::reset() {
|
|||
fRefCount = 1;
|
||||
fCFUKeys = NULL;
|
||||
fCFUValues = NULL;
|
||||
fCFUStringLengths = NULL;
|
||||
fCFUStrings = NULL;
|
||||
fAnyCaseTrie = NULL;
|
||||
fLowerCaseTrie = NULL;
|
||||
fScriptSets = NULL;
|
||||
}
|
||||
|
||||
|
||||
|
@ -628,7 +664,6 @@ void SpoofData::reset() {
|
|||
void SpoofData::initPtrs(UErrorCode &status) {
|
||||
fCFUKeys = NULL;
|
||||
fCFUValues = NULL;
|
||||
fCFUStringLengths = NULL;
|
||||
fCFUStrings = NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
|
@ -639,33 +674,13 @@ void SpoofData::initPtrs(UErrorCode &status) {
|
|||
if (fRawData->fCFUStringIndex != 0) {
|
||||
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
|
||||
}
|
||||
if (fRawData->fCFUStringLengths != 0) {
|
||||
fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
|
||||
}
|
||||
if (fRawData->fCFUStringTable != 0) {
|
||||
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
|
||||
}
|
||||
|
||||
if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
|
||||
fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
|
||||
}
|
||||
if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
|
||||
fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
|
||||
}
|
||||
|
||||
if (fRawData->fScriptSets != 0) {
|
||||
fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
SpoofData::~SpoofData() {
|
||||
utrie2_close(fAnyCaseTrie);
|
||||
fAnyCaseTrie = NULL;
|
||||
utrie2_close(fLowerCaseTrie);
|
||||
fLowerCaseTrie = NULL;
|
||||
if (fDataOwned) {
|
||||
uprv_free(fRawData);
|
||||
}
|
||||
|
@ -710,6 +725,78 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
|
|||
return (char *)fRawData + returnOffset;
|
||||
}
|
||||
|
||||
int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
|
||||
int32_t dataSize = fRawData->fLength;
|
||||
if (capacity < dataSize) {
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return dataSize;
|
||||
}
|
||||
uprv_memcpy(buf, fRawData, dataSize);
|
||||
return dataSize;
|
||||
}
|
||||
|
||||
int32_t SpoofData::size() const {
|
||||
return fRawData->fLength;
|
||||
}
|
||||
|
||||
//-------------------------------
|
||||
//
|
||||
// Front-end APIs for SpoofData
|
||||
//
|
||||
//-------------------------------
|
||||
|
||||
int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
|
||||
// Perform a binary search.
|
||||
// [lo, hi), i.e lo is inclusive, hi is exclusive.
|
||||
// The result after the loop will be in lo.
|
||||
int32_t lo = 0;
|
||||
int32_t hi = length();
|
||||
do {
|
||||
int32_t mid = (lo + hi) / 2;
|
||||
if (codePointAt(mid) > inChar) {
|
||||
hi = mid;
|
||||
} else if (codePointAt(mid) < inChar) {
|
||||
lo = mid;
|
||||
} else {
|
||||
// Found result. Break early.
|
||||
lo = mid;
|
||||
break;
|
||||
}
|
||||
} while (hi - lo > 1);
|
||||
|
||||
// Did we find an entry? If not, the char maps to itself.
|
||||
if (codePointAt(lo) != inChar) {
|
||||
dest.append(inChar);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Add the element to the string builder and return.
|
||||
return appendValueTo(lo, dest);
|
||||
}
|
||||
|
||||
int32_t SpoofData::length() const {
|
||||
return fRawData->fCFUKeysSize;
|
||||
}
|
||||
|
||||
UChar32 SpoofData::codePointAt(int32_t index) const {
|
||||
return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
|
||||
}
|
||||
|
||||
int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
|
||||
int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
|
||||
|
||||
// Value is either a char (for strings of length 1) or
|
||||
// an index into the string table (for longer strings)
|
||||
uint16_t value = fCFUValues[index];
|
||||
if (stringLength == 1) {
|
||||
dest.append((UChar)value);
|
||||
} else {
|
||||
dest.append(fCFUStrings + value, stringLength);
|
||||
}
|
||||
|
||||
return stringLength;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
@ -741,7 +828,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
|
|||
pInfo->dataFormat[1]==0x66 &&
|
||||
pInfo->dataFormat[2]==0x75 &&
|
||||
pInfo->dataFormat[3]==0x20 &&
|
||||
pInfo->formatVersion[0]==1 )) {
|
||||
pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
|
||||
pInfo->formatVersion[1]==0 &&
|
||||
pInfo->formatVersion[2]==0 &&
|
||||
pInfo->formatVersion[3]==0 )) {
|
||||
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
|
||||
"(format version %02x %02x %02x %02x) is not recognized\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||||
|
@ -830,26 +920,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
|
|||
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
|
||||
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// String Lengths Section
|
||||
sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
|
||||
sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
|
||||
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// Any Case Trie
|
||||
sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
|
||||
sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
|
||||
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// Lower Case Trie
|
||||
sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
|
||||
sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
|
||||
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// Script Sets. The data is an array of int32_t
|
||||
sectionStart = ds->readUInt32(spoofDH->fScriptSets);
|
||||
sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
|
||||
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// And, last, swap the header itself.
|
||||
// int32_t fMagic // swap this
|
||||
// uint8_t fFormatVersion[4] // Do not swap this, just copy
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#ifndef USPOOFIM_H
|
||||
#define USPOOFIM_H
|
||||
|
||||
#include "uassert.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
@ -39,11 +40,13 @@ U_NAMESPACE_BEGIN
|
|||
// Magic number for sanity checking spoof data.
|
||||
#define USPOOF_MAGIC 0x3845fdef
|
||||
|
||||
class IdentifierInfo;
|
||||
// Magic number for sanity checking spoof checkers.
|
||||
#define USPOOF_CHECK_MAGIC 0x2734ecde
|
||||
|
||||
class ScriptSet;
|
||||
class SpoofData;
|
||||
struct SpoofDataHeader;
|
||||
struct SpoofStringLengthsElement;
|
||||
class ConfusableDataUtils;
|
||||
|
||||
/**
|
||||
* Class SpoofImpl corresponds directly to the plain C API opaque type
|
||||
|
@ -51,25 +54,20 @@ struct SpoofStringLengthsElement;
|
|||
*/
|
||||
class SpoofImpl : public UObject {
|
||||
public:
|
||||
SpoofImpl(SpoofData *data, UErrorCode &status);
|
||||
SpoofImpl();
|
||||
virtual ~SpoofImpl();
|
||||
SpoofImpl(SpoofData *data, UErrorCode& status);
|
||||
SpoofImpl(UErrorCode& status);
|
||||
SpoofImpl();
|
||||
void construct(UErrorCode& status);
|
||||
virtual ~SpoofImpl();
|
||||
|
||||
/** Copy constructor, used by the user level uspoof_clone() function.
|
||||
*/
|
||||
SpoofImpl(const SpoofImpl &src, UErrorCode &status);
|
||||
|
||||
USpoofChecker *asUSpoofChecker();
|
||||
static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status);
|
||||
static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status);
|
||||
|
||||
/** Get the confusable skeleton transform for a single code point.
|
||||
* The result is a string with a length between 1 and 18.
|
||||
* @param tableMask bit flag specifying which confusable table to use.
|
||||
* One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
|
||||
* @return The length in UTF-16 code units of the substition string.
|
||||
*/
|
||||
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &destBuf) const;
|
||||
|
||||
/** Set and Get AllowedLocales, implementations of the corresponding API */
|
||||
void setAllowedLocales(const char *localesList, UErrorCode &status);
|
||||
const char * getAllowedLocales(UErrorCode &status);
|
||||
|
@ -78,26 +76,19 @@ public:
|
|||
// the specified locale. Part of the implementation of setAllowedLocales.
|
||||
void addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status);
|
||||
|
||||
// Functions implementing the features of UTS 39 section 5.
|
||||
static void getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status);
|
||||
void getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const;
|
||||
void getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const;
|
||||
void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const;
|
||||
URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const;
|
||||
|
||||
/** parse a hex number. Untility used by the builders. */
|
||||
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);
|
||||
|
||||
// Implementation for Whole Script tests.
|
||||
// Return the test bit flag to be ORed into the eventual user return value
|
||||
// if a Spoof opportunity is detected.
|
||||
void wholeScriptCheck(
|
||||
const UnicodeString &text, ScriptSet *result, UErrorCode &status) const;
|
||||
|
||||
static UClassID U_EXPORT2 getStaticClassID(void);
|
||||
virtual UClassID getDynamicClassID(void) const;
|
||||
|
||||
// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
|
||||
// Maintain a one-element cache, which is sufficient to avoid repeatedly
|
||||
// creating new ones unless we get multi-thread concurrency in spoof
|
||||
// check operations, which should be statistically uncommon.
|
||||
IdentifierInfo *getIdentifierInfo(UErrorCode &status) const;
|
||||
void releaseIdentifierInfo(IdentifierInfo *idInfo) const;
|
||||
|
||||
//
|
||||
// Data Members
|
||||
//
|
||||
|
@ -108,14 +99,36 @@ public:
|
|||
SpoofData *fSpoofData;
|
||||
|
||||
const UnicodeSet *fAllowedCharsSet; // The UnicodeSet of allowed characters.
|
||||
// for this Spoof Checker. Defaults to all chars.
|
||||
// for this Spoof Checker. Defaults to all chars.
|
||||
|
||||
const char *fAllowedLocales; // The list of allowed locales.
|
||||
URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier.
|
||||
|
||||
IdentifierInfo *fCachedIdentifierInfo; // Do not use directly. See getIdentifierInfo().:w
|
||||
};
|
||||
|
||||
/**
|
||||
* Class CheckResult corresponds directly to the plain C API opaque type
|
||||
* USpoofCheckResult. One can be cast to the other.
|
||||
*/
|
||||
class CheckResult : public UObject {
|
||||
public:
|
||||
CheckResult();
|
||||
virtual ~CheckResult();
|
||||
|
||||
USpoofCheckResult *asUSpoofCheckResult();
|
||||
static CheckResult *validateThis(USpoofCheckResult *ptr, UErrorCode &status);
|
||||
static const CheckResult *validateThis(const USpoofCheckResult *ptr, UErrorCode &status);
|
||||
|
||||
void clear();
|
||||
|
||||
// Used to convert this CheckResult to the older int32_t return value API
|
||||
int32_t toCombinedBitmask(int32_t expectedChecks);
|
||||
|
||||
// Data Members (all stack-allocated)
|
||||
int32_t fMagic; // Internal sanity check.
|
||||
int32_t fChecks; // Bit vector of checks that were failed.
|
||||
UnicodeSet fNumerics; // Set of numerics found in the string.
|
||||
URestrictionLevel fRestrictionLevel; // The restriction level of the string.
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
|
@ -127,14 +140,7 @@ public:
|
|||
//
|
||||
// The keys are stored as a sorted array of 32 bit ints.
|
||||
// bits 0-23 a code point value
|
||||
// bits 24-31 flags
|
||||
// 24: 1 if entry applies to SL table
|
||||
// 25: 1 if entry applies to SA table
|
||||
// 26: 1 if entry applies to ML table
|
||||
// 27: 1 if entry applies to MA table
|
||||
// 28: 1 if there are multiple entries for this code point.
|
||||
// 29-30: length of value string, in UChars.
|
||||
// values are (1, 2, 3, other)
|
||||
// bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
|
||||
// The key table is sorted in ascending code point order. (not on the
|
||||
// 32 bit int value, the flag bits do not participate in the sorting.)
|
||||
//
|
||||
|
@ -154,33 +160,25 @@ public:
|
|||
//
|
||||
// There is no nul character or other mark between adjacent strings.
|
||||
//
|
||||
// String Lengths table
|
||||
// The length of strings from 1 to 3 is flagged in the key table.
|
||||
// For strings of length 4 or longer, the string length table provides a
|
||||
// mapping between an index into the string table and the corresponding length.
|
||||
// Strings of these lengths are rare, so lookup time is not an issue.
|
||||
// Each entry consists of
|
||||
// uint16_t index of the _last_ string with this length
|
||||
// uint16_t the length
|
||||
//
|
||||
|
||||
// Flag bits in the Key entries
|
||||
#define USPOOF_SL_TABLE_FLAG (1<<24)
|
||||
#define USPOOF_SA_TABLE_FLAG (1<<25)
|
||||
#define USPOOF_ML_TABLE_FLAG (1<<26)
|
||||
#define USPOOF_MA_TABLE_FLAG (1<<27)
|
||||
#define USPOOF_KEY_MULTIPLE_VALUES (1<<28)
|
||||
#define USPOOF_KEY_LENGTH_SHIFT 29
|
||||
#define USPOOF_KEY_LENGTH_FIELD(x) (((x)>>29) & 3)
|
||||
|
||||
|
||||
struct SpoofStringLengthsElement {
|
||||
uint16_t fLastString; // index in string table of last string with this length
|
||||
uint16_t fStrLength; // Length of strings
|
||||
// Internal functions for manipulating confusable data table keys
|
||||
#define USPOOF_CONFUSABLE_DATA_FORMAT_VERSION 2 // version for ICU 58
|
||||
class ConfusableDataUtils {
|
||||
public:
|
||||
inline static UChar32 keyToCodePoint(int32_t key) {
|
||||
return key & 0x00ffffff;
|
||||
}
|
||||
inline static int32_t keyToLength(int32_t key) {
|
||||
return ((key & 0xff000000) >> 24) + 1;
|
||||
}
|
||||
inline static int32_t codePointAndLengthToKey(UChar32 codePoint, int32_t length) {
|
||||
U_ASSERT((codePoint & 0x00ffffff) == codePoint);
|
||||
U_ASSERT(length <= 256);
|
||||
return codePoint | ((length - 1) << 24);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// SpoofData
|
||||
|
@ -197,7 +195,9 @@ struct SpoofStringLengthsElement {
|
|||
//---------------------------------------------------------------------------------------
|
||||
class SpoofData: public UMemory {
|
||||
public:
|
||||
static SpoofData *getDefault(UErrorCode &status); // Load standard ICU spoof data.
|
||||
static SpoofData* getDefault(UErrorCode &status); // Get standard ICU spoof data.
|
||||
static void releaseDefault(); // Cleanup reference to default spoof data.
|
||||
|
||||
SpoofData(UErrorCode &status); // Create new spoof data wrapper.
|
||||
// Only used when building new data from rules.
|
||||
|
||||
|
@ -212,7 +212,8 @@ class SpoofData: public UMemory {
|
|||
|
||||
// Check raw Spoof Data Version compatibility.
|
||||
// Return TRUE it looks good.
|
||||
static UBool validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status);
|
||||
UBool validateDataVersion(UErrorCode &status) const;
|
||||
|
||||
~SpoofData(); // Destructor not normally used.
|
||||
// Use removeReference() instead.
|
||||
// Reference Counting functions.
|
||||
|
@ -222,6 +223,35 @@ class SpoofData: public UMemory {
|
|||
SpoofData *addReference();
|
||||
void removeReference();
|
||||
|
||||
// Reset all fields to an initial state.
|
||||
// Called from the top of all constructors.
|
||||
void reset();
|
||||
|
||||
// Copy this instance's raw data buffer to the specified address.
|
||||
int32_t serialize(void *buf, int32_t capacity, UErrorCode &status) const;
|
||||
|
||||
// Get the total number of bytes of data backed by this SpoofData.
|
||||
// Not to be confused with length, which returns the number of confusable entries.
|
||||
int32_t size() const;
|
||||
|
||||
// Get the confusable skeleton transform for a single code point.
|
||||
// The result is a string with a length between 1 and 18 as of Unicode 9.
|
||||
// This is the main public endpoint for this class.
|
||||
// @return The length in UTF-16 code units of the substition string.
|
||||
int32_t confusableLookup(UChar32 inChar, UnicodeString &dest) const;
|
||||
|
||||
// Get the number of confusable entries in this SpoofData.
|
||||
int32_t length() const;
|
||||
|
||||
// Get the code point (key) at the specified index.
|
||||
UChar32 codePointAt(int32_t index) const;
|
||||
|
||||
// Get the confusable skeleton (value) at the specified index.
|
||||
// Append it to the specified UnicodeString&.
|
||||
// @return The length in UTF-16 code units of the skeleton string.
|
||||
int32_t appendValueTo(int32_t index, UnicodeString& dest) const;
|
||||
|
||||
private:
|
||||
// Reserve space in the raw data. For use by builder when putting together a
|
||||
// new set of data. Init the new storage to zero, to prevent inconsistent
|
||||
// results if it is not all otherwise set by the requester.
|
||||
|
@ -232,10 +262,6 @@ class SpoofData: public UMemory {
|
|||
// initialize the pointers from this object to the raw data.
|
||||
void initPtrs(UErrorCode &status);
|
||||
|
||||
// Reset all fields to an initial state.
|
||||
// Called from the top of all constructors.
|
||||
void reset();
|
||||
|
||||
SpoofDataHeader *fRawData; // Ptr to the raw memory-mapped data
|
||||
UBool fDataOwned; // True if the raw data is owned, and needs
|
||||
// to be deleted when refcount goes to zero.
|
||||
|
@ -249,15 +275,10 @@ class SpoofData: public UMemory {
|
|||
// Confusable data
|
||||
int32_t *fCFUKeys;
|
||||
uint16_t *fCFUValues;
|
||||
SpoofStringLengthsElement *fCFUStringLengths;
|
||||
UChar *fCFUStrings;
|
||||
|
||||
// Whole Script Confusable Data
|
||||
UTrie2 *fAnyCaseTrie;
|
||||
UTrie2 *fLowerCaseTrie;
|
||||
ScriptSet *fScriptSets;
|
||||
};
|
||||
|
||||
friend class ConfusabledataBuilder;
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -286,49 +307,13 @@ struct SpoofDataHeader {
|
|||
int32_t fCFUStringTable; // byte offset of String table
|
||||
int32_t fCFUStringTableLen; // length of string table (in 16 bit UChars)
|
||||
|
||||
int32_t fCFUStringLengths; // byte offset to String Lengths table
|
||||
int32_t fCFUStringLengthsSize; // number of entries in lengths table. (2 x 16 bits each)
|
||||
|
||||
|
||||
// The following sections are for data from confusablesWholeScript.txt
|
||||
|
||||
int32_t fAnyCaseTrie; // byte offset to the serialized Any Case Trie
|
||||
int32_t fAnyCaseTrieLength; // Length (bytes) of the serialized Any Case Trie
|
||||
|
||||
int32_t fLowerCaseTrie; // byte offset to the serialized Lower Case Trie
|
||||
int32_t fLowerCaseTrieLength; // Length (bytes) of the serialized Lower Case Trie
|
||||
|
||||
int32_t fScriptSets; // byte offset to array of ScriptSets
|
||||
int32_t fScriptSetsLength; // Number of ScriptSets (24 bytes each)
|
||||
|
||||
|
||||
// The following sections are for data from xidmodifications.txt
|
||||
|
||||
|
||||
|
||||
int32_t unused[15]; // Padding, Room for Expansion
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Structure for the Whole Script Confusable Data
|
||||
// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
|
||||
// Whole Script confusable data
|
||||
//
|
||||
// The data provides mappings from code points to a set of scripts
|
||||
// that contain characters that might be confused with the code point.
|
||||
// There are two mappings, one for lower case only, and one for characters
|
||||
// of any case.
|
||||
//
|
||||
// The actual data consists of a utrie2 to map from a code point to an offset,
|
||||
// and an array of UScriptSets (essentially bit maps) that is indexed
|
||||
// by the offsets obtained from the Trie.
|
||||
//
|
||||
//
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif /* __cplusplus */
|
||||
|
||||
|
|
|
@ -222,7 +222,7 @@ static void TestUSpoofCAPI(void) {
|
|||
|
||||
checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, checkResults);
|
||||
|
||||
uspoof_close(sc2);
|
||||
free(buf);
|
||||
|
@ -299,7 +299,7 @@ static void TestUSpoofCAPI(void) {
|
|||
|
||||
checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, checkResults);
|
||||
uspoof_close(clone2);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
|
@ -318,7 +318,7 @@ static void TestUSpoofCAPI(void) {
|
|||
|
||||
result = uspoof_check(sc, scMixed, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, result);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT, result);
|
||||
TEST_TEARDOWN
|
||||
|
||||
|
||||
|
@ -428,7 +428,7 @@ static void TestUSpoofCAPI(void) {
|
|||
|
||||
checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
|
||||
TEST_ASSERT_EQ(0, checkResults);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/*
|
||||
|
@ -436,7 +436,7 @@ static void TestUSpoofCAPI(void) {
|
|||
*/
|
||||
TEST_SETUP
|
||||
char utf8buf[200];
|
||||
int32_t checkResults;
|
||||
int32_t checkResults, checkResults2;
|
||||
int32_t position;
|
||||
|
||||
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
|
||||
|
@ -457,12 +457,61 @@ static void TestUSpoofCAPI(void) {
|
|||
TEST_ASSERT_SUCCESS(status);
|
||||
position = 666;
|
||||
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
|
||||
checkResults2 = uspoof_check(sc, scMixed, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT , checkResults);
|
||||
TEST_ASSERT_EQ(0, position);
|
||||
TEST_ASSERT_EQ(checkResults , checkResults2);
|
||||
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/*
|
||||
* uspoof_check2 variants
|
||||
*/
|
||||
TEST_SETUP
|
||||
int32_t result1, result2;
|
||||
char utf8buf[200];
|
||||
uspoof_setChecks(sc, USPOOF_ALL_CHECKS | USPOOF_AUX_INFO, &status);
|
||||
USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
const UChar* tests[] = { goodLatin, scMixed, scLatin,
|
||||
goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
|
||||
|
||||
for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) {
|
||||
const UChar* str = tests[i];
|
||||
|
||||
// Basic test
|
||||
result1 = uspoof_check(sc, str, -1, NULL, &status);
|
||||
result2 = uspoof_check2(sc, str, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(result1, result2);
|
||||
|
||||
// With check result parameter
|
||||
result1 = uspoof_check(sc, str, -1, NULL, &status);
|
||||
result2 = uspoof_check2(sc, str, -1, checkResult, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(result1, result2);
|
||||
|
||||
// Checks from checkResult should be same as those from bitmask
|
||||
TEST_ASSERT_EQ(result1 & USPOOF_ALL_CHECKS, uspoof_getCheckResultChecks(checkResult, &status));
|
||||
|
||||
// Restriction level from checkResult should be same as that from bitmask
|
||||
URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult, &status);
|
||||
TEST_ASSERT_EQ(result1 & restrictionLevel, restrictionLevel);
|
||||
|
||||
// UTF8 endpoint
|
||||
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
result1 = uspoof_checkUTF8(sc, utf8buf, -1, NULL, &status);
|
||||
result2 = uspoof_check2UTF8(sc, utf8buf, -1, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(result1, result2);
|
||||
}
|
||||
|
||||
uspoof_closeCheckResult(checkResult);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/*
|
||||
* uspoof_areConfusable()
|
||||
*/
|
||||
|
|
|
@ -803,8 +803,7 @@ group: charset_detector
|
|||
uclean_i18n
|
||||
|
||||
group: spoof_detection
|
||||
uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o uspoof_wsconf.o
|
||||
identifier_info.o scriptset.o
|
||||
uspoof.o uspoof_build.o uspoof_conf.o uspoof_impl.o scriptset.o
|
||||
deps
|
||||
uniset_props regex unorm uscript
|
||||
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
#include "unicode/uspoof.h"
|
||||
|
||||
#include "cstring.h"
|
||||
#include "identifier_info.h"
|
||||
#include "scriptset.h"
|
||||
#include "uhash.h"
|
||||
|
||||
|
@ -58,11 +57,15 @@
|
|||
USpoofChecker *sc; \
|
||||
sc = uspoof_open(&status); \
|
||||
TEST_ASSERT_SUCCESS(status); \
|
||||
USpoofCheckResult *checkResult; \
|
||||
checkResult = uspoof_openCheckResult(&status); \
|
||||
TEST_ASSERT_SUCCESS(status); \
|
||||
if (U_SUCCESS(status)){
|
||||
|
||||
#define TEST_TEARDOWN \
|
||||
} \
|
||||
TEST_ASSERT_SUCCESS(status); \
|
||||
uspoof_closeCheckResult(checkResult); \
|
||||
uspoof_close(sc); \
|
||||
}
|
||||
|
||||
|
@ -81,7 +84,6 @@ void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
|
|||
TESTCASE_AUTO(testInvisible);
|
||||
TESTCASE_AUTO(testConfData);
|
||||
TESTCASE_AUTO(testBug8654);
|
||||
TESTCASE_AUTO(testIdentifierInfo);
|
||||
TESTCASE_AUTO(testScriptSet);
|
||||
TESTCASE_AUTO(testRestrictionLevel);
|
||||
TESTCASE_AUTO(testMixedNumbers);
|
||||
|
@ -105,6 +107,7 @@ void IntlTestSpoof::testSpoofAPI() {
|
|||
UnicodeString s1("cxs");
|
||||
UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
|
||||
int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
|
||||
|
||||
TEST_TEARDOWN;
|
||||
|
@ -223,8 +226,9 @@ void IntlTestSpoof::testAreConfusable() {
|
|||
"A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
|
||||
UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
|
||||
"A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
|
||||
int32_t result = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, result);
|
||||
|
||||
TEST_TEARDOWN;
|
||||
}
|
||||
|
@ -398,146 +402,6 @@ void IntlTestSpoof::testConfData() {
|
|||
}
|
||||
}
|
||||
|
||||
// testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time
|
||||
void IntlTestSpoof::testIdentifierInfo() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
|
||||
ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
|
||||
TEST_ASSERT(bitset12.contains(bitset2));
|
||||
TEST_ASSERT(bitset12.contains(bitset12));
|
||||
TEST_ASSERT(!bitset2.contains(bitset12));
|
||||
|
||||
ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
|
||||
ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
|
||||
UElement arabEl; arabEl.pointer = &arabSet;
|
||||
UElement latinEl; latinEl.pointer = &latinSet;
|
||||
TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
|
||||
TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
|
||||
|
||||
UnicodeString scriptString;
|
||||
bitset12.displayScripts(scriptString);
|
||||
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status);
|
||||
uhash_puti(alternates, &bitset12, 1, &status);
|
||||
uhash_puti(alternates, &bitset2, 1, &status);
|
||||
UnicodeString alternatesString;
|
||||
IdentifierInfo::displayAlternates(alternatesString, alternates, status);
|
||||
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
ScriptSet tScriptSet;
|
||||
tScriptSet.parseScripts(scriptString, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(bitset12 == tScriptSet);
|
||||
UnicodeString ss;
|
||||
ss.remove();
|
||||
uhash_close(alternates);
|
||||
|
||||
struct Test {
|
||||
const char *fTestString;
|
||||
URestrictionLevel fRestrictionLevel;
|
||||
const char *fNumerics;
|
||||
const char *fScripts;
|
||||
const char *fAlternates;
|
||||
const char *fCommonAlternates;
|
||||
} tests[] = {
|
||||
{"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""},
|
||||
{"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
|
||||
{"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"},
|
||||
{"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
|
||||
{"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
|
||||
{"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"},
|
||||
{"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""},
|
||||
{"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE,
|
||||
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
|
||||
{"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE,
|
||||
"[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}
|
||||
};
|
||||
|
||||
int testNum;
|
||||
for (testNum = 0; testNum < UPRV_LENGTHOF(tests); testNum++) {
|
||||
char testNumStr[40];
|
||||
sprintf(testNumStr, "testNum = %d", testNum);
|
||||
Test &test = tests[testNum];
|
||||
status = U_ZERO_ERROR;
|
||||
UnicodeString testString(test.fTestString); // Note: may do charset conversion.
|
||||
testString = testString.unescape();
|
||||
IdentifierInfo idInfo(status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
UnicodeSet allowedChars;
|
||||
// Allowed Identifier Characters. In addition to the Recommended Set,
|
||||
// allow u303c, which has an interesting script extension of Hani Hira Kana.
|
||||
allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C);
|
||||
idInfo.setIdentifierProfile(allowedChars);
|
||||
idInfo.setIdentifier(testString, status);
|
||||
TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr);
|
||||
|
||||
URestrictionLevel restrictionLevel = test.fRestrictionLevel;
|
||||
TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr);
|
||||
|
||||
ScriptSet scripts;
|
||||
scripts.parseScripts(UnicodeString(test.fScripts), status);
|
||||
TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr);
|
||||
|
||||
UnicodeString alternatesStr;
|
||||
IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status);
|
||||
TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr);
|
||||
|
||||
ScriptSet commonAlternates;
|
||||
commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status);
|
||||
TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr);
|
||||
}
|
||||
|
||||
// Test of getScriptCount()
|
||||
// Script and or Script Extension for chars used in the tests
|
||||
// \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
|
||||
// \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK
|
||||
// \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA
|
||||
//
|
||||
// \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA
|
||||
// \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA
|
||||
// \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A
|
||||
// \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A
|
||||
// 1234 ; Common # ascii digits
|
||||
// \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT
|
||||
|
||||
struct ScriptTest {
|
||||
const char *fTestString;
|
||||
int32_t fScriptCount;
|
||||
} scriptTests[] = {
|
||||
{"Hello", 1},
|
||||
{"Hello\\u0370", 2},
|
||||
{"1234", 0},
|
||||
{"Hello1234\\u0300", 1}, // Common and Inherited are ignored.
|
||||
{"\\u0030", 0},
|
||||
{"abc\\u0951", 1},
|
||||
{"abc\\u3013", 2},
|
||||
{"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path.
|
||||
{"\\u3013\\uA838", 2}
|
||||
};
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
IdentifierInfo identifierInfo(status);
|
||||
for (testNum=0; testNum<UPRV_LENGTHOF(scriptTests); testNum++) {
|
||||
ScriptTest &test = scriptTests[testNum];
|
||||
char msgBuf[100];
|
||||
sprintf(msgBuf, "testNum = %d ", testNum);
|
||||
UnicodeString testString = UnicodeString(test.fTestString).unescape();
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
identifierInfo.setIdentifier(testString, status);
|
||||
int32_t scriptCount = identifierInfo.getScriptCount();
|
||||
TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf);
|
||||
}
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testScriptSet() {
|
||||
ScriptSet s1;
|
||||
|
@ -600,6 +464,14 @@ void IntlTestSpoof::testScriptSet() {
|
|||
s2.intersect(s1);
|
||||
TEST_ASSERT(s2.countMembers() == 1);
|
||||
|
||||
s1.resetAll();
|
||||
TEST_ASSERT(s1.isEmpty());
|
||||
s1.set(USCRIPT_LATIN, status);
|
||||
TEST_ASSERT(!s1.isEmpty());
|
||||
s1.setAll();
|
||||
TEST_ASSERT(!s1.isEmpty());
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
s1.resetAll();
|
||||
s1.set(USCRIPT_AFAKA, status);
|
||||
s1.set(USCRIPT_VAI, status);
|
||||
|
@ -616,6 +488,39 @@ void IntlTestSpoof::testScriptSet() {
|
|||
}
|
||||
}
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
// Script extensions. Depends on data.
|
||||
s1.resetAll();
|
||||
s1.setScriptExtensions(0x67, status);
|
||||
TEST_ASSERT(s1.countMembers() == 1);
|
||||
TEST_ASSERT(s1.test(USCRIPT_LATIN, status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
s1.resetAll();
|
||||
s1.setScriptExtensions(0x303C, status);
|
||||
TEST_ASSERT(s1.countMembers() == 3);
|
||||
TEST_ASSERT(s1.test(USCRIPT_HAN, status));
|
||||
TEST_ASSERT(s1.test(USCRIPT_HIRAGANA, status));
|
||||
TEST_ASSERT(s1.test(USCRIPT_KATAKANA, status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
// Additional tests
|
||||
ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
|
||||
ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
|
||||
TEST_ASSERT(bitset12.contains(bitset2));
|
||||
TEST_ASSERT(bitset12.contains(bitset12));
|
||||
TEST_ASSERT(!bitset2.contains(bitset12));
|
||||
|
||||
ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
|
||||
ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
|
||||
UElement arabEl; arabEl.pointer = &arabSet;
|
||||
UElement latinEl; latinEl.pointer = &latinSet;
|
||||
TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
|
||||
TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
|
||||
|
||||
UnicodeString scriptString;
|
||||
bitset12.displayScripts(scriptString);
|
||||
TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
|
||||
}
|
||||
|
||||
|
||||
|
@ -629,35 +534,40 @@ void IntlTestSpoof::testRestrictionLevel() {
|
|||
{"\\u03B3", USPOOF_SINGLE_SCRIPT_RESTRICTIVE},
|
||||
{"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE},
|
||||
{"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE},
|
||||
{"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}
|
||||
{"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE},
|
||||
{"\\u0061\\u2665", USPOOF_UNRESTRICTIVE},
|
||||
{"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
|
||||
{"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
|
||||
{"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE},
|
||||
{"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE},
|
||||
{"\\u0061\\u0031\\u0661", USPOOF_MODERATELY_RESTRICTIVE},
|
||||
{"\\u0061\\u0031\\u0661\\u06F1", USPOOF_MODERATELY_RESTRICTIVE},
|
||||
{"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE},
|
||||
{"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE}
|
||||
};
|
||||
char msgBuffer[100];
|
||||
|
||||
URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_SINGLE_SCRIPT_RESTRICTIVE,
|
||||
USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE,
|
||||
USPOOF_UNRESTRICTIVE};
|
||||
|
||||
USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE,
|
||||
USPOOF_UNRESTRICTIVE};
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
IdentifierInfo idInfo(status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
UnicodeSet allowedChars;
|
||||
// Allowed Identifier Characters. In addition to the Recommended Set,
|
||||
// allow u303c, which has an interesting script extension of Hani Hira Kana.
|
||||
allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C);
|
||||
|
||||
for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
|
||||
status = U_ZERO_ERROR;
|
||||
const Test &test = tests[testNum];
|
||||
UnicodeString testString = UnicodeString(test.fId).unescape();
|
||||
URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel;
|
||||
idInfo.setIdentifier(testString, status);
|
||||
sprintf(msgBuffer, "testNum = %d ", testNum);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer);
|
||||
for (int levelIndex=0; levelIndex<UPRV_LENGTHOF(restrictionLevels); levelIndex++) {
|
||||
status = U_ZERO_ERROR;
|
||||
URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex];
|
||||
USpoofChecker *sc = uspoof_open(&status);
|
||||
uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
|
||||
uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
|
||||
uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status);
|
||||
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
|
||||
uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
|
||||
int32_t actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status);
|
||||
|
||||
// we want to fail if the text is (say) MODERATE and the testLevel is ASCII
|
||||
|
@ -665,9 +575,6 @@ void IntlTestSpoof::testRestrictionLevel() {
|
|||
if (expectedLevel > levelSetInSpoofChecker) {
|
||||
expectedValue |= USPOOF_RESTRICTION_LEVEL;
|
||||
}
|
||||
if (!uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString)) {
|
||||
expectedValue |= USPOOF_CHAR_LIMIT;
|
||||
}
|
||||
sprintf(msgBuffer, "testNum = %d, levelIndex = %d, expected = %#x, actual = %#x",
|
||||
testNum, levelIndex, expectedValue, actualValue);
|
||||
TEST_ASSERT_MSG(expectedValue == actualValue, msgBuffer);
|
||||
|
@ -675,9 +582,9 @@ void IntlTestSpoof::testRestrictionLevel() {
|
|||
|
||||
// Run the same check again, with the Spoof Checker configured to return
|
||||
// the actual restriction level.
|
||||
uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status);
|
||||
uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
|
||||
uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status);
|
||||
uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
|
||||
uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status);
|
||||
int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
if (U_SUCCESS(status)) {
|
||||
|
@ -687,8 +594,8 @@ void IntlTestSpoof::testRestrictionLevel() {
|
|||
uspoof_close(sc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void IntlTestSpoof::testMixedNumbers() {
|
||||
struct Test {
|
||||
|
@ -698,10 +605,18 @@ void IntlTestSpoof::testMixedNumbers() {
|
|||
{"1", "[0]"},
|
||||
{"\\u0967", "[\\u0966]"},
|
||||
{"1\\u0967", "[0\\u0966]"},
|
||||
{"\\u0661\\u06F1", "[\\u0660\\u06F0]"}
|
||||
{"\\u0661\\u06F1", "[\\u0660\\u06F0]"},
|
||||
{"\\u0061\\u2665", "[]"},
|
||||
{"\\u0061\\u303C", "[]"},
|
||||
{"\\u0061\\u30FC\\u303C", "[]"},
|
||||
{"\\u0061\\u30FC\\u303C\\u30A2", "[]"},
|
||||
{"\\u30A2\\u0061\\u30FC\\u303C", "[]"},
|
||||
{"\\u0061\\u0031\\u0661", "[\\u0030\\u0660]"},
|
||||
{"\\u0061\\u0031\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0]"},
|
||||
{"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"},
|
||||
{"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"}
|
||||
};
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
IdentifierInfo idInfo(status);
|
||||
for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
|
||||
char msgBuf[100];
|
||||
sprintf(msgBuf, "testNum = %d ", testNum);
|
||||
|
@ -710,17 +625,16 @@ void IntlTestSpoof::testMixedNumbers() {
|
|||
status = U_ZERO_ERROR;
|
||||
UnicodeString testString = UnicodeString(test.fTestString).unescape();
|
||||
UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status);
|
||||
idInfo.setIdentifier(testString, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
USpoofChecker *sc = uspoof_open(&status);
|
||||
uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
|
||||
int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
|
||||
UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0);
|
||||
TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
|
||||
uspoof_close(sc);
|
||||
TEST_SETUP
|
||||
uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
|
||||
uspoof_check2UnicodeString(sc, testString, checkResult, &status);
|
||||
UBool mixedNumberFailure = ((uspoof_getCheckResultChecks(checkResult, &status) & USPOOF_MIXED_NUMBERS) != 0);
|
||||
TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
|
||||
const UnicodeSet* actualSet = UnicodeSet::fromUSet(uspoof_getCheckResultNumerics(checkResult, &status));
|
||||
TEST_ASSERT_MSG(expectedSet == *actualSet, msgBuf);
|
||||
TEST_TEARDOWN
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,8 +38,6 @@ public:
|
|||
|
||||
void testBug8654();
|
||||
|
||||
void testIdentifierInfo();
|
||||
|
||||
void testScriptSet();
|
||||
|
||||
void testRestrictionLevel();
|
||||
|
|
|
@ -16,17 +16,20 @@
|
|||
// derived from the Unicode Consortium data described in
|
||||
// Unicode UAX 39.
|
||||
//
|
||||
// Usage: gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt -o output-file.cfu
|
||||
// Usage: gencfu [options] -r confusables-file.txt -o output-file.cfu
|
||||
//
|
||||
// options: -v verbose
|
||||
// -? or -h help
|
||||
//
|
||||
// The input rule filew is are plain text files containing confusable character
|
||||
// definitions in the input format defined by Unicode UAX39 for the files
|
||||
// confusables.txt and confusablesWholeScript.txt. This source (.txt) format
|
||||
// confusables.txt. This source (.txt) format
|
||||
// is also accepted direaccepted by ICU spoof detedtors. The
|
||||
// files must be encoded in utf-8 format, with or without a BOM.
|
||||
//
|
||||
// The script used to compile confusablesWholeScript.txt into the CFU file
|
||||
// until the Unicode consortium deprecated it.
|
||||
//
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
@ -53,7 +56,7 @@ static UOption options[]={
|
|||
UOPTION_HELP_QUESTION_MARK, /* 1 */
|
||||
UOPTION_VERBOSE, /* 2 */
|
||||
{ "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
|
||||
{ "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */
|
||||
{ "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */ // deprecated
|
||||
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */
|
||||
UOPTION_ICUDATADIR, /* 6 */
|
||||
UOPTION_DESTDIR, /* 7 */
|
||||
|
@ -62,7 +65,7 @@ static UOption options[]={
|
|||
};
|
||||
|
||||
void usageAndDie(int retCode) {
|
||||
printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
|
||||
printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName);
|
||||
printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
|
@ -133,7 +136,6 @@ static const char *readFile(const char *fileName, int32_t *len);
|
|||
int main(int argc, char **argv) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const char *confFileName;
|
||||
const char *confWSFileName;
|
||||
const char *outFileName;
|
||||
const char *outDir = NULL;
|
||||
const char *copyright = NULL;
|
||||
|
@ -156,12 +158,11 @@ int main(int argc, char **argv) {
|
|||
usageAndDie(0);
|
||||
}
|
||||
|
||||
if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
|
||||
fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
|
||||
if (!(options[3].doesOccur && options[5].doesOccur)) {
|
||||
fprintf(stderr, "confusables file and output file must all be specified.\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
confFileName = options[3].value;
|
||||
confWSFileName = options[4].value;
|
||||
outFileName = options[5].value;
|
||||
|
||||
if (options[6].doesOccur) {
|
||||
|
@ -220,13 +221,6 @@ int main(int argc, char **argv) {
|
|||
exit(-1);
|
||||
}
|
||||
|
||||
int32_t wsConfusablesLen = 0;
|
||||
const char *wsConfsables = readFile(confWSFileName, &wsConfusablesLen);
|
||||
if (wsConfsables == NULL) {
|
||||
printf("gencfu: error reading file \"%s\"\n", confFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
// Create the Spoof Detector from the source confusables files.
|
||||
// This will compile the data.
|
||||
|
@ -236,13 +230,11 @@ int main(int argc, char **argv) {
|
|||
parseError.offset = 0;
|
||||
int32_t errType;
|
||||
USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
|
||||
wsConfsables, wsConfusablesLen,
|
||||
NULL, 0,
|
||||
&errType, &parseError, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
const char *errFile =
|
||||
(errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
|
||||
fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n",
|
||||
u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
|
||||
u_errorName(status), confFileName, (int)parseError.line, (int)parseError.offset);
|
||||
exit(status);
|
||||
};
|
||||
|
||||
|
@ -297,7 +289,6 @@ int main(int argc, char **argv) {
|
|||
uspoof_close(sc);
|
||||
delete [] outData;
|
||||
delete [] confusables;
|
||||
delete [] wsConfsables;
|
||||
u_cleanup();
|
||||
if (!quiet) {
|
||||
printf("gencfu: tool completed successfully.\n");
|
||||
|
|
Loading…
Add table
Reference in a new issue