ICU-11446 Spoof Checker data update: use MA table only.

X-SVN-Rev: 37072
This commit is contained in:
Andy Heninger 2015-02-26 02:04:11 +00:00
parent 74157ec338
commit 56459a99d9
5 changed files with 8253 additions and 6757 deletions

View file

@ -1,6 +1,6 @@
/*
***************************************************************************
* Copyright (C) 2008-2014 International Business Machines Corporation
* Copyright (C) 2008-2015 International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*
@ -1809,6 +1809,15 @@ public class SpoofChecker {
* Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
* large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
*
* Skeletons are computed using the algorithm and data describe in Unicode UAX 39.
* The latest proposed update, UAX 39 Version 8 draft 1, says "the tables SL, SA, and ML
* were still problematic, and discouraged from use in [Uniocde] 7.0.
* They were thus removed from version 8.0"
*
* In light of this, the default mapping data included with ICU 55 uses the
* Unicode 7 MA (Multi script Any case) table data for the other type options
* (Single Script, Any Case), (Single Script, Lower Case) and (Multi Script, Lower Case).
*
* @param type
* The type of skeleton, corresponding to which of the Unicode confusable data tables to use. The default
* is Mixed-Script, Lowercase. Allowed options are SINGLE_SCRIPT_CONFUSABLE and ANY_CASE_CONFUSABLE. The

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:847e0ccaa347c084b4f8a52871942bd2493d12e2675e831ee206e86a176da7ac
size 11876902
oid sha256:0d15d27af09b6d207302e051d429e949e7b137054e0fc2c7db5be89b3a43424e
size 11868952

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c986657e2b3c0c646d4f51cc0332cbede2c984deab3864a42749f7ee76a7a95f
oid sha256:bba5e69e2602c2a121977ede7224e4126ce04905831fb048bef10cb59ec822f2
size 90574

View file

@ -407,39 +407,42 @@ public class SpoofCheckerTest extends TestFmwk {
checkSkeleton(sc, MA, "\\u02b9identifier'", "'identifier'", testName);
checkSkeleton(sc, SL, "nochange", "\\u213C\\u2134\\U0001D41C\\u210E\\u237A\\u213C\\u210A\\u212E", testName);
checkSkeleton(sc, SA, "nochange", "\\u213C\\u2134\\U0001D41C\\u210E\\u237A\\u213C\\u210A\\u212E", testName);
checkSkeleton(sc, ML, "nochange", "\\u213C\\u2134\\U0001D41C\\u210E\\u237A\\u213C\\u210A\\u212E", testName);
checkSkeleton(sc, SL, "nochange", "nochange", testName);
checkSkeleton(sc, SA, "nochange", "nochange", testName);
checkSkeleton(sc, ML, "nochange", "nochange", testName);
checkSkeleton(sc, MA, "nochange", "nochange", testName);
checkSkeleton(sc, MA, "love", "love", testName);
checkSkeleton(sc, MA, "1ove", "love", testName); // Digit 1 to letter l
checkSkeleton(sc, ML, "OOPS", "OOPS", testName);
checkSkeleton(sc, ML, "00PS", "00PS", testName); // Digit 0 unchanged in lower case mode.
checkSkeleton(sc, ML, "00PS", "OOPS", testName);
checkSkeleton(sc, MA, "OOPS", "OOPS", testName);
checkSkeleton(sc, MA, "00PS", "OOPS", testName); // Digit 0 to letter O in any case mode only
checkSkeleton(sc, MA, "00PS", "OOPS", testName); // Digit 0 to letter O
checkSkeleton(sc, SL, "\\u059c", "\\u0301", testName);
checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D", testName);
checkSkeleton(sc, SL, "\\u247E", "\\u0028\\u0031\\u0031\\u0029", testName); // "(11)"
checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0031\\u0644\\u2134", testName);
checkSkeleton(sc, SL, "\\u247E", "(ll)", testName);
checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f", testName);
// This mapping exists in the ML and MA tables, does not exist in SL, SA
// 0C83 mapping existed in the ML and MA tables, did not exist in SL, SA (Original Unicode 7)
// mapping exists in all tables (ICU 55).
// 0C83 ; 0983 ; ML # KANNADA SIGN VISARGA to
checkSkeleton(sc, SL, "\\u0C83", "\\u0C83", testName);
checkSkeleton(sc, SA, "\\u0C83", "\\u0C83", testName);
checkSkeleton(sc, SL, "\\u0C83", "\\u0983", testName);
checkSkeleton(sc, SA, "\\u0C83", "\\u0983", testName);
checkSkeleton(sc, ML, "\\u0C83", "\\u0983", testName);
checkSkeleton(sc, MA, "\\u0C83", "\\u0983", testName);
// 0391 mappings exist only in MA and SA tables.
// 0391 mappings existed only in MA and SA tables (Original Unicode 7).
// mappings exist in all tables (ICU 55)
checkSkeleton(sc, MA, "\\u0391", "A", testName);
checkSkeleton(sc, SA, "\\u0391", "\\U0001D400", testName);
checkSkeleton(sc, ML, "\\u0391", "\\u0391", testName);
checkSkeleton(sc, SL, "\\u0391", "\\u0391", testName);
checkSkeleton(sc, SA, "\\u0391", "A", testName);
checkSkeleton(sc, ML, "\\u0391", "A", testName);
checkSkeleton(sc, SL, "\\u0391", "A", testName);
// 13CF Mappings in all four tables, different in MA.
checkSkeleton(sc, ML, "\\u13CF", "\\U0001D41B", testName);
// 13CF Mappings in all four tables, different in MA (Original Unicode 7).
// Mapping same in all tables (ICU 55)
checkSkeleton(sc, ML, "\\u13CF", "b", testName);
checkSkeleton(sc, MA, "\\u13CF", "b", testName);
checkSkeleton(sc, SL, "\\u13CF", "\\U0001D41B", testName);
checkSkeleton(sc, SA, "\\u13CF", "\\U0001D41B", testName);
checkSkeleton(sc, SL, "\\u13CF", "b", testName);
checkSkeleton(sc, SA, "\\u13CF", "b", testName);
// 0022 ; 0027 0027 ;
// all tables