ICU-13630 switch from IdnaTest.txt to IdnaTestV2.txt new in Unicode 11 see Unicode PRI 375

X-SVN-Rev: 41294
This commit is contained in:
Markus Scherer 2018-04-30 03:17:11 +00:00
parent c9680037cc
commit a4e66ded6d
8 changed files with 12773 additions and 15782 deletions

View file

@ -44,7 +44,7 @@ public:
void checkIdnaTestResult(const char *line, const char *type,
const UnicodeString &expected, const UnicodeString &result,
const IDNAInfo &info);
const char *status, const IDNAInfo &info);
void idnaTestOneLine(char *fields[][2], UErrorCode &errorCode);
private:
@ -896,7 +896,7 @@ void UTS46Test::TestSomeCases() {
namespace {
const int32_t kNumFields = 4; // Will need 5 when we read NV8 from the optional fifth column.
const int32_t kNumFields = 7;
void U_CALLCONV
idnaTestLineFn(void *context,
@ -905,17 +905,43 @@ idnaTestLineFn(void *context,
reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
}
UnicodeString s16FromField(char *(&field)[2]) {
int32_t length = (int32_t)(field[1] - field[0]);
return UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
}
std::string statusFromField(char *(&field)[2]) {
const char *start = u_skipWhitespace(field[0]);
std::string status;
if (start != field[1]) {
int32_t length = (int32_t)(field[1] - start);
while (length > 0 && (start[length - 1] == u' ' || start[length - 1] == u'\t')) {
--length;
}
status.assign(start, length);
}
return status;
}
} // namespace
void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
const UnicodeString &expected, const UnicodeString &result,
const IDNAInfo &info) {
const char *status, const IDNAInfo &info) {
// An error in toUnicode or toASCII is indicated by a value in square brackets,
// such as "[B5 B6]".
UBool expectedHasErrors = !expected.isEmpty() && expected[0] == u'[';
UBool expectedHasErrors = FALSE;
if (*status != 0) {
if (*status != u'[') {
errln("%s status field does not start with '[': %s\n %s", type, status, line);
}
if (strcmp(status, u8"[]") != 0) {
expectedHasErrors = TRUE;
}
}
if (expectedHasErrors != info.hasErrors()) {
errln("%s expected errors %d != %d = actual has errors: %04lx\n %s",
type, expectedHasErrors, info.hasErrors(), (long)info.getErrors(), line);
errln("%s expected errors %s %d != %d = actual has errors: %04lx\n %s",
type, status, expectedHasErrors, info.hasErrors(), (long)info.getErrors(), line);
}
if (!expectedHasErrors && expected != result) {
errln("%s expected != actual\n %s", type, line);
@ -925,57 +951,68 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
}
void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// Column 1: type - T for transitional, N for nontransitional, B for both
const char *typePtr = u_skipWhitespace(fields[0][0]);
const char *limit;
char typeChar;
if (typePtr == fields[0][1] ||
((typeChar = *typePtr) != 'B' && typeChar != 'N' && typeChar != 'T') ||
(limit = u_skipWhitespace(typePtr + 1)) != fields[0][1]) {
errln("empty or unknown type field: %s", fields[0][0]);
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// IdnaTestV2.txt (since Unicode 11)
// Column 1: source
// The source string to be tested
UnicodeString source = s16FromField(fields[0]);
// Column 2: source - the source string to be tested
int32_t length = (int32_t)(fields[1][1] - fields[1][0]);
UnicodeString source16 = UnicodeString::fromUTF8(StringPiece(fields[1][0], length)).
trim().unescape();
// Column 3: toUnicode - the result of applying toUnicode to the source.
// Column 2: toUnicode
// The result of applying toUnicode to the source, with Transitional_Processing=false.
// A blank value means the same as the source value.
length = (int32_t)(fields[2][1] - fields[2][0]);
UnicodeString unicode16 = UnicodeString::fromUTF8(StringPiece(fields[2][0], length)).
trim().unescape();
if (unicode16.isEmpty()) {
unicode16 = source16;
UnicodeString toUnicode = s16FromField(fields[1]);
if (toUnicode.isEmpty()) {
toUnicode = source;
}
// Column 4: toASCII - the result of applying toASCII to the source, using the specified type.
// Column 3: toUnicodeStatus
// A set of status codes, each corresponding to a particular test.
// A blank value means [].
std::string toUnicodeStatus = statusFromField(fields[2]);
// Column 4: toAsciiN
// The result of applying toASCII to the source, with Transitional_Processing=false.
// A blank value means the same as the toUnicode value.
length = (int32_t)(fields[3][1] - fields[3][0]);
UnicodeString ascii16 = UnicodeString::fromUTF8(StringPiece(fields[3][0], length)).
trim().unescape();
if (ascii16.isEmpty()) {
ascii16 = unicode16;
UnicodeString toAsciiN = s16FromField(fields[3]);
if (toAsciiN.isEmpty()) {
toAsciiN = toUnicode;
}
// Column 5: NV8 - present if the toUnicode value would not be a valid domain name under IDNA2008. Not a normative field.
// Ignored as long as we do not implement and test vanilla IDNA2008.
// Column 5: toAsciiNStatus
// A set of status codes, each corresponding to a particular test.
// A blank value means the same as the toUnicodeStatus value.
std::string toAsciiNStatus = statusFromField(fields[4]);
if (toAsciiNStatus.empty()) {
toAsciiNStatus = toUnicodeStatus;
}
// Column 6: toAsciiT
// The result of applying toASCII to the source, with Transitional_Processing=true.
// A blank value means the same as the toAsciiN value.
UnicodeString toAsciiT = s16FromField(fields[5]);
if (toAsciiT.isEmpty()) {
toAsciiT = toAsciiN;
}
// Column 7: toAsciiTStatus
// A set of status codes, each corresponding to a particular test.
// A blank value means the same as the toAsciiNStatus value.
std::string toAsciiTStatus = statusFromField(fields[6]);
if (toAsciiTStatus.empty()) {
toAsciiTStatus = toAsciiNStatus;
}
// ToASCII/ToUnicode, transitional/nontransitional
UnicodeString uN, aN, aT;
IDNAInfo uNInfo, aNInfo, aTInfo;
nontrans->nameToUnicode(source16, uN, uNInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toUnicodeNontrans", unicode16, uN, uNInfo);
if (typeChar == 'T' || typeChar == 'B') {
trans->nameToASCII(source16, aT, aTInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toASCIITrans", ascii16, aT, aTInfo);
}
if (typeChar == 'N' || typeChar == 'B') {
nontrans->nameToASCII(source16, aN, aNInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toASCIINontrans", ascii16, aN, aNInfo);
}
nontrans->nameToUnicode(source, uN, uNInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toUnicodeNontrans", toUnicode, uN,
toUnicodeStatus.c_str(), uNInfo);
nontrans->nameToASCII(source, aN, aNInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toASCIINontrans", toAsciiN, aN,
toAsciiNStatus.c_str(), aNInfo);
trans->nameToASCII(source, aT, aTInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toASCIITrans", toAsciiT, aT,
toAsciiTStatus.c_str(), aTInfo);
}
// TODO: de-duplicate
@ -990,7 +1027,7 @@ void UTS46Test::IdnaTest() {
return;
}
CharString path(sourceTestDataPath, errorCode);
path.appendPathPart("IdnaTest.txt", errorCode);
path.appendPathPart("IdnaTestV2.txt", errorCode);
LocalStdioFilePointer idnaTestFile(fopen(path.data(), "r"));
if (idnaTestFile.isNull()) {
errln("unable to open %s", path.data());

File diff suppressed because it is too large Load diff

6310
icu4c/source/test/testdata/IdnaTestV2.txt vendored Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -728,14 +728,23 @@ public class UTS46Test extends TestFmwk {
}
private void checkIdnaTestResult(String line, String type,
String expected, CharSequence result, IDNA.Info info) {
String expected, CharSequence result, String status, IDNA.Info info) {
// An error in toUnicode or toASCII is indicated by a value in square brackets,
// such as "[B5 B6]".
boolean expectedHasErrors = !expected.isEmpty() && expected.charAt(0) == '[';
boolean expectedHasErrors = false;
if (!status.isEmpty()) {
if (status.charAt(0) != '[') {
errln(String.format("%s status field does not start with '[': %s\n %s",
type, status, line));
}
if (!status.equals("[]")) {
expectedHasErrors = true;
}
}
if (expectedHasErrors != info.hasErrors()) {
errln(String.format(
"%s expected errors %b != %b = actual has errors: %s\n %s",
type, expectedHasErrors, info.hasErrors(), info.getErrors(), line));
"%s expected errors %s %b != %b = actual has errors: %s\n %s",
type, status, expectedHasErrors, info.hasErrors(), info.getErrors(), line));
}
if (!expectedHasErrors && !UTF16Plus.equal(expected, result)) {
errln(String.format("%s expected != actual\n %s", type, line));
@ -746,7 +755,7 @@ public class UTS46Test extends TestFmwk {
@Test
public void IdnaTest() throws IOException {
BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTest.txt", "UTF-8");
BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTestV2.txt", "UTF-8");
Pattern semi = Pattern.compile(";");
try {
String line;
@ -761,48 +770,65 @@ public class UTS46Test extends TestFmwk {
continue; // Skip empty and comment-only lines.
}
// Column 1: type - T for transitional, N for nontransitional, B for both
String type = fields[0].trim();
char typeChar;
if (type.length() != 1 ||
((typeChar = type.charAt(0)) != 'B' && typeChar != 'N' && typeChar != 'T')) {
errln("empty or unknown type field: " + line);
return;
}
// IdnaTestV2.txt (since Unicode 11)
// Column 1: source
// The source string to be tested
String source = Utility.unescape(fields[0].trim());
// Column 2: source - the source string to be tested
String source16 = Utility.unescape(fields[1].trim());
// Column 3: toUnicode - the result of applying toUnicode to the source.
// Column 2: toUnicode
// The result of applying toUnicode to the source, with Transitional_Processing=false.
// A blank value means the same as the source value.
String unicode16 = Utility.unescape(fields[2].trim());
if (unicode16.isEmpty()) {
unicode16 = source16;
String toUnicode = Utility.unescape(fields[1].trim());
if (toUnicode.isEmpty()) {
toUnicode = source;
}
// Column 4: toASCII - the result of applying toASCII to the source, using the specified type.
// Column 3: toUnicodeStatus
// A set of status codes, each corresponding to a particular test.
// A blank value means [].
String toUnicodeStatus = fields[2].trim();
// Column 4: toAsciiN
// The result of applying toASCII to the source, with Transitional_Processing=false.
// A blank value means the same as the toUnicode value.
String ascii16 = Utility.unescape(fields[3].trim());
if (ascii16.isEmpty()) {
ascii16 = unicode16;
String toAsciiN = Utility.unescape(fields[3].trim());
if (toAsciiN.isEmpty()) {
toAsciiN = toUnicode;
}
// Column 5: NV8 - present if the toUnicode value would not be a valid domain name under IDNA2008. Not a normative field.
// Ignored as long as we do not implement and test vanilla IDNA2008.
// Column 5: toAsciiNStatus
// A set of status codes, each corresponding to a particular test.
// A blank value means the same as the toUnicodeStatus value.
String toAsciiNStatus = fields[4].trim();
if (toAsciiNStatus.isEmpty()) {
toAsciiNStatus = toUnicodeStatus;
}
// Column 6: toAsciiT
// The result of applying toASCII to the source, with Transitional_Processing=true.
// A blank value means the same as the toAsciiN value.
String toAsciiT = Utility.unescape(fields[5].trim());
if (toAsciiT.isEmpty()) {
toAsciiT = toAsciiN;
}
// Column 7: toAsciiTStatus
// A set of status codes, each corresponding to a particular test.
// A blank value means the same as the toAsciiNStatus value.
String toAsciiTStatus = fields[6].trim();
if (toAsciiTStatus.isEmpty()) {
toAsciiTStatus = toAsciiNStatus;
}
// ToASCII/ToUnicode, transitional/nontransitional
StringBuilder uN, aN, aT;
IDNA.Info uNInfo, aNInfo, aTInfo;
nontrans.nameToUnicode(source16, uN = new StringBuilder(), uNInfo = new IDNA.Info());
checkIdnaTestResult(line, "toUnicodeNontrans", unicode16, uN, uNInfo);
if (typeChar == 'T' || typeChar == 'B') {
trans.nameToASCII(source16, aT = new StringBuilder(), aTInfo = new IDNA.Info());
checkIdnaTestResult(line, "toASCIITrans", ascii16, aT, aTInfo);
}
if (typeChar == 'N' || typeChar == 'B') {
nontrans.nameToASCII(source16, aN = new StringBuilder(), aNInfo = new IDNA.Info());
checkIdnaTestResult(line, "toASCIINontrans", ascii16, aN, aNInfo);
}
nontrans.nameToUnicode(source, uN = new StringBuilder(), uNInfo = new IDNA.Info());
checkIdnaTestResult(line, "toUnicodeNontrans", toUnicode, uN, toUnicodeStatus, uNInfo);
nontrans.nameToASCII(source, aN = new StringBuilder(), aNInfo = new IDNA.Info());
checkIdnaTestResult(line, "toASCIINontrans", toAsciiN, aN, toAsciiNStatus, aNInfo);
trans.nameToASCII(source, aT = new StringBuilder(), aTInfo = new IDNA.Info());
checkIdnaTestResult(line, "toASCIITrans", toAsciiT, aT, toAsciiTStatus, aTInfo);
}
} finally {
idnaTestFile.close();

View file

@ -26,15 +26,19 @@ public final class TestUtil {
* Return an input stream on the data file at path 'name' rooted at the data path
*/
public static final InputStream getDataStream(String name) throws IOException {
String path = DATA_PATH + name;
InputStream is = null;
try {
is = TestUtil.class.getResourceAsStream(DATA_PATH + name);
is = TestUtil.class.getResourceAsStream(path);
} catch (Throwable t) {
IOException ex =
new IOException("data resource '" + name + "' not found");
new IOException("data resource '" + path + "' not found");
ex.initCause(t);
throw ex;
}
if (is == null) {
throw new IOException("data resource '" + path + "' not found");
}
return is;
}

View file

@ -1603,7 +1603,7 @@ _files = {
"emoji-data.txt": (DontCopy, ParseNamedProperties),
"GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
"GraphemeBreakTest.txt": (CopyOnly, "testdata"),
"IdnaTest.txt": (CopyOnly, "testdata"),
"IdnaTestV2.txt": (CopyOnly, "testdata"),
"IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
"IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
"LineBreak.txt": (DontCopy, ParseLineBreak),