ICU-22707 adjust UTS46 for Unicode 16

See #3130
This commit is contained in:
Markus Scherer 2024-09-05 23:38:32 +00:00
parent 6d67afcf01
commit 415a7accc5
4 changed files with 102 additions and 42 deletions

View file

@ -756,7 +756,12 @@ UTS46::processLabel(UnicodeString &dest,
if(U_FAILURE(errorCode)) {
return labelLength;
}
if(!isValid) {
// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with “xn--”.
if(!isValid || fromPunycode.startsWith(UnicodeString::readOnlyAlias(u"xn--"))) {
info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
}

View file

@ -340,6 +340,18 @@ void UTS46Test::TestACELabelEdgeCases() {
idna->labelToUnicode(u"Xn---", result, info, errorCode);
assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
{
// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with “xn--”.
IDNAInfo info;
idna->labelToUnicode("xn--xn---epa", result, info, errorCode);
assertTrue("error for xn--xn---epa",
(info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
}
}
void UTS46Test::TestTooLong() {
@ -1016,9 +1028,15 @@ idnaTestLineFn(void *context,
reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
}
UnicodeString s16FromField(char *(&field)[2]) {
UnicodeString s16FromField(char *(&field)[2], const UnicodeString &sameAs) {
int32_t length = static_cast<int32_t>(field[1] - field[0]);
return UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
UnicodeString s = UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
if (s.isEmpty()) {
s = sameAs; // blank means same as another string
} else if (s == u"\"\"") {
s.remove(); // explicit empty string (new in Unicode 16)
}
return s;
}
std::string statusFromField(char *(&field)[2]) {
@ -1049,6 +1067,20 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
if (strcmp(status, reinterpret_cast<const char*>(u8"[]")) != 0) {
expectedHasErrors = true;
}
// ICU workaround:
// We do effectively VerifyDnsLength (we always check for lengths), except,
// based on past bug reports, we do not do the following in UTS #46 ToASCII:
// When VerifyDnsLength is true, the empty root label is disallowed.
// Ignore the expected error if it is the only one.
// TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
if (strncmp(type, "toASCII", 7) == 0 && // startsWith
strcmp(status, "[A4_2]") == 0 && !info.hasErrors()) {
if (result.endsWith(UnicodeString::readOnlyAlias(u".")) &&
// !contains
result.indexOf(UnicodeString::readOnlyAlias(u"..")) < 0) {
expectedHasErrors = false;
}
}
}
if (expectedHasErrors != info.hasErrors()) {
errln("%s expected errors %s %d != %d = actual has errors: %04lx\n %s",
@ -1064,16 +1096,15 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// IdnaTestV2.txt (since Unicode 11)
// Column 1: source
// The source string to be tested
UnicodeString source = s16FromField(fields[0]);
// The source string to be tested.
// "" means the empty string.
UnicodeString source = s16FromField(fields[0], UnicodeString());
// Column 2: toUnicode
// The result of applying toUnicode to the source, with Transitional_Processing=false.
// A blank value means the same as the source value.
UnicodeString toUnicode = s16FromField(fields[1]);
if (toUnicode.isEmpty()) {
toUnicode = source;
}
// "" means the empty string.
UnicodeString toUnicode = s16FromField(fields[1], source);
// Column 3: toUnicodeStatus
// A set of status codes, each corresponding to a particular test.
@ -1083,10 +1114,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// Column 4: toAsciiN
// The result of applying toASCII to the source, with Transitional_Processing=false.
// A blank value means the same as the toUnicode value.
UnicodeString toAsciiN = s16FromField(fields[3]);
if (toAsciiN.isEmpty()) {
toAsciiN = toUnicode;
}
// "" means the empty string.
UnicodeString toAsciiN = s16FromField(fields[3], toUnicode);
// Column 5: toAsciiNStatus
// A set of status codes, each corresponding to a particular test.
@ -1099,10 +1128,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// Column 6: toAsciiT
// The result of applying toASCII to the source, with Transitional_Processing=true.
// A blank value means the same as the toAsciiN value.
UnicodeString toAsciiT = s16FromField(fields[5]);
if (toAsciiT.isEmpty()) {
toAsciiT = toAsciiN;
}
// "" means the empty string.
UnicodeString toAsciiT = s16FromField(fields[5], toAsciiN);
// Column 7: toAsciiTStatus
// A set of status codes, each corresponding to a particular test.
@ -1133,12 +1160,7 @@ U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
} // namespace
// http://www.unicode.org/Public/idna/latest/IdnaTest.txt
void UTS46Test::IdnaTest() {
if (logKnownIssue("ICU-22707",
"The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
return;
}
IcuTestErrorCode errorCode(*this, "IdnaTest");
const char *sourceTestDataPath = getSourceTestData(errorCode);
if (errorCode.errIfFailureAndReset("unable to find the source/test/testdata "
@ -1158,7 +1180,7 @@ void UTS46Test::IdnaTest() {
// Comments are indicated with hash marks.
char *fields[kNumFields][2];
u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
if (errorCode.errIfFailureAndReset("error parsing IdnaTest.txt")) {
if (errorCode.errIfFailureAndReset("error parsing IdnaTestV2.txt")) {
return;
}
}

View file

@ -358,7 +358,12 @@ public final class UTS46 extends IDNA {
// In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
// then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
boolean isValid=uts46Norm2.isNormalized(fromPunycode);
if(!isValid) {
// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with xn--.
if(!isValid || startsWithXNDashDash(fromPunycode)) {
addLabelError(info, Error.INVALID_ACE_LABEL);
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
}
@ -488,6 +493,12 @@ public final class UTS46 extends IDNA {
}
return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
}
private static boolean startsWithXNDashDash(CharSequence s) {
return s.length()>=4 &&
s.charAt(0)=='x' && s.charAt(1)=='n' && s.charAt(2)=='-' && s.charAt(3)=='-';
}
private int
markBadACELabel(StringBuilder dest,
int labelStart, int labelLength,

View file

@ -168,6 +168,15 @@ public class UTS46Test extends CoreTestFmwk {
info=new IDNA.Info();
idna.labelToUnicode("Xn---", result, info);
assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));
// Unicode 15.1 UTS #46:
// Added an additional condition in 4.1 Validity Criteria to
// disallow labels such as xn--xn---epa., which do not round-trip.
// --> Validity Criteria new criterion 4:
// If not CheckHyphens, the label must not begin with xn--.
idna.labelToUnicode("xn--xn---epa", result, info);
assertTrue("error for xn--xn---epa",
info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
}
@Test
@ -813,6 +822,16 @@ public class UTS46Test extends CoreTestFmwk {
}
}
private static String escapeTestString(String s, String sameAs) {
s = Utility.unescape(s.trim());
if (s.isEmpty()) {
s = sameAs; // blank means same as another string
} else if (s.equals("\"\"")) {
s = ""; // explicit empty string (new in Unicode 16)
}
return s;
}
private void checkIdnaTestResult(String line, String type,
String expected, CharSequence result, String status, IDNA.Info info) {
// An error in toUnicode or toASCII is indicated by a value in square brackets,
@ -826,6 +845,18 @@ public class UTS46Test extends CoreTestFmwk {
if (!status.equals("[]")) {
expectedHasErrors = true;
}
// ICU workaround:
// We do effectively VerifyDnsLength (we always check for lengths), except,
// based on past bug reports, we do not do the following in UTS #46 ToASCII:
// When VerifyDnsLength is true, the empty root label is disallowed.
// Ignore the expected error if it is the only one.
// TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
if (type.startsWith("toASCII") && status.equals("[A4_2]") && !info.hasErrors()) {
String a = result.toString();
if (a.endsWith(".") && !a.contains("..")) {
expectedHasErrors = false;
}
}
}
if (expectedHasErrors != info.hasErrors()) {
errln(String.format(
@ -841,10 +872,6 @@ public class UTS46Test extends CoreTestFmwk {
@Test
public void IdnaTest() throws IOException {
if (logKnownIssue("ICU-22707",
"The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
return;
}
BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTestV2.txt", "UTF-8");
Pattern semi = Pattern.compile(";");
try {
@ -862,16 +889,15 @@ public class UTS46Test extends CoreTestFmwk {
// IdnaTestV2.txt (since Unicode 11)
// Column 1: source
// The source string to be tested
String source = Utility.unescape(fields[0].trim());
// The source string to be tested.
// "" means the empty string.
String source = escapeTestString(fields[0], "");
// Column 2: toUnicode
// The result of applying toUnicode to the source, with Transitional_Processing=false.
// A blank value means the same as the source value.
String toUnicode = Utility.unescape(fields[1].trim());
if (toUnicode.isEmpty()) {
toUnicode = source;
}
// "" means the empty string.
String toUnicode = escapeTestString(fields[1], source);
// Column 3: toUnicodeStatus
// A set of status codes, each corresponding to a particular test.
@ -881,10 +907,8 @@ public class UTS46Test extends CoreTestFmwk {
// Column 4: toAsciiN
// The result of applying toASCII to the source, with Transitional_Processing=false.
// A blank value means the same as the toUnicode value.
String toAsciiN = Utility.unescape(fields[3].trim());
if (toAsciiN.isEmpty()) {
toAsciiN = toUnicode;
}
// "" means the empty string.
String toAsciiN = escapeTestString(fields[3], toUnicode);
// Column 5: toAsciiNStatus
// A set of status codes, each corresponding to a particular test.
@ -897,10 +921,8 @@ public class UTS46Test extends CoreTestFmwk {
// Column 6: toAsciiT
// The result of applying toASCII to the source, with Transitional_Processing=true.
// A blank value means the same as the toAsciiN value.
String toAsciiT = Utility.unescape(fields[5].trim());
if (toAsciiT.isEmpty()) {
toAsciiT = toAsciiN;
}
// "" means the empty string.
String toAsciiT = escapeTestString(fields[5], toAsciiN);
// Column 7: toAsciiTStatus
// A set of status codes, each corresponding to a particular test.