ICU-22707 adjust UTS46 for Unicode 16

See #3130
2025-04-10 15:42:14 +00:00 · 2024-09-05 23:38:32 +00:00 · 2024-09-05 23:38:32 +00:00 · 415a7accc5
commit 415a7accc5
parent 6d67afcf01
4 changed files with 102 additions and 42 deletions
--- a/icu4c/source/common/uts46.cpp
+++ b/icu4c/source/common/uts46.cpp
@ -756,7 +756,12 @@ UTS46::processLabel(UnicodeString &dest,
        if(U_FAILURE(errorCode)) {
            return labelLength;
        }
-        if(!isValid) {
+        // Unicode 15.1 UTS #46:
+        // Added an additional condition in 4.1 Validity Criteria to
+        // disallow labels such as xn--xn---epa., which do not round-trip.
+        // --> Validity Criteria new criterion 4:
+        // If not CheckHyphens, the label must not begin with “xn--”.
+        if(!isValid || fromPunycode.startsWith(UnicodeString::readOnlyAlias(u"xn--"))) {
            info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
            return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
        }
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp
@ -340,6 +340,18 @@ void UTS46Test::TestACELabelEdgeCases() {
        idna->labelToUnicode(u"Xn---", result, info, errorCode);
        assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
    }
+
+    {
+        // Unicode 15.1 UTS #46:
+        // Added an additional condition in 4.1 Validity Criteria to
+        // disallow labels such as xn--xn---epa., which do not round-trip.
+        // --> Validity Criteria new criterion 4:
+        // If not CheckHyphens, the label must not begin with “xn--”.
+        IDNAInfo info;
+        idna->labelToUnicode("xn--xn---epa", result, info, errorCode);
+        assertTrue("error for xn--xn---epa",
+                (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+    }
 }

 void UTS46Test::TestTooLong() {
@ -1016,9 +1028,15 @@ idnaTestLineFn(void *context,
    reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
 }

-UnicodeString s16FromField(char *(&field)[2]) {
+UnicodeString s16FromField(char *(&field)[2], const UnicodeString &sameAs) {
    int32_t length = static_cast<int32_t>(field[1] - field[0]);
-    return UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
+    UnicodeString s = UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
+    if (s.isEmpty()) {
+        s = sameAs;  // blank means same as another string
+    } else if (s == u"\"\"") {
+        s.remove();  // explicit empty string (new in Unicode 16)
+    }
+    return s;
 }

 std::string statusFromField(char *(&field)[2]) {
@ -1049,6 +1067,20 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
        if (strcmp(status, reinterpret_cast<const char*>(u8"[]")) != 0) {
            expectedHasErrors = true;
        }
+        // ICU workaround:
+        // We do effectively VerifyDnsLength (we always check for lengths), except,
+        // based on past bug reports, we do not do the following in UTS #46 ToASCII:
+        // When VerifyDnsLength is true, the empty root label is disallowed.
+        // Ignore the expected error if it is the only one.
+        // TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
+        if (strncmp(type, "toASCII", 7) == 0 &&  // startsWith
+                strcmp(status, "[A4_2]") == 0 && !info.hasErrors()) {
+            if (result.endsWith(UnicodeString::readOnlyAlias(u".")) &&
+                    // !contains
+                    result.indexOf(UnicodeString::readOnlyAlias(u"..")) < 0) {
+                expectedHasErrors = false;
+            }
+        }
    }
    if (expectedHasErrors != info.hasErrors()) {
        errln("%s  expected errors %s %d != %d = actual has errors: %04lx\n    %s",
@ -1064,16 +1096,15 @@ void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
 void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
    // IdnaTestV2.txt (since Unicode 11)
    // Column 1: source
-    // The source string to be tested
-    UnicodeString source = s16FromField(fields[0]);
+    // The source string to be tested.
+    // "" means the empty string.
+    UnicodeString source = s16FromField(fields[0], UnicodeString());

    // Column 2: toUnicode
    // The result of applying toUnicode to the source, with Transitional_Processing=false.
    // A blank value means the same as the source value.
-    UnicodeString toUnicode = s16FromField(fields[1]);
-    if (toUnicode.isEmpty()) {
-        toUnicode = source;
-    }
+    // "" means the empty string.
+    UnicodeString toUnicode = s16FromField(fields[1], source);

    // Column 3: toUnicodeStatus
    // A set of status codes, each corresponding to a particular test.
@ -1083,10 +1114,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
    // Column 4: toAsciiN
    // The result of applying toASCII to the source, with Transitional_Processing=false.
    // A blank value means the same as the toUnicode value.
-    UnicodeString toAsciiN = s16FromField(fields[3]);
-    if (toAsciiN.isEmpty()) {
-        toAsciiN = toUnicode;
-    }
+    // "" means the empty string.
+    UnicodeString toAsciiN = s16FromField(fields[3], toUnicode);

    // Column 5: toAsciiNStatus
    // A set of status codes, each corresponding to a particular test.
@ -1099,10 +1128,8 @@ void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
    // Column 6: toAsciiT
    // The result of applying toASCII to the source, with Transitional_Processing=true.
    // A blank value means the same as the toAsciiN value.
-    UnicodeString toAsciiT = s16FromField(fields[5]);
-    if (toAsciiT.isEmpty()) {
-        toAsciiT = toAsciiN;
-    }
+    // "" means the empty string.
+    UnicodeString toAsciiT = s16FromField(fields[5], toAsciiN);

    // Column 7: toAsciiTStatus
    // A set of status codes, each corresponding to a particular test.
@ -1133,12 +1160,7 @@ U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);

 }  // namespace

-// http://www.unicode.org/Public/idna/latest/IdnaTest.txt
 void UTS46Test::IdnaTest() {
-    if (logKnownIssue("ICU-22707",
-                      "The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
-        return;
-    }
    IcuTestErrorCode errorCode(*this, "IdnaTest");
    const char *sourceTestDataPath = getSourceTestData(errorCode);
    if (errorCode.errIfFailureAndReset("unable to find the source/test/testdata "
@ -1158,7 +1180,7 @@ void UTS46Test::IdnaTest() {
    // Comments are indicated with hash marks.
    char *fields[kNumFields][2];
    u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
-    if (errorCode.errIfFailureAndReset("error parsing IdnaTest.txt")) {
+    if (errorCode.errIfFailureAndReset("error parsing IdnaTestV2.txt")) {
        return;
    }
 }
--- a/icu4j/main/core/src/main/java/com/ibm/icu/impl/UTS46.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/impl/UTS46.java
@ -358,7 +358,12 @@ public final class UTS46 extends IDNA {
            // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
            // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
            boolean isValid=uts46Norm2.isNormalized(fromPunycode);
-            if(!isValid) {
+            // Unicode 15.1 UTS #46:
+            // Added an additional condition in 4.1 Validity Criteria to
+            // disallow labels such as xn--xn---epa., which do not round-trip.
+            // --> Validity Criteria new criterion 4:
+            // If not CheckHyphens, the label must not begin with “xn--”.
+            if(!isValid || startsWithXNDashDash(fromPunycode)) {
                addLabelError(info, Error.INVALID_ACE_LABEL);
                return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
            }
@ -488,6 +493,12 @@ public final class UTS46 extends IDNA {
        }
        return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
    }
+
+    private static boolean startsWithXNDashDash(CharSequence s) {
+        return s.length()>=4 &&
+                s.charAt(0)=='x' && s.charAt(1)=='n' && s.charAt(2)=='-' && s.charAt(3)=='-';
+    }
+
    private int
    markBadACELabel(StringBuilder dest,
                    int labelStart, int labelLength,
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java
@ -168,6 +168,15 @@ public class UTS46Test extends CoreTestFmwk {
        info=new IDNA.Info();
        idna.labelToUnicode("Xn---", result, info);
        assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        // Unicode 15.1 UTS #46:
+        // Added an additional condition in 4.1 Validity Criteria to
+        // disallow labels such as xn--xn---epa., which do not round-trip.
+        // --> Validity Criteria new criterion 4:
+        // If not CheckHyphens, the label must not begin with “xn--”.
+        idna.labelToUnicode("xn--xn---epa", result, info);
+        assertTrue("error for xn--xn---epa",
+                info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
    }

    @Test
@ -813,6 +822,16 @@ public class UTS46Test extends CoreTestFmwk {
        }
    }

+    private static String escapeTestString(String s, String sameAs) {
+        s = Utility.unescape(s.trim());
+        if (s.isEmpty()) {
+            s = sameAs;  // blank means same as another string
+        } else if (s.equals("\"\"")) {
+            s = "";  // explicit empty string (new in Unicode 16)
+        }
+        return s;
+    }
+
    private void checkIdnaTestResult(String line, String type,
            String expected, CharSequence result, String status, IDNA.Info info) {
        // An error in toUnicode or toASCII is indicated by a value in square brackets,
@ -826,6 +845,18 @@ public class UTS46Test extends CoreTestFmwk {
            if (!status.equals("[]")) {
                expectedHasErrors = true;
            }
+            // ICU workaround:
+            // We do effectively VerifyDnsLength (we always check for lengths), except,
+            // based on past bug reports, we do not do the following in UTS #46 ToASCII:
+            // When VerifyDnsLength is true, the empty root label is disallowed.
+            // Ignore the expected error if it is the only one.
+            // TODO: ICU-22882 - Report the empty root label separately from empty non-root labels.
+            if (type.startsWith("toASCII") && status.equals("[A4_2]") && !info.hasErrors()) {
+                String a = result.toString();
+                if (a.endsWith(".") && !a.contains("..")) {
+                    expectedHasErrors = false;
+                }
+            }
        }
        if (expectedHasErrors != info.hasErrors()) {
            errln(String.format(
@ -841,10 +872,6 @@ public class UTS46Test extends CoreTestFmwk {

    @Test
    public void IdnaTest() throws IOException {
-        if (logKnownIssue("ICU-22707",
-                "The UTS #46 spec is changing for Unicode 16; need to adjust ICU impl")) {
-            return;
-        }
        BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTestV2.txt", "UTF-8");
        Pattern semi = Pattern.compile(";");
        try {
@ -862,16 +889,15 @@ public class UTS46Test extends CoreTestFmwk {

                // IdnaTestV2.txt (since Unicode 11)
                // Column 1: source
-                // The source string to be tested
-                String source = Utility.unescape(fields[0].trim());
+                // The source string to be tested.
+                // "" means the empty string.
+                String source = escapeTestString(fields[0], "");

                // Column 2: toUnicode
                // The result of applying toUnicode to the source, with Transitional_Processing=false.
                // A blank value means the same as the source value.
-                String toUnicode = Utility.unescape(fields[1].trim());
-                if (toUnicode.isEmpty()) {
-                    toUnicode = source;
-                }
+                // "" means the empty string.
+                String toUnicode = escapeTestString(fields[1], source);

                // Column 3: toUnicodeStatus
                // A set of status codes, each corresponding to a particular test.
@ -881,10 +907,8 @@ public class UTS46Test extends CoreTestFmwk {
                // Column 4: toAsciiN
                // The result of applying toASCII to the source, with Transitional_Processing=false.
                // A blank value means the same as the toUnicode value.
-                String toAsciiN = Utility.unescape(fields[3].trim());
-                if (toAsciiN.isEmpty()) {
-                    toAsciiN = toUnicode;
-                }
+                // "" means the empty string.
+                String toAsciiN = escapeTestString(fields[3], toUnicode);

                // Column 5: toAsciiNStatus
                // A set of status codes, each corresponding to a particular test.
@ -897,10 +921,8 @@ public class UTS46Test extends CoreTestFmwk {
                // Column 6: toAsciiT
                // The result of applying toASCII to the source, with Transitional_Processing=true.
                // A blank value means the same as the toAsciiN value.
-                String toAsciiT = Utility.unescape(fields[5].trim());
-                if (toAsciiT.isEmpty()) {
-                    toAsciiT = toAsciiN;
-                }
+                // "" means the empty string.
+                String toAsciiT = escapeTestString(fields[5], toAsciiN);

                // Column 7: toAsciiTStatus
                // A set of status codes, each corresponding to a particular test.