From 9279e49d2fdf5e0d554c7de03ef6d5f171dff147 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 14 Aug 2020 13:33:34 -0700 Subject: [PATCH] ICU-21030 validate ACE label edge cases --- icu4c/source/common/uts46.cpp | 18 +++++++--- icu4c/source/test/intltest/uts46test.cpp | 35 +++++++++++++++++-- .../core/src/com/ibm/icu/impl/UTS46.java | 16 +++++++-- .../icu/dev/test/normalizer/UTS46Test.java | 26 ++++++++++++-- 4 files changed, 84 insertions(+), 11 deletions(-) diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp index b9e6cb023bb..f25b4e12f12 100644 --- a/icu4c/source/common/uts46.cpp +++ b/icu4c/source/common/uts46.cpp @@ -714,6 +714,16 @@ UTS46::processLabel(UnicodeString &dest, UBool wasPunycode; if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { // Label starts with "xn--", try to un-Punycode it. + // In IDNA2008, labels like "xn--" (decodes to an empty string) and + // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from + // comparing the ToUnicode input with the back-to-ToASCII output. + // They are alternate encodings of the respective ASCII labels. + // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before + // the round-trip verification. + if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) { + info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; + return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); + } wasPunycode=TRUE; UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit if(unicodeBuffer==NULL) { @@ -925,10 +935,10 @@ UTS46::markBadACELabel(UnicodeString &dest, UBool isASCII=TRUE; UBool onlyLDH=TRUE; const UChar *label=dest.getBuffer()+labelStart; - // Ok to cast away const because we own the UnicodeString. - UChar *s=(UChar *)label+4; // After the initial "xn--". const UChar *limit=label+labelLength; - do { + // Start after the initial "xn--". + // Ok to cast away const because we own the UnicodeString. + for(UChar *s=const_cast(label+4); s idna(IDNA::createUTS46Instance(0, errorCode)); + if(errorCode.isFailure()) { + return; + } + UnicodeString result; + { + IDNAInfo info; + idna->labelToUnicode(u"xn--", result, info, errorCode); + assertTrue("empty xn--", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0); + } + { + IDNAInfo info; + idna->labelToUnicode(u"xN--ASCII-", result, info, errorCode); + assertTrue("nothing but ASCII", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0); + } + { + // Different error: The Punycode decoding procedure does not consume the last delimiter + // if it is right after the xn-- so the main decoding loop fails because the hyphen + // is not a valid Punycode digit. + IDNAInfo info; + idna->labelToUnicode(u"Xn---", result, info, errorCode); + assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } +} + struct TestCase { // Input string and options string (Nontransitional/Transitional/Both). const char *s, *o; @@ -558,13 +589,13 @@ static const TestCase testCases[]={ UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN| UIDNA_ERROR_HYPHEN_3_4 }, { "a..c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL }, - { "a.xn--.c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL }, + { "a.xn--.c", "B", "a.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL }, { "a.-b.", "B", "a.-b.", UIDNA_ERROR_LEADING_HYPHEN }, { "a.b-.c", "B", "a.b-.c", UIDNA_ERROR_TRAILING_HYPHEN }, { "a.-.c", "B", "a.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN }, { "a.bc--de.f", "B", "a.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 }, { "\\u00E4.\\u00AD.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL }, - { "\\u00E4.xn--.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL }, + { "\\u00E4.xn--.c", "B", "\\u00E4.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL }, { "\\u00E4.-b.", "B", "\\u00E4.-b.", UIDNA_ERROR_LEADING_HYPHEN }, { "\\u00E4.b-.c", "B", "\\u00E4.b-.c", UIDNA_ERROR_TRAILING_HYPHEN }, { "\\u00E4.-.c", "B", "\\u00E4.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN }, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java index c830bc3c75e..1e52f8f91e5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java @@ -341,6 +341,16 @@ public final class UTS46 extends IDNA { dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-' ) { // Label starts with "xn--", try to un-Punycode it. + // In IDNA2008, labels like "xn--" (decodes to an empty string) and + // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from + // comparing the ToUnicode input with the back-to-ToASCII output. + // They are alternate encodings of the respective ASCII labels. + // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before + // the round-trip verification. + if(labelLength==4 || (labelLength>5 && dest.charAt(labelStart+labelLength-1)=='-')) { + addLabelError(info, Error.INVALID_ACE_LABEL); + return markBadACELabel(dest, labelStart, labelLength, toASCII, info); + } wasPunycode=true; try { fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null); @@ -496,9 +506,9 @@ public final class UTS46 extends IDNA { boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; boolean isASCII=true; boolean onlyLDH=true; - int i=labelStart+4; // After the initial "xn--". int limit=labelStart+labelLength; - do { + // Start after the initial "xn--". + for(int i=labelStart+4; i errorNamesToErrors; static { errorNamesToErrors=new TreeMap<>(); @@ -432,13 +454,13 @@ public class UTS46Test extends TestFmwk { "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+ "UIDNA_ERROR_HYPHEN_3_4" }, { "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" }, - { "a.xn--.c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" }, + { "a.xn--.c", "B", "a.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" }, { "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" }, { "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" }, { "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" }, { "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" }, { "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" }, - { "\u00E4.xn--.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" }, + { "\u00E4.xn--.c", "B", "\u00E4.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" }, { "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" }, { "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" }, { "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },