ICU-21030 validate ACE label edge cases

This commit is contained in:
Markus Scherer 2020-08-14 13:33:34 -07:00
parent e19d12997b
commit 9279e49d2f
4 changed files with 84 additions and 11 deletions

View file

@ -714,6 +714,16 @@ UTS46::processLabel(UnicodeString &dest,
UBool wasPunycode;
if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
// Label starts with "xn--", try to un-Punycode it.
// In IDNA2008, labels like "xn--" (decodes to an empty string) and
// "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
// comparing the ToUnicode input with the back-to-ToASCII output.
// They are alternate encodings of the respective ASCII labels.
// Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
// the round-trip verification.
if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) {
info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
}
wasPunycode=TRUE;
UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit
if(unicodeBuffer==NULL) {
@ -925,10 +935,10 @@ UTS46::markBadACELabel(UnicodeString &dest,
UBool isASCII=TRUE;
UBool onlyLDH=TRUE;
const UChar *label=dest.getBuffer()+labelStart;
// Ok to cast away const because we own the UnicodeString.
UChar *s=(UChar *)label+4; // After the initial "xn--".
const UChar *limit=label+labelLength;
do {
// Start after the initial "xn--".
// Ok to cast away const because we own the UnicodeString.
for(UChar *s=const_cast<UChar *>(label+4); s<limit; ++s) {
UChar c=*s;
if(c<=0x7f) {
if(c==0x2e) {
@ -945,7 +955,7 @@ UTS46::markBadACELabel(UnicodeString &dest,
} else {
isASCII=onlyLDH=FALSE;
}
} while(++s<limit);
}
if(onlyLDH) {
dest.insert(labelStart+labelLength, (UChar)0xfffd);
if(dest.isBogus()) {

View file

@ -40,6 +40,7 @@ public:
void TestAPI();
void TestNotSTD3();
void TestInvalidPunycodeDigits();
void TestACELabelEdgeCases();
void TestSomeCases();
void IdnaTest();
@ -84,6 +85,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
TESTCASE_AUTO(TestAPI);
TESTCASE_AUTO(TestNotSTD3);
TESTCASE_AUTO(TestInvalidPunycodeDigits);
TESTCASE_AUTO(TestACELabelEdgeCases);
TESTCASE_AUTO(TestSomeCases);
TESTCASE_AUTO(IdnaTest);
TESTCASE_AUTO_END;
@ -312,6 +314,35 @@ void UTS46Test::TestInvalidPunycodeDigits() {
}
}
void UTS46Test::TestACELabelEdgeCases() {
// In IDNA2008, these labels fail the round-trip validation from comparing
// the ToUnicode input with the back-to-ToASCII output.
IcuTestErrorCode errorCode(*this, "TestACELabelEdgeCases()");
LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
if(errorCode.isFailure()) {
return;
}
UnicodeString result;
{
IDNAInfo info;
idna->labelToUnicode(u"xn--", result, info, errorCode);
assertTrue("empty xn--", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
}
{
IDNAInfo info;
idna->labelToUnicode(u"xN--ASCII-", result, info, errorCode);
assertTrue("nothing but ASCII", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
}
{
// Different error: The Punycode decoding procedure does not consume the last delimiter
// if it is right after the xn-- so the main decoding loop fails because the hyphen
// is not a valid Punycode digit.
IDNAInfo info;
idna->labelToUnicode(u"Xn---", result, info, errorCode);
assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
}
struct TestCase {
// Input string and options string (Nontransitional/Transitional/Both).
const char *s, *o;
@ -558,13 +589,13 @@ static const TestCase testCases[]={
UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
UIDNA_ERROR_HYPHEN_3_4 },
{ "a..c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
{ "a.xn--.c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
{ "a.xn--.c", "B", "a.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
{ "a.-b.", "B", "a.-b.", UIDNA_ERROR_LEADING_HYPHEN },
{ "a.b-.c", "B", "a.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
{ "a.-.c", "B", "a.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
{ "a.bc--de.f", "B", "a.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
{ "\\u00E4.\\u00AD.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
{ "\\u00E4.xn--.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
{ "\\u00E4.xn--.c", "B", "\\u00E4.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
{ "\\u00E4.-b.", "B", "\\u00E4.-b.", UIDNA_ERROR_LEADING_HYPHEN },
{ "\\u00E4.b-.c", "B", "\\u00E4.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
{ "\\u00E4.-.c", "B", "\\u00E4.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },

View file

@ -341,6 +341,16 @@ public final class UTS46 extends IDNA {
dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
) {
// Label starts with "xn--", try to un-Punycode it.
// In IDNA2008, labels like "xn--" (decodes to an empty string) and
// "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
// comparing the ToUnicode input with the back-to-ToASCII output.
// They are alternate encodings of the respective ASCII labels.
// Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
// the round-trip verification.
if(labelLength==4 || (labelLength>5 && dest.charAt(labelStart+labelLength-1)=='-')) {
addLabelError(info, Error.INVALID_ACE_LABEL);
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
}
wasPunycode=true;
try {
fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
@ -496,9 +506,9 @@ public final class UTS46 extends IDNA {
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
boolean isASCII=true;
boolean onlyLDH=true;
int i=labelStart+4; // After the initial "xn--".
int limit=labelStart+labelLength;
do {
// Start after the initial "xn--".
for(int i=labelStart+4; i<limit; ++i) {
char c=dest.charAt(i);
if(c<=0x7f) {
if(c=='.') {
@ -515,7 +525,7 @@ public final class UTS46 extends IDNA {
} else {
isASCII=onlyLDH=false;
}
} while(++i<limit);
}
if(onlyLDH) {
dest.insert(labelStart+labelLength, '\ufffd');
++labelLength;

View file

@ -152,6 +152,28 @@ public class UTS46Test extends TestFmwk {
info.getErrors().contains(IDNA.Error.PUNYCODE));
}
@Test
public void TestACELabelEdgeCases() {
// In IDNA2008, these labels fail the round-trip validation from comparing
// the ToUnicode input with the back-to-ToASCII output.
IDNA idna=IDNA.getUTS46Instance(0);
StringBuilder result=new StringBuilder();
IDNA.Info info=new IDNA.Info();
idna.labelToUnicode("xn--", result, info);
assertTrue("empty xn--", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
info=new IDNA.Info();
idna.labelToUnicode("xN--ASCII-", result, info);
assertTrue("nothing but ASCII", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
// Different error: The Punycode decoding procedure does not consume the last delimiter
// if it is right after the xn-- so the main decoding loop fails because the hyphen
// is not a valid Punycode digit.
info=new IDNA.Info();
idna.labelToUnicode("Xn---", result, info);
assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));
}
private static final Map<String, IDNA.Error> errorNamesToErrors;
static {
errorNamesToErrors=new TreeMap<>();
@ -432,13 +454,13 @@ public class UTS46Test extends TestFmwk {
"UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
"UIDNA_ERROR_HYPHEN_3_4" },
{ "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
{ "a.xn--.c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
{ "a.xn--.c", "B", "a.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
{ "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
{ "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
{ "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
{ "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
{ "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
{ "\u00E4.xn--.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
{ "\u00E4.xn--.c", "B", "\u00E4.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
{ "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
{ "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
{ "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },