diff --git a/icu4c/source/common/uidna.cpp b/icu4c/source/common/uidna.cpp index ad42b454129..f1740a236e3 100644 --- a/icu4c/source/common/uidna.cpp +++ b/icu4c/source/common/uidna.cpp @@ -24,6 +24,7 @@ #include "punycode.h" #include "ustr_imp.h" #include "cmemory.h" +#include "uassert.h" #include "sprpimpl.h" /* it is official IDNA ACE Prefix is "xn--" */ @@ -395,10 +396,11 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, int32_t options, UStringPrepProfile* nameprep, UParseError* parseError, - UErrorCode* status){ + UErrorCode* status) +{ //get the options - UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0); + //UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0); int32_t namePrepOptions = ((options & UIDNA_ALLOW_UNASSIGNED) != 0) ? USPREP_ALLOW_UNASSIGNED: 0; UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE]; @@ -415,8 +417,8 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, UBool* caseFlags = NULL; UBool srcIsASCII = TRUE; - UBool srcIsLDH = TRUE; - int32_t failPos =0; + /*UBool srcIsLDH = TRUE; + int32_t failPos =0;*/ // step 1: find out if all the codepoints in src are ASCII if(srcLength==-1){ @@ -424,26 +426,26 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, for(;src[srcLength]!=0;){ if(src[srcLength]> 0x7f){ srcIsASCII = FALSE; - }else if(isLDHChar(src[srcLength])==FALSE){ + }/*else if(isLDHChar(src[srcLength])==FALSE){ // here we do not assemble surrogates // since we know that LDH code points // are in the ASCII range only srcIsLDH = FALSE; failPos = srcLength; - } + }*/ srcLength++; } }else if(srcLength > 0){ for(int32_t j=0; j 0x7f){ srcIsASCII = FALSE; - }else if(isLDHChar(src[j])==FALSE){ + }/*else if(isLDHChar(src[j])==FALSE){ // here we do not assemble surrogates // since we know that LDH code points // are in the ASCII range only srcIsLDH = FALSE; failPos = j; - } + }*/ } }else{ return 0; @@ -476,6 +478,12 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, b1Len = srcLength; } + // The RFC states that + // + // ToUnicode never fails. If any step fails, then the original input + // is returned immediately in that step. + // + //step 3: verify ACE Prefix if(startsWithPrefix(b1,b1Len)){ @@ -485,7 +493,7 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, //step 5: Decode using punycode b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Capacity, caseFlags,status); - + if(*status == U_BUFFER_OVERFLOW_ERROR){ // redo processing of string /* we do not have enough room so grow the buffer*/ @@ -496,14 +504,13 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, } *status = U_ZERO_ERROR; // reset error - + b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Len, caseFlags, status); - } - - + + //step 6:Apply toASCII - b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity,options,parseError, status); + b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity, options, parseError, status); if(*status == U_BUFFER_OVERFLOW_ERROR){ // redo processing of string @@ -515,9 +522,9 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, } *status = U_ZERO_ERROR; // reset error - + b3Len = uidna_toASCII(b2,b2Len,b3,b3Len,options,parseError, status); - + } //bail out on error if(U_FAILURE(*status)){ @@ -526,7 +533,8 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, //step 7: verify if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){ - *status = U_IDNA_VERIFICATION_ERROR; + // Cause the original to be returned. + *status = U_IDNA_VERIFICATION_ERROR; goto CLEANUP; } @@ -535,14 +543,16 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, if(b2Len <= destCapacity) { uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR); } - }else{ + } + else{ + // See the start of this if statement for why this is commented out. // verify that STD3 ASCII rules are satisfied - if(useSTD3ASCIIRules == TRUE){ - if( srcIsLDH == FALSE /* source contains some non-LDH characters */ + /*if(useSTD3ASCIIRules == TRUE){ + if( srcIsLDH == FALSE // source contains some non-LDH characters || src[0] == HYPHEN || src[srcLength-1] == HYPHEN){ *status = U_IDNA_STD3_ASCII_RULES_ERROR; - /* populate the parseError struct */ + // populate the parseError struct if(srcIsLDH==FALSE){ // failPos is always set the index of failure uprv_syntaxError(src,failPos, srcLength,parseError); @@ -556,7 +566,8 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, goto CLEANUP; } - } + }*/ + // just return the source //copy the source to destination if(srcLength <= destCapacity){ uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR); @@ -564,6 +575,7 @@ _internal_toUnicode(const UChar* src, int32_t srcLength, reqLength = srcLength; } + CLEANUP: if(b1 != b1Stack && b1!=src){ @@ -573,8 +585,7 @@ CLEANUP: uprv_free(b2); } uprv_free(caseFlags); - - + // The RFC states that // // ToUnicode never fails. If any step fails, then the original input @@ -584,11 +595,9 @@ CLEANUP: if(U_FAILURE(*status)){ //copy the source to destination if(dest && srcLength <= destCapacity){ - if(srcLength == -1) { - uprv_memmove(dest,src,u_strlen(src)* U_SIZEOF_UCHAR); - } else { + // srcLength should have already been set earlier. + U_ASSERT(srcLength >= 0); uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR); - } } reqLength = srcLength; *status = U_ZERO_ERROR; @@ -720,6 +729,7 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, // should never occur remainingDestCapacity = 0; } + if(done == TRUE){ break; } @@ -729,7 +739,7 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, *currentDest++ = FULL_STOP; remainingDestCapacity--; } - reqLength++; + reqLength++; labelStart = delimiter; if(remainingLen >0 ){ @@ -737,7 +747,7 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, } } - + if(reqLength > MAX_DOMAIN_NAME_LENGTH){ *status = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR; } @@ -779,14 +789,20 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, int32_t labelLen = 0, labelReqLength = 0; UBool done = FALSE; - for(;;){ labelLen = getNextSeparator(labelStart,remainingLen, &delimiter,&done); - if(labelLen==0 && done==FALSE){ + // The RFC states that + // + // ToUnicode never fails. If any step fails, then the original input + // is returned immediately in that step. + // + // _internal_toUnicode will copy the label. + /*if(labelLen==0 && done==FALSE){ *status = U_IDNA_ZERO_LENGTH_LABEL_ERROR; - } + break; + }*/ labelReqLength = _internal_toUnicode(labelStart, labelLen, currentDest, remainingDestCapacity, @@ -794,7 +810,6 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, parseError, status); if(*status == U_BUFFER_OVERFLOW_ERROR){ - *status = U_ZERO_ERROR; // reset error remainingDestCapacity = 0; } @@ -818,11 +833,12 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, } // add the label separator + // Unlike the ToASCII operation we don't normalize the label separators if(remainingDestCapacity > 0){ - *currentDest++ = FULL_STOP; + *currentDest++ = *(labelStart + labelLen); remainingDestCapacity--; } - reqLength++; + reqLength++; labelStart = delimiter; if(remainingLen >0 ){ @@ -830,7 +846,7 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, } } - + if(reqLength > MAX_DOMAIN_NAME_LENGTH){ *status = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR; }