diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index 11da50c2340..667eef14d0d 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -66,7 +66,8 @@ brkiter.o brkdict.o ubrk.o dbbi.o dbbi_tbl.o \ rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \ utrie.o uset.o cmemory.o caniter.o \ unifilt.o unifunct.o uniset.o usetiter.o util.o uenum.o \ -icuserv.o iculserv.o icunotif.o ustrenum.o +icuserv.o iculserv.o icunotif.o ustrenum.o \ +uidna.o strprep.o nameprep.o punycode.o STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp index 5e10453bdfa..15fa0ea1dfe 100644 --- a/icu4c/source/common/common.dsp +++ b/icu4c/source/common/common.dsp @@ -3244,6 +3244,65 @@ InputPath=.\unicode\utf_old.h !ENDIF +# End Source File +# End Group +# Begin Group "idna" + +# PROP Default_Filter "*.c,*.h" +# Begin Source File + +SOURCE=.\nameprep.cpp +# End Source File +# Begin Source File + +SOURCE=.\nameprep.h +# End Source File +# Begin Source File + +SOURCE=.\punycode.c +# End Source File +# Begin Source File + +SOURCE=.\punycode.h +# End Source File +# Begin Source File + +SOURCE=.\sprpimpl.h +# End Source File +# Begin Source File + +SOURCE=.\strprep.cpp +# End Source File +# Begin Source File + +SOURCE=.\strprep.h +# End Source File +# Begin Source File + +SOURCE=.\uidna.cpp +# End Source File +# Begin Source File + +SOURCE=.\unicode\uidna.h + +!IF "$(CFG)" == "common - Win32 Release" + +!ELSEIF "$(CFG)" == "common - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\uidna.h + +"..\..\include\unicode\uidna.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy $(InputPath) ..\..\include\unicode + +# End Custom Build + +!ELSEIF "$(CFG)" == "common - Win64 Release" + +!ELSEIF "$(CFG)" == "common - Win64 Debug" + +!ENDIF + # End Source File # End Group # End Target diff --git a/icu4c/source/common/nameprep.cpp b/icu4c/source/common/nameprep.cpp new file mode 100644 index 00000000000..9e175be953b --- /dev/null +++ b/icu4c/source/common/nameprep.cpp @@ -0,0 +1,41 @@ +/* + ******************************************************************************* + * + * Copyright (C) 2002, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: strprep.cpp + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003feb1 + * created by: Ram Viswanadha + */ + +#include "nameprep.h" +// ***************************************************************************** +// class NamePrep +// ***************************************************************************** +static const UChar ASCII_SPACE = 0x0020; + + +U_NAMESPACE_BEGIN + +const char NamePrep::fgClassID=0; + +// default constructor +NamePrep::NamePrep(UErrorCode& status){ + bidiCheck = TRUE; + doNFKC = TRUE; +} + +UBool NamePrep::isNotProhibited(UChar32 ch){ + return (UBool)(ch == ASCII_SPACE); +} + +U_NAMESPACE_END + + + diff --git a/icu4c/source/common/nameprep.h b/icu4c/source/common/nameprep.h new file mode 100644 index 00000000000..c3304b0a8b3 --- /dev/null +++ b/icu4c/source/common/nameprep.h @@ -0,0 +1,97 @@ +/* + ******************************************************************************* + * + * Copyright (C) 2002, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: nameprep.h + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003feb1 + * created by: Ram Viswanadha + */ + +#ifndef NAMEPREP_H +#define NAMEPREP_H + +#include "unicode/utypes.h" +#include "strprep.h" +#include "unicode/uniset.h" + + +U_NAMESPACE_BEGIN +/* + A profile of stringprep MUST include all of the following: + + - The intended applicability of the profile + + - The character repertoire that is the input and output to stringprep + (which is Unicode 3.2 for this version of stringprep) + + - The mapping tables from this document used (as described in section + 3) + + - Any additional mapping tables specific to the profile + + - The Unicode normalization used, if any (as described in section 4) + + - The tables from this document of characters that are prohibited as + output (as described in section 5) + + - The bidirectional string testing used, if any (as described in + section 6) + + - Any additional characters that are prohibited as output specific to + the profile +*/ + + +class NamePrep: public StringPrep { +public : + NamePrep(UErrorCode& status); + + virtual inline ~NamePrep(){}; + + virtual inline UBool isNotProhibited(UChar32 ch); + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + * + * @draft ICU 2.6 + */ + virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); } + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + * + * @draft ICU 2.6 + */ + static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; } + +private: + /** + * The address of this static class variable serves as this class's ID + * for ICU "poor man's RTTI". + */ + static const char fgClassID; +}; + + + +U_NAMESPACE_END + +#endif + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + + diff --git a/icu4c/source/common/punycode.c b/icu4c/source/common/punycode.c new file mode 100644 index 00000000000..29264177789 --- /dev/null +++ b/icu4c/source/common/punycode.c @@ -0,0 +1,563 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: punycode.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002jan31 +* created by: Markus W. Scherer +*/ + + +/* This ICU code derived from: */ +/* +punycode.c 0.4.0 (2001-Nov-17-Sat) +http://www.cs.berkeley.edu/~amc/idn/ +Adam M. Costello +http://www.nicemice.net/amc/ +*/ +/* + * ICU modifications: + * - ICU data types and coding conventions + * - ICU string buffer handling with implicit source lengths + * and destination preflighting + * - UTF-16 handling + */ + +#include "unicode/utypes.h" +#include "ustr_imp.h" +#include "cstring.h" +#include "cmemory.h" +#include "punycode.h" +#include "unicode/ustring.h" + + +/* Punycode ----------------------------------------------------------------- */ + +/* Punycode parameters for Bootstring */ +#define BASE 36 +#define TMIN 1 +#define TMAX 26 +#define SKEW 38 +#define DAMP 700 +#define INITIAL_BIAS 72 +#define INITIAL_N 0x80 + +/* "Basic" Unicode/ASCII code points */ +#define _HYPHEN 0X2d +#define DELIMITER _HYPHEN + +#define _ZERO 0X30 +#define _NINE 0x39 + +#define _SMALL_A 0X61 +#define _SMALL_Z 0X7a + +#define _CAPITAL_A 0X41 +#define _CAPITAL_Z 0X5a + +#define IS_BASIC(c) ((c)<0x80) +#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z) + +/** + * digitToBasic() returns the basic code point whose value + * (when used for representing integers) is d, which must be in the + * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is + * nonzero, in which case the uppercase form is used. + */ +U_INLINE char +digitToBasic(int32_t digit, UBool uppercase) { + /* 0..25 map to ASCII a..z or A..Z */ + /* 26..35 map to ASCII 0..9 */ + if(digit<26) { + if(uppercase) { + return (char)(_CAPITAL_A+digit); + } else { + return (char)(_SMALL_A+digit); + } + } else { + return (char)((_ZERO-26)+digit); + } +} + +/** + * basicToDigit[] contains the numeric value of a basic code + * point (for use in representing integers) in the range 0 to + * BASE-1, or -1 if b is does not represent a value. + */ +static int8_t +basicToDigit[256]={ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +U_INLINE char +asciiCaseMap(char b, UBool uppercase) { + if(uppercase) { + if(_SMALL_A<=b && b<=_SMALL_Z) { + b-=(_SMALL_A-_CAPITAL_A); + } + } else { + if(_CAPITAL_A<=b && b<=_CAPITAL_Z) { + b+=(_SMALL_A-_CAPITAL_A); + } + } + return b; +} + +/* Punycode-specific Bootstring code ---------------------------------------- */ + +/* + * The following code omits the {parts} of the pseudo-algorithm in the spec + * that are not used with the Punycode parameter set. + */ + +/* Bias adaptation function. */ +static int32_t +adaptBias(int32_t delta, int32_t length, UBool firstTime) { + int32_t count; + + if(firstTime) { + delta/=DAMP; + } else { + delta/=2; + } + + delta+=delta/length; + for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { + delta/=(BASE-TMIN); + } + + return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); +} + +#define MAX_CP_COUNT 200 + +U_CFUNC int32_t +u_strToPunycode(const UChar *src, int32_t srcLength, + UChar *dest, int32_t destCapacity, + const UBool *caseFlags, + UErrorCode *pErrorCode) { + + int32_t cpBuffer[MAX_CP_COUNT]; + int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; + UChar c, c2; + + /* argument checking */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + + if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* + * Handle the basic code points and + * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): + */ + srcCPCount=destLength=0; + if(srcLength==-1) { + /* NUL-terminated input */ + for(j=0; /* no condition */; ++j) { + if((c=src[j])==0) { + break; + } + if(srcCPCount==MAX_CP_COUNT) { + /* too many input code points */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + if(IS_BASIC(c)) { + cpBuffer[srcCPCount++]=0; + if(destLength0) { + if(destLength state to , but guard against overflow: + */ + if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { + *pErrorCode=U_INTERNAL_PROGRAM_ERROR; + return 0; + } + delta+=(m-n)*(handledCPCount+1); + n=m; + + /* Encode a sequence of same code points n */ + for(j=0; jTMAX) { + t=TMAX; + } + */ + + t=k-bias; + if(t=(bias+TMAX)) { + t=TMAX; + } + + if(q0;) { + if(src[--j]==DELIMITER) { + break; + } + } + destLength=basicLength=destCPCount=j; + + while(j>0) { + b=src[--j]; + if(!IS_BASIC(b)) { + *pErrorCode=U_INVALID_CHAR_FOUND; + return 0; + } + + if(j0 ? basicLength+1 : 0; in=srcLength) { + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + return 0; + } + + digit=basicToDigit[(uint8_t)src[in++]]; + if(digit<0) { + *pErrorCode=U_INVALID_CHAR_FOUND; + return 0; + } + if(digit>(0x7fffffff-i)/w) { + /* integer overflow */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + return 0; + } + + i+=digit*w; + /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt + t=k-bias; + if(tTMAX) { + t=TMAX; + } + */ + t=k-bias; + if(t=(bias+TMAX)) { + t=TMAX; + } + if(digit0x7fffffff/(BASE-t)) { + /* integer overflow */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + return 0; + } + w*=BASE-t; + } + + /* + * Modification from sample code: + * Increments destCPCount here, + * where needed instead of in for() loop tail. + */ + ++destCPCount; + bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0)); + + /* + * i was supposed to wrap around from (incremented) destCPCount to 0, + * incrementing n each time, so we'll fix that now: + */ + if(i/destCPCount>(0x7fffffff-n)) { + /* integer overflow */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + return 0; + } + + n+=i/destCPCount; + i%=destCPCount; + /* not needed for Punycode: */ + /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ + + if(n>0x10ffff || UTF_IS_SURROGATE(n)) { + /* Unicode code point overflow */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + return 0; + } + + /* Insert n at position i of the output: */ + cpLength=UTF_CHAR_LENGTH(n); + if((destLength+cpLength)1) { + firstSupplementaryIndex=codeUnitIndex; + } else { + ++firstSupplementaryIndex; + } + } else { + codeUnitIndex=firstSupplementaryIndex; + UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex); + } + + /* use the UChar index codeUnitIndex instead of the code point index i */ + if(codeUnitIndex=64). + * U_INDEX_OUTOFBOUNDS_ERROR is set if the limit is exceeded. + * @param srcLength Number of UChars in src, or -1 if NUL-terminated. + * @param dest Output Punycode array. + * @param destCapacity Size of dest. + * @param caseFlags Vector of boolean values, one per input UChar, + * indicating that the corresponding character is to be + * marked for the decoder optionally + * uppercasing (TRUE) or lowercasing (FALSE) + * the character. + * ASCII characters are output directly in the case as marked. + * Flags corresponding to trail surrogates are ignored. + * If caseFlags==NULL then input characters are not + * case-mapped. + * @param pErrorCode ICU in/out error code parameter. + * U_INVALID_CHAR_FOUND if src contains + * unmatched single surrogates. + * U_INDEX_OUTOFBOUNDS_ERROR if src contains + * too many code points. + * @return Number of ASCII characters in puny. + * + * @see u_strFromPunycode + */ +U_CFUNC int32_t +u_strToPunycode(const UChar *src, int32_t srcLength, + UChar *dest, int32_t destCapacity, + const UBool *caseFlags, + UErrorCode *pErrorCode); + +/** + * u_strFromPunycode() converts Punycode to Unicode. + * The Unicode string will be at most as long (in UChars) + * than the Punycode string (in chars). + * + * @param src Input Punycode string. + * @param srcLength Length of puny, or -1 if NUL-terminated + * @param dest Output Unicode string buffer. + * @param destCapacity Size of dest in number of UChars, + * and of caseFlags in numbers of UBools. + * @param caseFlags Output array for case flags as + * defined by the Punycode string. + * The caller should uppercase (TRUE) or lowercase (FASLE) + * the corresponding character in dest. + * For supplementary characters, only the lead surrogate + * is marked, and FALSE is stored for the trail surrogate. + * This is redundant and not necessary for ASCII characters + * because they are already in the case indicated. + * Can be NULL if the case flags are not needed. + * @param pErrorCode ICU in/out error code parameter. + * U_INVALID_CHAR_FOUND if a non-ASCII character + * precedes the last delimiter ('-'), + * or if an invalid character (not a-zA-Z0-9) is found + * after the last delimiter. + * U_ILLEGAL_CHAR_FOUND if the delta sequence is ill-formed. + * @return Number of UChars written to dest. + * + * @see u_strToPunycode + */ +U_CFUNC int32_t +u_strFromPunycode(const UChar *src, int32_t srcLength, + UChar *dest, int32_t destCapacity, + UBool *caseFlags, + UErrorCode *pErrorCode); + +#endif + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index c155fd1e4d8..f99bbd7297e 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -1761,6 +1761,18 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = { "U_REGEX_INVALID_FLAG" }; +static const char * const +_uIDNAErrorName[U_IDNA_ERROR_LIMIT - U_IDNA_ERROR_START] = { + "U_IDNA_ERROR_START", + "U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR", + "U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR", + "U_IDNA_CHECK_BIDI_ERROR", + "U_IDNA_STD3_ASCII_RULES_ERROR", + "U_IDNA_ACE_PREFIX_ERROR", + "U_IDNA_VERIFICATION_ERROR", + "U_IDNA_LABEL_TOO_LONG_ERROR" +}; + U_CAPI const char * U_EXPORT2 u_errorName(UErrorCode code) { if(U_ZERO_ERROR <= code && code < U_STANDARD_ERROR_LIMIT) { @@ -1775,6 +1787,8 @@ u_errorName(UErrorCode code) { return _uBrkErrorName[code - U_BRK_ERROR_START]; } else if (U_REGEX_ERROR_START <= code && code < U_REGEX_ERROR_LIMIT) { return _uRegexErrorName[code - U_REGEX_ERROR_START]; + } else if( U_IDNA_ERROR_START <= code && code <= U_IDNA_ERROR_LIMIT) { + return _uIDNAErrorName[code - U_IDNA_ERROR_START]; } else { return "[BOGUS UErrorCode]"; } diff --git a/icu4c/source/common/sprpimpl.h b/icu4c/source/common/sprpimpl.h new file mode 100644 index 00000000000..2589d934883 --- /dev/null +++ b/icu4c/source/common/sprpimpl.h @@ -0,0 +1,65 @@ +/* + ******************************************************************************* + * + * Copyright (C) 2002, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: strprep.h + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003feb1 + * created by: Ram Viswanadha + */ + +#ifndef SPRPIMPL_H +#define SPRPIMPL_H + +enum{ + UIDNA_NO_VALUE = 0x0000 , + UIDNA_UNASSIGNED = 0x0001 , + UIDNA_PROHIBITED = 0x0002 , + UIDNA_MAP_NFKC = 0x0003 , + UIDNA_LABEL_SEPARATOR = 0x0004 , +}; +enum{ + _IDNA_LENGTH_IN_MAPPING_TABLE = 0x0003 /*11*/ +}; +/* indexes[] value names */ +enum { + _IDNA_INDEX_TRIE_SIZE, /* number of bytes in normalization trie */ + _IDNA_INDEX_MAPPING_DATA_SIZE, /* The array that contains the mapping */ + _IDNA_INDEX_TOP=3 /* changing this requires a new formatVersion */ +}; + +enum { + _IDNA_MAPPING_DATA_SIZE = 1700, + _IDNA_MAP_TO_NOTHING = 0xFFF +}; + +U_CFUNC UBool U_EXPORT2 +ustrprep_cleanup(); + +/* error codes for prototyping +#define U_IDNA_ERROR_START U_ERROR_LIMIT +#define U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 1)) +#define U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 2)) +#define U_IDNA_CHECK_BIDI_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 3)) +#define U_IDNA_STD3_ASCII_RULES_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 4)) +#define U_IDNA_ACE_PREFIX_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 5)) +#define U_IDNA_VERIFICATION_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 6)) +#define U_IDNA_LABEL_TOO_LONG_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 8)) +*/ +#endif + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + diff --git a/icu4c/source/common/strprep.cpp b/icu4c/source/common/strprep.cpp new file mode 100644 index 00000000000..861cc7c4702 --- /dev/null +++ b/icu4c/source/common/strprep.cpp @@ -0,0 +1,530 @@ +/* + ******************************************************************************* + * + * Copyright (C) 2002, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: strprep.cpp + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003feb1 + * created by: Ram Viswanadha + */ + +#include "strprep.h" +#include "utrie.h" +#include "umutex.h" +#include "cmemory.h" +#include "sprpimpl.h" +#include "nameprep.h" +#include "ustr_imp.h" +#include "unormimp.h" +#include "unicode/unorm.h" +#include "unicode/udata.h" +#include "unicode/ustring.h" + +static const uint16_t* mappingData = NULL; +static int32_t indexes[_IDNA_INDEX_TOP]={ 0 }; +static UBool _isDataLoaded = FALSE; +static UTrie idnTrie={ 0,0,0,0,0,0,0 }; +static UDataMemory* idnData=NULL; +static UErrorCode dataErrorCode =U_ZERO_ERROR; +/* file definitions */ +static const char* DATA_NAME = "uidna"; +static const char* DATA_TYPE = "icu"; + +U_CFUNC UBool U_EXPORT2 +ustrprep_cleanup() { + if(idnData!=NULL) { + udata_close(idnData); + idnData=NULL; + } + dataErrorCode=U_ZERO_ERROR; + _isDataLoaded=FALSE; + + return TRUE; +} + +static UBool U_CALLCONV +isAcceptable(void * /* context */, + const char * /* type */, + const char * /* name */, + const UDataInfo *pInfo) { + if( + pInfo->size>=20 && + pInfo->isBigEndian==U_IS_BIG_ENDIAN && + pInfo->charsetFamily==U_CHARSET_FAMILY && + pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */ + pInfo->dataFormat[1]==0x44 && + pInfo->dataFormat[2]==0x4e && + pInfo->dataFormat[3]==0x41 && + pInfo->formatVersion[0]==2 && + pInfo->formatVersion[2]==UTRIE_SHIFT && + pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT + ) { + return TRUE; + } else { + return FALSE; + } +} + +static int32_t U_CALLCONV +getFoldingOffset(uint32_t data) { + if(data&0x8000) { + return (int32_t)(data&0x7fff); + } else { + return 0; + } +} + +static UBool U_CALLCONV +loadData(UErrorCode &errorCode) { + /* load Unicode IDNA data from file */ + + if(_isDataLoaded==FALSE) { + UTrie _idnTrie={ 0,0,0,0,0,0,0 }; + UDataMemory *data; + const int32_t *p=NULL; + const uint8_t *pb; + + if(&errorCode==NULL || U_FAILURE(errorCode)) { + return 0; + } + + /* open the data outside the mutex block */ + //TODO: change the path + data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode); + dataErrorCode=errorCode; + if(U_FAILURE(errorCode)) { + return _isDataLoaded=FALSE; + } + + p=(const int32_t *)udata_getMemory(data); + pb=(const uint8_t *)(p+_IDNA_INDEX_TOP); + utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode); + _idnTrie.getFoldingOffset=getFoldingOffset; + + + if(U_FAILURE(errorCode)) { + dataErrorCode=errorCode; + udata_close(data); + return _isDataLoaded=FALSE; + } + + /* in the mutex block, set the data for this process */ + umtx_lock(NULL); + if(idnData==NULL) { + idnData=data; + data=NULL; + uprv_memcpy(&indexes, p, sizeof(indexes)); + uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie)); + } else { + p=(const int32_t *)udata_getMemory(idnData); + } + umtx_unlock(NULL); + /* initialize some variables */ + mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]); + + _isDataLoaded = TRUE; + + /* if a different thread set it first, then close the extra data */ + if(data!=NULL) { + udata_close(data); /* NULL if it was set correctly */ + } + } + + return _isDataLoaded; +} + + +static inline +void syntaxError(const UChar* rules, + int32_t pos, + int32_t rulesLen, + UParseError* parseError) { + + if(parseError == NULL){ + return; + } + if(pos == rulesLen && rulesLen >0){ + pos--; + } + parseError->offset = pos; + parseError->line = 0 ; // we are not using line numbers + + // for pre-context + int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); + int32_t stop = pos; + + u_memcpy(parseError->preContext,rules+start,stop-start); + //null terminate the buffer + parseError->preContext[stop-start] = 0; + + //for post-context + start = pos+1; + stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : + rulesLen; + + u_memcpy(parseError->postContext,rules+start,stop-start); + //null terminate the buffer + parseError->postContext[stop-start]= 0; + +} + +// ***************************************************************************** +// class StringPrep +// ***************************************************************************** + +U_NAMESPACE_BEGIN + +const char StringPrep::fgClassID=0; + +UBool StringPrep::isDataLoaded(UErrorCode& status){ + if(U_FAILURE(status)){ + return FALSE; + } + if(_isDataLoaded==FALSE && U_FAILURE(dataErrorCode)){ + status = dataErrorCode; + return FALSE; + } + loadData(dataErrorCode); + if(U_FAILURE(dataErrorCode)){ + status = dataErrorCode; + return FALSE; + } + return TRUE; +} + + +StringPrep* StringPrep::createDefaultInstance(UErrorCode& status){ + StringPrep* strprep = new StringPrep(); + if(!isDataLoaded(status)){ + delete strprep; + return NULL; + } + return strprep; +} + +StringPrep* StringPrep::createNameprepInstance(UErrorCode& status){ + StringPrep* strprep = new NamePrep(status); + if(!isDataLoaded(status)){ + delete strprep; + return NULL; + } + return strprep; +} + +UBool StringPrep::isNotProhibited(UChar32 ch){ + return FALSE; +} +UBool StringPrep::isUnassigned(UChar32 ch){ + + uint32_t result; + UTRIE_GET16(&idnTrie,ch,result); + return (result == UIDNA_UNASSIGNED); + +} + + +static inline void getValues(uint32_t result, int8_t& flag, + int8_t& length, int32_t& index){ + /* first 3 bits contain the flag */ + flag = (int8_t) (result & 0x07); + /* next 2 bits contain the length */ + length = (int8_t) ((result>>3) & 0x03); + /* next 10 bits contain the index */ + index = (result>> 5); +} + + +int32_t StringPrep::map(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + UBool allowUnassigned, + UParseError* parseError, + UErrorCode& status ){ + + uint32_t result; + int8_t flag; + int8_t length; + int32_t index; + int32_t destIndex=0; + int32_t srcIndex=0; + + // check error status + if(U_FAILURE(status)){ + return 0; + } + + //check arguments + if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { + status=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + if(srcLength == -1){ + srcLength = u_strlen(src); + } + + for(;srcIndex through ) are examples of + this because they have bidirectional category "EN". + + In any profile that specifies bidirectional character handling, all + three of the following requirements MUST be met: + + 1) The characters in section 5.8 MUST be prohibited. + + 2) If a string contains any RandALCat character, the string MUST NOT + contain any LCat character. + + 3) If a string contains any RandALCat character, a RandALCat + character MUST be the first character of the string, and a + RandALCat character MUST be the last character of the string. +*/ + +#define MAX_STACK_BUFFER_SIZE 300 + +int32_t StringPrep::process(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + UBool allowUnassigned, + UParseError* parseError, + UErrorCode& status ){ + // check error status + if(U_FAILURE(status)){ + return 0; + } + + //check arguments + if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { + status=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UChar b1Stack[MAX_STACK_BUFFER_SIZE], b2Stack[MAX_STACK_BUFFER_SIZE]; + UChar *b1 = b1Stack, *b2 = b2Stack; + int32_t b1Len, b2Len, + b1Capacity = MAX_STACK_BUFFER_SIZE , + b2Capacity = MAX_STACK_BUFFER_SIZE; + uint32_t result; + int32_t b2Index = 0; + int8_t flag; + int8_t length; + int32_t index; + UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT; + UBool leftToRight=FALSE, rightToLeft=FALSE; + int32_t rtlPos =-1, ltrPos =-1; + + b1Len = map(src,srcLength, b1, b1Capacity,allowUnassigned, parseError, status); + + if(status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + /* we do not have enough room so grow the buffer*/ + b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR); + if(b1==NULL){ + status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + status = U_ZERO_ERROR; // reset error + + b1Len = map(src,srcLength, b1, b1Len,allowUnassigned, parseError, status); + + } + + b2Len = normalize(b1,b1Len, b2,b2Capacity,status); + + if(status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + /* we do not have enough room so grow the buffer*/ + b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR); + if(b2==NULL){ + status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + status = U_ZERO_ERROR; // reset error + + b2Len = normalize(b2,b2Len, b2,b2Len,status); + + } + + if(U_FAILURE(status)){ + goto CLEANUP; + } + + UChar32 ch; + + for(; b2IndexltrPos) ? rtlPos : ltrPos, b2Len, parseError); + goto CLEANUP; + } + + //satisfy 3 + if(rightToLeft == TRUE && firstCharDir != direction ){ + status = U_IDNA_CHECK_BIDI_ERROR; + syntaxError(b2,b2Index-1,b2Len,parseError); + return FALSE; + } + + if(b2Len <= destCapacity){ + uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR); + } + +CLEANUP: + if(b1!=b1Stack){ + uprv_free(b1); + } + if(b2!=b2Stack){ + uprv_free(b2); + } + return u_terminateUChars(dest, destCapacity, b2Len, &status); +} + + +UBool StringPrep::isLabelSeparator(UChar32 ch, UErrorCode& status){ + // check error status + if(U_FAILURE(status)){ + return FALSE; + } + + if(isDataLoaded(status)){ + int32_t result; + UTRIE_GET16(&idnTrie,ch, result); + if( (result & 0x07) == UIDNA_LABEL_SEPARATOR){ + return TRUE; + } + } + return FALSE; +} + +U_NAMESPACE_END + diff --git a/icu4c/source/common/strprep.h b/icu4c/source/common/strprep.h new file mode 100644 index 00000000000..1b64ae8d3ed --- /dev/null +++ b/icu4c/source/common/strprep.h @@ -0,0 +1,360 @@ +/* + ******************************************************************************* + * + * Copyright (C) 2003, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: strprep.h + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003feb1 + * created by: Ram Viswanadha + */ + +#ifndef STRPREP_H +#define STRPREP_H + +#include "unicode/uobject.h" +#include "unicode/uniset.h" +#include "unicode/parseerr.h" + +U_NAMESPACE_BEGIN + +/**\file + * + * This API implements RF 3454 StringPrep standard. + * + * The steps for preparing strings are: + * + * 1) Map -- For each character in the input, check if it has a mapping + * and, if so, replace it with its mapping. + *
    + *
  • Delete certain codepoints from the input because their + * presence or absence in the protocol identifies should not + * make two strings different
  • + *
  • Case Mapings + *
    If Normalization is turned off + *
    Get mappings from case map tables + *
    else + *
    Get mappings from case map tables for normalization + *
    Use u_getFC_NFKC_Closure for obtaining extra mappings + *
  • + *
+ * 2) Normalize -- Possibly normalize the result of step 1 using Unicode + * normalization NFKC. + * + * 3) Prohibit -- Check for any characters that are not allowed in the + * output. If any are found, return an error. + * + * 4) Check bidi -- Possibly check for right-to-left characters, and if + * any are found, make sure that the whole string satisfies the + * requirements for bidirectional strings. If the string does not + * satisfy the requirements for bidirectional strings, return an + * error. + * + * Some StringPrep profiles: + * IDN: "Nameprep" http://www.ietf.org/internet-drafts/draft-ietf-idn-nameprep-11.txt + * XMPP Node Identifiers: "Nodeprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-nodeprep-01.txt + * XMPP Resource Identifiers: "Resourceprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-resourceprep-01.txt + * ANONYMOUS SASL tokens: "plain" http://www.ietf.org/internet-drafts/draft-ietf-sasl-anon-00.txt + * iSCSI http://www.ietf.org/internet-drafts/draft-ietf-ips-iscsi-string-prep-03.txt + */ +class StringPrep : public UObject{ + +protected: + UVersionInfo unicodeVersion; /** The Character repertoire version of this profile */ + UBool bidiCheck; /** Option to turn BiDi checking on */ + UBool doNFKC; /** Option to turn NFKC on */ + + /** + * Protected default constructor sub classes + */ + StringPrep(){}; + +public: + /** + * Destructor + */ + virtual inline ~StringPrep(){}; + + /** + * Map every character in input stream with mapping character + * in the mapping table and populate the output stream. + * For any individual character the mapping table may specify + * that that a character be mapped to nothing, mapped to one + * other character or to a string of other characters. + * + * @param src Pointer to UChar buffer containing a single label + * @param srcLength Number of characters in the source label + * @param dest Pointer to the destination buffer to receive the output + * @param destCapacity The capacity of destination array + * @param allowUnassigned Unassigned values can be converted to ASCII for query operations + * If TRUE unassigned values are treated as normal Unicode code point. + * If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code. + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of UChars in the destination buffer + * + */ + virtual int32_t map(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + UBool allowUnassigned, + UParseError* parseError, + UErrorCode& status ); + + /** + * Normalize the input stream using Normalization Form KC (NFKC) + * + * @param src Pointer to UChar buffer containing a single label + * @param srcLength Number of characters in the source label + * @param dest Pointer to the destination buffer to receive the output + * @param destCapacity The capacity of destination array + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of UChars in the destination buffer + * + * + */ + virtual int32_t normalize( const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + UErrorCode& status ); + + + /** + * Prepare the input stream with for use. This operation maps, normalizes(NFKC), + * checks for prohited and BiDi characters in the order defined by RFC 3454 + * + * @param src Pointer to UChar buffer containing a single label + * @param srcLength Number of characters in the source label + * @param dest Pointer to the destination buffer to receive the output + * @param destCapacity The capacity of destination array + * @param allowUnassigned Unassigned values can be converted to ASCII for query operations + * If TRUE unassigned values are treated as normal Unicode code point. + * If FALSE the operation fails with U_UNASSIGNED_CODE_POINT error code. + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of UChars in the destination buffer + * + * + */ + virtual int32_t process(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + UBool allowUnassigned, + UParseError* parseError, + UErrorCode& status ); + + /** + * Create a profile from prebuilt default Nameprep profile conforming to + * nameprep internet draft (http://www.ietf.org/html.charters/idn-charter.html). + * This is a built-in/unmodifiable profile. + * + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return Pointer to StringPrep object that is created. Should be deleted by + * by caller + * + * + */ + static StringPrep* createNameprepInstance(UErrorCode& status); + + /** + * Create a profile from prebuilt default StringPrep profile conforming to + * RFC 3454 (ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt). + * User defined profiles can be created by getting the default profile and + * adding mappings, removing mappings, turning options ON/OFF and prohibiting + * characters from the output. + * + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return Pointer to StringPrep object that is created. Should be deleted by + * the caller. + * + * + */ + static StringPrep* createDefaultInstance(UErrorCode& status); + + /** + * Ascertain if the given code point is a Letter/Digit/Hyphen in the ASCII range + * + * @return TRUE is the code point is a Letter/Digit/Hyphen + * + * + */ + static inline UBool isLDHChar(UChar32 ch); + + /** + * Ascertain if the given code point is a label separator as specified by IDNA + * + * @return TRUE is the code point is a label separator + * + * + */ + virtual UBool isLabelSeparator(UChar32 ch, UErrorCode& status); + + /** + * Get the BiDi option of this profile + * + * + */ + inline UBool getCheckBiDi(); + + /** + * Get the normalization (NFKC) option of this profile + * + * @return The normalization option + * + * + */ + inline UBool getNormalization(); + + /** + * Get the Unicode version which this profile + * conforms to + * + * + */ + inline void getUnicodeVersion(UVersionInfo& info); + +private: + // Boiler plate + + /** + * Copy constructor. + * + */ + StringPrep(const StringPrep&); + + /** + * Assignment operator. + * + */ + StringPrep& operator=(const StringPrep&); + + /** + * Return true if another object is semantically equal to this one. + * + * @param other the object to be compared with. + * @return true if another object is semantically equal to this one. + * + */ + UBool operator==(const StringPrep& other) const {return FALSE;}; + + /** + * Return true if another object is semantically unequal to this one. + * + * @param other the object to be compared with. + * @return true if another object is semantically unequal to this one. + * + */ + UBool operator!=(const StringPrep& other) const { return !operator==(other); } + +public: + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + * + * + */ + static inline UClassID getStaticClassID(); + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + * + * + */ + virtual inline UClassID getDynamicClassID() const; + +protected: + + /** + * Sub classes that slightly modify the default profile + * implement this method to remove characters to + * the prohibited list. The default implementation does not + * check if the data is loaded or not. The caller is responsible + * for checking for data. + * + */ + virtual UBool isNotProhibited(UChar32 ch); + + /** + * Sub classes that slightly modify the default profile + * implement this method to remove characters to + * the unassigned list. The default implementation does not + * check if the data is loaded or not. The caller is responsible + * for checking for data. + */ + virtual UBool isUnassigned(UChar32 ch); + + /** + * Ascertains if uidna.icu data file is loaded. + * If data is not loaded, loads the data file. + * + * + */ + static UBool isDataLoaded(UErrorCode& status); + +private: + + /** + * The address of this static class variable serves as this class's ID + * for ICU "poor man's RTTI". + */ + static const char fgClassID; + +}; + +inline UBool StringPrep::getCheckBiDi(){ + return bidiCheck; +} + + +inline UBool StringPrep::getNormalization(){ + return doNFKC; +} + +inline void StringPrep::getUnicodeVersion(UVersionInfo& info){ + for(int32_t i=0; i< (sizeof(info)/sizeof(info[0])); i++){ + info[i] = unicodeVersion[i]; + } +} + +inline UClassID StringPrep::getStaticClassID() { + return (UClassID)&fgClassID; +} + +inline UClassID StringPrep::getDynamicClassID() const { + return getStaticClassID(); +} + +inline UBool StringPrep::isLDHChar(UChar32 ch){ + // high runner case + if(ch>0x007A){ + return FALSE; + } + //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] + if( (ch==0x002D) || + (0x0030 <= ch && ch <= 0x0039) || + (0x0041 <= ch && ch <= 0x005A) || + (0x0061 <= ch && ch <= 0x007A) + ){ + return TRUE; + } + return FALSE; +} + +U_NAMESPACE_END + +#endif + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + diff --git a/icu4c/source/common/uidna.cpp b/icu4c/source/common/uidna.cpp new file mode 100644 index 00000000000..50c0ea33c69 --- /dev/null +++ b/icu4c/source/common/uidna.cpp @@ -0,0 +1,735 @@ +/* + ******************************************************************************* + * + * Copyright (C) 2002, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: strprep.cpp + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003feb1 + * created by: Ram Viswanadha + */ +#include "unicode/uidna.h" +#include "unicode/ustring.h" +#include "strprep.h" +#include "punycode.h" +#include "ustr_imp.h" +#include "cmemory.h" +#include "sprpimpl.h" + +/* it is official IDNA ACE Prefix is "xn--" */ +static const UChar ACE_PREFIX[] ={ 0x0078,0x006E,0x002d,0x002d } ; +#define ACE_PREFIX_LENGTH 4 + +#define MAX_LABEL_LENGTH 63 +#define HYPHEN 0x002D +/* The Max length of the labels should not be more than 64 */ +#define MAX_LABEL_BUFFER_SIZE 100 +#define MAX_IDN_BUFFER_SIZE 300 + +#define CAPITAL_A 0x0041 +#define CAPITAL_Z 0x005A +#define LOWER_CASE_DELTA 0x0020 +#define FULL_STOP 0x002E + +inline static UBool +startsWithPrefix(const UChar* src , int32_t srcLength){ + UBool startsWithPrefix = TRUE; + + if(srcLength < ACE_PREFIX_LENGTH){ + return FALSE; + } + + for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){ + if(u_tolower(src[i]) != ACE_PREFIX[i]){ + startsWithPrefix = FALSE; + } + } + return startsWithPrefix; +} + +inline static UChar +toASCIILower(UChar ch){ + if(CAPITAL_A <= ch && ch <= CAPITAL_Z){ + return ch + LOWER_CASE_DELTA; + } + return ch; +} + +inline static int32_t +compareCaseInsensitiveASCII(const UChar* s1, int32_t s1Len, + const UChar* s2, int32_t s2Len){ + if(s1Len != s2Len){ + return (s1Len > s2Len) ? s1Len : s2Len; + } + UChar c1,c2; + int32_t rc; + + for(int32_t i =0;/* no condition */;i++) { + /* If we reach the ends of both strings then they match */ + if(i == s1Len) { + return 0; + } + + c1 = s1[i]; + c2 = s2[i]; + + /* Case-insensitive comparison */ + if(c1!=c2) { + rc=(int32_t)toASCIILower(c1)-(int32_t)toASCIILower(c2); + if(rc!=0) { + return rc; + } + } + } + +} + +static inline +void syntaxError(const UChar* rules, + int32_t pos, + int32_t rulesLen, + UParseError* parseError) { + + if(parseError == NULL){ + return; + } + if(pos == rulesLen && rulesLen >0){ + pos--; + } + parseError->offset = pos; + parseError->line = 0 ; // we are not using line numbers + + // for pre-context + int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); + int32_t stop = pos; + + u_memcpy(parseError->preContext,rules+start,stop-start); + //null terminate the buffer + parseError->preContext[stop-start] = 0; + + //for post-context + start = pos+1; + stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : + rulesLen; + + u_memcpy(parseError->postContext,rules+start,stop-start); + //null terminate the buffer + parseError->postContext[stop-start]= 0; + +} + +U_CAPI int32_t U_EXPORT2 +uidna_toASCII(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status){ + + if(status == NULL || U_FAILURE(*status)){ + return 0; + } + if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE]; + //initialize pointers to stack buffers + UChar *b1 = b1Stack, *b2 = b2Stack; + int32_t b1Len, b2Len, + b1Capacity = MAX_LABEL_BUFFER_SIZE, + b2Capacity = MAX_LABEL_BUFFER_SIZE , + reqLength=0; + + + UBool* caseFlags = NULL; + + // the source contains all ascii codepoints + UBool srcIsASCII = TRUE; + // assume the source contains all LDH codepoints + UBool srcIsLDH = TRUE; + + int32_t j=0; + + //get the options + UBool allowUnassigned = options & UIDNA_ALLOW_UNASSIGNED; + UBool useSTD3ASCIIRules = (options & UIDNA_USE_STD3_RULES) >>1; + + int32_t failPos = -1; + // step 2 + StringPrep* prep = StringPrep::createNameprepInstance(*status); + + if(U_FAILURE(*status)){ + goto CLEANUP; + } + + b1Len = prep->process(src,srcLength,b1, b1Capacity,allowUnassigned, parseError, *status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + // we do not have enough room so grow the buffer + b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR); + if(b1==NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status); + } + // error bail out + if(U_FAILURE(*status)){ + goto CLEANUP; + } + + // step 3 & 4 + for( j=0;j 0x7F){ + srcIsASCII = FALSE; + } + // here we do not assemble surrogates + // since we know that LDH code points + // are in the ASCII range only + if(prep->isLDHChar(b1[j])==FALSE){ + srcIsLDH = FALSE; + failPos = j; + } + } + + if(useSTD3ASCIIRules == TRUE){ + // verify 3a and 3b + if( srcIsLDH == FALSE /* source contains some non-LDH characters */ + || b1[0] == HYPHEN || b1[b1Len-1] == HYPHEN){ + *status = U_IDNA_STD3_ASCII_RULES_ERROR; + + /* populate the parseError struct */ + if(srcIsLDH==FALSE){ + syntaxError(b1,failPos-1,b1Len,parseError); + }else if(b1[0] == HYPHEN){ + syntaxError(b1,0,b1Len,parseError); + }else{ + syntaxError(b1,b1Len-1,b1Len,parseError); + } + + goto CLEANUP; + } + } + if(srcIsASCII){ + if(b1Len <= destCapacity){ + uprv_memmove(dest, b1, b1Len * U_SIZEOF_UCHAR); + reqLength = b1Len; + }else{ + reqLength = b1Len; + goto CLEANUP; + } + }else{ + // step 5 : verify the sequence does not begin with ACE prefix + if(!startsWithPrefix(b1,b1Len)){ + + //step 6: encode the sequence with punycode + caseFlags = (UBool*) uprv_malloc(b1Len * sizeof(UBool)); + + b2Len = u_strToPunycode(b1,b1Len,b2,b2Capacity,caseFlags, status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + /* we do not have enough room so grow the buffer*/ + b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR); + if(b2 == NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b2Len = u_strToPunycode(b1,b1Len,b2,b2Len,caseFlags, status); + } + //error bail out + if(U_FAILURE(*status)){ + goto CLEANUP; + } + reqLength = b2Len+ACE_PREFIX_LENGTH; + + if(reqLength > destCapacity){ + *status = U_BUFFER_OVERFLOW_ERROR; + goto CLEANUP; + } + //Step 7: prepend the ACE prefix + uprv_memcpy(dest,ACE_PREFIX,ACE_PREFIX_LENGTH * U_SIZEOF_UCHAR); + //Step 6: copy the contents in b2 into dest + uprv_memcpy(dest+ACE_PREFIX_LENGTH, b2, b2Len * U_SIZEOF_UCHAR); + + }else{ + *status = U_IDNA_ACE_PREFIX_ERROR; + syntaxError(b1,0,b1Len,parseError); + goto CLEANUP; + } + } + + if(reqLength > MAX_LABEL_LENGTH){ + *status = U_IDNA_LABEL_TOO_LONG_ERROR; + } + +CLEANUP: + if(b1 != b1Stack){ + uprv_free(b1); + } + if(b2 != b2Stack){ + uprv_free(b2); + } + uprv_free(caseFlags); + + delete prep; + + return u_terminateUChars(dest, destCapacity, reqLength, status); +} + + +U_CAPI int32_t U_EXPORT2 +uidna_toUnicode(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status){ + + if(status == NULL || U_FAILURE(*status)){ + return 0; + } + if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + //get the options + UBool allowUnassigned = options & UIDNA_ALLOW_UNASSIGNED; + UBool useSTD3ASCIIRules = (options & UIDNA_USE_STD3_RULES) >>1; + + UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE]; + + //initialize pointers to stack buffers + UChar *b1 = b1Stack, *b2 = b2Stack, *b1Prime=NULL, *b3=b3Stack; + int32_t b1Len, b2Len, b1PrimeLen, b3Len, + b1Capacity = MAX_LABEL_BUFFER_SIZE, + b2Capacity = MAX_LABEL_BUFFER_SIZE, + b3Capacity = MAX_LABEL_BUFFER_SIZE, + reqLength=0; + + StringPrep* prep = StringPrep::createNameprepInstance(*status); + b1Len = 0; + UBool* caseFlags = NULL; + + UBool srcIsASCII = TRUE; + + if(U_FAILURE(*status)){ + goto CLEANUP; + } + // step 1: find out if all the codepoints in src are ASCII + if(srcLength==-1){ + srcLength = 0; + for(;src[srcLength]!=0;){ + if(src[srcLength]> 0x7f){ + srcIsASCII = FALSE; + } + srcLength++; + } + }else{ + for(int32_t j=0; j 0x7f){ + srcIsASCII = FALSE; + } + } + } + + if(srcIsASCII == FALSE){ + // step 2: process the string + b1Len = prep->process(src,srcLength,b1,b1Capacity,allowUnassigned, parseError, *status); + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + /* we do not have enough room so grow the buffer*/ + b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR); + if(b1==NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status); + } + //bail out on error + if(U_FAILURE(*status)){ + goto CLEANUP; + } + }else{ + + //just point src to b1 + b1 = (UChar*) src; + b1Len = srcLength; + } + //step 3: verify ACE Prefix + if(startsWithPrefix(src,srcLength)){ + + //step 4: Remove the ACE Prefix + b1Prime = b1 + ACE_PREFIX_LENGTH; + b1PrimeLen = b1Len - ACE_PREFIX_LENGTH; + + //step 5: Decode using punycode + b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Capacity, caseFlags,status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + /* we do not have enough room so grow the buffer*/ + b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR); + if(b2==NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Len, caseFlags, status); + + } + + + //step 6:Apply toASCII + b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity,options,parseError, status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + /* we do not have enough room so grow the buffer*/ + b3 = (UChar*) uprv_malloc(b3Len * U_SIZEOF_UCHAR); + if(b3==NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b3Len = uidna_toASCII(b2,b2Len,b3,b3Len,options,parseError, status); + + } + //bail out on error + if(U_FAILURE(*status)){ + goto CLEANUP; + } + + //step 7: verify + if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){ + *status = U_IDNA_VERIFICATION_ERROR; + goto CLEANUP; + } + + //step 8: return output of step 5 + reqLength = b2Len; + if(b2Len <= destCapacity) { + uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR); + } + }else{ + //copy the source to destination + if(srcLength <= destCapacity){ + uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR); + } + reqLength = srcLength; + } + +CLEANUP: + + if(b1 != b1Stack && b1!=src){ + uprv_free(b1); + } + if(b2 != b2Stack){ + uprv_free(b2); + } + uprv_free(caseFlags); + + delete prep; + + return u_terminateUChars(dest, destCapacity, reqLength, status); +} + +static int32_t +getNextSeparator(UChar *src,int32_t srcLength,StringPrep* prep, + UChar **limit, + UBool *done, + UErrorCode *status){ + if(srcLength == -1){ + int32_t i; + for(i=0 ; ;i++){ + if(src[i] == 0){ + *limit = src + i; // point to null + *done = TRUE; + return i; + } + if(prep->isLabelSeparator(src[i],*status)){ + *limit = src + (i+1); // go past the delimiter + return i; + + } + } + // we have not found the delimiter + if(i==srcLength){ + *limit = src+srcLength; + *done = TRUE; + } + return i; + }else{ + int32_t i; + for(i=0;iisLabelSeparator(src[i],*status)){ + *limit = src + (i+1); // go past the delimiter + return i; + } + } + // we have not found the delimiter + if(i==srcLength){ + *limit = src+srcLength; + *done = TRUE; + } + return i; + } +} + +U_CAPI int32_t U_EXPORT2 +uidna_IDNToASCII( const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status){ + + if(status == NULL || U_FAILURE(*status)){ + return 0; + } + if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + //get the options + UBool allowUnassigned = options & UIDNA_ALLOW_UNASSIGNED; + UBool useSTD3ASCIIRules = (options & UIDNA_USE_STD3_RULES) >>1; + + UChar *start=NULL, *limit=NULL; + + int32_t reqLength = 0; + + StringPrep* prep = StringPrep::createNameprepInstance(*status); + + if(U_FAILURE(*status)){ + return 0; + } + + //initialize pointers to stack buffers + UChar b1Stack[MAX_LABEL_BUFFER_SIZE]; + UChar *b1 = b1Stack; + int32_t b1Len, labelLen; + UChar* delimiter = (UChar*)src; + UChar* labelStart = (UChar*)src; + int32_t remainingLen = srcLength; + int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE; + UBool done = FALSE; + + + for(;;){ + + labelLen = getNextSeparator(labelStart, -1, prep, &delimiter,&done, status); + + b1Len = uidna_toASCII(labelStart, labelLen, b1, b1Capacity, + options, parseError, status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // for pre-flighting we already know the return length + // do not re-process the string just save the length + // and reset error code + + *status = U_ZERO_ERROR; // reset error + } + + if(U_FAILURE(*status)){ + break; + } + int32_t tempLen = (reqLength + b1Len ); + + // copy to dest + if( tempLen <= destCapacity){ + uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR); + } + + reqLength = tempLen; + + // add the label separator + if(done==FALSE){ + if(reqLength < destCapacity){ + dest[reqLength] = FULL_STOP; + } + reqLength++; + } + + labelStart = delimiter; + + if(done == TRUE){ + break; + } + + } + + if(b1 != b1Stack){ + uprv_free(b1); + } + + delete prep; + + return u_terminateUChars(dest, destCapacity, reqLength, status); +} + +U_CAPI int32_t U_EXPORT2 +uidna_IDNToUnicode( const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status){ + + if(status == NULL || U_FAILURE(*status)){ + return 0; + } + if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UChar *start=NULL, *limit=NULL; + + int32_t reqLength = 0; + + StringPrep* prep = StringPrep::createNameprepInstance(*status); + + if(U_FAILURE(*status)){ + return 0; + } + + //initialize pointers to stack buffers + UChar b1Stack[MAX_LABEL_BUFFER_SIZE]; + UChar *b1 = b1Stack; + int32_t b1Len, labelLen; + UChar* delimiter = (UChar*)src; + UChar* labelStart = (UChar*)src; + int32_t remainingLen = srcLength; + int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE; + UBool done = FALSE; + + for(;;){ + + labelLen = getNextSeparator(labelStart, -1, prep, &delimiter, &done, status); + + + b1Len = uidna_toUnicode( labelStart,labelLen, b1, b1Capacity, + options, parseError, status); + + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // for pre-flighting we already know the return length + // do not re-process the string just save the length + // and reset error code + *status = U_ZERO_ERROR; // reset error + } + + if(U_FAILURE(*status)){ + break; + } + int32_t tempLen = (reqLength + b1Len ); + // copy to dest + if( tempLen <= destCapacity){ + uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR); + } + + reqLength = tempLen; + // add the label separator + if(done==FALSE){ + if(reqLength < destCapacity){ + dest[reqLength] = FULL_STOP; + } + reqLength++; + } + + labelStart = delimiter; + + if(done==TRUE){ + break; + } + + } + + if(b1 != b1Stack){ + uprv_free(b1); + } + + delete prep; + + return u_terminateUChars(dest, destCapacity, reqLength, status); +} + +U_CAPI int32_t U_EXPORT2 +uidna_compare( const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + int32_t options, + UErrorCode* status){ + + if(status == NULL || U_FAILURE(*status)){ + return -1; + } + + UChar b1Stack[MAX_IDN_BUFFER_SIZE], b2Stack[MAX_IDN_BUFFER_SIZE]; + UChar *b1 = b1Stack, *b2 = b2Stack; + int32_t b1Len, b2Len, b1Capacity = MAX_IDN_BUFFER_SIZE, b2Capacity = MAX_IDN_BUFFER_SIZE; + int32_t result; + + UParseError parseError; + + b1Len = uidna_IDNToASCII(s1, length1, b1, b1Capacity, options, &parseError, status); + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR); + if(b1==NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b1Len = uidna_IDNToASCII(s1,length1,b1,b1Len, options, &parseError, status); + + } + + b2Len = uidna_IDNToASCII(s2,length2, b2,b2Capacity, options, &parseError, status); + if(*status == U_BUFFER_OVERFLOW_ERROR){ + // redo processing of string + b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR); + if(b2==NULL){ + *status = U_MEMORY_ALLOCATION_ERROR; + goto CLEANUP; + } + + *status = U_ZERO_ERROR; // reset error + + b2Len = uidna_IDNToASCII(s2, length2, b2, b2Len, options, &parseError, status); + + } + // when toASCII is applied all label separators are replaced with FULL_STOP + result = compareCaseInsensitiveASCII(b1,b1Len,b2,b2Len); + +CLEANUP: + if(b1 != b1Stack){ + uprv_free(b1); + } + + if(b2 != b2Stack){ + uprv_free(b2); + } + + return result; +} + diff --git a/icu4c/source/common/unicode/uidna.h b/icu4c/source/common/unicode/uidna.h new file mode 100644 index 00000000000..eaec021ded4 --- /dev/null +++ b/icu4c/source/common/unicode/uidna.h @@ -0,0 +1,282 @@ +/* + ******************************************************************************* + * + * Copyright (C) 2003, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: uidna.h + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003feb1 + * created by: Ram Viswanadha + */ + +#ifndef __UIDNA_H__ +#define __UIDNA_H__ + +#include "unicode/utypes.h" +#include "unicode/parseerr.h" + +/** + *\file + * UIDNA API implements the IDNA protocol as defined in the IDNA draft + * (http://www.ietf.org/internet-drafts/draft-ietf-idn-idna-14.txt). + * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels + * containing non-ASCII code points are required to be processed by + * ToASCII operation before passing it to resolver libraries. Domain names + * that are obtained from resolver libraries are required to be processed by + * ToUnicode operation before displaying the domain name to the user. + * IDNA requires that implementations process input strings with Nameprep + * (http://www.ietf.org/internet-drafts/draft-ietf-idn-nameprep-11.txt), + * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt), + * and then with Punycode (http://www.ietf.org/internet-drafts/draft-ietf-idn-punycode-03.txt). + * Implementations of IDNA MUST fully implement Nameprep and Punycode; + * neither Nameprep nor Punycode are optional. + * The input and output of ToASCII and ToUnicode operations are Unicode + * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations + * multiple times to an input string will yield the same result as applying the operation + * once. + * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) + * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string). + *\end_file + */ + +#define UIDNA_DEFAULT 0x0000 +#define UIDNA_ALLOW_UNASSIGNED 0x0001 +#define UIDNA_USE_STD3_RULES 0x0002 + +/** + * This function implements the ToASCII operation as defined in the IDNA draft. + * This operation is done on single labels before sending it to something that expects + * ASCII names. A label is an individual part of a domain name. Labels are usually + * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * + * @param src Input UChar array containing label in Unicode. + * @param srcLength Number of UChars in src, or -1 if NUL-terminated. + * @param dest Output UChar array with ASCII (ACE encoded) label. + * @param destCapacity Size of dest. + * @param options A bit set of options: + * + * - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * U_UNASSIGNED_CODE_POINT_FOUND error code. + * + * - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR + * + * @param parseError Pointer to UParseError struct to receive information on position + * of error if an error is encountered. Can be NULL. + * @param status ICU in/out error code parameter. + * U_INVALID_CHAR_FOUND if src contains + * unmatched single surrogates. + * U_INDEX_OUTOFBOUNDS_ERROR if src contains + * too many code points. + * U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough + * @return Number of ASCII characters converted. + * @draft ICU 2.6 + */ +U_CAPI int32_t U_EXPORT2 +uidna_toASCII(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status); + + +/** + * This function implements the ToUnicode operation as defined in the IDNA draft. + * This operation is done on single labels before sending it to something that expects + * Unicode names. A label is an individual part of a domain name. Labels are usually + * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * @param src Input UChar array containing ASCII (ACE encoded) label. + * @param srcLength Number of UChars in src, or -1 if NUL-terminated. + * @param dest Output Converted UChar array containing Unicode equivalent of label. + * @param destCapacity Size of dest. + * @param options A bit set of options: + * + * - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * U_UNASSIGNED_CODE_POINT_FOUND error code. + * + * - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. Note: This option is + * required on toUnicode operation because the draft mandates + * verification of decoded ACE input by applying toASCII and comparing + * its output with source + * + * + * + * - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR + * + * @param parseError Pointer to UParseError struct to receive information on position + * of error if an error is encountered. Can be NULL. + * @param status ICU in/out error code parameter. + * U_INVALID_CHAR_FOUND if src contains + * unmatched single surrogates. + * U_INDEX_OUTOFBOUNDS_ERROR if src contains + * too many code points. + * U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough + * @return Number of Unicode characters converted. + * @draft ICU 2.6 + */ +U_CAPI int32_t U_EXPORT2 +uidna_toUnicode(const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status); + + +/** + * Convenience function that implements the IDNToASCII operation as defined in the IDNA draft. + * This operation is done on complete domain names, e.g: "www.example.com". + * It is important to note that this operation can fail. If it fails, then the input + * domain name cannot be used as an Internationalized Domain Name and the application + * should have methods defined to deal with the failure. + * + * Note: IDNA draft specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src Input UChar array containing IDN in Unicode. + * @param srcLength Number of UChars in src, or -1 if NUL-terminated. + * @param dest Output UChar array with ASCII (ACE encoded) IDN. + * @param destCapacity Size of dest. + * @param options A bit set of options: + * + * - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * U_UNASSIGNED_CODE_POINT_FOUND error code. + * + * - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR + * + * @param parseError Pointer to UParseError struct to receive information on position + * of error if an error is encountered. Can be NULL. + * @param status ICU in/out error code parameter. + * U_INVALID_CHAR_FOUND if src contains + * unmatched single surrogates. + * U_INDEX_OUTOFBOUNDS_ERROR if src contains + * too many code points. + * U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough + * @return Number of ASCII characters converted. + * @draft ICU 2.6 + */ +U_CAPI int32_t U_EXPORT2 +uidna_IDNToASCII( const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status); + +/** + * Convenience function that implements the IDNToUnicode operation as defined in the IDNA draft. + * This operation is done on complete domain names, e.g: "www.example.com". + * + * Note: IDNA draft specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src Input UChar array containing IDN in ASCII (ACE encoded) form. + * @param srcLength Number of UChars in src, or -1 if NUL-terminated. + * @param dest Output UChar array containing Unicode equivalent of source IDN. + * @param destCapacity Size of dest. + * @param options A bit set of options: + * + * - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * U_UNASSIGNED_CODE_POINT_FOUND error code. + * + * - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR + * + * @param parseError Pointer to UParseError struct to receive information on position + * of error if an error is encountered. Can be NULL. + * @param status ICU in/out error code parameter. + * U_INVALID_CHAR_FOUND if src contains + * unmatched single surrogates. + * U_INDEX_OUTOFBOUNDS_ERROR if src contains + * too many code points. + * U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough + * @return Number of ASCII characters converted. + * @draft ICU 2.6 + */ +U_CAPI int32_t U_EXPORT2 +uidna_IDNToUnicode( const UChar* src, int32_t srcLength, + UChar* dest, int32_t destCapacity, + int32_t options, + UParseError* parseError, + UErrorCode* status); + +/** + * Compare two strings for IDNs for equivalence. + * This function splits the domain names into labels and compares them. + * According to IDN draft, whenever two labels are compared, they are + * considered equal if and only if their ASCII forms (obtained by + * applying toASCII) match using an case-insensitive ASCII comparison. + * Two domain names are considered a match if and only if all labels + * match regardless of whether label separators match. + * + * @param s1 First source string. + * @param length1 Length of first source string, or -1 if NUL-terminated. + * + * @param s2 Second source string. + * @param length2 Length of second source string, or -1 if NUL-terminated. + * @param options A bit set of options: + * + * - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * U_UNASSIGNED_CODE_POINT_FOUND error code. + * + * - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR + * + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return <0 or 0 or >0 as usual for string comparisons + * @draft ICU 2.6 + */ +U_CAPI int32_t U_EXPORT2 +uidna_compare( const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + int32_t options, + UErrorCode* status); + +#endif diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index 1da09f6adbe..4ae7a912ef1 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -631,8 +631,21 @@ typedef enum UErrorCode { U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */ U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */ U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ + + /* + * The error code in the range 0x10400-0x104ff are reserved for IDNA related error codes + */ + U_IDNA_ERROR_START=0x10400, + U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR, + U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR, + U_IDNA_CHECK_BIDI_ERROR, + U_IDNA_STD3_ASCII_RULES_ERROR, + U_IDNA_ACE_PREFIX_ERROR, + U_IDNA_VERIFICATION_ERROR, + U_IDNA_LABEL_TOO_LONG_ERROR, + U_IDNA_ERROR_LIMIT, - U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ + U_ERROR_LIMIT=U_IDNA_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ } UErrorCode; /* Use the following to determine if an UErrorCode represents */