ICU-2194 IDNA API

X-SVN-Rev: 11194
This commit is contained in:
Ram Viswanadha 2003-02-28 21:35:25 +00:00
parent 78f36c9a5a
commit 7da935c904
13 changed files with 2877 additions and 2 deletions

View file

@ -66,7 +66,8 @@ brkiter.o brkdict.o ubrk.o dbbi.o dbbi_tbl.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
utrie.o uset.o cmemory.o caniter.o \
unifilt.o unifunct.o uniset.o usetiter.o util.o uenum.o \
icuserv.o iculserv.o icunotif.o ustrenum.o
icuserv.o iculserv.o icunotif.o ustrenum.o \
uidna.o strprep.o nameprep.o punycode.o
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))

View file

@ -3244,6 +3244,65 @@ InputPath=.\unicode\utf_old.h
!ENDIF
# End Source File
# End Group
# Begin Group "idna"
# PROP Default_Filter "*.c,*.h"
# Begin Source File
SOURCE=.\nameprep.cpp
# End Source File
# Begin Source File
SOURCE=.\nameprep.h
# End Source File
# Begin Source File
SOURCE=.\punycode.c
# End Source File
# Begin Source File
SOURCE=.\punycode.h
# End Source File
# Begin Source File
SOURCE=.\sprpimpl.h
# End Source File
# Begin Source File
SOURCE=.\strprep.cpp
# End Source File
# Begin Source File
SOURCE=.\strprep.h
# End Source File
# Begin Source File
SOURCE=.\uidna.cpp
# End Source File
# Begin Source File
SOURCE=.\unicode\uidna.h
!IF "$(CFG)" == "common - Win32 Release"
!ELSEIF "$(CFG)" == "common - Win32 Debug"
# Begin Custom Build
InputPath=.\unicode\uidna.h
"..\..\include\unicode\uidna.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(InputPath) ..\..\include\unicode
# End Custom Build
!ELSEIF "$(CFG)" == "common - Win64 Release"
!ELSEIF "$(CFG)" == "common - Win64 Debug"
!ENDIF
# End Source File
# End Group
# End Target

View file

@ -0,0 +1,41 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "nameprep.h"
// *****************************************************************************
// class NamePrep
// *****************************************************************************
static const UChar ASCII_SPACE = 0x0020;
U_NAMESPACE_BEGIN
const char NamePrep::fgClassID=0;
// default constructor
NamePrep::NamePrep(UErrorCode& status){
bidiCheck = TRUE;
doNFKC = TRUE;
}
UBool NamePrep::isNotProhibited(UChar32 ch){
return (UBool)(ch == ASCII_SPACE);
}
U_NAMESPACE_END

View file

@ -0,0 +1,97 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: nameprep.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef NAMEPREP_H
#define NAMEPREP_H
#include "unicode/utypes.h"
#include "strprep.h"
#include "unicode/uniset.h"
U_NAMESPACE_BEGIN
/*
A profile of stringprep MUST include all of the following:
- The intended applicability of the profile
- The character repertoire that is the input and output to stringprep
(which is Unicode 3.2 for this version of stringprep)
- The mapping tables from this document used (as described in section
3)
- Any additional mapping tables specific to the profile
- The Unicode normalization used, if any (as described in section 4)
- The tables from this document of characters that are prohibited as
output (as described in section 5)
- The bidirectional string testing used, if any (as described in
section 6)
- Any additional characters that are prohibited as output specific to
the profile
*/
class NamePrep: public StringPrep {
public :
NamePrep(UErrorCode& status);
virtual inline ~NamePrep(){};
virtual inline UBool isNotProhibited(UChar32 ch);
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.6
*/
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.6
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
private:
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
U_NAMESPACE_END
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View file

@ -0,0 +1,563 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: punycode.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan31
* created by: Markus W. Scherer
*/
/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
*/
/*
* ICU modifications:
* - ICU data types and coding conventions
* - ICU string buffer handling with implicit source lengths
* and destination preflighting
* - UTF-16 handling
*/
#include "unicode/utypes.h"
#include "ustr_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "punycode.h"
#include "unicode/ustring.h"
/* Punycode ----------------------------------------------------------------- */
/* Punycode parameters for Bootstring */
#define BASE 36
#define TMIN 1
#define TMAX 26
#define SKEW 38
#define DAMP 700
#define INITIAL_BIAS 72
#define INITIAL_N 0x80
/* "Basic" Unicode/ASCII code points */
#define _HYPHEN 0X2d
#define DELIMITER _HYPHEN
#define _ZERO 0X30
#define _NINE 0x39
#define _SMALL_A 0X61
#define _SMALL_Z 0X7a
#define _CAPITAL_A 0X41
#define _CAPITAL_Z 0X5a
#define IS_BASIC(c) ((c)<0x80)
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
/**
* digitToBasic() returns the basic code point whose value
* (when used for representing integers) is d, which must be in the
* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
* nonzero, in which case the uppercase form is used.
*/
U_INLINE char
digitToBasic(int32_t digit, UBool uppercase) {
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
if(digit<26) {
if(uppercase) {
return (char)(_CAPITAL_A+digit);
} else {
return (char)(_SMALL_A+digit);
}
} else {
return (char)((_ZERO-26)+digit);
}
}
/**
* basicToDigit[] contains the numeric value of a basic code
* point (for use in representing integers) in the range 0 to
* BASE-1, or -1 if b is does not represent a value.
*/
static int8_t
basicToDigit[256]={
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
U_INLINE char
asciiCaseMap(char b, UBool uppercase) {
if(uppercase) {
if(_SMALL_A<=b && b<=_SMALL_Z) {
b-=(_SMALL_A-_CAPITAL_A);
}
} else {
if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
b+=(_SMALL_A-_CAPITAL_A);
}
}
return b;
}
/* Punycode-specific Bootstring code ---------------------------------------- */
/*
* The following code omits the {parts} of the pseudo-algorithm in the spec
* that are not used with the Punycode parameter set.
*/
/* Bias adaptation function. */
static int32_t
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
int32_t count;
if(firstTime) {
delta/=DAMP;
} else {
delta/=2;
}
delta+=delta/length;
for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
delta/=(BASE-TMIN);
}
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
#define MAX_CP_COUNT 200
U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
const UBool *caseFlags,
UErrorCode *pErrorCode) {
int32_t cpBuffer[MAX_CP_COUNT];
int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
UChar c, c2;
/* argument checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/*
* Handle the basic code points and
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=destLength=0;
if(srcLength==-1) {
/* NUL-terminated input */
for(j=0; /* no condition */; ++j) {
if((c=src[j])==0) {
break;
}
if(srcCPCount==MAX_CP_COUNT) {
/* too many input code points */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if(IS_BASIC(c)) {
cpBuffer[srcCPCount++]=0;
if(destLength<destCapacity) {
dest[destLength]=
caseFlags!=NULL ?
asciiCaseMap((char)c, caseFlags[j]) :
(char)c;
}
++destLength;
} else {
n=(caseFlags!=NULL && caseFlags[j])<<31L;
if(UTF_IS_SINGLE(c)) {
n|=c;
} else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
++j;
n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
} else {
/* error: unmatched surrogate */
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
cpBuffer[srcCPCount++]=n;
}
}
} else {
/* length-specified input */
for(j=0; j<srcLength; ++j) {
if(srcCPCount==MAX_CP_COUNT) {
/* too many input code points */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
c=src[j];
if(IS_BASIC(c)) {
if(destLength<destCapacity) {
cpBuffer[srcCPCount++]=0;
dest[destLength]=
caseFlags!=NULL ?
asciiCaseMap((char)c, caseFlags[j]) :
(char)c;
}
++destLength;
} else {
n=(caseFlags!=NULL && caseFlags[j])<<31L;
if(UTF_IS_SINGLE(c)) {
n|=c;
} else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
++j;
n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
} else {
/* error: unmatched surrogate */
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
cpBuffer[srcCPCount++]=n;
}
}
}
/* Finish the basic string - if it is not empty - with a delimiter. */
basicLength=destLength;
if(basicLength>0) {
if(destLength<destCapacity) {
dest[destLength]=DELIMITER;
}
++destLength;
}
/*
* handledCPCount is the number of code points that have been handled
* basicLength is the number of basic code points
* destLength is the number of chars that have been output
*/
/* Initialize the state: */
n=INITIAL_N;
delta=0;
bias=INITIAL_BIAS;
/* Main encoding loop: */
for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
/*
* All non-basic code points < n have been handled already.
* Find the next larger one:
*/
for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(n<=q && q<m) {
m=q;
}
}
/*
* Increase delta enough to advance the decoder's
* <n,i> state to <m,0>, but guard against overflow:
*/
if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
return 0;
}
delta+=(m-n)*(handledCPCount+1);
n=m;
/* Encode a sequence of same code points n */
for(j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(q<n) {
++delta;
} else if(q==n) {
/* Represent delta as a generalized variable-length integer: */
for(q=delta, k=BASE; /* no condition */; k+=BASE) {
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(t>TMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(q<t) {
break;
}
if(destLength<destCapacity) {
dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), 0);
}
q=(q-t)/(BASE-t);
}
if(destLength<destCapacity) {
dest[destLength++]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
}
bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
delta=0;
++handledCPCount;
}
}
++delta;
++n;
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
UBool *caseFlags,
UErrorCode *pErrorCode) {
int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
destCPCount, firstSupplementaryIndex, cpLength;
UChar b;
/* argument checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(srcLength==-1) {
srcLength=u_strlen(src);
}
/*
* Handle the basic code points:
* Let basicLength be the number of input code points
* before the last delimiter, or 0 if there is none,
* then copy the first basicLength code points to the output.
*
* The two following loops iterate backward.
*/
for(j=srcLength; j>0;) {
if(src[--j]==DELIMITER) {
break;
}
}
destLength=basicLength=destCPCount=j;
while(j>0) {
b=src[--j];
if(!IS_BASIC(b)) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
if(j<destCapacity) {
dest[j]=(UChar)b;
if(caseFlags!=NULL) {
caseFlags[j]=IS_BASIC_UPPERCASE(b);
}
}
}
/* Initialize the state: */
n=INITIAL_N;
i=0;
bias=INITIAL_BIAS;
firstSupplementaryIndex=1000000000;
/*
* Main decoding loop:
* Start just after the last delimiter if any
* basic code points were copied; start at the beginning otherwise.
*/
for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
/*
* in is the index of the next character to be consumed, and
* destCPCount is the number of code points in the output array.
*
* Decode a generalized variable-length integer into delta,
* which gets added to i. The overflow checking is easier
* if we increase i as we go, then subtract off its starting
* value at the end to obtain delta.
*/
for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
if(in>=srcLength) {
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
digit=basicToDigit[(uint8_t)src[in++]];
if(digit<0) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
if(digit>(0x7fffffff-i)/w) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
i+=digit*w;
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(t>TMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(digit<t) {
break;
}
if(w>0x7fffffff/(BASE-t)) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
w*=BASE-t;
}
/*
* Modification from sample code:
* Increments destCPCount here,
* where needed instead of in for() loop tail.
*/
++destCPCount;
bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
/*
* i was supposed to wrap around from (incremented) destCPCount to 0,
* incrementing n each time, so we'll fix that now:
*/
if(i/destCPCount>(0x7fffffff-n)) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
n+=i/destCPCount;
i%=destCPCount;
/* not needed for Punycode: */
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
if(n>0x10ffff || UTF_IS_SURROGATE(n)) {
/* Unicode code point overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
/* Insert n at position i of the output: */
cpLength=UTF_CHAR_LENGTH(n);
if((destLength+cpLength)<destCapacity) {
int32_t codeUnitIndex;
/*
* Handle indexes when supplementary code points are present.
*
* In almost all cases, there will be only BMP code points before i
* and even in the entire string.
* This is handled with the same efficiency as with UTF-32.
*
* Only the rare cases with supplementary code points are handled
* more slowly - but not too bad since this is an insertion anyway.
*/
if(i<=firstSupplementaryIndex) {
codeUnitIndex=i;
if(cpLength>1) {
firstSupplementaryIndex=codeUnitIndex;
} else {
++firstSupplementaryIndex;
}
} else {
codeUnitIndex=firstSupplementaryIndex;
UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
}
/* use the UChar index codeUnitIndex instead of the code point index i */
if(codeUnitIndex<destLength) {
uprv_memmove(dest+codeUnitIndex+cpLength,
dest+codeUnitIndex,
(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
if(caseFlags!=NULL) {
uprv_memmove(caseFlags+codeUnitIndex+cpLength,
caseFlags+codeUnitIndex,
destLength-codeUnitIndex);
}
}
if(cpLength==1) {
/* BMP, insert one code unit */
dest[codeUnitIndex]=(UChar)n;
} else {
/* supplementary character, insert two code units */
dest[codeUnitIndex]=UTF16_LEAD(n);
dest[codeUnitIndex+1]=UTF16_TRAIL(n);
}
if(caseFlags!=NULL) {
/* Case of last character determines uppercase flag: */
caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
if(cpLength==2) {
caseFlags[codeUnitIndex+1]=FALSE;
}
}
}
destLength+=cpLength;
++i;
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */

View file

@ -0,0 +1,115 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: punycode.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan31
* created by: Markus W. Scherer
*/
/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
*/
#ifndef __PUNYCODE_H__
#define __PUNYCODE_H__
#include "unicode/utypes.h"
/**
* u_strToPunycode() converts Unicode to Punycode.
*
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
* The output string is NUL-terminated according to normal ICU
* string output rules.
*
* @param src Input Unicode string.
* This function handles a limited amount of code points
* (the limit is >=64).
* U_INDEX_OUTOFBOUNDS_ERROR is set if the limit is exceeded.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Punycode array.
* @param destCapacity Size of dest.
* @param caseFlags Vector of boolean values, one per input UChar,
* indicating that the corresponding character is to be
* marked for the decoder optionally
* uppercasing (TRUE) or lowercasing (FALSE)
* the character.
* ASCII characters are output directly in the case as marked.
* Flags corresponding to trail surrogates are ignored.
* If caseFlags==NULL then input characters are not
* case-mapped.
* @param pErrorCode ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* @return Number of ASCII characters in puny.
*
* @see u_strFromPunycode
*/
U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
const UBool *caseFlags,
UErrorCode *pErrorCode);
/**
* u_strFromPunycode() converts Punycode to Unicode.
* The Unicode string will be at most as long (in UChars)
* than the Punycode string (in chars).
*
* @param src Input Punycode string.
* @param srcLength Length of puny, or -1 if NUL-terminated
* @param dest Output Unicode string buffer.
* @param destCapacity Size of dest in number of UChars,
* and of caseFlags in numbers of UBools.
* @param caseFlags Output array for case flags as
* defined by the Punycode string.
* The caller should uppercase (TRUE) or lowercase (FASLE)
* the corresponding character in dest.
* For supplementary characters, only the lead surrogate
* is marked, and FALSE is stored for the trail surrogate.
* This is redundant and not necessary for ASCII characters
* because they are already in the case indicated.
* Can be NULL if the case flags are not needed.
* @param pErrorCode ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if a non-ASCII character
* precedes the last delimiter ('-'),
* or if an invalid character (not a-zA-Z0-9) is found
* after the last delimiter.
* U_ILLEGAL_CHAR_FOUND if the delta sequence is ill-formed.
* @return Number of UChars written to dest.
*
* @see u_strToPunycode
*/
U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
UBool *caseFlags,
UErrorCode *pErrorCode);
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View file

@ -1761,6 +1761,18 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_INVALID_FLAG"
};
static const char * const
_uIDNAErrorName[U_IDNA_ERROR_LIMIT - U_IDNA_ERROR_START] = {
"U_IDNA_ERROR_START",
"U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR",
"U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR",
"U_IDNA_CHECK_BIDI_ERROR",
"U_IDNA_STD3_ASCII_RULES_ERROR",
"U_IDNA_ACE_PREFIX_ERROR",
"U_IDNA_VERIFICATION_ERROR",
"U_IDNA_LABEL_TOO_LONG_ERROR"
};
U_CAPI const char * U_EXPORT2
u_errorName(UErrorCode code) {
if(U_ZERO_ERROR <= code && code < U_STANDARD_ERROR_LIMIT) {
@ -1775,6 +1787,8 @@ u_errorName(UErrorCode code) {
return _uBrkErrorName[code - U_BRK_ERROR_START];
} else if (U_REGEX_ERROR_START <= code && code < U_REGEX_ERROR_LIMIT) {
return _uRegexErrorName[code - U_REGEX_ERROR_START];
} else if( U_IDNA_ERROR_START <= code && code <= U_IDNA_ERROR_LIMIT) {
return _uIDNAErrorName[code - U_IDNA_ERROR_START];
} else {
return "[BOGUS UErrorCode]";
}

View file

@ -0,0 +1,65 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef SPRPIMPL_H
#define SPRPIMPL_H
enum{
UIDNA_NO_VALUE = 0x0000 ,
UIDNA_UNASSIGNED = 0x0001 ,
UIDNA_PROHIBITED = 0x0002 ,
UIDNA_MAP_NFKC = 0x0003 ,
UIDNA_LABEL_SEPARATOR = 0x0004 ,
};
enum{
_IDNA_LENGTH_IN_MAPPING_TABLE = 0x0003 /*11*/
};
/* indexes[] value names */
enum {
_IDNA_INDEX_TRIE_SIZE, /* number of bytes in normalization trie */
_IDNA_INDEX_MAPPING_DATA_SIZE, /* The array that contains the mapping */
_IDNA_INDEX_TOP=3 /* changing this requires a new formatVersion */
};
enum {
_IDNA_MAPPING_DATA_SIZE = 1700,
_IDNA_MAP_TO_NOTHING = 0xFFF
};
U_CFUNC UBool U_EXPORT2
ustrprep_cleanup();
/* error codes for prototyping
#define U_IDNA_ERROR_START U_ERROR_LIMIT
#define U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 1))
#define U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 2))
#define U_IDNA_CHECK_BIDI_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 3))
#define U_IDNA_STD3_ASCII_RULES_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 4))
#define U_IDNA_ACE_PREFIX_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 5))
#define U_IDNA_VERIFICATION_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 6))
#define U_IDNA_LABEL_TOO_LONG_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 8))
*/
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View file

@ -0,0 +1,530 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "strprep.h"
#include "utrie.h"
#include "umutex.h"
#include "cmemory.h"
#include "sprpimpl.h"
#include "nameprep.h"
#include "ustr_imp.h"
#include "unormimp.h"
#include "unicode/unorm.h"
#include "unicode/udata.h"
#include "unicode/ustring.h"
static const uint16_t* mappingData = NULL;
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
static UBool _isDataLoaded = FALSE;
static UTrie idnTrie={ 0,0,0,0,0,0,0 };
static UDataMemory* idnData=NULL;
static UErrorCode dataErrorCode =U_ZERO_ERROR;
/* file definitions */
static const char* DATA_NAME = "uidna";
static const char* DATA_TYPE = "icu";
U_CFUNC UBool U_EXPORT2
ustrprep_cleanup() {
if(idnData!=NULL) {
udata_close(idnData);
idnData=NULL;
}
dataErrorCode=U_ZERO_ERROR;
_isDataLoaded=FALSE;
return TRUE;
}
static UBool U_CALLCONV
isAcceptable(void * /* context */,
const char * /* type */,
const char * /* name */,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */
pInfo->dataFormat[1]==0x44 &&
pInfo->dataFormat[2]==0x4e &&
pInfo->dataFormat[3]==0x41 &&
pInfo->formatVersion[0]==2 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
return TRUE;
} else {
return FALSE;
}
}
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
static UBool U_CALLCONV
loadData(UErrorCode &errorCode) {
/* load Unicode IDNA data from file */
if(_isDataLoaded==FALSE) {
UTrie _idnTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
const int32_t *p=NULL;
const uint8_t *pb;
if(&errorCode==NULL || U_FAILURE(errorCode)) {
return 0;
}
/* open the data outside the mutex block */
//TODO: change the path
data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
dataErrorCode=errorCode;
if(U_FAILURE(errorCode)) {
return _isDataLoaded=FALSE;
}
p=(const int32_t *)udata_getMemory(data);
pb=(const uint8_t *)(p+_IDNA_INDEX_TOP);
utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode);
_idnTrie.getFoldingOffset=getFoldingOffset;
if(U_FAILURE(errorCode)) {
dataErrorCode=errorCode;
udata_close(data);
return _isDataLoaded=FALSE;
}
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if(idnData==NULL) {
idnData=data;
data=NULL;
uprv_memcpy(&indexes, p, sizeof(indexes));
uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie));
} else {
p=(const int32_t *)udata_getMemory(idnData);
}
umtx_unlock(NULL);
/* initialize some variables */
mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]);
_isDataLoaded = TRUE;
/* if a different thread set it first, then close the extra data */
if(data!=NULL) {
udata_close(data); /* NULL if it was set correctly */
}
}
return _isDataLoaded;
}
static inline
void syntaxError(const UChar* rules,
int32_t pos,
int32_t rulesLen,
UParseError* parseError) {
if(parseError == NULL){
return;
}
if(pos == rulesLen && rulesLen >0){
pos--;
}
parseError->offset = pos;
parseError->line = 0 ; // we are not using line numbers
// for pre-context
int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
int32_t stop = pos;
u_memcpy(parseError->preContext,rules+start,stop-start);
//null terminate the buffer
parseError->preContext[stop-start] = 0;
//for post-context
start = pos+1;
stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
rulesLen;
u_memcpy(parseError->postContext,rules+start,stop-start);
//null terminate the buffer
parseError->postContext[stop-start]= 0;
}
// *****************************************************************************
// class StringPrep
// *****************************************************************************
U_NAMESPACE_BEGIN
const char StringPrep::fgClassID=0;
UBool StringPrep::isDataLoaded(UErrorCode& status){
if(U_FAILURE(status)){
return FALSE;
}
if(_isDataLoaded==FALSE && U_FAILURE(dataErrorCode)){
status = dataErrorCode;
return FALSE;
}
loadData(dataErrorCode);
if(U_FAILURE(dataErrorCode)){
status = dataErrorCode;
return FALSE;
}
return TRUE;
}
StringPrep* StringPrep::createDefaultInstance(UErrorCode& status){
StringPrep* strprep = new StringPrep();
if(!isDataLoaded(status)){
delete strprep;
return NULL;
}
return strprep;
}
StringPrep* StringPrep::createNameprepInstance(UErrorCode& status){
StringPrep* strprep = new NamePrep(status);
if(!isDataLoaded(status)){
delete strprep;
return NULL;
}
return strprep;
}
UBool StringPrep::isNotProhibited(UChar32 ch){
return FALSE;
}
UBool StringPrep::isUnassigned(UChar32 ch){
uint32_t result;
UTRIE_GET16(&idnTrie,ch,result);
return (result == UIDNA_UNASSIGNED);
}
static inline void getValues(uint32_t result, int8_t& flag,
int8_t& length, int32_t& index){
/* first 3 bits contain the flag */
flag = (int8_t) (result & 0x07);
/* next 2 bits contain the length */
length = (int8_t) ((result>>3) & 0x03);
/* next 10 bits contain the index */
index = (result>> 5);
}
int32_t StringPrep::map(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status ){
uint32_t result;
int8_t flag;
int8_t length;
int32_t index;
int32_t destIndex=0;
int32_t srcIndex=0;
// check error status
if(U_FAILURE(status)){
return 0;
}
//check arguments
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(srcLength == -1){
srcLength = u_strlen(src);
}
for(;srcIndex<srcLength;){
UChar32 ch;
U16_NEXT(src,srcIndex,srcLength,ch);
UTRIE_GET16(&idnTrie,ch,result);
getValues(result,flag,length,index);
// check if the source codepoint is unassigned
if(flag == UIDNA_UNASSIGNED){
if(allowUnassigned == TRUE){
//copy the ch to destination
if(ch <= 0xFFFF){
if(destIndex < destCapacity ){
dest[destIndex] = (UChar)ch;
}
destIndex++;
}else{
if(destIndex+1 < destCapacity ){
dest[destIndex] = U16_LEAD(ch);
dest[destIndex+1] = U16_TRAIL(ch);
}
destIndex +=2;
}
}else{
syntaxError(src, srcIndex-1, srcLength,parseError);
status = U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR;
return 0;
}
}else if((flag == UIDNA_MAP_NFKC && doNFKC == TRUE) ||
(index == _IDNA_MAP_TO_NOTHING && doNFKC == FALSE)){
if(length == _IDNA_LENGTH_IN_MAPPING_TABLE){
length = (int8_t) mappingData[index++];
}
for(int8_t i =0; i< length; i++){
if(destIndex < destCapacity ){
dest[destIndex] = mappingData[index+i];
}
destIndex++; /* for pre-flighting */
}
}else{
//copy the source into destination
if(ch <= 0xFFFF){
if(destIndex < destCapacity ){
dest[destIndex] = (UChar)ch;
}
destIndex++;
}else{
if(destIndex+1 < destCapacity ){
dest[destIndex] = U16_LEAD(ch);
dest[destIndex+1] = U16_TRAIL(ch);
}
destIndex +=2;
}
}
}
return u_terminateUChars(dest, destCapacity, destIndex, &status);
}
int32_t StringPrep::normalize( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode& status ){
return unorm_normalize(src,srcLength,UNORM_NFKC,UNORM_UNICODE_3_2,dest,destCapacity,&status);
}
/*
1) Map -- For each character in the input, check if it has a mapping
and, if so, replace it with its mapping.
2) Normalize -- Possibly normalize the result of step 1 using Unicode
normalization.
3) Prohibit -- Check for any characters that are not allowed in the
output. If any are found, return an error.
4) Check bidi -- Possibly check for right-to-left characters, and if
any are found, make sure that the whole string satisfies the
requirements for bidirectional strings. If the string does not
satisfy the requirements for bidirectional strings, return an
error.
[Unicode3.2] defines several bidirectional categories; each character
has one bidirectional category assigned to it. For the purposes of
the requirements below, an "RandALCat character" is a character that
has Unicode bidirectional categories "R" or "AL"; an "LCat character"
is a character that has Unicode bidirectional category "L". Note
that there are many characters which fall in neither of the above
definitions; Latin digits (<U+0030> through <U+0039>) are examples of
this because they have bidirectional category "EN".
In any profile that specifies bidirectional character handling, all
three of the following requirements MUST be met:
1) The characters in section 5.8 MUST be prohibited.
2) If a string contains any RandALCat character, the string MUST NOT
contain any LCat character.
3) If a string contains any RandALCat character, a RandALCat
character MUST be the first character of the string, and a
RandALCat character MUST be the last character of the string.
*/
#define MAX_STACK_BUFFER_SIZE 300
int32_t StringPrep::process(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status ){
// check error status
if(U_FAILURE(status)){
return 0;
}
//check arguments
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar b1Stack[MAX_STACK_BUFFER_SIZE], b2Stack[MAX_STACK_BUFFER_SIZE];
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len,
b1Capacity = MAX_STACK_BUFFER_SIZE ,
b2Capacity = MAX_STACK_BUFFER_SIZE;
uint32_t result;
int32_t b2Index = 0;
int8_t flag;
int8_t length;
int32_t index;
UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
UBool leftToRight=FALSE, rightToLeft=FALSE;
int32_t rtlPos =-1, ltrPos =-1;
b1Len = map(src,srcLength, b1, b1Capacity,allowUnassigned, parseError, status);
if(status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
status = U_ZERO_ERROR; // reset error
b1Len = map(src,srcLength, b1, b1Len,allowUnassigned, parseError, status);
}
b2Len = normalize(b1,b1Len, b2,b2Capacity,status);
if(status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
status = U_ZERO_ERROR; // reset error
b2Len = normalize(b2,b2Len, b2,b2Len,status);
}
if(U_FAILURE(status)){
goto CLEANUP;
}
UChar32 ch;
for(; b2Index<b2Len;){
ch = 0;
U16_NEXT(b2, b2Index, b2Len, ch);
UTRIE_GET16(&idnTrie,ch,result);
getValues(result,flag,length,index);
if(flag == UIDNA_PROHIBITED
&& isNotProhibited(ch) == FALSE){
status = U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR;
syntaxError(b1,b2Index-1,b2Len, parseError);
goto CLEANUP;
}
direction = u_charDirection(ch);
if(firstCharDir == U_CHAR_DIRECTION_COUNT){
firstCharDir = direction;
}
if(direction == U_LEFT_TO_RIGHT){
leftToRight = TRUE;
ltrPos = b2Index-1;
}
if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
rightToLeft = TRUE;
rtlPos = b2Index-1;
}
}
// satisfy 2
if( leftToRight == TRUE && rightToLeft == TRUE){
status = U_IDNA_CHECK_BIDI_ERROR;
syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
goto CLEANUP;
}
//satisfy 3
if(rightToLeft == TRUE && firstCharDir != direction ){
status = U_IDNA_CHECK_BIDI_ERROR;
syntaxError(b2,b2Index-1,b2Len,parseError);
return FALSE;
}
if(b2Len <= destCapacity){
uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR);
}
CLEANUP:
if(b1!=b1Stack){
uprv_free(b1);
}
if(b2!=b2Stack){
uprv_free(b2);
}
return u_terminateUChars(dest, destCapacity, b2Len, &status);
}
UBool StringPrep::isLabelSeparator(UChar32 ch, UErrorCode& status){
// check error status
if(U_FAILURE(status)){
return FALSE;
}
if(isDataLoaded(status)){
int32_t result;
UTRIE_GET16(&idnTrie,ch, result);
if( (result & 0x07) == UIDNA_LABEL_SEPARATOR){
return TRUE;
}
}
return FALSE;
}
U_NAMESPACE_END

View file

@ -0,0 +1,360 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef STRPREP_H
#define STRPREP_H
#include "unicode/uobject.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
U_NAMESPACE_BEGIN
/**\file
*
* This API implements RF 3454 StringPrep standard.
*
* The steps for preparing strings are:
*
* 1) Map -- For each character in the input, check if it has a mapping
* and, if so, replace it with its mapping.
* <ul>
* <li>Delete certain codepoints from the input because their
* presence or absence in the protocol identifies should not
* make two strings different</li>
* <li>Case Mapings
* <br>If Normalization is turned off
* <br> Get mappings from case map tables
* <br>else
* <br> Get mappings from case map tables for normalization
* <br> Use u_getFC_NFKC_Closure for obtaining extra mappings
* </li>
* </ul>
* 2) Normalize -- Possibly normalize the result of step 1 using Unicode
* normalization NFKC.
*
* 3) Prohibit -- Check for any characters that are not allowed in the
* output. If any are found, return an error.
*
* 4) Check bidi -- Possibly check for right-to-left characters, and if
* any are found, make sure that the whole string satisfies the
* requirements for bidirectional strings. If the string does not
* satisfy the requirements for bidirectional strings, return an
* error.
*
* Some StringPrep profiles:
* IDN: "Nameprep" http://www.ietf.org/internet-drafts/draft-ietf-idn-nameprep-11.txt
* XMPP Node Identifiers: "Nodeprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-nodeprep-01.txt
* XMPP Resource Identifiers: "Resourceprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-resourceprep-01.txt
* ANONYMOUS SASL tokens: "plain" http://www.ietf.org/internet-drafts/draft-ietf-sasl-anon-00.txt
* iSCSI http://www.ietf.org/internet-drafts/draft-ietf-ips-iscsi-string-prep-03.txt
*/
class StringPrep : public UObject{
protected:
UVersionInfo unicodeVersion; /** The Character repertoire version of this profile */
UBool bidiCheck; /** Option to turn BiDi checking on */
UBool doNFKC; /** Option to turn NFKC on */
/**
* Protected default constructor sub classes
*/
StringPrep(){};
public:
/**
* Destructor
*/
virtual inline ~StringPrep(){};
/**
* Map every character in input stream with mapping character
* in the mapping table and populate the output stream.
* For any individual character the mapping table may specify
* that that a character be mapped to nothing, mapped to one
* other character or to a string of other characters.
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code point.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*
*/
virtual int32_t map(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status );
/**
* Normalize the input stream using Normalization Form KC (NFKC)
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*
*
*/
virtual int32_t normalize( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode& status );
/**
* Prepare the input stream with for use. This operation maps, normalizes(NFKC),
* checks for prohited and BiDi characters in the order defined by RFC 3454
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code point.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT error code.
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*
*
*/
virtual int32_t process(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status );
/**
* Create a profile from prebuilt default Nameprep profile conforming to
* nameprep internet draft (http://www.ietf.org/html.charters/idn-charter.html).
* This is a built-in/unmodifiable profile.
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Pointer to StringPrep object that is created. Should be deleted by
* by caller
*
*
*/
static StringPrep* createNameprepInstance(UErrorCode& status);
/**
* Create a profile from prebuilt default StringPrep profile conforming to
* RFC 3454 (ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt).
* User defined profiles can be created by getting the default profile and
* adding mappings, removing mappings, turning options ON/OFF and prohibiting
* characters from the output.
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Pointer to StringPrep object that is created. Should be deleted by
* the caller.
*
*
*/
static StringPrep* createDefaultInstance(UErrorCode& status);
/**
* Ascertain if the given code point is a Letter/Digit/Hyphen in the ASCII range
*
* @return TRUE is the code point is a Letter/Digit/Hyphen
*
*
*/
static inline UBool isLDHChar(UChar32 ch);
/**
* Ascertain if the given code point is a label separator as specified by IDNA
*
* @return TRUE is the code point is a label separator
*
*
*/
virtual UBool isLabelSeparator(UChar32 ch, UErrorCode& status);
/**
* Get the BiDi option of this profile
*
*
*/
inline UBool getCheckBiDi();
/**
* Get the normalization (NFKC) option of this profile
*
* @return The normalization option
*
*
*/
inline UBool getNormalization();
/**
* Get the Unicode version which this profile
* conforms to
*
*
*/
inline void getUnicodeVersion(UVersionInfo& info);
private:
// Boiler plate
/**
* Copy constructor.
*
*/
StringPrep(const StringPrep&);
/**
* Assignment operator.
*
*/
StringPrep& operator=(const StringPrep&);
/**
* Return true if another object is semantically equal to this one.
*
* @param other the object to be compared with.
* @return true if another object is semantically equal to this one.
*
*/
UBool operator==(const StringPrep& other) const {return FALSE;};
/**
* Return true if another object is semantically unequal to this one.
*
* @param other the object to be compared with.
* @return true if another object is semantically unequal to this one.
*
*/
UBool operator!=(const StringPrep& other) const { return !operator==(other); }
public:
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
*
*/
static inline UClassID getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
*
*/
virtual inline UClassID getDynamicClassID() const;
protected:
/**
* Sub classes that slightly modify the default profile
* implement this method to remove characters to
* the prohibited list. The default implementation does not
* check if the data is loaded or not. The caller is responsible
* for checking for data.
*
*/
virtual UBool isNotProhibited(UChar32 ch);
/**
* Sub classes that slightly modify the default profile
* implement this method to remove characters to
* the unassigned list. The default implementation does not
* check if the data is loaded or not. The caller is responsible
* for checking for data.
*/
virtual UBool isUnassigned(UChar32 ch);
/**
* Ascertains if uidna.icu data file is loaded.
* If data is not loaded, loads the data file.
*
*
*/
static UBool isDataLoaded(UErrorCode& status);
private:
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UBool StringPrep::getCheckBiDi(){
return bidiCheck;
}
inline UBool StringPrep::getNormalization(){
return doNFKC;
}
inline void StringPrep::getUnicodeVersion(UVersionInfo& info){
for(int32_t i=0; i< (sizeof(info)/sizeof(info[0])); i++){
info[i] = unicodeVersion[i];
}
}
inline UClassID StringPrep::getStaticClassID() {
return (UClassID)&fgClassID;
}
inline UClassID StringPrep::getDynamicClassID() const {
return getStaticClassID();
}
inline UBool StringPrep::isLDHChar(UChar32 ch){
// high runner case
if(ch>0x007A){
return FALSE;
}
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
if( (ch==0x002D) ||
(0x0030 <= ch && ch <= 0x0039) ||
(0x0041 <= ch && ch <= 0x005A) ||
(0x0061 <= ch && ch <= 0x007A)
){
return TRUE;
}
return FALSE;
}
U_NAMESPACE_END
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View file

@ -0,0 +1,735 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "unicode/uidna.h"
#include "unicode/ustring.h"
#include "strprep.h"
#include "punycode.h"
#include "ustr_imp.h"
#include "cmemory.h"
#include "sprpimpl.h"
/* it is official IDNA ACE Prefix is "xn--" */
static const UChar ACE_PREFIX[] ={ 0x0078,0x006E,0x002d,0x002d } ;
#define ACE_PREFIX_LENGTH 4
#define MAX_LABEL_LENGTH 63
#define HYPHEN 0x002D
/* The Max length of the labels should not be more than 64 */
#define MAX_LABEL_BUFFER_SIZE 100
#define MAX_IDN_BUFFER_SIZE 300
#define CAPITAL_A 0x0041
#define CAPITAL_Z 0x005A
#define LOWER_CASE_DELTA 0x0020
#define FULL_STOP 0x002E
inline static UBool
startsWithPrefix(const UChar* src , int32_t srcLength){
UBool startsWithPrefix = TRUE;
if(srcLength < ACE_PREFIX_LENGTH){
return FALSE;
}
for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){
if(u_tolower(src[i]) != ACE_PREFIX[i]){
startsWithPrefix = FALSE;
}
}
return startsWithPrefix;
}
inline static UChar
toASCIILower(UChar ch){
if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
return ch + LOWER_CASE_DELTA;
}
return ch;
}
inline static int32_t
compareCaseInsensitiveASCII(const UChar* s1, int32_t s1Len,
const UChar* s2, int32_t s2Len){
if(s1Len != s2Len){
return (s1Len > s2Len) ? s1Len : s2Len;
}
UChar c1,c2;
int32_t rc;
for(int32_t i =0;/* no condition */;i++) {
/* If we reach the ends of both strings then they match */
if(i == s1Len) {
return 0;
}
c1 = s1[i];
c2 = s2[i];
/* Case-insensitive comparison */
if(c1!=c2) {
rc=(int32_t)toASCIILower(c1)-(int32_t)toASCIILower(c2);
if(rc!=0) {
return rc;
}
}
}
}
static inline
void syntaxError(const UChar* rules,
int32_t pos,
int32_t rulesLen,
UParseError* parseError) {
if(parseError == NULL){
return;
}
if(pos == rulesLen && rulesLen >0){
pos--;
}
parseError->offset = pos;
parseError->line = 0 ; // we are not using line numbers
// for pre-context
int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
int32_t stop = pos;
u_memcpy(parseError->preContext,rules+start,stop-start);
//null terminate the buffer
parseError->preContext[stop-start] = 0;
//for post-context
start = pos+1;
stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
rulesLen;
u_memcpy(parseError->postContext,rules+start,stop-start);
//null terminate the buffer
parseError->postContext[stop-start]= 0;
}
U_CAPI int32_t U_EXPORT2
uidna_toASCII(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE];
//initialize pointers to stack buffers
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len,
b1Capacity = MAX_LABEL_BUFFER_SIZE,
b2Capacity = MAX_LABEL_BUFFER_SIZE ,
reqLength=0;
UBool* caseFlags = NULL;
// the source contains all ascii codepoints
UBool srcIsASCII = TRUE;
// assume the source contains all LDH codepoints
UBool srcIsLDH = TRUE;
int32_t j=0;
//get the options
UBool allowUnassigned = options & UIDNA_ALLOW_UNASSIGNED;
UBool useSTD3ASCIIRules = (options & UIDNA_USE_STD3_RULES) >>1;
int32_t failPos = -1;
// step 2
StringPrep* prep = StringPrep::createNameprepInstance(*status);
if(U_FAILURE(*status)){
goto CLEANUP;
}
b1Len = prep->process(src,srcLength,b1, b1Capacity,allowUnassigned, parseError, *status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
// we do not have enough room so grow the buffer
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
}
// error bail out
if(U_FAILURE(*status)){
goto CLEANUP;
}
// step 3 & 4
for( j=0;j<b1Len;j++){
if(b1[j] > 0x7F){
srcIsASCII = FALSE;
}
// here we do not assemble surrogates
// since we know that LDH code points
// are in the ASCII range only
if(prep->isLDHChar(b1[j])==FALSE){
srcIsLDH = FALSE;
failPos = j;
}
}
if(useSTD3ASCIIRules == TRUE){
// verify 3a and 3b
if( srcIsLDH == FALSE /* source contains some non-LDH characters */
|| b1[0] == HYPHEN || b1[b1Len-1] == HYPHEN){
*status = U_IDNA_STD3_ASCII_RULES_ERROR;
/* populate the parseError struct */
if(srcIsLDH==FALSE){
syntaxError(b1,failPos-1,b1Len,parseError);
}else if(b1[0] == HYPHEN){
syntaxError(b1,0,b1Len,parseError);
}else{
syntaxError(b1,b1Len-1,b1Len,parseError);
}
goto CLEANUP;
}
}
if(srcIsASCII){
if(b1Len <= destCapacity){
uprv_memmove(dest, b1, b1Len * U_SIZEOF_UCHAR);
reqLength = b1Len;
}else{
reqLength = b1Len;
goto CLEANUP;
}
}else{
// step 5 : verify the sequence does not begin with ACE prefix
if(!startsWithPrefix(b1,b1Len)){
//step 6: encode the sequence with punycode
caseFlags = (UBool*) uprv_malloc(b1Len * sizeof(UBool));
b2Len = u_strToPunycode(b1,b1Len,b2,b2Capacity,caseFlags, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2 == NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = u_strToPunycode(b1,b1Len,b2,b2Len,caseFlags, status);
}
//error bail out
if(U_FAILURE(*status)){
goto CLEANUP;
}
reqLength = b2Len+ACE_PREFIX_LENGTH;
if(reqLength > destCapacity){
*status = U_BUFFER_OVERFLOW_ERROR;
goto CLEANUP;
}
//Step 7: prepend the ACE prefix
uprv_memcpy(dest,ACE_PREFIX,ACE_PREFIX_LENGTH * U_SIZEOF_UCHAR);
//Step 6: copy the contents in b2 into dest
uprv_memcpy(dest+ACE_PREFIX_LENGTH, b2, b2Len * U_SIZEOF_UCHAR);
}else{
*status = U_IDNA_ACE_PREFIX_ERROR;
syntaxError(b1,0,b1Len,parseError);
goto CLEANUP;
}
}
if(reqLength > MAX_LABEL_LENGTH){
*status = U_IDNA_LABEL_TOO_LONG_ERROR;
}
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
uprv_free(caseFlags);
delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
U_CAPI int32_t U_EXPORT2
uidna_toUnicode(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//get the options
UBool allowUnassigned = options & UIDNA_ALLOW_UNASSIGNED;
UBool useSTD3ASCIIRules = (options & UIDNA_USE_STD3_RULES) >>1;
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE];
//initialize pointers to stack buffers
UChar *b1 = b1Stack, *b2 = b2Stack, *b1Prime=NULL, *b3=b3Stack;
int32_t b1Len, b2Len, b1PrimeLen, b3Len,
b1Capacity = MAX_LABEL_BUFFER_SIZE,
b2Capacity = MAX_LABEL_BUFFER_SIZE,
b3Capacity = MAX_LABEL_BUFFER_SIZE,
reqLength=0;
StringPrep* prep = StringPrep::createNameprepInstance(*status);
b1Len = 0;
UBool* caseFlags = NULL;
UBool srcIsASCII = TRUE;
if(U_FAILURE(*status)){
goto CLEANUP;
}
// step 1: find out if all the codepoints in src are ASCII
if(srcLength==-1){
srcLength = 0;
for(;src[srcLength]!=0;){
if(src[srcLength]> 0x7f){
srcIsASCII = FALSE;
}
srcLength++;
}
}else{
for(int32_t j=0; j<srcLength; j++){
if(src[j]> 0x7f){
srcIsASCII = FALSE;
}
}
}
if(srcIsASCII == FALSE){
// step 2: process the string
b1Len = prep->process(src,srcLength,b1,b1Capacity,allowUnassigned, parseError, *status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
}
//bail out on error
if(U_FAILURE(*status)){
goto CLEANUP;
}
}else{
//just point src to b1
b1 = (UChar*) src;
b1Len = srcLength;
}
//step 3: verify ACE Prefix
if(startsWithPrefix(src,srcLength)){
//step 4: Remove the ACE Prefix
b1Prime = b1 + ACE_PREFIX_LENGTH;
b1PrimeLen = b1Len - ACE_PREFIX_LENGTH;
//step 5: Decode using punycode
b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Capacity, caseFlags,status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Len, caseFlags, status);
}
//step 6:Apply toASCII
b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity,options,parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b3 = (UChar*) uprv_malloc(b3Len * U_SIZEOF_UCHAR);
if(b3==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b3Len = uidna_toASCII(b2,b2Len,b3,b3Len,options,parseError, status);
}
//bail out on error
if(U_FAILURE(*status)){
goto CLEANUP;
}
//step 7: verify
if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){
*status = U_IDNA_VERIFICATION_ERROR;
goto CLEANUP;
}
//step 8: return output of step 5
reqLength = b2Len;
if(b2Len <= destCapacity) {
uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR);
}
}else{
//copy the source to destination
if(srcLength <= destCapacity){
uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR);
}
reqLength = srcLength;
}
CLEANUP:
if(b1 != b1Stack && b1!=src){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
uprv_free(caseFlags);
delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
static int32_t
getNextSeparator(UChar *src,int32_t srcLength,StringPrep* prep,
UChar **limit,
UBool *done,
UErrorCode *status){
if(srcLength == -1){
int32_t i;
for(i=0 ; ;i++){
if(src[i] == 0){
*limit = src + i; // point to null
*done = TRUE;
return i;
}
if(prep->isLabelSeparator(src[i],*status)){
*limit = src + (i+1); // go past the delimiter
return i;
}
}
// we have not found the delimiter
if(i==srcLength){
*limit = src+srcLength;
*done = TRUE;
}
return i;
}else{
int32_t i;
for(i=0;i<srcLength;i++){
if(prep->isLabelSeparator(src[i],*status)){
*limit = src + (i+1); // go past the delimiter
return i;
}
}
// we have not found the delimiter
if(i==srcLength){
*limit = src+srcLength;
*done = TRUE;
}
return i;
}
}
U_CAPI int32_t U_EXPORT2
uidna_IDNToASCII( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//get the options
UBool allowUnassigned = options & UIDNA_ALLOW_UNASSIGNED;
UBool useSTD3ASCIIRules = (options & UIDNA_USE_STD3_RULES) >>1;
UChar *start=NULL, *limit=NULL;
int32_t reqLength = 0;
StringPrep* prep = StringPrep::createNameprepInstance(*status);
if(U_FAILURE(*status)){
return 0;
}
//initialize pointers to stack buffers
UChar b1Stack[MAX_LABEL_BUFFER_SIZE];
UChar *b1 = b1Stack;
int32_t b1Len, labelLen;
UChar* delimiter = (UChar*)src;
UChar* labelStart = (UChar*)src;
int32_t remainingLen = srcLength;
int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE;
UBool done = FALSE;
for(;;){
labelLen = getNextSeparator(labelStart, -1, prep, &delimiter,&done, status);
b1Len = uidna_toASCII(labelStart, labelLen, b1, b1Capacity,
options, parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// for pre-flighting we already know the return length
// do not re-process the string just save the length
// and reset error code
*status = U_ZERO_ERROR; // reset error
}
if(U_FAILURE(*status)){
break;
}
int32_t tempLen = (reqLength + b1Len );
// copy to dest
if( tempLen <= destCapacity){
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
}
reqLength = tempLen;
// add the label separator
if(done==FALSE){
if(reqLength < destCapacity){
dest[reqLength] = FULL_STOP;
}
reqLength++;
}
labelStart = delimiter;
if(done == TRUE){
break;
}
}
if(b1 != b1Stack){
uprv_free(b1);
}
delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
U_CAPI int32_t U_EXPORT2
uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar *start=NULL, *limit=NULL;
int32_t reqLength = 0;
StringPrep* prep = StringPrep::createNameprepInstance(*status);
if(U_FAILURE(*status)){
return 0;
}
//initialize pointers to stack buffers
UChar b1Stack[MAX_LABEL_BUFFER_SIZE];
UChar *b1 = b1Stack;
int32_t b1Len, labelLen;
UChar* delimiter = (UChar*)src;
UChar* labelStart = (UChar*)src;
int32_t remainingLen = srcLength;
int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE;
UBool done = FALSE;
for(;;){
labelLen = getNextSeparator(labelStart, -1, prep, &delimiter, &done, status);
b1Len = uidna_toUnicode( labelStart,labelLen, b1, b1Capacity,
options, parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// for pre-flighting we already know the return length
// do not re-process the string just save the length
// and reset error code
*status = U_ZERO_ERROR; // reset error
}
if(U_FAILURE(*status)){
break;
}
int32_t tempLen = (reqLength + b1Len );
// copy to dest
if( tempLen <= destCapacity){
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
}
reqLength = tempLen;
// add the label separator
if(done==FALSE){
if(reqLength < destCapacity){
dest[reqLength] = FULL_STOP;
}
reqLength++;
}
labelStart = delimiter;
if(done==TRUE){
break;
}
}
if(b1 != b1Stack){
uprv_free(b1);
}
delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
U_CAPI int32_t U_EXPORT2
uidna_compare( const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
int32_t options,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return -1;
}
UChar b1Stack[MAX_IDN_BUFFER_SIZE], b2Stack[MAX_IDN_BUFFER_SIZE];
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len, b1Capacity = MAX_IDN_BUFFER_SIZE, b2Capacity = MAX_IDN_BUFFER_SIZE;
int32_t result;
UParseError parseError;
b1Len = uidna_IDNToASCII(s1, length1, b1, b1Capacity, options, &parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = uidna_IDNToASCII(s1,length1,b1,b1Len, options, &parseError, status);
}
b2Len = uidna_IDNToASCII(s2,length2, b2,b2Capacity, options, &parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = uidna_IDNToASCII(s2, length2, b2, b2Len, options, &parseError, status);
}
// when toASCII is applied all label separators are replaced with FULL_STOP
result = compareCaseInsensitiveASCII(b1,b1Len,b2,b2Len);
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
return result;
}

View file

@ -0,0 +1,282 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uidna.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef __UIDNA_H__
#define __UIDNA_H__
#include "unicode/utypes.h"
#include "unicode/parseerr.h"
/**
*\file
* UIDNA API implements the IDNA protocol as defined in the IDNA draft
* (http://www.ietf.org/internet-drafts/draft-ietf-idn-idna-14.txt).
* The draft defines 2 operations: ToASCII and ToUnicode. Domain labels
* containing non-ASCII code points are required to be processed by
* ToASCII operation before passing it to resolver libraries. Domain names
* that are obtained from resolver libraries are required to be processed by
* ToUnicode operation before displaying the domain name to the user.
* IDNA requires that implementations process input strings with Nameprep
* (http://www.ietf.org/internet-drafts/draft-ietf-idn-nameprep-11.txt),
* which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
* and then with Punycode (http://www.ietf.org/internet-drafts/draft-ietf-idn-punycode-03.txt).
* Implementations of IDNA MUST fully implement Nameprep and Punycode;
* neither Nameprep nor Punycode are optional.
* The input and output of ToASCII and ToUnicode operations are Unicode
* and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
* multiple times to an input string will yield the same result as applying the operation
* once.
* ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
* ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
*\end_file
*/
#define UIDNA_DEFAULT 0x0000
#define UIDNA_ALLOW_UNASSIGNED 0x0001
#define UIDNA_USE_STD3_RULES 0x0002
/**
* This function implements the ToASCII operation as defined in the IDNA draft.
* This operation is done on <b>single labels</b> before sending it to something that expects
* ASCII names. A label is an individual part of a domain name. Labels are usually
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
* "www","example", and "com".
*
*
* @param src Input UChar array containing label in Unicode.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array with ASCII (ACE encoded) label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_toASCII(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* This function implements the ToUnicode operation as defined in the IDNA draft.
* This operation is done on <b>single labels</b> before sending it to something that expects
* Unicode names. A label is an individual part of a domain name. Labels are usually
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
* "www","example", and "com".
*
* @param src Input UChar array containing ASCII (ACE encoded) label.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Converted UChar array containing Unicode equivalent of label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points. <b> Note: </b> This option is
* required on toUnicode operation because the draft mandates
* verification of decoded ACE input by applying toASCII and comparing
* its output with source
*
*
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of Unicode characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_toUnicode(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Convenience function that implements the IDNToASCII operation as defined in the IDNA draft.
* This operation is done on complete domain names, e.g: "www.example.com".
* It is important to note that this operation can fail. If it fails, then the input
* domain name cannot be used as an Internationalized Domain Name and the application
* should have methods defined to deal with the failure.
*
* <b>Note:</b> IDNA draft specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input UChar array containing IDN in Unicode.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array with ASCII (ACE encoded) IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_IDNToASCII( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Convenience function that implements the IDNToUnicode operation as defined in the IDNA draft.
* This operation is done on complete domain names, e.g: "www.example.com".
*
* <b>Note:</b> IDNA draft specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input UChar array containing IDN in ASCII (ACE encoded) form.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array containing Unicode equivalent of source IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Compare two strings for IDNs for equivalence.
* This function splits the domain names into labels and compares them.
* According to IDN draft, whenever two labels are compared, they are
* considered equal if and only if their ASCII forms (obtained by
* applying toASCII) match using an case-insensitive ASCII comparison.
* Two domain names are considered a match if and only if all labels
* match regardless of whether label separators match.
*
* @param s1 First source string.
* @param length1 Length of first source string, or -1 if NUL-terminated.
*
* @param s2 Second source string.
* @param length2 Length of second source string, or -1 if NUL-terminated.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return <0 or 0 or >0 as usual for string comparisons
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_compare( const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
int32_t options,
UErrorCode* status);
#endif

View file

@ -631,8 +631,21 @@ typedef enum UErrorCode {
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
/*
* The error code in the range 0x10400-0x104ff are reserved for IDNA related error codes
*/
U_IDNA_ERROR_START=0x10400,
U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR,
U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR,
U_IDNA_CHECK_BIDI_ERROR,
U_IDNA_STD3_ASCII_RULES_ERROR,
U_IDNA_ACE_PREFIX_ERROR,
U_IDNA_VERIFICATION_ERROR,
U_IDNA_LABEL_TOO_LONG_ERROR,
U_IDNA_ERROR_LIMIT,
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
U_ERROR_LIMIT=U_IDNA_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
} UErrorCode;
/* Use the following to determine if an UErrorCode represents */