ICU-2194 tests for IDNA

X-SVN-Rev: 11196
This commit is contained in:
Ram Viswanadha 2003-02-28 21:37:55 +00:00
parent 71eb8f87f1
commit 267d3d1f30
14 changed files with 3865 additions and 2 deletions

View file

@ -40,7 +40,7 @@ itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o unhxtrts.o hxuntrt
ufltlgts.o testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
jamotest.o srchtest.o reptest.o regextst.o \
itrbnf.o itrbnfrt.o tstdtmod.o testdata.o datamap.o ucaconf.o icusvtst.o \
uobjtest.o
uobjtest.o idnaref.o nptrans.o punyref.o testidn.o testidna.o
DEPS = $(OBJECTS:.o=.d)

View file

@ -0,0 +1,976 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "idnaref.h"
#include "strprep.h"
#include "punyref.h"
#include "ustr_imp.h"
#include "cmemory.h"
#include "sprpimpl.h"
#include "nptrans.h"
//#include "punyref.h"
#include "punycode.h"
#include "unicode/ustring.h"
/* it is official IDNA ACE Prefix is "xn--" */
static const UChar ACE_PREFIX[] ={ 0x0058,0x004E,0x002d,0x002d } ;
#define ACE_PREFIX_LENGTH 4
#define MAX_LABEL_LENGTH 63
#define HYPHEN 0x002D
/* The Max length of the labels should not be more than 64 */
#define MAX_LABEL_BUFFER_SIZE 100
#define MAX_IDN_BUFFER_SIZE 300
#define CAPITAL_A 0x0041
#define CAPITAL_Z 0x005A
#define LOWER_CASE_DELTA 0x0020
#define FULL_STOP 0x002E
static NamePrepTransform* prep = NULL;
static NamePrepTransform* getInstance(UErrorCode& status){
if(prep == NULL){
UParseError parseError;
prep = NamePrepTransform::createInstance(parseError, status);
if(prep ==NULL){
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
}
return prep;
}
inline static UBool
startsWithPrefix(const UChar* src , int32_t srcLength){
UBool startsWithPrefix = TRUE;
if(srcLength < ACE_PREFIX_LENGTH){
return FALSE;
}
for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){
if(u_toupper(src[i]) != ACE_PREFIX[i]){
startsWithPrefix = FALSE;
}
}
return startsWithPrefix;
}
inline static UChar
toASCIILower(UChar ch){
if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
return ch + LOWER_CASE_DELTA;
}
return ch;
}
inline static int32_t
compareCaseInsensitiveASCII(const UChar* s1, int32_t s1Len,
const UChar* s2, int32_t s2Len){
if(s1Len != s2Len){
return (s1Len > s2Len) ? s1Len : s2Len;
}
UChar c1,c2;
int32_t rc;
for(int32_t i =0;/* no condition */;i++) {
/* If we reach the ends of both strings then they match */
if(i == s1Len) {
return 0;
}
c1 = s1[i];
c2 = s2[i];
/* Case-insensitive comparison */
if(c1!=c2) {
rc=(int32_t)toASCIILower(c1)-(int32_t)toASCIILower(c2);
if(rc!=0) {
return rc;
}
}
}
}
static UErrorCode getError(enum punycode_status status){
switch(status){
case punycode_success:
return U_ZERO_ERROR;
case punycode_bad_input: /* Input is invalid. */
return U_INVALID_CHAR_FOUND;
case punycode_big_output: /* Output would exceed the space provided. */
return U_BUFFER_OVERFLOW_ERROR;
case punycode_overflow : /* Input requires wider integers to process. */
return U_INDEX_OUTOFBOUNDS_ERROR;
default:
return U_INTERNAL_PROGRAM_ERROR;
}
}
// wrapper around the reference Punycode implementation
static int32_t convertToPuny(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode& status){
uint32_t b1Stack[MAX_LABEL_BUFFER_SIZE];
int32_t b1Len = 0, b1Capacity = MAX_LABEL_BUFFER_SIZE;
uint32_t* b1 = b1Stack;
char b2Stack[MAX_LABEL_BUFFER_SIZE];
char* b2 = b2Stack;
int32_t b2Len =MAX_LABEL_BUFFER_SIZE ;
punycode_status error;
unsigned char* caseFlags = NULL;
u_strToUTF32((UChar32*)b1,b1Capacity,&b1Len,src,srcLength,&status);
if(status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (uint32_t*) uprv_malloc(b1Len * sizeof(uint32_t));
if(b1==NULL){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
status = U_ZERO_ERROR; // reset error
u_strToUTF32((UChar32*)b1,b1Len,&b1Len,src,srcLength,&status);
}
if(U_FAILURE(status)){
goto CLEANUP;
}
caseFlags = (unsigned char*) uprv_malloc(b1Len *sizeof(unsigned char));
error = punycode_encode(b1Len,b1,caseFlags, (uint32_t*)&b2Len, b2);
status = getError(error);
if(status == U_BUFFER_OVERFLOW_ERROR){
/* we do not have enough room so grow the buffer*/
b2 = (char*) uprv_malloc( b2Len * sizeof(char));
if(b2==NULL){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
status = U_ZERO_ERROR; // reset error
punycode_status error = punycode_encode(b1Len,b1,caseFlags, (uint32_t*)&b2Len, b2);
status = getError(error);
}
if(U_FAILURE(status)){
goto CLEANUP;
}
if(b2Len < destCapacity){
u_charsToUChars(b2,dest,b2Len);
}else{
status =U_BUFFER_OVERFLOW_ERROR;
}
CLEANUP:
if(b1Stack != b1){
uprv_free(b1);
}
if(b2Stack != b2){
uprv_free(b2);
}
uprv_free(caseFlags);
return b2Len;
}
static int32_t convertFromPuny( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode& status){
char b1Stack[MAX_LABEL_BUFFER_SIZE];
char* b1 = b1Stack;
int32_t b1Len = 0, b1Capacity = MAX_LABEL_BUFFER_SIZE;
int32_t destLen =0;
u_UCharsToChars(src, b1,srcLength);
uint32_t b2Stack[MAX_LABEL_BUFFER_SIZE];
uint32_t* b2 = b2Stack;
int32_t b2Len =MAX_LABEL_BUFFER_SIZE , b2Capacity = MAX_LABEL_BUFFER_SIZE;
unsigned char* caseFlags = (unsigned char*) uprv_malloc(srcLength * sizeof(unsigned char*));
punycode_status error = punycode_decode(srcLength,b1,(uint32_t*)&b2Len,b2,caseFlags);
status = getError(error);
if(status == U_BUFFER_OVERFLOW_ERROR){
b2 = (uint32_t*) uprv_malloc(b2Len * sizeof(uint32_t));
if(b2 == NULL){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
error = punycode_decode(srcLength,b1,(uint32_t*)&b2Len,b2,caseFlags);
status = getError(error);
}
if(U_FAILURE(status)){
goto CLEANUP;
}
u_strFromUTF32(dest,destCapacity,&destLen,(UChar32*)b2,b2Len,&status);
CLEANUP:
if(b1Stack != b1){
uprv_free(b1);
}
if(b2Stack != b2){
uprv_free(b2);
}
uprv_free(caseFlags);
return destLen;
}
int32_t
idnaref_toASCII(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE];
//initialize pointers to stack buffers
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len,
b1Capacity = MAX_LABEL_BUFFER_SIZE,
b2Capacity = MAX_LABEL_BUFFER_SIZE ,
reqLength=0;
//get the options
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
UBool* caseFlags = NULL;
// assume the source contains all ascii codepoints
UBool srcIsASCII = TRUE;
// assume the source contains all LDH codepoints
UBool srcIsLDH = TRUE;
int32_t j=0;
// UParseError parseError;
// step 2
NamePrepTransform* prep = getInstance(*status);
if(U_FAILURE(*status)){
goto CLEANUP;
}
b1Len = prep->process(src,srcLength,b1, b1Capacity,allowUnassigned,parseError,*status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
}
// error bail out
if(U_FAILURE(*status)){
goto CLEANUP;
}
// step 3 & 4
for( j=0;j<b1Len;j++){
if(b1[j] > 0x7F) srcIsASCII = FALSE;
srcIsLDH = prep->isLDHChar(b1[j]);
}
if(useSTD3ASCIIRules == TRUE){
// verify 3a and 3b
if( srcIsLDH == FALSE /* source contains some non-LDH characters */
|| b1[0] == HYPHEN || b1[b1Len-1] == HYPHEN){
*status = U_IDNA_STD3_ASCII_RULES_ERROR;
goto CLEANUP;
}
}
if(srcIsASCII){
if(b1Len <= destCapacity){
uprv_memmove(dest, b1, b1Len * U_SIZEOF_UCHAR);
reqLength = b1Len;
}else{
reqLength = b1Len;
goto CLEANUP;
}
}else{
// step 5 : verify the sequence does not begin with ACE prefix
if(!startsWithPrefix(b1,b1Len)){
//step 6: encode the sequence with punycode
caseFlags = (UBool*) uprv_malloc(b1Len * sizeof(UBool));
b2Len = convertToPuny(b1,b1Len, b2,b2Capacity,*status);
//b2Len = u_strToPunycode(b2,b2Capacity,b1,b1Len, caseFlags, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2 == NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = convertToPuny(b1, b1Len, b2, b2Len, *status);
//b2Len = u_strToPunycode(b2,b2Len,b1,b1Len, caseFlags, status);
}
//error bail out
if(U_FAILURE(*status)){
goto CLEANUP;
}
reqLength = b2Len+ACE_PREFIX_LENGTH;
if(reqLength > destCapacity){
*status = U_BUFFER_OVERFLOW_ERROR;
goto CLEANUP;
}
//Step 7: prepend the ACE prefix
uprv_memcpy(dest,ACE_PREFIX,ACE_PREFIX_LENGTH * U_SIZEOF_UCHAR);
//Step 6: copy the contents in b2 into dest
uprv_memcpy(dest+ACE_PREFIX_LENGTH, b2, b2Len * U_SIZEOF_UCHAR);
}else{
*status = U_IDNA_ACE_PREFIX_ERROR;
goto CLEANUP;
}
}
if(reqLength > MAX_LABEL_LENGTH){
*status = U_IDNA_LABEL_TOO_LONG_ERROR;
}
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
uprv_free(caseFlags);
// delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
int32_t
idnaref_toUnicode(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE];
//initialize pointers to stack buffers
UChar *b1 = b1Stack, *b2 = b2Stack, *b1Prime=NULL, *b3=b3Stack;
int32_t b1Len, b2Len, b1PrimeLen, b3Len,
b1Capacity = MAX_LABEL_BUFFER_SIZE,
b2Capacity = MAX_LABEL_BUFFER_SIZE,
b3Capacity = MAX_LABEL_BUFFER_SIZE,
reqLength=0;
// UParseError parseError;
NamePrepTransform* prep = getInstance(*status);
b1Len = 0;
UBool* caseFlags = NULL;
UBool srcIsASCII = TRUE;
//get the options
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
if(U_FAILURE(*status)){
goto CLEANUP;
}
// step 1: find out if all the codepoints in src are ASCII
if(srcLength==-1){
srcLength = 0;
for(;src[srcLength]!=0;){
if(src[srcLength]> 0x7f){
srcIsASCII = FALSE;
}
srcLength++;
}
}else{
for(int32_t j=0; j<srcLength; j++){
if(src[j]> 0x7f){
srcIsASCII = FALSE;
}
}
}
if(srcIsASCII == FALSE){
// step 2: process the string
b1Len = prep->process(src,srcLength,b1,b1Capacity,allowUnassigned, parseError, *status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
}
//bail out on error
if(U_FAILURE(*status)){
goto CLEANUP;
}
}else{
// copy everything to b1
if(srcLength < b1Capacity){
uprv_memmove(b1,src, srcLength * U_SIZEOF_UCHAR);
}else{
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
uprv_memmove(b1,src, srcLength * U_SIZEOF_UCHAR);
}
b1Len = srcLength;
}
//step 3: verify ACE Prefix
if(startsWithPrefix(src,srcLength)){
//step 4: Remove the ACE Prefix
b1Prime = b1 + ACE_PREFIX_LENGTH;
b1PrimeLen = b1Len - ACE_PREFIX_LENGTH;
//step 5: Decode using punycode
b2Len = convertFromPuny(b1Prime,b1PrimeLen, b2, b2Capacity, *status);
//b2Len = u_strFromPunycode(b2, b2Capacity,b1Prime,b1PrimeLen, caseFlags, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = convertFromPuny(b1Prime,b1PrimeLen, b2, b2Len, *status);
//b2Len = u_strFromPunycode(b2, b2Len,b1Prime,b1PrimeLen,caseFlags, status);
}
//step 6:Apply toASCII
b3Len = idnaref_toASCII(b2,b2Len,b3,b3Capacity,options,parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b3 = (UChar*) uprv_malloc(b3Len * U_SIZEOF_UCHAR);
if(b3==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b3Len = idnaref_toASCII(b2,b2Len,b3,b3Len, options, parseError, status);
}
//bail out on error
if(U_FAILURE(*status)){
goto CLEANUP;
}
//step 7: verify
if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){
*status = U_IDNA_VERIFICATION_ERROR;
goto CLEANUP;
}
//step 8: return output of step 5
reqLength = b2Len;
if(b2Len <= destCapacity) {
uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR);
}
}else{
//copy the source to destination
if(srcLength <= destCapacity){
uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR);
}
reqLength = srcLength;
}
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
uprv_free(caseFlags);
// delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
static int32_t
getNextSeparator(UChar *src,int32_t srcLength,NamePrepTransform* prep,
UChar **limit,
UBool *done,
UErrorCode *status){
if(srcLength == -1){
int32_t i;
for(i=0 ; ;i++){
if(src[i] == 0){
*limit = src + i; // point to null
*done = TRUE;
return i;
}
if(prep->isLabelSeparator(src[i],*status)){
*limit = src + (i+1); // go past the delimiter
return i;
}
}
// we have not found the delimiter
if(i==srcLength){
*limit = src+srcLength;
*done = TRUE;
}
return i;
}else{
int32_t i;
for(i=0;i<srcLength;i++){
if(prep->isLabelSeparator(src[i],*status)){
*limit = src + (i+1); // go past the delimiter
return i;
}
}
// we have not found the delimiter
if(i==srcLength){
*limit = src+srcLength;
*done = TRUE;
}
return i;
}
}
int32_t
idnaref_IDNToASCII( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar *start=NULL, *limit=NULL;
int32_t reqLength = 0;
// UParseError parseError;
NamePrepTransform* prep = getInstance(*status);
//initialize pointers to stack buffers
UChar b1Stack[MAX_LABEL_BUFFER_SIZE];
UChar *b1 = b1Stack;
int32_t b1Len, labelLen;
UChar* delimiter = (UChar*)src;
UChar* labelStart = (UChar*)src;
int32_t remainingLen = srcLength;
int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE;
//get the options
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
UBool done = FALSE;
if(U_FAILURE(*status)){
goto CLEANUP;
}
if(srcLength == -1){
for(;;){
if(*delimiter == 0){
break;
}
labelLen = getNextSeparator(labelStart, -1, prep, &delimiter, &done, status);
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Capacity,
options, parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Len,
options, parseError, status);
}
if(U_FAILURE(*status)){
goto CLEANUP;
}
int32_t tempLen = (reqLength + b1Len );
// copy to dest
if( tempLen< destCapacity){
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
}
reqLength = tempLen;
// add the label separator
if(done == FALSE){
if(reqLength < destCapacity){
dest[reqLength] = FULL_STOP;
}
reqLength++;
}
labelStart = delimiter;
}
}else{
for(;;){
if(delimiter == src+srcLength){
break;
}
labelLen = getNextSeparator(labelStart, remainingLen, prep, &delimiter, &done, status);
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Capacity,
options,parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Len,
options, parseError, status);
}
if(U_FAILURE(*status)){
goto CLEANUP;
}
int32_t tempLen = (reqLength + b1Len );
// copy to dest
if( tempLen< destCapacity){
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
}
reqLength = tempLen;
// add the label separator
if(done == FALSE){
if(reqLength < destCapacity){
dest[reqLength] = FULL_STOP;
}
reqLength++;
}
labelStart = delimiter;
remainingLen = srcLength - (delimiter - src);
}
}
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
// delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
int32_t
idnaref_IDNToUnicode( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar *start=NULL, *limit=NULL;
int32_t reqLength = 0;
UBool done = FALSE;
NamePrepTransform* prep = getInstance(*status);
//initialize pointers to stack buffers
UChar b1Stack[MAX_LABEL_BUFFER_SIZE];
UChar *b1 = b1Stack;
int32_t b1Len, labelLen;
UChar* delimiter = (UChar*)src;
UChar* labelStart = (UChar*)src;
int32_t remainingLen = srcLength;
int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE;
//get the options
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
if(U_FAILURE(*status)){
goto CLEANUP;
}
if(srcLength == -1){
for(;;){
if(*delimiter == 0){
break;
}
labelLen = getNextSeparator(labelStart, -1, prep, &delimiter, &done, status);
b1Len = idnaref_toUnicode(labelStart, labelLen, b1, b1Capacity,
options, parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = idnaref_toUnicode( labelStart, labelLen, b1, b1Len,
options, parseError, status);
}
if(U_FAILURE(*status)){
goto CLEANUP;
}
int32_t tempLen = (reqLength + b1Len );
// copy to dest
if( tempLen< destCapacity){
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
}
reqLength = tempLen;
// add the label separator
if(done == FALSE){
if(reqLength < destCapacity){
dest[reqLength] = FULL_STOP;
}
reqLength++;
}
labelStart = delimiter;
}
}else{
for(;;){
if(delimiter == src+srcLength){
break;
}
labelLen = getNextSeparator(labelStart, remainingLen, prep, &delimiter, &done, status);
b1Len = idnaref_toUnicode( labelStart,labelLen, b1, b1Capacity,
options, parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = idnaref_toUnicode( labelStart, labelLen, b1, b1Len,
options, parseError, status);
}
if(U_FAILURE(*status)){
goto CLEANUP;
}
int32_t tempLen = (reqLength + b1Len );
// copy to dest
if( tempLen< destCapacity){
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
}
reqLength = tempLen;
// add the label separator
if(done == FALSE){
if(reqLength < destCapacity){
dest[reqLength] = FULL_STOP;
}
reqLength++;
}
labelStart = delimiter;
remainingLen = srcLength - (delimiter - src);
}
}
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
// delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
int32_t
idnaref_compare( const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
int32_t options,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return -1;
}
UChar b1Stack[MAX_IDN_BUFFER_SIZE], b2Stack[MAX_IDN_BUFFER_SIZE];
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len, b1Capacity = MAX_IDN_BUFFER_SIZE, b2Capacity = MAX_IDN_BUFFER_SIZE;
int32_t result;
UParseError parseError;
b1Len = idnaref_IDNToASCII(s1, length1, b1, b1Capacity, options, &parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = idnaref_IDNToASCII(s1,length1,b1,b1Len, options, &parseError, status);
}
b2Len = idnaref_IDNToASCII(s2,length2,b2,b2Capacity,options, &parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = idnaref_IDNToASCII(s2,length2,b2,b2Len,options, &parseError, status);
}
// when toASCII is applied all label separators are replaced with FULL_STOP
result = compareCaseInsensitiveASCII(b1,b1Len,b2,b2Len);
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
return result;
}

View file

@ -0,0 +1,226 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: idnaref.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef __IDNAREF_H__
#define __IDNAREF_H__
#include "unicode/utypes.h"
#include "unicode/parseerr.h"
#define IDNAREF_DEFAULT 0x0000
#define IDNAREF_ALLOW_UNASSIGNED 0x0001
#define IDNAREF_USE_STD3_RULES 0x0002
/**
* This function implements the ToASCII operation as defined in the IDNA draft.
* This operation is done on <b>single labels</b> before sending it to something that expects
* ASCII names. A label is an individual part of a domain name. Labels are usually
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
* "www","example", and "com".
*
*
* @param src Input Unicode label.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Unicode array with ACE encoded ASCII label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code points.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If TRUE and the input does not statisfy STD3 rules, the operation
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to recieve information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
int32_t
idnaref_toASCII(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* This function implements the ToUnicode operation as defined in the IDNA draft.
* This operation is done on <b>single labels</b> before sending it to something that expects
* ASCII names. A label is an individual part of a domain name. Labels are usually
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
* "www","example", and "com".
*
* @param src Input ASCII (ACE encoded) label.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Converted Unicode array.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code points.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If TRUE and the input does not statisfy STD3 rules, the operation
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to recieve information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of Unicode characters converted.
* @draft ICU 2.6
*/
int32_t
idnaref_toUnicode(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Convenience function that implements the IDNToASCII operation as defined in the IDNA draft.
* This operation is done on complete domain names, e.g: "www.example.com".
* It is important to note that this operation can fail. If it fails, then the input
* domain name cannot be used as an Internationalized Domain Name and the application
* should have methods defined to deal with the failure.
*
* <b>Note:</b> IDNA draft specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input ASCII IDN.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Unicode array.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code points.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If TRUE and the input does not statisfy STD3 rules, the operation
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to recieve information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
int32_t
idnaref_IDNToASCII( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Convenience function that implements the IDNToUnicode operation as defined in the IDNA draft.
* This operation is done on complete domain names, e.g: "www.example.com".
*
* <b>Note:</b> IDNA draft specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input Unicode IDN.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output ASCII array.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code points.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If TRUE and the input does not statisfy STD3 rules, the operation
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to recieve information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
int32_t
idnaref_IDNToUnicode( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Compare two strings for IDNs for equivalence.
* This function splits the domain names into labels and compares them.
* According to IDN draft, whenever two labels are compared, they are
* considered equal if and only if their ASCII forms (obtained by
* applying toASCII) match using an case-insensitive ASCII comparison.
* Two domain names are considered a match if and only if all labels
* match regardless of whether label separators match.
*
* @param s1 First source string.
* @param length1 Length of first source string, or -1 if NUL-terminated.
*
* @param s2 Second source string.
* @param length2 Length of second source string, or -1 if NUL-terminated.
* @param options A bit set of options:
*
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code points.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If TRUE and the input does not statisfy STD3 rules, the operation
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return <0 or 0 or >0 as usual for string comparisons
* @draft ICU 2.6
*/
int32_t
idnaref_compare( const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
int32_t options,
UErrorCode* status);
#endif

View file

@ -1291,6 +1291,64 @@ const char* IntlTest::loadTestData(UErrorCode& err){
}
return _testDataPath;
}
const char* IntlTest::fgDataDir = NULL;
/* returns the path to icu/source/data */
const char * IntlTest::pathToDataDirectory()
{
if(fgDataDir != NULL) {
return fgDataDir;
}
/* U_TOPSRCDIR is set by the makefiles on UNIXes when building cintltst and intltst
// to point to the top of the build hierarchy, which may or
// may not be the same as the source directory, depending on
// the configure options used. At any rate,
// set the data path to the built data from this directory.
// The value is complete with quotes, so it can be used
// as-is as a string constant.
*/
#if defined (U_TOPSRCDIR)
{
fgDataDir = U_TOPSRCDIR U_FILE_SEP_STRING "data" U_FILE_SEP_STRING;
}
#else
/* On Windows, the file name obtained from __FILE__ includes a full path.
* This file is "wherever\icu\source\test\cintltst\cintltst.c"
* Change to "wherever\icu\source\data"
*/
{
static char p[sizeof(__FILE__) + 10];
char *pBackSlash;
int i;
strcpy(p, __FILE__);
/* We want to back over three '\' chars. */
/* Only Windows should end up here, so looking for '\' is safe. */
for (i=1; i<=3; i++) {
pBackSlash = strrchr(p, U_FILE_SEP_CHAR);
if (pBackSlash != NULL) {
*pBackSlash = 0; /* Truncate the string at the '\' */
}
}
if (pBackSlash != NULL) {
/* We found and truncated three names from the path.
* Now append "source\data" and set the environment
*/
strcpy(pBackSlash, U_FILE_SEP_STRING "data" U_FILE_SEP_STRING );
fgDataDir = p;
}
}
#endif
return fgDataDir;
}
/*
* This is a variant of cintltst/ccolltst.c:CharsToUChars().
* It converts a character string into a UnicodeString, with

View file

@ -942,5 +942,49 @@ SOURCE=.\unhxtrts.cpp
SOURCE=.\unhxtrts.h
# End Source File
# End Group
# Begin Group "idna"
# PROP Default_Filter "*.c,*.h"
# Begin Source File
SOURCE=.\idnaref.cpp
# End Source File
# Begin Source File
SOURCE=.\idnaref.h
# End Source File
# Begin Source File
SOURCE=.\nptrans.cpp
# End Source File
# Begin Source File
SOURCE=.\nptrans.h
# End Source File
# Begin Source File
SOURCE=.\punyref.c
# End Source File
# Begin Source File
SOURCE=.\punyref.h
# End Source File
# Begin Source File
SOURCE=.\testidn.cpp
# End Source File
# Begin Source File
SOURCE=.\testidna.cpp
# End Source File
# Begin Source File
SOURCE=.\testidna.h
# End Source File
# Begin Source File
SOURCE=.\tidnaref.cpp
# End Source File
# End Group
# End Target
# End Project

View file

@ -158,6 +158,8 @@ protected:
public:
static void setICU_DATA(); // Set up ICU_DATA if necessary.
static const char* pathToDataDirectory();
public:
UBool run_phase2( char* name, char* par ); // internally, supports reporting memory leaks
static const char* loadTestData(UErrorCode& err);
@ -165,6 +167,7 @@ public:
// static members
public:
static IntlTest* gTest;
static const char* fgDataDir;
};

View file

@ -30,7 +30,7 @@
#include "tstnorm.h"
#include "canittst.h"
#include "icusvtst.h"
#include "testidna.h"
#define CASE_SUITE(id, suite) case id: \
name = #suite; \
if(exec) { \
@ -148,6 +148,13 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
}
#endif
break;
case 11: name = "idna";
if(exec){
logln("TestSuite IDNA----"); logln();
TestIDNA test;
callTest(test,par);
}
break;
default: name = ""; break;
}
}

View file

@ -0,0 +1,279 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: nameprep.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "nptrans.h"
#include "unicode/resbund.h"
#include "unicode/uniset.h"
#include "sprpimpl.h"
#include "cmemory.h"
#include "ustr_imp.h"
#include "intltest.h"
#ifdef DEBUG
#include <stdio.h>
#endif
const char NamePrepTransform::fgClassID=0;
NamePrepTransform* NamePrepTransform::transform = NULL;
//Factory method
NamePrepTransform* NamePrepTransform::createInstance(UParseError& parseError, UErrorCode& status){
if(transform==NULL){
transform = new NamePrepTransform(parseError, status);
if(U_FAILURE(status)){
delete transform;
return NULL;
}
}
return transform;
}
//constructor
NamePrepTransform::NamePrepTransform(UParseError& parseError, UErrorCode& status)
: unassigned(), prohibited(), labelSeparatorSet(){
mapping = NULL;
bundle = NULL;
const char* testDataName = IntlTest::loadTestData(status);
if(U_FAILURE(status)){
return;
}
bundle = ures_openDirect(testDataName,"idna_rules",&status);
if(bundle != NULL && U_SUCCESS(status)){
// create the mapping transliterator
int32_t ruleLen = 0;
const UChar* ruleUChar = ures_getStringByKey(bundle, "MapNFKC",&ruleLen, &status);
UnicodeString rule(ruleUChar, ruleLen);
mapping = Transliterator::createFromRules("NamePrepTransform", rule,
UTRANS_FORWARD, parseError,status);
//create the unassigned set
int32_t patternLen =0;
const UChar* pattern = ures_getStringByKey(bundle,"UnassignedSet",&patternLen, &status);
unassigned.applyPattern(UnicodeString(pattern, patternLen), status);
//create prohibited set
patternLen=0;
pattern = ures_getStringByKey(bundle,"ProhibitedSet",&patternLen, &status);
UnicodeString test(pattern,patternLen);
prohibited.applyPattern(test,status);
#ifdef DEBUG
if(U_FAILURE(status)){
printf("Construction of Unicode set failed\n");
}
if(U_SUCCESS(status)){
if(prohibited.contains((UChar) 0x644)){
printf("The string contains 0x644 ... damn !!\n");
}
UnicodeString temp;
prohibited.toPattern(temp,TRUE);
for(int32_t i=0;i<temp.length();i++){
printf("%c", (char)temp.charAt(i));
}
printf("\n");
}
#endif
//create label separator set
patternLen=0;
pattern = ures_getStringByKey(bundle,"LabelSeparatorSet",&patternLen, &status);
labelSeparatorSet.applyPattern(UnicodeString(pattern,patternLen),status);
}
if(U_SUCCESS(status) &&
(mapping == NULL)
){
status = U_MEMORY_ALLOCATION_ERROR;
delete mapping;
ures_close(bundle);
mapping = NULL;
bundle = NULL;
}
}
UBool NamePrepTransform::isProhibited(UChar32 ch){
return (UBool)(ch != ASCII_SPACE);
}
NamePrepTransform::~NamePrepTransform(){
delete mapping;
mapping = NULL;
//close the bundle
ures_close(bundle);
bundle = NULL;
}
int32_t NamePrepTransform::map(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status ){
if(U_FAILURE(status)){
return 0;
}
//check arguments
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString rsource(src,srcLength);
// map the code points
// transliteration also performs NFKC
mapping->transliterate(rsource);
const UChar* buffer = rsource.getBuffer();
int32_t bufLen = rsource.length();
// check if unassigned
if(allowUnassigned == FALSE){
int32_t bufIndex=0;
UChar32 ch =0 ;
for(;bufIndex<bufLen;){
U16_NEXT(buffer, bufIndex, bufLen, ch);
if(unassigned.contains(ch)){
status = U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR;
rsource.releaseBuffer();
return 0;
}
}
}
// check if there is enough room in the output
if(bufLen < destCapacity){
uprv_memcpy(dest,buffer,bufLen*U_SIZEOF_UCHAR);
}
return u_terminateUChars(dest, destCapacity, bufLen, &status);
}
#define MAX_BUFFER_SIZE 300
int32_t NamePrepTransform::process( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status ){
// check error status
if(U_FAILURE(status)){
return 0;
}
//check arguments
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar b1Stack[MAX_BUFFER_SIZE];
UChar *b1 = b1Stack;
int32_t b1Len,b1Capacity = MAX_BUFFER_SIZE;
int32_t b1Index = 0;
UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
UBool leftToRight=FALSE, rightToLeft=FALSE;
b1Len = map(src,srcLength, b1, b1Capacity,allowUnassigned,parseError, status);
if(status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
if(!u_growBufferFromStatic(b1Stack,&b1,&b1Capacity,b1Len,0)){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
status = U_ZERO_ERROR; // reset error
b1Len = map(src,srcLength, b1, b1Len,allowUnassigned, parseError, status);
}
if(U_FAILURE(status)){
goto CLEANUP;
}
for(; b1Index<b1Len; ){
UChar32 ch = 0;
U16_NEXT(b1, b1Index, b1Len, ch);
if(prohibited.contains(ch) && ch!=0x0020){
status = U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR;
goto CLEANUP;
}
direction = u_charDirection(ch);
if(firstCharDir==U_CHAR_DIRECTION_COUNT){
firstCharDir = direction;
}
if(direction == U_LEFT_TO_RIGHT){
leftToRight = TRUE;
}
if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
rightToLeft = TRUE;
}
}
// satisfy 2
if( leftToRight == TRUE && rightToLeft == TRUE){
status = U_IDNA_CHECK_BIDI_ERROR;
goto CLEANUP;
}
//satisfy 3
if(rightToLeft == TRUE && firstCharDir != direction ){
status = U_IDNA_CHECK_BIDI_ERROR;
return FALSE;
}
if(b1Len <= destCapacity){
uprv_memmove(dest,b1, b1Len*U_SIZEOF_UCHAR);
}
CLEANUP:
if(b1!=b1Stack){
uprv_free(b1);
}
return u_terminateUChars(dest, destCapacity, b1Len, &status);
}
UBool NamePrepTransform::isLabelSeparator(UChar32 ch, UErrorCode& status){
// check error status
if(U_FAILURE(status)){
return FALSE;
}
return labelSeparatorSet.contains(ch);
}

View file

@ -0,0 +1,154 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: nameprep.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef NPTRANS_H
#define NPTRANS_H
#include "unicode/utypes.h"
#include "strprep.h"
#include "unicode/uniset.h"
#include "unicode/ures.h"
#include "unicode/translit.h"
#define ASCII_SPACE 0x0020
class NamePrepTransform {
private :
Transliterator *mapping;
UnicodeSet unassigned;
UnicodeSet prohibited;
UnicodeSet labelSeparatorSet;
UResourceBundle *bundle;
static NamePrepTransform* transform;
NamePrepTransform(UParseError& parseError, UErrorCode& status);
public :
static NamePrepTransform* createInstance(UParseError& parseError, UErrorCode& status);
inline ~NamePrepTransform();
inline UBool isProhibited(UChar32 ch);
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.6
*/
inline UClassID getDynamicClassID() const { return getStaticClassID(); }
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.6
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
/**
* Map every character in input stream with mapping character
* in the mapping table and populate the output stream.
* For any individual character the mapping table may specify
* that that a character be mapped to nothing, mapped to one
* other character or to a string of other characters.
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code point.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*
*/
int32_t map(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status );
/**
* Prepare the input stream with for use. This operation maps, normalizes(NFKC),
* checks for prohited and BiDi characters in the order defined by RFC 3454
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code point.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT error code.
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*/
int32_t process(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status );
/**
* Ascertain if the given code point is a label separator as specified by IDNA
*
* @return TRUE is the code point is a label separator
*
*
*/
UBool isLabelSeparator(UChar32 ch, UErrorCode& status);
inline UBool isLDHChar(UChar32 ch);
private:
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UBool NamePrepTransform::isLDHChar(UChar32 ch){
// high runner case
if(ch>0x007A){
return FALSE;
}
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
if( (ch==0x002D) ||
(0x0030 <= ch && ch <= 0x0039) ||
(0x0041 <= ch && ch <= 0x005A) ||
(0x0061 <= ch && ch <= 0x007A)
){
return TRUE;
}
return FALSE;
}
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View file

@ -0,0 +1,264 @@
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
*/
/**********************************************************/
/* Implementation (would normally go in its own .c file): */
#include <string.h>
#include "punyref.h"
/*** Bootstring parameters for Punycode ***/
enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
/* basic(cp) tests whether cp is a basic code point: */
#define basic(cp) ((punycode_uint)(cp) < 0x80)
/* delim(cp) tests whether cp is a delimiter: */
#define delim(cp) ((cp) == delimiter)
/* decode_digit(cp) returns the numeric value of a basic code */
/* point (for use in representing integers) in the range 0 to */
/* base-1, or base if cp is does not represent a value. */
static punycode_uint decode_digit(punycode_uint cp)
{
return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
cp - 97 < 26 ? cp - 97 : base;
}
/* encode_digit(d,flag) returns the basic code point whose value */
/* (when used for representing integers) is d, which needs to be in */
/* the range 0 to base-1. The lowercase form is used unless flag is */
/* nonzero, in which case the uppercase form is used. The behavior */
/* is undefined if flag is nonzero and digit d has no uppercase form. */
static char encode_digit(punycode_uint d, int flag)
{
return (char) d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
}
/* flagged(bcp) tests whether a basic code point is flagged */
/* (uppercase). The behavior is undefined if bcp is not a */
/* basic code point. */
#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
/* encode_basic(bcp,flag) forces a basic code point to lowercase */
/* if flag is zero, uppercase if flag is nonzero, and returns */
/* the resulting code point. The code point is unchanged if it */
/* is caseless. The behavior is undefined if bcp is not a basic */
/* code point. */
static char encode_basic(punycode_uint bcp, int flag)
{
bcp -= (bcp - 97 < 26) << 5;
return (char) bcp + ((!flag && (bcp - 65 < 26)) << 5);
}
/*** Platform-specific constants ***/
/* maxint is the maximum value of a punycode_uint variable: */
static const punycode_uint maxint = -1;
/* Because maxint is unsigned, -1 becomes the maximum value. */
/*** Bias adaptation function ***/
static punycode_uint adapt(
punycode_uint delta, punycode_uint numpoints, int firsttime )
{
punycode_uint k;
delta = firsttime ? delta / damp : delta >> 1;
/* delta >> 1 is a faster way of doing delta / 2 */
delta += delta / numpoints;
for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
delta /= base - tmin;
}
return k + (base - tmin + 1) * delta / (delta + skew);
}
/*** Main encode function ***/
enum punycode_status punycode_encode(
punycode_uint input_length,
const punycode_uint input[],
const unsigned char case_flags[],
punycode_uint *output_length,
char output[] )
{
punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
/* Initialize the state: */
n = initial_n;
delta = out = 0;
max_out = *output_length;
bias = initial_bias;
/* Handle the basic code points: */
for (j = 0; j < input_length; ++j) {
if (basic(input[j])) {
if (max_out - out < 2) return punycode_big_output;
output[out++] = (char)
(case_flags ? encode_basic(input[j], case_flags[j]) : input[j]);
}
/* else if (input[j] < n) return punycode_bad_input; */
/* (not needed for Punycode with unsigned code points) */
}
h = b = out;
/* h is the number of code points that have been handled, b is the */
/* number of basic code points, and out is the number of characters */
/* that have been output. */
if (b > 0) output[out++] = delimiter;
/* Main encoding loop: */
while (h < input_length) {
/* All non-basic code points < n have been */
/* handled already. Find the next larger one: */
for (m = maxint, j = 0; j < input_length; ++j) {
/* if (basic(input[j])) continue; */
/* (not needed for Punycode) */
if (input[j] >= n && input[j] < m) m = input[j];
}
/* Increase delta enough to advance the decoder's */
/* <n,i> state to <m,0>, but guard against overflow: */
if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
delta += (m - n) * (h + 1);
n = m;
for (j = 0; j < input_length; ++j) {
/* Punycode does not need to check whether input[j] is basic: */
if (input[j] < n /* || basic(input[j]) */ ) {
if (++delta == 0) return punycode_overflow;
}
if (input[j] == n) {
/* Represent delta as a generalized variable-length integer: */
for (q = delta, k = base; ; k += base) {
if (out >= max_out) return punycode_big_output;
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
k >= bias + tmax ? tmax : k - bias;
if (q < t) break;
output[out++] = encode_digit(t + (q - t) % (base - t), 0);
q = (q - t) / (base - t);
}
output[out++] = encode_digit(q, case_flags && case_flags[j]);
bias = adapt(delta, h + 1, h == b);
delta = 0;
++h;
}
}
++delta, ++n;
}
*output_length = out;
return punycode_success;
}
/*** Main decode function ***/
enum punycode_status punycode_decode(
punycode_uint input_length,
const char input[],
punycode_uint *output_length,
punycode_uint output[],
unsigned char case_flags[] )
{
punycode_uint n, out, i, max_out, bias,
b, j, in, oldi, w, k, digit, t;
/* Initialize the state: */
n = initial_n;
out = i = 0;
max_out = *output_length;
bias = initial_bias;
/* Handle the basic code points: Let b be the number of input code */
/* points before the last delimiter, or 0 if there is none, then */
/* copy the first b code points to the output. */
for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
if (b > max_out) return punycode_big_output;
for (j = 0; j < b; ++j) {
if (case_flags) case_flags[out] = flagged(input[j]);
if (!basic(input[j])) return punycode_bad_input;
output[out++] = input[j];
}
/* Main decoding loop: Start just after the last delimiter if any */
/* basic code points were copied; start at the beginning otherwise. */
for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
/* in is the index of the next character to be consumed, and */
/* out is the number of code points in the output array. */
/* Decode a generalized variable-length integer into delta, */
/* which gets added to i. The overflow checking is easier */
/* if we increase i as we go, then subtract off its starting */
/* value at the end to obtain delta. */
for (oldi = i, w = 1, k = base; ; k += base) {
if (in >= input_length) return punycode_bad_input;
digit = decode_digit(input[in++]);
if (digit >= base) return punycode_bad_input;
if (digit > (maxint - i) / w) return punycode_overflow;
i += digit * w;
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
k >= bias + tmax ? tmax : k - bias;
if (digit < t) break;
if (w > maxint / (base - t)) return punycode_overflow;
w *= (base - t);
}
bias = adapt(i - oldi, out + 1, oldi == 0);
/* i was supposed to wrap around from out+1 to 0, */
/* incrementing n each time, so we'll fix that now: */
if (i / (out + 1) > maxint - n) return punycode_overflow;
n += i / (out + 1);
i %= (out + 1);
/* Insert n at position i of the output: */
/* not needed for Punycode: */
/* if (decode_digit(n) <= base) return punycode_invalid_input; */
if (out >= max_out) return punycode_big_output;
if (case_flags) {
memmove(case_flags + i + 1, case_flags + i, out - i);
/* Case of last character determines uppercase flag: */
case_flags[i] = flagged(input[in - 1]);
}
memmove(output + i + 1, output + i, (out - i) * sizeof *output);
output[i++] = n;
}
*output_length = out;
return punycode_success;
}

View file

@ -0,0 +1,101 @@
/*
punycode.c from draft-ietf-idn-punycode-03
http://www.nicemice.net/idn/
Adam M. Costello
http://www.nicemice.net/amc/
This is ANSI C code (C89) implementing
Punycode (draft-ietf-idn-punycode-03).
*/
#ifndef _PUNYREF_H
#define _PUNYREF_H
/************************************************************/
/* Public interface (would normally go in its own .h file): */
#include <limits.h>
#include "unicode/utypes.h"
enum punycode_status {
punycode_success,
punycode_bad_input, /* Input is invalid. */
punycode_big_output, /* Output would exceed the space provided. */
punycode_overflow /* Input needs wider integers to process. */
};
/*typedef unsigned long punycode_uint;*/
#if defined(_WIN32) || defined(WIN32)
typedef unsigned long punycode_uint;
#else
# if UINT_MAX >= (1 << 26) - 1
typedef unsigned int punycode_uint;
# else
typedef unsigned long punycode_uint;
# endif
#endif
U_CFUNC enum punycode_status punycode_encode(
punycode_uint input_length,
const punycode_uint input[],
const unsigned char case_flags[],
punycode_uint *output_length,
char output[] );
/* punycode_encode() converts Unicode to Punycode. The input */
/* is represented as an array of Unicode code points (not code */
/* units; surrogate pairs are not allowed), and the output */
/* will be represented as an array of ASCII code points. The */
/* output string is *not* null-terminated; it will contain */
/* zeros if and only if the input contains zeros. (Of course */
/* the caller can leave room for a terminator and add one if */
/* needed.) The input_length is the number of code points in */
/* the input. The output_length is an in/out argument: the */
/* caller passes in the maximum number of code points that it */
/* can receive, and on successful return it will contain the */
/* number of code points actually output. The case_flags array */
/* holds input_length boolean values, where nonzero suggests that */
/* the corresponding Unicode character be forced to uppercase */
/* after being decoded (if possible), and zero suggests that */
/* it be forced to lowercase (if possible). ASCII code points */
/* are encoded literally, except that ASCII letters are forced */
/* to uppercase or lowercase according to the corresponding */
/* uppercase flags. If case_flags is a null pointer then ASCII */
/* letters are left as they are, and other code points are */
/* treated as if their uppercase flags were zero. The return */
/* value can be any of the punycode_status values defined above */
/* except punycode_bad_input; if not punycode_success, then */
/* output_size and output might contain garbage. */
U_CFUNC enum punycode_status punycode_decode(
punycode_uint input_length,
const char input[],
punycode_uint *output_length,
punycode_uint output[],
unsigned char case_flags[] );
/* punycode_decode() converts Punycode to Unicode. The input is */
/* represented as an array of ASCII code points, and the output */
/* will be represented as an array of Unicode code points. The */
/* input_length is the number of code points in the input. The */
/* output_length is an in/out argument: the caller passes in */
/* the maximum number of code points that it can receive, and */
/* on successful return it will contain the actual number of */
/* code points output. The case_flags array needs room for at */
/* least output_length values, or it can be a null pointer if the */
/* case information is not needed. A nonzero flag suggests that */
/* the corresponding Unicode character be forced to uppercase */
/* by the caller (if possible), while zero suggests that it be */
/* forced to lowercase (if possible). ASCII code points are */
/* output already in the proper case, but their flags will be set */
/* appropriately so that applying the flags would be harmless. */
/* The return value can be any of the punycode_status values */
/* defined above; if not punycode_success, then output_length, */
/* output, and case_flags might contain garbage. On success, the */
/* decoder will never need to write an output_length greater than */
/* input_length, because of how the encoding is defined. */
#endif

View file

@ -0,0 +1,513 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: genidn.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003-02-06
* created by: Ram Viswanadha
*
* This program reads the rfc3454_*.txt files,
* parses them, and extracts the data for Nameprep conformance.
* It then preprocesses it and writes a binary file for efficient use
* in various IDNA conversion processes.
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "utrie.h"
#include "umutex.h"
#include "sprpimpl.h"
#include "testidna.h"
#ifdef WIN32
# pragma warning(disable: 4100)
#endif
UBool beVerbose=FALSE, haveCopyright=TRUE;
/* prototypes --------------------------------------------------------------- */
static UBool isDataLoaded = FALSE;
static UTrie idnTrie={ 0,0,0,0,0,0,0 };
static UDataMemory *idnData=NULL;
static UErrorCode dataErrorCode =U_ZERO_ERROR;
static const uint16_t* mappingData = NULL;
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
static void
parseMappings(const char *filename, UBool withNorm, UBool reportError,TestIDNA& test, UErrorCode *pErrorCode);
static void
parseTable(const char *filename, UBool isUnassigned, TestIDNA& test, UErrorCode *pErrorCode);
static UBool loadIDNData(UErrorCode &errorCode);
static UBool cleanup();
static void
compareMapping(uint32_t codepoint, uint32_t* mapping, int32_t mapLength,
UBool withNorm, UErrorCode *status);
static void
compareFlagsForRange(uint32_t start, uint32_t end,
UBool isUnassigned, UErrorCode *status);
static void
testAllCodepoints(TestIDNA& test);
static TestIDNA* pTestIDNA =NULL;
static const char* fileNames[] = {
"rfc3454_A_1.txt", /* contains unassigned code points */
"rfc3454_C_X.txt", /* contains code points that are prohibited */
"rfc3454_B_1.txt", /* contains case mappings when normalization is turned off */
"rfc3454_B_2.txt", /* contains case mappings when normalization it turned on */
/* "NormalizationCorrections.txt",contains NFKC case mappings whicha are not included in UTR 21 */
};
/* -------------------------------------------------------------------------- */
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
UOPTION_SOURCEDIR,
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
};
/* file definitions */
#define DATA_NAME "uidna"
#define DATA_TYPE "icu"
#define MISC_DIR "misc"
extern int
testData(TestIDNA& test) {
char filename[300];
//TODO get the srcDir dynamically
const char *srcDir=IntlTest::pathToDataDirectory(), *destDir=NULL, *suffix=NULL;
char *basename=NULL;
UErrorCode errorCode=U_ZERO_ERROR;
char *saveBasename =NULL;
loadIDNData(errorCode);
if(U_FAILURE(dataErrorCode)){
test.errln( "Could not load data. Error: %s\n",u_errorName(dataErrorCode));
return dataErrorCode;
}
//initialize
pTestIDNA = &test;
/* prepare the filename beginning with the source dir */
if(srcDir[0] == U_FILE_SEP_CHAR){
filename[0]= 0x2E;
uprv_strcat(filename+1,srcDir);
}else if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL){
filename[0] = 0x2E;
filename[1] = U_FILE_SEP_CHAR;
uprv_strcpy(filename+2,srcDir);
}else{
uprv_strcpy(filename, srcDir);
}
/* process unassigned */
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
*basename++=U_FILE_SEP_CHAR;
}
uprv_strcpy(basename,MISC_DIR);
basename= basename + uprv_strlen(MISC_DIR);
*basename++ = U_FILE_SEP_CHAR;
uprv_strcpy(basename,fileNames[0]);
parseTable(filename,TRUE, test,&errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
/* process prohibited */
uprv_strcpy(basename,fileNames[1]);
parseTable(filename,FALSE, test, &errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
/* process mappings */
uprv_strcpy(basename,fileNames[2]);
parseMappings(filename, FALSE, FALSE,test, &errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
uprv_strcpy(basename,fileNames[3]);
parseMappings(filename, TRUE, FALSE,test, &errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
testAllCodepoints(test);
cleanup();
pTestIDNA = NULL;
return errorCode;
}
static void U_CALLCONV
caseMapLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t mapping[40];
char *end, *s;
uint32_t code;
int32_t length;
UBool* mapWithNorm = (UBool*) context;
/* ignore First and Last entries for ranges */
if( *fields[1][0]=='<' &&
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
) {
return;
}
/* get the character code, field 0 */
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
if(end<=fields[0][0] || end!=fields[0][1]) {
*pErrorCode=U_PARSE_ERROR;
}
s = fields[1][0];
/* parse the mapping string */
length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
/* store the mapping */
compareMapping(code,mapping, length, *mapWithNorm, pErrorCode);
}
static void
parseMappings(const char *filename,UBool withNorm, UBool reportError, TestIDNA& test, UErrorCode *pErrorCode) {
char *fields[3][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 3, caseMapLineFn, &withNorm, pErrorCode);
//fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);
if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
test.errln( "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
}
}
/* parser for UnicodeData.txt ----------------------------------------------- */
static void U_CALLCONV
unicodeDataLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
int32_t length;
uint32_t rangeStart=0,rangeEnd =0;
UBool* isUnassigned = (UBool*) context;
/* ignore First and Last entries for ranges */
if( *fields[1][0]=='<' &&
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
) {
return;
}
u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode);
if(U_FAILURE(*pErrorCode)){
*pErrorCode = U_PARSE_ERROR;
return;
}
compareFlagsForRange(rangeStart,rangeEnd,*isUnassigned, pErrorCode);
}
static void
parseTable(const char *filename,UBool isUnassigned,TestIDNA& test, UErrorCode *pErrorCode) {
char *fields[1][2];
int32_t len=0;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 1, unicodeDataLineFn, &isUnassigned, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
test.errln( "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
}
}
static void
testAllCodepoints(TestIDNA& test){
if(isDataLoaded){
uint32_t i = 0;
int32_t unassigned = 0;
int32_t prohibited = 0;
int32_t mappedWithNorm = 0;
int32_t mapped = 0;
int32_t noValueInTrie = 0;
for(i=0;i<=0x10FFFF;i++){
uint32_t result = 0;
UTRIE_GET16(&idnTrie,i, result);
if(result != UIDNA_NO_VALUE ){
if((result & 0x07) == UIDNA_UNASSIGNED){
unassigned++;
}
if((result & 0x07) == UIDNA_PROHIBITED){
prohibited++;
}
if((result>>5) == _IDNA_MAP_TO_NOTHING){
mapped++;
}
if((result & 0x07) == UIDNA_MAP_NFKC){
mappedWithNorm++;
}
}else{
noValueInTrie++;
if(result > 0){
test.errln("The return value for 0x%06X is wrong. %i\n",i,result);
}
}
}
test.logln("Number of Unassinged code points : %i \n",unassigned);
test.logln("Number of Prohibited code points : %i \n",prohibited);
test.logln("Number of Mapped code points : %i \n",mapped);
test.logln("Number of Mapped with NFKC code points : %i \n",mappedWithNorm);
test.logln("Number of code points that have no value in Trie: %i \n",noValueInTrie);
}
}
static inline void getValues(uint32_t result, int8_t& flag,
int8_t& length, int32_t& index){
/* first 3 bits contain the flag */
flag = (int8_t) (result & 0x07);
/* next 2 bits contain the length */
length = (int8_t) ((result>>3) & 0x03);
/* next 10 bits contain the index */
index = (result>> 5);
}
static void
compareMapping(uint32_t codepoint, uint32_t* mapping,int32_t mapLength,
UBool withNorm, UErrorCode *status){
if(isDataLoaded){
uint32_t result = 0;
UTRIE_GET16(&idnTrie,codepoint, result);
int8_t flag, length;
int32_t index;
getValues(result,flag,length, index);
if(withNorm){
if(flag != UIDNA_MAP_NFKC){
pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n",codepoint, UIDNA_MAP_NFKC, flag);
}
}else{
if(flag=UIDNA_NO_VALUE || flag == UIDNA_PROHIBITED){
if(index != _IDNA_MAP_TO_NOTHING ){
pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n", codepoint, _IDNA_MAP_TO_NOTHING, index);
}
}
}
if(length ==_IDNA_LENGTH_IN_MAPPING_TABLE){
length = (int8_t)mappingData[index];
index++;
}
if(mapLength != length){
pTestIDNA->errln( "Did not get the expected length. Expected: %i Got: %i\n", mapLength, length);
}
for(int8_t i =0; i< mapLength; i++){
if(mapping[i] <= 0xFFFF){
if(mappingData[index+i] != (uint16_t)mapping[i]){
pTestIDNA->errln("Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n", mapping[i], mappingData[index+i]);
}
}else{
UChar lead = UTF16_LEAD(mapping[i]);
UChar trail = UTF16_TRAIL(mapping[i]);
if(mappingData[index+i] != lead ||
mappingData[index+i+1] != trail){
pTestIDNA->errln( "Did not get the expected result. Expected: 0x%04X 0x%04X Got: 0x%04X 0x%04X", lead, trail, mappingData[index+i], mappingData[index+i+1]);
}
}
}
}
}
static void
compareFlagsForRange(uint32_t start, uint32_t end,
UBool isUnassigned, UErrorCode *status){
if(isDataLoaded){
uint32_t result =0 ;
while(start < end+1){
UTRIE_GET16(&idnTrie,start, result);
if(isUnassigned){
if(result != UIDNA_UNASSIGNED){
pTestIDNA->errln( "UIDNA_UASSIGNED flag failed for 0x%06X. Expected: %04X Got: %04X\n",start,UIDNA_UNASSIGNED, result);
}
}else{
if((result & 0x03) != UIDNA_PROHIBITED){
pTestIDNA->errln( "UIDNA_PROHIBITED flag failed for 0x%06X. Expected: %04X Got: %04X\n\n",start,UIDNA_PROHIBITED, result);
}
}
start++;
}
}
}
UBool
cleanup() {
if(idnData!=NULL) {
udata_close(idnData);
idnData=NULL;
}
dataErrorCode=U_ZERO_ERROR;
isDataLoaded=FALSE;
return TRUE;
}
static UBool U_CALLCONV
isAcceptable(void * /* context */,
const char * /* type */, const char * /* name */,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */
pInfo->dataFormat[1]==0x44 &&
pInfo->dataFormat[2]==0x4e &&
pInfo->dataFormat[3]==0x41 &&
pInfo->formatVersion[0]==2 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
return TRUE;
} else {
return FALSE;
}
}
/* idnTrie: the folding offset is the lead FCD value itself */
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
static UBool
loadIDNData(UErrorCode &errorCode) {
/* load Unicode normalization data from file */
if(isDataLoaded==FALSE) {
UTrie _idnTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
const int32_t *p=NULL;
const uint8_t *pb;
if(&errorCode==NULL || U_FAILURE(errorCode)) {
return 0;
}
/* open the data outside the mutex block */
data=udata_openChoice("c:\\work\\devicu\\idn\\genidn\\icudt26l", DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
dataErrorCode=errorCode;
if(U_FAILURE(errorCode)) {
return isDataLoaded=FALSE;
}
p=(const int32_t *)udata_getMemory(data);
pb=(const uint8_t *)(p+_IDNA_INDEX_TOP);
utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode);
_idnTrie.getFoldingOffset=getFoldingOffset;
if(U_FAILURE(errorCode)) {
dataErrorCode=errorCode;
udata_close(data);
return isDataLoaded=FALSE;
}
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if(idnData==NULL) {
idnData=data;
data=NULL;
uprv_memcpy(&indexes, p, sizeof(indexes));
uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie));
} else {
p=(const int32_t *)udata_getMemory(idnData);
}
umtx_unlock(NULL);
/* initialize some variables */
mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]);
isDataLoaded = TRUE;
/* if a different thread set it first, then close the extra data */
if(data!=NULL) {
udata_close(data); /* NULL if it was set correctly */
}
}
return isDataLoaded;
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,96 @@
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef TESTIDNA_H
#define TESTIDNA_H
#include "sprpimpl.h"
#include "intltest.h"
#include "unicode/parseerr.h"
typedef int32_t
(*TestFunc) ( const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
int32_t options, UParseError *parseError,
UErrorCode *status);
typedef int32_t
(*CompareFunc) (const UChar *s1, int32_t s1Len,
const UChar *s2, int32_t s2Len,
int32_t options,
UErrorCode *status);
// test the API
/**
* @test
* @summary General test of HexadecimalToUnicodeTransliterator
*/
class TestIDNA : public IntlTest {
public:
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
void TestDataFile();
void TestToASCII();
void TestToUnicode();
void TestIDNToUnicode();
void TestIDNToASCII();
void TestCompare();
void TestErrorCases();
void TestChaining();
void TestRootLabelSeparator();
void TestCompareReferenceImpl();
void TestRefIDNA();
void TestIDNAMonkeyTest();
private:
void testToASCII(const char* testName, TestFunc func);
void testToUnicode(const char* testName, TestFunc func);
void testIDNToUnicode(const char* testName, TestFunc func);
void testIDNToASCII(const char* testName, TestFunc func);
void testCompare(const char* testName, CompareFunc func);
void testChaining(const char* toASCIIName, TestFunc toASCII,
const char* toUnicodeName, TestFunc toUnicode);
// main testing functions
void testAPI(const UChar *src, const UChar *expected, const char *testName,
UBool useSTD3ASCIIRules, UErrorCode expectedStatus,
UBool doCompare, TestFunc func);
void testCompare(const UChar* s1, int32_t s1Len,
const UChar* s2, int32_t s2Len,
const char* testName, CompareFunc func,
UBool isEqual);
void testErrorCases(const char* toASCIIName, TestFunc toASCII,
const char* IDNToASCIIName, TestFunc IDNToASCII,
const char* IDNToUnicodeName, TestFunc IDNToUnicode);
void testChaining(UChar* src,int32_t numIterations,const char* testName,
UBool useSTD3ASCIIRules, UBool caseInsensitive, TestFunc func);
void testRootLabelSeparator(const char* testName, CompareFunc func,
const char* IDNToASCIIName, TestFunc IDNToASCII,
const char* IDNToUnicodeName, TestFunc IDNToUnicode);
void testCompareReferenceImpl(const UChar* src, int32_t srcLen);
};
// test the TRIE data structure
int testData(TestIDNA& test);
#endif