mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 15:42:14 +00:00
ICU-2194 tests for IDNA
X-SVN-Rev: 11196
This commit is contained in:
parent
71eb8f87f1
commit
267d3d1f30
14 changed files with 3865 additions and 2 deletions
|
@ -40,7 +40,7 @@ itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o unhxtrts.o hxuntrt
|
|||
ufltlgts.o testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
|
||||
jamotest.o srchtest.o reptest.o regextst.o \
|
||||
itrbnf.o itrbnfrt.o tstdtmod.o testdata.o datamap.o ucaconf.o icusvtst.o \
|
||||
uobjtest.o
|
||||
uobjtest.o idnaref.o nptrans.o punyref.o testidn.o testidna.o
|
||||
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
|
976
icu4c/source/test/intltest/idnaref.cpp
Normal file
976
icu4c/source/test/intltest/idnaref.cpp
Normal file
|
@ -0,0 +1,976 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: strprep.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003feb1
|
||||
* created by: Ram Viswanadha
|
||||
*/
|
||||
#include "idnaref.h"
|
||||
#include "strprep.h"
|
||||
#include "punyref.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "cmemory.h"
|
||||
#include "sprpimpl.h"
|
||||
#include "nptrans.h"
|
||||
//#include "punyref.h"
|
||||
#include "punycode.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
||||
/* it is official IDNA ACE Prefix is "xn--" */
|
||||
static const UChar ACE_PREFIX[] ={ 0x0058,0x004E,0x002d,0x002d } ;
|
||||
#define ACE_PREFIX_LENGTH 4
|
||||
|
||||
#define MAX_LABEL_LENGTH 63
|
||||
#define HYPHEN 0x002D
|
||||
/* The Max length of the labels should not be more than 64 */
|
||||
#define MAX_LABEL_BUFFER_SIZE 100
|
||||
#define MAX_IDN_BUFFER_SIZE 300
|
||||
|
||||
#define CAPITAL_A 0x0041
|
||||
#define CAPITAL_Z 0x005A
|
||||
#define LOWER_CASE_DELTA 0x0020
|
||||
#define FULL_STOP 0x002E
|
||||
|
||||
|
||||
static NamePrepTransform* prep = NULL;
|
||||
|
||||
static NamePrepTransform* getInstance(UErrorCode& status){
|
||||
if(prep == NULL){
|
||||
UParseError parseError;
|
||||
prep = NamePrepTransform::createInstance(parseError, status);
|
||||
if(prep ==NULL){
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return prep;
|
||||
|
||||
}
|
||||
|
||||
inline static UBool
|
||||
startsWithPrefix(const UChar* src , int32_t srcLength){
|
||||
UBool startsWithPrefix = TRUE;
|
||||
|
||||
if(srcLength < ACE_PREFIX_LENGTH){
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){
|
||||
if(u_toupper(src[i]) != ACE_PREFIX[i]){
|
||||
startsWithPrefix = FALSE;
|
||||
}
|
||||
}
|
||||
return startsWithPrefix;
|
||||
}
|
||||
|
||||
inline static UChar
|
||||
toASCIILower(UChar ch){
|
||||
if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
|
||||
return ch + LOWER_CASE_DELTA;
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
inline static int32_t
|
||||
compareCaseInsensitiveASCII(const UChar* s1, int32_t s1Len,
|
||||
const UChar* s2, int32_t s2Len){
|
||||
if(s1Len != s2Len){
|
||||
return (s1Len > s2Len) ? s1Len : s2Len;
|
||||
}
|
||||
UChar c1,c2;
|
||||
int32_t rc;
|
||||
|
||||
for(int32_t i =0;/* no condition */;i++) {
|
||||
/* If we reach the ends of both strings then they match */
|
||||
if(i == s1Len) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
c1 = s1[i];
|
||||
c2 = s2[i];
|
||||
|
||||
/* Case-insensitive comparison */
|
||||
if(c1!=c2) {
|
||||
rc=(int32_t)toASCIILower(c1)-(int32_t)toASCIILower(c2);
|
||||
if(rc!=0) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static UErrorCode getError(enum punycode_status status){
|
||||
switch(status){
|
||||
case punycode_success:
|
||||
return U_ZERO_ERROR;
|
||||
case punycode_bad_input: /* Input is invalid. */
|
||||
return U_INVALID_CHAR_FOUND;
|
||||
case punycode_big_output: /* Output would exceed the space provided. */
|
||||
return U_BUFFER_OVERFLOW_ERROR;
|
||||
case punycode_overflow : /* Input requires wider integers to process. */
|
||||
return U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
default:
|
||||
return U_INTERNAL_PROGRAM_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// wrapper around the reference Punycode implementation
|
||||
static int32_t convertToPuny(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
UErrorCode& status){
|
||||
uint32_t b1Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
int32_t b1Len = 0, b1Capacity = MAX_LABEL_BUFFER_SIZE;
|
||||
uint32_t* b1 = b1Stack;
|
||||
char b2Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
char* b2 = b2Stack;
|
||||
int32_t b2Len =MAX_LABEL_BUFFER_SIZE ;
|
||||
punycode_status error;
|
||||
unsigned char* caseFlags = NULL;
|
||||
|
||||
u_strToUTF32((UChar32*)b1,b1Capacity,&b1Len,src,srcLength,&status);
|
||||
if(status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (uint32_t*) uprv_malloc(b1Len * sizeof(uint32_t));
|
||||
if(b1==NULL){
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
status = U_ZERO_ERROR; // reset error
|
||||
|
||||
u_strToUTF32((UChar32*)b1,b1Len,&b1Len,src,srcLength,&status);
|
||||
}
|
||||
if(U_FAILURE(status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
caseFlags = (unsigned char*) uprv_malloc(b1Len *sizeof(unsigned char));
|
||||
|
||||
error = punycode_encode(b1Len,b1,caseFlags, (uint32_t*)&b2Len, b2);
|
||||
status = getError(error);
|
||||
|
||||
if(status == U_BUFFER_OVERFLOW_ERROR){
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b2 = (char*) uprv_malloc( b2Len * sizeof(char));
|
||||
if(b2==NULL){
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
status = U_ZERO_ERROR; // reset error
|
||||
|
||||
punycode_status error = punycode_encode(b1Len,b1,caseFlags, (uint32_t*)&b2Len, b2);
|
||||
status = getError(error);
|
||||
}
|
||||
if(U_FAILURE(status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if(b2Len < destCapacity){
|
||||
u_charsToUChars(b2,dest,b2Len);
|
||||
}else{
|
||||
status =U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
if(b1Stack != b1){
|
||||
uprv_free(b1);
|
||||
}
|
||||
if(b2Stack != b2){
|
||||
uprv_free(b2);
|
||||
}
|
||||
uprv_free(caseFlags);
|
||||
|
||||
return b2Len;
|
||||
}
|
||||
static int32_t convertFromPuny( const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
UErrorCode& status){
|
||||
char b1Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
char* b1 = b1Stack;
|
||||
int32_t b1Len = 0, b1Capacity = MAX_LABEL_BUFFER_SIZE;
|
||||
int32_t destLen =0;
|
||||
|
||||
u_UCharsToChars(src, b1,srcLength);
|
||||
|
||||
uint32_t b2Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
uint32_t* b2 = b2Stack;
|
||||
int32_t b2Len =MAX_LABEL_BUFFER_SIZE , b2Capacity = MAX_LABEL_BUFFER_SIZE;
|
||||
unsigned char* caseFlags = (unsigned char*) uprv_malloc(srcLength * sizeof(unsigned char*));
|
||||
punycode_status error = punycode_decode(srcLength,b1,(uint32_t*)&b2Len,b2,caseFlags);
|
||||
status = getError(error);
|
||||
if(status == U_BUFFER_OVERFLOW_ERROR){
|
||||
b2 = (uint32_t*) uprv_malloc(b2Len * sizeof(uint32_t));
|
||||
if(b2 == NULL){
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
error = punycode_decode(srcLength,b1,(uint32_t*)&b2Len,b2,caseFlags);
|
||||
status = getError(error);
|
||||
}
|
||||
|
||||
if(U_FAILURE(status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
u_strFromUTF32(dest,destCapacity,&destLen,(UChar32*)b2,b2Len,&status);
|
||||
|
||||
CLEANUP:
|
||||
if(b1Stack != b1){
|
||||
uprv_free(b1);
|
||||
}
|
||||
if(b2Stack != b2){
|
||||
uprv_free(b2);
|
||||
}
|
||||
uprv_free(caseFlags);
|
||||
|
||||
return destLen;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t
|
||||
idnaref_toASCII(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status){
|
||||
|
||||
if(status == NULL || U_FAILURE(*status)){
|
||||
return 0;
|
||||
}
|
||||
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
//initialize pointers to stack buffers
|
||||
UChar *b1 = b1Stack, *b2 = b2Stack;
|
||||
int32_t b1Len, b2Len,
|
||||
b1Capacity = MAX_LABEL_BUFFER_SIZE,
|
||||
b2Capacity = MAX_LABEL_BUFFER_SIZE ,
|
||||
reqLength=0;
|
||||
|
||||
//get the options
|
||||
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
|
||||
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
|
||||
|
||||
UBool* caseFlags = NULL;
|
||||
|
||||
// assume the source contains all ascii codepoints
|
||||
UBool srcIsASCII = TRUE;
|
||||
// assume the source contains all LDH codepoints
|
||||
UBool srcIsLDH = TRUE;
|
||||
int32_t j=0;
|
||||
// UParseError parseError;
|
||||
// step 2
|
||||
NamePrepTransform* prep = getInstance(*status);
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
b1Len = prep->process(src,srcLength,b1, b1Capacity,allowUnassigned,parseError,*status);
|
||||
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
|
||||
}
|
||||
// error bail out
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
// step 3 & 4
|
||||
for( j=0;j<b1Len;j++){
|
||||
if(b1[j] > 0x7F) srcIsASCII = FALSE;
|
||||
srcIsLDH = prep->isLDHChar(b1[j]);
|
||||
}
|
||||
|
||||
if(useSTD3ASCIIRules == TRUE){
|
||||
// verify 3a and 3b
|
||||
if( srcIsLDH == FALSE /* source contains some non-LDH characters */
|
||||
|| b1[0] == HYPHEN || b1[b1Len-1] == HYPHEN){
|
||||
*status = U_IDNA_STD3_ASCII_RULES_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
if(srcIsASCII){
|
||||
if(b1Len <= destCapacity){
|
||||
uprv_memmove(dest, b1, b1Len * U_SIZEOF_UCHAR);
|
||||
reqLength = b1Len;
|
||||
}else{
|
||||
reqLength = b1Len;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}else{
|
||||
// step 5 : verify the sequence does not begin with ACE prefix
|
||||
if(!startsWithPrefix(b1,b1Len)){
|
||||
|
||||
//step 6: encode the sequence with punycode
|
||||
caseFlags = (UBool*) uprv_malloc(b1Len * sizeof(UBool));
|
||||
|
||||
b2Len = convertToPuny(b1,b1Len, b2,b2Capacity,*status);
|
||||
//b2Len = u_strToPunycode(b2,b2Capacity,b1,b1Len, caseFlags, status);
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
|
||||
if(b2 == NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b2Len = convertToPuny(b1, b1Len, b2, b2Len, *status);
|
||||
//b2Len = u_strToPunycode(b2,b2Len,b1,b1Len, caseFlags, status);
|
||||
|
||||
}
|
||||
//error bail out
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
reqLength = b2Len+ACE_PREFIX_LENGTH;
|
||||
|
||||
if(reqLength > destCapacity){
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
//Step 7: prepend the ACE prefix
|
||||
uprv_memcpy(dest,ACE_PREFIX,ACE_PREFIX_LENGTH * U_SIZEOF_UCHAR);
|
||||
//Step 6: copy the contents in b2 into dest
|
||||
uprv_memcpy(dest+ACE_PREFIX_LENGTH, b2, b2Len * U_SIZEOF_UCHAR);
|
||||
|
||||
}else{
|
||||
*status = U_IDNA_ACE_PREFIX_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
if(reqLength > MAX_LABEL_LENGTH){
|
||||
*status = U_IDNA_LABEL_TOO_LONG_ERROR;
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
if(b1 != b1Stack){
|
||||
uprv_free(b1);
|
||||
}
|
||||
if(b2 != b2Stack){
|
||||
uprv_free(b2);
|
||||
}
|
||||
uprv_free(caseFlags);
|
||||
|
||||
// delete prep;
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, reqLength, status);
|
||||
}
|
||||
|
||||
|
||||
int32_t
|
||||
idnaref_toUnicode(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status){
|
||||
|
||||
if(status == NULL || U_FAILURE(*status)){
|
||||
return 0;
|
||||
}
|
||||
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
|
||||
//initialize pointers to stack buffers
|
||||
UChar *b1 = b1Stack, *b2 = b2Stack, *b1Prime=NULL, *b3=b3Stack;
|
||||
int32_t b1Len, b2Len, b1PrimeLen, b3Len,
|
||||
b1Capacity = MAX_LABEL_BUFFER_SIZE,
|
||||
b2Capacity = MAX_LABEL_BUFFER_SIZE,
|
||||
b3Capacity = MAX_LABEL_BUFFER_SIZE,
|
||||
reqLength=0;
|
||||
// UParseError parseError;
|
||||
|
||||
NamePrepTransform* prep = getInstance(*status);
|
||||
b1Len = 0;
|
||||
UBool* caseFlags = NULL;
|
||||
|
||||
UBool srcIsASCII = TRUE;
|
||||
|
||||
//get the options
|
||||
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
|
||||
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
// step 1: find out if all the codepoints in src are ASCII
|
||||
if(srcLength==-1){
|
||||
srcLength = 0;
|
||||
for(;src[srcLength]!=0;){
|
||||
if(src[srcLength]> 0x7f){
|
||||
srcIsASCII = FALSE;
|
||||
}
|
||||
srcLength++;
|
||||
}
|
||||
}else{
|
||||
for(int32_t j=0; j<srcLength; j++){
|
||||
if(src[j]> 0x7f){
|
||||
srcIsASCII = FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(srcIsASCII == FALSE){
|
||||
// step 2: process the string
|
||||
b1Len = prep->process(src,srcLength,b1,b1Capacity,allowUnassigned, parseError, *status);
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
|
||||
}
|
||||
//bail out on error
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
}else{
|
||||
|
||||
// copy everything to b1
|
||||
if(srcLength < b1Capacity){
|
||||
uprv_memmove(b1,src, srcLength * U_SIZEOF_UCHAR);
|
||||
}else{
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
uprv_memmove(b1,src, srcLength * U_SIZEOF_UCHAR);
|
||||
}
|
||||
b1Len = srcLength;
|
||||
}
|
||||
//step 3: verify ACE Prefix
|
||||
if(startsWithPrefix(src,srcLength)){
|
||||
|
||||
//step 4: Remove the ACE Prefix
|
||||
b1Prime = b1 + ACE_PREFIX_LENGTH;
|
||||
b1PrimeLen = b1Len - ACE_PREFIX_LENGTH;
|
||||
|
||||
//step 5: Decode using punycode
|
||||
b2Len = convertFromPuny(b1Prime,b1PrimeLen, b2, b2Capacity, *status);
|
||||
//b2Len = u_strFromPunycode(b2, b2Capacity,b1Prime,b1PrimeLen, caseFlags, status);
|
||||
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
|
||||
if(b2==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b2Len = convertFromPuny(b1Prime,b1PrimeLen, b2, b2Len, *status);
|
||||
//b2Len = u_strFromPunycode(b2, b2Len,b1Prime,b1PrimeLen,caseFlags, status);
|
||||
}
|
||||
|
||||
|
||||
//step 6:Apply toASCII
|
||||
b3Len = idnaref_toASCII(b2,b2Len,b3,b3Capacity,options,parseError, status);
|
||||
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b3 = (UChar*) uprv_malloc(b3Len * U_SIZEOF_UCHAR);
|
||||
if(b3==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b3Len = idnaref_toASCII(b2,b2Len,b3,b3Len, options, parseError, status);
|
||||
|
||||
}
|
||||
//bail out on error
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
//step 7: verify
|
||||
if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){
|
||||
*status = U_IDNA_VERIFICATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
//step 8: return output of step 5
|
||||
reqLength = b2Len;
|
||||
if(b2Len <= destCapacity) {
|
||||
uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR);
|
||||
}
|
||||
}else{
|
||||
//copy the source to destination
|
||||
if(srcLength <= destCapacity){
|
||||
uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR);
|
||||
}
|
||||
reqLength = srcLength;
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
|
||||
if(b1 != b1Stack){
|
||||
uprv_free(b1);
|
||||
}
|
||||
if(b2 != b2Stack){
|
||||
uprv_free(b2);
|
||||
}
|
||||
uprv_free(caseFlags);
|
||||
|
||||
// delete prep;
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, reqLength, status);
|
||||
}
|
||||
|
||||
|
||||
static int32_t
|
||||
getNextSeparator(UChar *src,int32_t srcLength,NamePrepTransform* prep,
|
||||
UChar **limit,
|
||||
UBool *done,
|
||||
UErrorCode *status){
|
||||
if(srcLength == -1){
|
||||
int32_t i;
|
||||
for(i=0 ; ;i++){
|
||||
if(src[i] == 0){
|
||||
*limit = src + i; // point to null
|
||||
*done = TRUE;
|
||||
return i;
|
||||
}
|
||||
if(prep->isLabelSeparator(src[i],*status)){
|
||||
*limit = src + (i+1); // go past the delimiter
|
||||
return i;
|
||||
|
||||
}
|
||||
}
|
||||
// we have not found the delimiter
|
||||
if(i==srcLength){
|
||||
*limit = src+srcLength;
|
||||
*done = TRUE;
|
||||
}
|
||||
return i;
|
||||
}else{
|
||||
int32_t i;
|
||||
for(i=0;i<srcLength;i++){
|
||||
if(prep->isLabelSeparator(src[i],*status)){
|
||||
*limit = src + (i+1); // go past the delimiter
|
||||
return i;
|
||||
}
|
||||
}
|
||||
// we have not found the delimiter
|
||||
if(i==srcLength){
|
||||
*limit = src+srcLength;
|
||||
*done = TRUE;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
idnaref_IDNToASCII( const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status){
|
||||
|
||||
if(status == NULL || U_FAILURE(*status)){
|
||||
return 0;
|
||||
}
|
||||
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
UChar *start=NULL, *limit=NULL;
|
||||
|
||||
int32_t reqLength = 0;
|
||||
// UParseError parseError;
|
||||
|
||||
NamePrepTransform* prep = getInstance(*status);
|
||||
|
||||
//initialize pointers to stack buffers
|
||||
UChar b1Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
UChar *b1 = b1Stack;
|
||||
int32_t b1Len, labelLen;
|
||||
UChar* delimiter = (UChar*)src;
|
||||
UChar* labelStart = (UChar*)src;
|
||||
int32_t remainingLen = srcLength;
|
||||
int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE;
|
||||
|
||||
//get the options
|
||||
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
|
||||
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
|
||||
UBool done = FALSE;
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
|
||||
if(srcLength == -1){
|
||||
for(;;){
|
||||
|
||||
if(*delimiter == 0){
|
||||
break;
|
||||
}
|
||||
|
||||
labelLen = getNextSeparator(labelStart, -1, prep, &delimiter, &done, status);
|
||||
|
||||
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Capacity,
|
||||
options, parseError, status);
|
||||
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Len,
|
||||
options, parseError, status);
|
||||
|
||||
}
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
int32_t tempLen = (reqLength + b1Len );
|
||||
// copy to dest
|
||||
if( tempLen< destCapacity){
|
||||
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
|
||||
}
|
||||
|
||||
reqLength = tempLen;
|
||||
|
||||
// add the label separator
|
||||
if(done == FALSE){
|
||||
if(reqLength < destCapacity){
|
||||
dest[reqLength] = FULL_STOP;
|
||||
}
|
||||
reqLength++;
|
||||
}
|
||||
|
||||
labelStart = delimiter;
|
||||
}
|
||||
}else{
|
||||
for(;;){
|
||||
|
||||
if(delimiter == src+srcLength){
|
||||
break;
|
||||
}
|
||||
|
||||
labelLen = getNextSeparator(labelStart, remainingLen, prep, &delimiter, &done, status);
|
||||
|
||||
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Capacity,
|
||||
options,parseError, status);
|
||||
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = idnaref_toASCII(labelStart, labelLen, b1, b1Len,
|
||||
options, parseError, status);
|
||||
|
||||
}
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
int32_t tempLen = (reqLength + b1Len );
|
||||
// copy to dest
|
||||
if( tempLen< destCapacity){
|
||||
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
|
||||
}
|
||||
|
||||
reqLength = tempLen;
|
||||
|
||||
// add the label separator
|
||||
if(done == FALSE){
|
||||
if(reqLength < destCapacity){
|
||||
dest[reqLength] = FULL_STOP;
|
||||
}
|
||||
reqLength++;
|
||||
}
|
||||
|
||||
labelStart = delimiter;
|
||||
remainingLen = srcLength - (delimiter - src);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
CLEANUP:
|
||||
|
||||
if(b1 != b1Stack){
|
||||
uprv_free(b1);
|
||||
}
|
||||
|
||||
// delete prep;
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, reqLength, status);
|
||||
}
|
||||
|
||||
int32_t
|
||||
idnaref_IDNToUnicode( const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status){
|
||||
|
||||
if(status == NULL || U_FAILURE(*status)){
|
||||
return 0;
|
||||
}
|
||||
if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
UChar *start=NULL, *limit=NULL;
|
||||
|
||||
int32_t reqLength = 0;
|
||||
|
||||
UBool done = FALSE;
|
||||
|
||||
NamePrepTransform* prep = getInstance(*status);
|
||||
|
||||
//initialize pointers to stack buffers
|
||||
UChar b1Stack[MAX_LABEL_BUFFER_SIZE];
|
||||
UChar *b1 = b1Stack;
|
||||
int32_t b1Len, labelLen;
|
||||
UChar* delimiter = (UChar*)src;
|
||||
UChar* labelStart = (UChar*)src;
|
||||
int32_t remainingLen = srcLength;
|
||||
int32_t b1Capacity = MAX_LABEL_BUFFER_SIZE;
|
||||
|
||||
//get the options
|
||||
UBool allowUnassigned = options & IDNAREF_ALLOW_UNASSIGNED;
|
||||
UBool useSTD3ASCIIRules = (options & IDNAREF_USE_STD3_RULES) >>1;
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if(srcLength == -1){
|
||||
for(;;){
|
||||
|
||||
if(*delimiter == 0){
|
||||
break;
|
||||
}
|
||||
|
||||
labelLen = getNextSeparator(labelStart, -1, prep, &delimiter, &done, status);
|
||||
|
||||
b1Len = idnaref_toUnicode(labelStart, labelLen, b1, b1Capacity,
|
||||
options, parseError, status);
|
||||
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = idnaref_toUnicode( labelStart, labelLen, b1, b1Len,
|
||||
options, parseError, status);
|
||||
|
||||
}
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
int32_t tempLen = (reqLength + b1Len );
|
||||
// copy to dest
|
||||
if( tempLen< destCapacity){
|
||||
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
|
||||
}
|
||||
|
||||
reqLength = tempLen;
|
||||
// add the label separator
|
||||
if(done == FALSE){
|
||||
if(reqLength < destCapacity){
|
||||
dest[reqLength] = FULL_STOP;
|
||||
}
|
||||
reqLength++;
|
||||
}
|
||||
|
||||
labelStart = delimiter;
|
||||
}
|
||||
}else{
|
||||
for(;;){
|
||||
|
||||
if(delimiter == src+srcLength){
|
||||
break;
|
||||
}
|
||||
|
||||
labelLen = getNextSeparator(labelStart, remainingLen, prep, &delimiter, &done, status);
|
||||
|
||||
b1Len = idnaref_toUnicode( labelStart,labelLen, b1, b1Capacity,
|
||||
options, parseError, status);
|
||||
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = idnaref_toUnicode( labelStart, labelLen, b1, b1Len,
|
||||
options, parseError, status);
|
||||
|
||||
}
|
||||
|
||||
if(U_FAILURE(*status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
int32_t tempLen = (reqLength + b1Len );
|
||||
// copy to dest
|
||||
if( tempLen< destCapacity){
|
||||
uprv_memmove(dest+reqLength, b1, b1Len * U_SIZEOF_UCHAR);
|
||||
}
|
||||
|
||||
reqLength = tempLen;
|
||||
|
||||
// add the label separator
|
||||
if(done == FALSE){
|
||||
if(reqLength < destCapacity){
|
||||
dest[reqLength] = FULL_STOP;
|
||||
}
|
||||
reqLength++;
|
||||
}
|
||||
|
||||
labelStart = delimiter;
|
||||
remainingLen = srcLength - (delimiter - src);
|
||||
}
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
|
||||
if(b1 != b1Stack){
|
||||
uprv_free(b1);
|
||||
}
|
||||
|
||||
// delete prep;
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, reqLength, status);
|
||||
}
|
||||
|
||||
int32_t
|
||||
idnaref_compare( const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
int32_t options,
|
||||
UErrorCode* status){
|
||||
|
||||
if(status == NULL || U_FAILURE(*status)){
|
||||
return -1;
|
||||
}
|
||||
|
||||
UChar b1Stack[MAX_IDN_BUFFER_SIZE], b2Stack[MAX_IDN_BUFFER_SIZE];
|
||||
UChar *b1 = b1Stack, *b2 = b2Stack;
|
||||
int32_t b1Len, b2Len, b1Capacity = MAX_IDN_BUFFER_SIZE, b2Capacity = MAX_IDN_BUFFER_SIZE;
|
||||
int32_t result;
|
||||
|
||||
UParseError parseError;
|
||||
|
||||
b1Len = idnaref_IDNToASCII(s1, length1, b1, b1Capacity, options, &parseError, status);
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
|
||||
if(b1==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = idnaref_IDNToASCII(s1,length1,b1,b1Len, options, &parseError, status);
|
||||
|
||||
}
|
||||
|
||||
b2Len = idnaref_IDNToASCII(s2,length2,b2,b2Capacity,options, &parseError, status);
|
||||
if(*status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
|
||||
if(b2==NULL){
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
*status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b2Len = idnaref_IDNToASCII(s2,length2,b2,b2Len,options, &parseError, status);
|
||||
|
||||
}
|
||||
// when toASCII is applied all label separators are replaced with FULL_STOP
|
||||
result = compareCaseInsensitiveASCII(b1,b1Len,b2,b2Len);
|
||||
|
||||
CLEANUP:
|
||||
if(b1 != b1Stack){
|
||||
uprv_free(b1);
|
||||
}
|
||||
|
||||
if(b2 != b2Stack){
|
||||
uprv_free(b2);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
226
icu4c/source/test/intltest/idnaref.h
Normal file
226
icu4c/source/test/intltest/idnaref.h
Normal file
|
@ -0,0 +1,226 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: idnaref.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003feb1
|
||||
* created by: Ram Viswanadha
|
||||
*/
|
||||
|
||||
#ifndef __IDNAREF_H__
|
||||
#define __IDNAREF_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
||||
#define IDNAREF_DEFAULT 0x0000
|
||||
#define IDNAREF_ALLOW_UNASSIGNED 0x0001
|
||||
#define IDNAREF_USE_STD3_RULES 0x0002
|
||||
|
||||
/**
|
||||
* This function implements the ToASCII operation as defined in the IDNA draft.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* ASCII names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
|
||||
* "www","example", and "com".
|
||||
*
|
||||
*
|
||||
* @param src Input Unicode label.
|
||||
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
|
||||
* @param dest Output Unicode array with ACE encoded ASCII label.
|
||||
* @param destCapacity Size of dest.
|
||||
* @param options A bit set of options:
|
||||
*
|
||||
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
|
||||
* If TRUE unassigned values are treated as normal Unicode code points.
|
||||
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
|
||||
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
|
||||
* If TRUE and the input does not statisfy STD3 rules, the operation
|
||||
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
|
||||
*
|
||||
* @param parseError Pointer to UParseError struct to recieve information on position
|
||||
* of error if an error is encountered. Can be NULL.
|
||||
* @param status ICU in/out error code parameter.
|
||||
* U_INVALID_CHAR_FOUND if src contains
|
||||
* unmatched single surrogates.
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
|
||||
* too many code points.
|
||||
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
|
||||
* @return Number of ASCII characters converted.
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
int32_t
|
||||
idnaref_toASCII(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status);
|
||||
|
||||
|
||||
/**
|
||||
* This function implements the ToUnicode operation as defined in the IDNA draft.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* ASCII names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
|
||||
* "www","example", and "com".
|
||||
*
|
||||
* @param src Input ASCII (ACE encoded) label.
|
||||
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
|
||||
* @param dest Output Converted Unicode array.
|
||||
* @param destCapacity Size of dest.
|
||||
* @param options A bit set of options:
|
||||
*
|
||||
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
|
||||
* If TRUE unassigned values are treated as normal Unicode code points.
|
||||
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
|
||||
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
|
||||
* If TRUE and the input does not statisfy STD3 rules, the operation
|
||||
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
|
||||
*
|
||||
* @param parseError Pointer to UParseError struct to recieve information on position
|
||||
* of error if an error is encountered. Can be NULL.
|
||||
* @param status ICU in/out error code parameter.
|
||||
* U_INVALID_CHAR_FOUND if src contains
|
||||
* unmatched single surrogates.
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
|
||||
* too many code points.
|
||||
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
|
||||
* @return Number of Unicode characters converted.
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
int32_t
|
||||
idnaref_toUnicode(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status);
|
||||
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToASCII operation as defined in the IDNA draft.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
* It is important to note that this operation can fail. If it fails, then the input
|
||||
* domain name cannot be used as an Internationalized Domain Name and the application
|
||||
* should have methods defined to deal with the failure.
|
||||
*
|
||||
* <b>Note:</b> IDNA draft specifies that a conformant application should divide a domain name
|
||||
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
|
||||
* and then convert. This function does not offer that level of granularity. The options once
|
||||
* set will apply to all labels in the domain name
|
||||
*
|
||||
* @param src Input ASCII IDN.
|
||||
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
|
||||
* @param dest Output Unicode array.
|
||||
* @param destCapacity Size of dest.
|
||||
* @param options A bit set of options:
|
||||
*
|
||||
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
|
||||
* If TRUE unassigned values are treated as normal Unicode code points.
|
||||
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
|
||||
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
|
||||
* If TRUE and the input does not statisfy STD3 rules, the operation
|
||||
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
|
||||
*
|
||||
* @param parseError Pointer to UParseError struct to recieve information on position
|
||||
* of error if an error is encountered. Can be NULL.
|
||||
* @param status ICU in/out error code parameter.
|
||||
* U_INVALID_CHAR_FOUND if src contains
|
||||
* unmatched single surrogates.
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
|
||||
* too many code points.
|
||||
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
|
||||
* @return Number of ASCII characters converted.
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
int32_t
|
||||
idnaref_IDNToASCII( const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToUnicode operation as defined in the IDNA draft.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
*
|
||||
* <b>Note:</b> IDNA draft specifies that a conformant application should divide a domain name
|
||||
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
|
||||
* and then convert. This function does not offer that level of granularity. The options once
|
||||
* set will apply to all labels in the domain name
|
||||
*
|
||||
* @param src Input Unicode IDN.
|
||||
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
|
||||
* @param dest Output ASCII array.
|
||||
* @param destCapacity Size of dest.
|
||||
* @param options A bit set of options:
|
||||
*
|
||||
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
|
||||
* If TRUE unassigned values are treated as normal Unicode code points.
|
||||
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
|
||||
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
|
||||
* If TRUE and the input does not statisfy STD3 rules, the operation
|
||||
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
|
||||
*
|
||||
* @param parseError Pointer to UParseError struct to recieve information on position
|
||||
* of error if an error is encountered. Can be NULL.
|
||||
* @param status ICU in/out error code parameter.
|
||||
* U_INVALID_CHAR_FOUND if src contains
|
||||
* unmatched single surrogates.
|
||||
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
|
||||
* too many code points.
|
||||
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
|
||||
* @return Number of ASCII characters converted.
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
int32_t
|
||||
idnaref_IDNToUnicode( const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
int32_t options,
|
||||
UParseError* parseError,
|
||||
UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Compare two strings for IDNs for equivalence.
|
||||
* This function splits the domain names into labels and compares them.
|
||||
* According to IDN draft, whenever two labels are compared, they are
|
||||
* considered equal if and only if their ASCII forms (obtained by
|
||||
* applying toASCII) match using an case-insensitive ASCII comparison.
|
||||
* Two domain names are considered a match if and only if all labels
|
||||
* match regardless of whether label separators match.
|
||||
*
|
||||
* @param s1 First source string.
|
||||
* @param length1 Length of first source string, or -1 if NUL-terminated.
|
||||
*
|
||||
* @param s2 Second source string.
|
||||
* @param length2 Length of second source string, or -1 if NUL-terminated.
|
||||
* @param options A bit set of options:
|
||||
*
|
||||
* - idnaref_UNASSIGNED Unassigned values can be converted to ASCII for query operations
|
||||
* If TRUE unassigned values are treated as normal Unicode code points.
|
||||
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
|
||||
* - idnaref_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
|
||||
* If TRUE and the input does not statisfy STD3 rules, the operation
|
||||
* will fail with U_IDNA_STD3_ASCII_RULES_ERROR
|
||||
*
|
||||
* @param status ICU error code in/out parameter.
|
||||
* Must fulfill U_SUCCESS before the function call.
|
||||
* @return <0 or 0 or >0 as usual for string comparisons
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
int32_t
|
||||
idnaref_compare( const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
int32_t options,
|
||||
UErrorCode* status);
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
|
@ -1291,6 +1291,64 @@ const char* IntlTest::loadTestData(UErrorCode& err){
|
|||
}
|
||||
return _testDataPath;
|
||||
}
|
||||
|
||||
const char* IntlTest::fgDataDir = NULL;
|
||||
|
||||
/* returns the path to icu/source/data */
|
||||
const char * IntlTest::pathToDataDirectory()
|
||||
{
|
||||
|
||||
if(fgDataDir != NULL) {
|
||||
return fgDataDir;
|
||||
}
|
||||
|
||||
/* U_TOPSRCDIR is set by the makefiles on UNIXes when building cintltst and intltst
|
||||
// to point to the top of the build hierarchy, which may or
|
||||
// may not be the same as the source directory, depending on
|
||||
// the configure options used. At any rate,
|
||||
// set the data path to the built data from this directory.
|
||||
// The value is complete with quotes, so it can be used
|
||||
// as-is as a string constant.
|
||||
*/
|
||||
#if defined (U_TOPSRCDIR)
|
||||
{
|
||||
fgDataDir = U_TOPSRCDIR U_FILE_SEP_STRING "data" U_FILE_SEP_STRING;
|
||||
}
|
||||
#else
|
||||
|
||||
/* On Windows, the file name obtained from __FILE__ includes a full path.
|
||||
* This file is "wherever\icu\source\test\cintltst\cintltst.c"
|
||||
* Change to "wherever\icu\source\data"
|
||||
*/
|
||||
{
|
||||
static char p[sizeof(__FILE__) + 10];
|
||||
char *pBackSlash;
|
||||
int i;
|
||||
|
||||
strcpy(p, __FILE__);
|
||||
/* We want to back over three '\' chars. */
|
||||
/* Only Windows should end up here, so looking for '\' is safe. */
|
||||
for (i=1; i<=3; i++) {
|
||||
pBackSlash = strrchr(p, U_FILE_SEP_CHAR);
|
||||
if (pBackSlash != NULL) {
|
||||
*pBackSlash = 0; /* Truncate the string at the '\' */
|
||||
}
|
||||
}
|
||||
|
||||
if (pBackSlash != NULL) {
|
||||
/* We found and truncated three names from the path.
|
||||
* Now append "source\data" and set the environment
|
||||
*/
|
||||
strcpy(pBackSlash, U_FILE_SEP_STRING "data" U_FILE_SEP_STRING );
|
||||
fgDataDir = p;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return fgDataDir;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a variant of cintltst/ccolltst.c:CharsToUChars().
|
||||
* It converts a character string into a UnicodeString, with
|
||||
|
|
|
@ -942,5 +942,49 @@ SOURCE=.\unhxtrts.cpp
|
|||
SOURCE=.\unhxtrts.h
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "idna"
|
||||
|
||||
# PROP Default_Filter "*.c,*.h"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\idnaref.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\idnaref.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\nptrans.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\nptrans.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\punyref.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\punyref.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\testidn.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\testidna.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\testidna.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\tidnaref.cpp
|
||||
# End Source File
|
||||
# End Group
|
||||
# End Target
|
||||
# End Project
|
||||
|
|
|
@ -158,6 +158,8 @@ protected:
|
|||
public:
|
||||
static void setICU_DATA(); // Set up ICU_DATA if necessary.
|
||||
|
||||
static const char* pathToDataDirectory();
|
||||
|
||||
public:
|
||||
UBool run_phase2( char* name, char* par ); // internally, supports reporting memory leaks
|
||||
static const char* loadTestData(UErrorCode& err);
|
||||
|
@ -165,6 +167,7 @@ public:
|
|||
// static members
|
||||
public:
|
||||
static IntlTest* gTest;
|
||||
static const char* fgDataDir;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
#include "tstnorm.h"
|
||||
#include "canittst.h"
|
||||
#include "icusvtst.h"
|
||||
|
||||
#include "testidna.h"
|
||||
#define CASE_SUITE(id, suite) case id: \
|
||||
name = #suite; \
|
||||
if(exec) { \
|
||||
|
@ -148,6 +148,13 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
|
|||
}
|
||||
#endif
|
||||
break;
|
||||
case 11: name = "idna";
|
||||
if(exec){
|
||||
logln("TestSuite IDNA----"); logln();
|
||||
TestIDNA test;
|
||||
callTest(test,par);
|
||||
}
|
||||
break;
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
|
279
icu4c/source/test/intltest/nptrans.cpp
Normal file
279
icu4c/source/test/intltest/nptrans.cpp
Normal file
|
@ -0,0 +1,279 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: nameprep.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003feb1
|
||||
* created by: Ram Viswanadha
|
||||
*/
|
||||
|
||||
#include "nptrans.h"
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "sprpimpl.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "intltest.h"
|
||||
|
||||
#ifdef DEBUG
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
const char NamePrepTransform::fgClassID=0;
|
||||
|
||||
NamePrepTransform* NamePrepTransform::transform = NULL;
|
||||
|
||||
//Factory method
|
||||
NamePrepTransform* NamePrepTransform::createInstance(UParseError& parseError, UErrorCode& status){
|
||||
if(transform==NULL){
|
||||
transform = new NamePrepTransform(parseError, status);
|
||||
if(U_FAILURE(status)){
|
||||
delete transform;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return transform;
|
||||
}
|
||||
|
||||
//constructor
|
||||
NamePrepTransform::NamePrepTransform(UParseError& parseError, UErrorCode& status)
|
||||
: unassigned(), prohibited(), labelSeparatorSet(){
|
||||
|
||||
mapping = NULL;
|
||||
bundle = NULL;
|
||||
|
||||
|
||||
const char* testDataName = IntlTest::loadTestData(status);
|
||||
|
||||
if(U_FAILURE(status)){
|
||||
return;
|
||||
}
|
||||
|
||||
bundle = ures_openDirect(testDataName,"idna_rules",&status);
|
||||
|
||||
if(bundle != NULL && U_SUCCESS(status)){
|
||||
// create the mapping transliterator
|
||||
int32_t ruleLen = 0;
|
||||
const UChar* ruleUChar = ures_getStringByKey(bundle, "MapNFKC",&ruleLen, &status);
|
||||
UnicodeString rule(ruleUChar, ruleLen);
|
||||
|
||||
mapping = Transliterator::createFromRules("NamePrepTransform", rule,
|
||||
UTRANS_FORWARD, parseError,status);
|
||||
|
||||
//create the unassigned set
|
||||
int32_t patternLen =0;
|
||||
const UChar* pattern = ures_getStringByKey(bundle,"UnassignedSet",&patternLen, &status);
|
||||
unassigned.applyPattern(UnicodeString(pattern, patternLen), status);
|
||||
|
||||
//create prohibited set
|
||||
patternLen=0;
|
||||
pattern = ures_getStringByKey(bundle,"ProhibitedSet",&patternLen, &status);
|
||||
UnicodeString test(pattern,patternLen);
|
||||
prohibited.applyPattern(test,status);
|
||||
#ifdef DEBUG
|
||||
if(U_FAILURE(status)){
|
||||
printf("Construction of Unicode set failed\n");
|
||||
}
|
||||
|
||||
if(U_SUCCESS(status)){
|
||||
if(prohibited.contains((UChar) 0x644)){
|
||||
printf("The string contains 0x644 ... damn !!\n");
|
||||
}
|
||||
UnicodeString temp;
|
||||
prohibited.toPattern(temp,TRUE);
|
||||
|
||||
for(int32_t i=0;i<temp.length();i++){
|
||||
printf("%c", (char)temp.charAt(i));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
//create label separator set
|
||||
patternLen=0;
|
||||
pattern = ures_getStringByKey(bundle,"LabelSeparatorSet",&patternLen, &status);
|
||||
labelSeparatorSet.applyPattern(UnicodeString(pattern,patternLen),status);
|
||||
}
|
||||
|
||||
if(U_SUCCESS(status) &&
|
||||
(mapping == NULL)
|
||||
){
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete mapping;
|
||||
ures_close(bundle);
|
||||
mapping = NULL;
|
||||
bundle = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
UBool NamePrepTransform::isProhibited(UChar32 ch){
|
||||
return (UBool)(ch != ASCII_SPACE);
|
||||
}
|
||||
|
||||
NamePrepTransform::~NamePrepTransform(){
|
||||
delete mapping;
|
||||
mapping = NULL;
|
||||
|
||||
//close the bundle
|
||||
ures_close(bundle);
|
||||
bundle = NULL;
|
||||
}
|
||||
|
||||
|
||||
int32_t NamePrepTransform::map(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
UBool allowUnassigned,
|
||||
UParseError* parseError,
|
||||
UErrorCode& status ){
|
||||
|
||||
if(U_FAILURE(status)){
|
||||
return 0;
|
||||
}
|
||||
//check arguments
|
||||
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
|
||||
status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
UnicodeString rsource(src,srcLength);
|
||||
// map the code points
|
||||
// transliteration also performs NFKC
|
||||
mapping->transliterate(rsource);
|
||||
|
||||
const UChar* buffer = rsource.getBuffer();
|
||||
int32_t bufLen = rsource.length();
|
||||
// check if unassigned
|
||||
if(allowUnassigned == FALSE){
|
||||
int32_t bufIndex=0;
|
||||
UChar32 ch =0 ;
|
||||
for(;bufIndex<bufLen;){
|
||||
U16_NEXT(buffer, bufIndex, bufLen, ch);
|
||||
if(unassigned.contains(ch)){
|
||||
status = U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR;
|
||||
rsource.releaseBuffer();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// check if there is enough room in the output
|
||||
if(bufLen < destCapacity){
|
||||
uprv_memcpy(dest,buffer,bufLen*U_SIZEOF_UCHAR);
|
||||
}
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, bufLen, &status);
|
||||
}
|
||||
|
||||
|
||||
#define MAX_BUFFER_SIZE 300
|
||||
|
||||
int32_t NamePrepTransform::process( const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
UBool allowUnassigned,
|
||||
UParseError* parseError,
|
||||
UErrorCode& status ){
|
||||
// check error status
|
||||
if(U_FAILURE(status)){
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check arguments
|
||||
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
|
||||
status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
UChar b1Stack[MAX_BUFFER_SIZE];
|
||||
UChar *b1 = b1Stack;
|
||||
int32_t b1Len,b1Capacity = MAX_BUFFER_SIZE;
|
||||
|
||||
int32_t b1Index = 0;
|
||||
UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
|
||||
UBool leftToRight=FALSE, rightToLeft=FALSE;
|
||||
|
||||
b1Len = map(src,srcLength, b1, b1Capacity,allowUnassigned,parseError, status);
|
||||
|
||||
if(status == U_BUFFER_OVERFLOW_ERROR){
|
||||
// redo processing of string
|
||||
/* we do not have enough room so grow the buffer*/
|
||||
if(!u_growBufferFromStatic(b1Stack,&b1,&b1Capacity,b1Len,0)){
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
status = U_ZERO_ERROR; // reset error
|
||||
|
||||
b1Len = map(src,srcLength, b1, b1Len,allowUnassigned, parseError, status);
|
||||
|
||||
}
|
||||
|
||||
if(U_FAILURE(status)){
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
|
||||
for(; b1Index<b1Len; ){
|
||||
|
||||
UChar32 ch = 0;
|
||||
|
||||
U16_NEXT(b1, b1Index, b1Len, ch);
|
||||
|
||||
if(prohibited.contains(ch) && ch!=0x0020){
|
||||
status = U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
direction = u_charDirection(ch);
|
||||
if(firstCharDir==U_CHAR_DIRECTION_COUNT){
|
||||
firstCharDir = direction;
|
||||
}
|
||||
if(direction == U_LEFT_TO_RIGHT){
|
||||
leftToRight = TRUE;
|
||||
}
|
||||
if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
|
||||
rightToLeft = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
// satisfy 2
|
||||
if( leftToRight == TRUE && rightToLeft == TRUE){
|
||||
status = U_IDNA_CHECK_BIDI_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
//satisfy 3
|
||||
if(rightToLeft == TRUE && firstCharDir != direction ){
|
||||
status = U_IDNA_CHECK_BIDI_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if(b1Len <= destCapacity){
|
||||
uprv_memmove(dest,b1, b1Len*U_SIZEOF_UCHAR);
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
if(b1!=b1Stack){
|
||||
uprv_free(b1);
|
||||
}
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, b1Len, &status);
|
||||
}
|
||||
|
||||
UBool NamePrepTransform::isLabelSeparator(UChar32 ch, UErrorCode& status){
|
||||
// check error status
|
||||
if(U_FAILURE(status)){
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return labelSeparatorSet.contains(ch);
|
||||
}
|
||||
|
||||
|
154
icu4c/source/test/intltest/nptrans.h
Normal file
154
icu4c/source/test/intltest/nptrans.h
Normal file
|
@ -0,0 +1,154 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: nameprep.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003feb1
|
||||
* created by: Ram Viswanadha
|
||||
*/
|
||||
|
||||
#ifndef NPTRANS_H
|
||||
#define NPTRANS_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "strprep.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/translit.h"
|
||||
|
||||
|
||||
|
||||
#define ASCII_SPACE 0x0020
|
||||
|
||||
class NamePrepTransform {
|
||||
|
||||
private :
|
||||
Transliterator *mapping;
|
||||
UnicodeSet unassigned;
|
||||
UnicodeSet prohibited;
|
||||
UnicodeSet labelSeparatorSet;
|
||||
UResourceBundle *bundle;
|
||||
static NamePrepTransform* transform;
|
||||
NamePrepTransform(UParseError& parseError, UErrorCode& status);
|
||||
|
||||
|
||||
public :
|
||||
|
||||
static NamePrepTransform* createInstance(UParseError& parseError, UErrorCode& status);
|
||||
|
||||
inline ~NamePrepTransform();
|
||||
|
||||
|
||||
inline UBool isProhibited(UChar32 ch);
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
|
||||
/**
|
||||
* Map every character in input stream with mapping character
|
||||
* in the mapping table and populate the output stream.
|
||||
* For any individual character the mapping table may specify
|
||||
* that that a character be mapped to nothing, mapped to one
|
||||
* other character or to a string of other characters.
|
||||
*
|
||||
* @param src Pointer to UChar buffer containing a single label
|
||||
* @param srcLength Number of characters in the source label
|
||||
* @param dest Pointer to the destination buffer to receive the output
|
||||
* @param destCapacity The capacity of destination array
|
||||
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
|
||||
* If TRUE unassigned values are treated as normal Unicode code point.
|
||||
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
|
||||
* @param status ICU error code in/out parameter.
|
||||
* Must fulfill U_SUCCESS before the function call.
|
||||
* @return The number of UChars in the destination buffer
|
||||
*
|
||||
*/
|
||||
int32_t map(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
UBool allowUnassigned,
|
||||
UParseError* parseError,
|
||||
UErrorCode& status );
|
||||
|
||||
/**
|
||||
* Prepare the input stream with for use. This operation maps, normalizes(NFKC),
|
||||
* checks for prohited and BiDi characters in the order defined by RFC 3454
|
||||
*
|
||||
* @param src Pointer to UChar buffer containing a single label
|
||||
* @param srcLength Number of characters in the source label
|
||||
* @param dest Pointer to the destination buffer to receive the output
|
||||
* @param destCapacity The capacity of destination array
|
||||
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
|
||||
* If TRUE unassigned values are treated as normal Unicode code point.
|
||||
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT error code.
|
||||
* @param status ICU error code in/out parameter.
|
||||
* Must fulfill U_SUCCESS before the function call.
|
||||
* @return The number of UChars in the destination buffer
|
||||
*/
|
||||
int32_t process(const UChar* src, int32_t srcLength,
|
||||
UChar* dest, int32_t destCapacity,
|
||||
UBool allowUnassigned,
|
||||
UParseError* parseError,
|
||||
UErrorCode& status );
|
||||
|
||||
/**
|
||||
* Ascertain if the given code point is a label separator as specified by IDNA
|
||||
*
|
||||
* @return TRUE is the code point is a label separator
|
||||
*
|
||||
*
|
||||
*/
|
||||
UBool isLabelSeparator(UChar32 ch, UErrorCode& status);
|
||||
|
||||
|
||||
inline UBool isLDHChar(UChar32 ch);
|
||||
private:
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
* for ICU "poor man's RTTI".
|
||||
*/
|
||||
static const char fgClassID;
|
||||
};
|
||||
|
||||
inline UBool NamePrepTransform::isLDHChar(UChar32 ch){
|
||||
// high runner case
|
||||
if(ch>0x007A){
|
||||
return FALSE;
|
||||
}
|
||||
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
|
||||
if( (ch==0x002D) ||
|
||||
(0x0030 <= ch && ch <= 0x0039) ||
|
||||
(0x0041 <= ch && ch <= 0x005A) ||
|
||||
(0x0061 <= ch && ch <= 0x007A)
|
||||
){
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
264
icu4c/source/test/intltest/punyref.c
Normal file
264
icu4c/source/test/intltest/punyref.c
Normal file
|
@ -0,0 +1,264 @@
|
|||
/*
|
||||
punycode.c 0.4.0 (2001-Nov-17-Sat)
|
||||
http://www.cs.berkeley.edu/~amc/idn/
|
||||
Adam M. Costello
|
||||
http://www.nicemice.net/amc/
|
||||
*/
|
||||
|
||||
/**********************************************************/
|
||||
/* Implementation (would normally go in its own .c file): */
|
||||
|
||||
#include <string.h>
|
||||
#include "punyref.h"
|
||||
|
||||
/*** Bootstring parameters for Punycode ***/
|
||||
|
||||
enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
|
||||
initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
|
||||
|
||||
/* basic(cp) tests whether cp is a basic code point: */
|
||||
#define basic(cp) ((punycode_uint)(cp) < 0x80)
|
||||
|
||||
/* delim(cp) tests whether cp is a delimiter: */
|
||||
#define delim(cp) ((cp) == delimiter)
|
||||
|
||||
/* decode_digit(cp) returns the numeric value of a basic code */
|
||||
/* point (for use in representing integers) in the range 0 to */
|
||||
/* base-1, or base if cp is does not represent a value. */
|
||||
|
||||
static punycode_uint decode_digit(punycode_uint cp)
|
||||
{
|
||||
return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
|
||||
cp - 97 < 26 ? cp - 97 : base;
|
||||
}
|
||||
|
||||
/* encode_digit(d,flag) returns the basic code point whose value */
|
||||
/* (when used for representing integers) is d, which needs to be in */
|
||||
/* the range 0 to base-1. The lowercase form is used unless flag is */
|
||||
/* nonzero, in which case the uppercase form is used. The behavior */
|
||||
/* is undefined if flag is nonzero and digit d has no uppercase form. */
|
||||
|
||||
static char encode_digit(punycode_uint d, int flag)
|
||||
{
|
||||
return (char) d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
|
||||
/* 0..25 map to ASCII a..z or A..Z */
|
||||
/* 26..35 map to ASCII 0..9 */
|
||||
}
|
||||
|
||||
/* flagged(bcp) tests whether a basic code point is flagged */
|
||||
/* (uppercase). The behavior is undefined if bcp is not a */
|
||||
/* basic code point. */
|
||||
|
||||
#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
|
||||
|
||||
/* encode_basic(bcp,flag) forces a basic code point to lowercase */
|
||||
/* if flag is zero, uppercase if flag is nonzero, and returns */
|
||||
/* the resulting code point. The code point is unchanged if it */
|
||||
/* is caseless. The behavior is undefined if bcp is not a basic */
|
||||
/* code point. */
|
||||
|
||||
static char encode_basic(punycode_uint bcp, int flag)
|
||||
{
|
||||
bcp -= (bcp - 97 < 26) << 5;
|
||||
return (char) bcp + ((!flag && (bcp - 65 < 26)) << 5);
|
||||
}
|
||||
|
||||
/*** Platform-specific constants ***/
|
||||
|
||||
/* maxint is the maximum value of a punycode_uint variable: */
|
||||
static const punycode_uint maxint = -1;
|
||||
/* Because maxint is unsigned, -1 becomes the maximum value. */
|
||||
|
||||
/*** Bias adaptation function ***/
|
||||
|
||||
static punycode_uint adapt(
|
||||
punycode_uint delta, punycode_uint numpoints, int firsttime )
|
||||
{
|
||||
punycode_uint k;
|
||||
|
||||
delta = firsttime ? delta / damp : delta >> 1;
|
||||
/* delta >> 1 is a faster way of doing delta / 2 */
|
||||
delta += delta / numpoints;
|
||||
|
||||
for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
|
||||
delta /= base - tmin;
|
||||
}
|
||||
|
||||
return k + (base - tmin + 1) * delta / (delta + skew);
|
||||
}
|
||||
|
||||
/*** Main encode function ***/
|
||||
|
||||
enum punycode_status punycode_encode(
|
||||
punycode_uint input_length,
|
||||
const punycode_uint input[],
|
||||
const unsigned char case_flags[],
|
||||
punycode_uint *output_length,
|
||||
char output[] )
|
||||
{
|
||||
punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
|
||||
|
||||
/* Initialize the state: */
|
||||
|
||||
n = initial_n;
|
||||
delta = out = 0;
|
||||
max_out = *output_length;
|
||||
bias = initial_bias;
|
||||
|
||||
/* Handle the basic code points: */
|
||||
|
||||
for (j = 0; j < input_length; ++j) {
|
||||
if (basic(input[j])) {
|
||||
if (max_out - out < 2) return punycode_big_output;
|
||||
output[out++] = (char)
|
||||
(case_flags ? encode_basic(input[j], case_flags[j]) : input[j]);
|
||||
}
|
||||
/* else if (input[j] < n) return punycode_bad_input; */
|
||||
/* (not needed for Punycode with unsigned code points) */
|
||||
}
|
||||
|
||||
h = b = out;
|
||||
|
||||
/* h is the number of code points that have been handled, b is the */
|
||||
/* number of basic code points, and out is the number of characters */
|
||||
/* that have been output. */
|
||||
|
||||
if (b > 0) output[out++] = delimiter;
|
||||
|
||||
/* Main encoding loop: */
|
||||
|
||||
while (h < input_length) {
|
||||
/* All non-basic code points < n have been */
|
||||
/* handled already. Find the next larger one: */
|
||||
|
||||
for (m = maxint, j = 0; j < input_length; ++j) {
|
||||
/* if (basic(input[j])) continue; */
|
||||
/* (not needed for Punycode) */
|
||||
if (input[j] >= n && input[j] < m) m = input[j];
|
||||
}
|
||||
|
||||
/* Increase delta enough to advance the decoder's */
|
||||
/* <n,i> state to <m,0>, but guard against overflow: */
|
||||
|
||||
if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
|
||||
delta += (m - n) * (h + 1);
|
||||
n = m;
|
||||
|
||||
for (j = 0; j < input_length; ++j) {
|
||||
/* Punycode does not need to check whether input[j] is basic: */
|
||||
if (input[j] < n /* || basic(input[j]) */ ) {
|
||||
if (++delta == 0) return punycode_overflow;
|
||||
}
|
||||
|
||||
if (input[j] == n) {
|
||||
/* Represent delta as a generalized variable-length integer: */
|
||||
|
||||
for (q = delta, k = base; ; k += base) {
|
||||
if (out >= max_out) return punycode_big_output;
|
||||
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
|
||||
k >= bias + tmax ? tmax : k - bias;
|
||||
if (q < t) break;
|
||||
output[out++] = encode_digit(t + (q - t) % (base - t), 0);
|
||||
q = (q - t) / (base - t);
|
||||
}
|
||||
|
||||
output[out++] = encode_digit(q, case_flags && case_flags[j]);
|
||||
bias = adapt(delta, h + 1, h == b);
|
||||
delta = 0;
|
||||
++h;
|
||||
}
|
||||
}
|
||||
|
||||
++delta, ++n;
|
||||
}
|
||||
|
||||
*output_length = out;
|
||||
return punycode_success;
|
||||
}
|
||||
|
||||
/*** Main decode function ***/
|
||||
|
||||
enum punycode_status punycode_decode(
|
||||
punycode_uint input_length,
|
||||
const char input[],
|
||||
punycode_uint *output_length,
|
||||
punycode_uint output[],
|
||||
unsigned char case_flags[] )
|
||||
{
|
||||
punycode_uint n, out, i, max_out, bias,
|
||||
b, j, in, oldi, w, k, digit, t;
|
||||
|
||||
/* Initialize the state: */
|
||||
|
||||
n = initial_n;
|
||||
out = i = 0;
|
||||
max_out = *output_length;
|
||||
bias = initial_bias;
|
||||
|
||||
/* Handle the basic code points: Let b be the number of input code */
|
||||
/* points before the last delimiter, or 0 if there is none, then */
|
||||
/* copy the first b code points to the output. */
|
||||
|
||||
for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
|
||||
if (b > max_out) return punycode_big_output;
|
||||
|
||||
for (j = 0; j < b; ++j) {
|
||||
if (case_flags) case_flags[out] = flagged(input[j]);
|
||||
if (!basic(input[j])) return punycode_bad_input;
|
||||
output[out++] = input[j];
|
||||
}
|
||||
|
||||
/* Main decoding loop: Start just after the last delimiter if any */
|
||||
/* basic code points were copied; start at the beginning otherwise. */
|
||||
|
||||
for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
|
||||
|
||||
/* in is the index of the next character to be consumed, and */
|
||||
/* out is the number of code points in the output array. */
|
||||
|
||||
/* Decode a generalized variable-length integer into delta, */
|
||||
/* which gets added to i. The overflow checking is easier */
|
||||
/* if we increase i as we go, then subtract off its starting */
|
||||
/* value at the end to obtain delta. */
|
||||
|
||||
for (oldi = i, w = 1, k = base; ; k += base) {
|
||||
if (in >= input_length) return punycode_bad_input;
|
||||
digit = decode_digit(input[in++]);
|
||||
if (digit >= base) return punycode_bad_input;
|
||||
if (digit > (maxint - i) / w) return punycode_overflow;
|
||||
i += digit * w;
|
||||
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
|
||||
k >= bias + tmax ? tmax : k - bias;
|
||||
if (digit < t) break;
|
||||
if (w > maxint / (base - t)) return punycode_overflow;
|
||||
w *= (base - t);
|
||||
}
|
||||
|
||||
bias = adapt(i - oldi, out + 1, oldi == 0);
|
||||
|
||||
/* i was supposed to wrap around from out+1 to 0, */
|
||||
/* incrementing n each time, so we'll fix that now: */
|
||||
|
||||
if (i / (out + 1) > maxint - n) return punycode_overflow;
|
||||
n += i / (out + 1);
|
||||
i %= (out + 1);
|
||||
|
||||
/* Insert n at position i of the output: */
|
||||
|
||||
/* not needed for Punycode: */
|
||||
/* if (decode_digit(n) <= base) return punycode_invalid_input; */
|
||||
if (out >= max_out) return punycode_big_output;
|
||||
|
||||
if (case_flags) {
|
||||
memmove(case_flags + i + 1, case_flags + i, out - i);
|
||||
/* Case of last character determines uppercase flag: */
|
||||
case_flags[i] = flagged(input[in - 1]);
|
||||
}
|
||||
|
||||
memmove(output + i + 1, output + i, (out - i) * sizeof *output);
|
||||
output[i++] = n;
|
||||
}
|
||||
|
||||
*output_length = out;
|
||||
return punycode_success;
|
||||
}
|
101
icu4c/source/test/intltest/punyref.h
Normal file
101
icu4c/source/test/intltest/punyref.h
Normal file
|
@ -0,0 +1,101 @@
|
|||
/*
|
||||
punycode.c from draft-ietf-idn-punycode-03
|
||||
http://www.nicemice.net/idn/
|
||||
Adam M. Costello
|
||||
http://www.nicemice.net/amc/
|
||||
|
||||
This is ANSI C code (C89) implementing
|
||||
Punycode (draft-ietf-idn-punycode-03).
|
||||
|
||||
*/
|
||||
#ifndef _PUNYREF_H
|
||||
#define _PUNYREF_H
|
||||
|
||||
/************************************************************/
|
||||
/* Public interface (would normally go in its own .h file): */
|
||||
|
||||
#include <limits.h>
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
enum punycode_status {
|
||||
punycode_success,
|
||||
punycode_bad_input, /* Input is invalid. */
|
||||
punycode_big_output, /* Output would exceed the space provided. */
|
||||
punycode_overflow /* Input needs wider integers to process. */
|
||||
};
|
||||
|
||||
|
||||
/*typedef unsigned long punycode_uint;*/
|
||||
|
||||
#if defined(_WIN32) || defined(WIN32)
|
||||
typedef unsigned long punycode_uint;
|
||||
#else
|
||||
# if UINT_MAX >= (1 << 26) - 1
|
||||
typedef unsigned int punycode_uint;
|
||||
# else
|
||||
typedef unsigned long punycode_uint;
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
U_CFUNC enum punycode_status punycode_encode(
|
||||
punycode_uint input_length,
|
||||
const punycode_uint input[],
|
||||
const unsigned char case_flags[],
|
||||
punycode_uint *output_length,
|
||||
char output[] );
|
||||
|
||||
/* punycode_encode() converts Unicode to Punycode. The input */
|
||||
/* is represented as an array of Unicode code points (not code */
|
||||
/* units; surrogate pairs are not allowed), and the output */
|
||||
/* will be represented as an array of ASCII code points. The */
|
||||
/* output string is *not* null-terminated; it will contain */
|
||||
/* zeros if and only if the input contains zeros. (Of course */
|
||||
/* the caller can leave room for a terminator and add one if */
|
||||
/* needed.) The input_length is the number of code points in */
|
||||
/* the input. The output_length is an in/out argument: the */
|
||||
/* caller passes in the maximum number of code points that it */
|
||||
/* can receive, and on successful return it will contain the */
|
||||
/* number of code points actually output. The case_flags array */
|
||||
/* holds input_length boolean values, where nonzero suggests that */
|
||||
/* the corresponding Unicode character be forced to uppercase */
|
||||
/* after being decoded (if possible), and zero suggests that */
|
||||
/* it be forced to lowercase (if possible). ASCII code points */
|
||||
/* are encoded literally, except that ASCII letters are forced */
|
||||
/* to uppercase or lowercase according to the corresponding */
|
||||
/* uppercase flags. If case_flags is a null pointer then ASCII */
|
||||
/* letters are left as they are, and other code points are */
|
||||
/* treated as if their uppercase flags were zero. The return */
|
||||
/* value can be any of the punycode_status values defined above */
|
||||
/* except punycode_bad_input; if not punycode_success, then */
|
||||
/* output_size and output might contain garbage. */
|
||||
|
||||
U_CFUNC enum punycode_status punycode_decode(
|
||||
punycode_uint input_length,
|
||||
const char input[],
|
||||
punycode_uint *output_length,
|
||||
punycode_uint output[],
|
||||
unsigned char case_flags[] );
|
||||
|
||||
/* punycode_decode() converts Punycode to Unicode. The input is */
|
||||
/* represented as an array of ASCII code points, and the output */
|
||||
/* will be represented as an array of Unicode code points. The */
|
||||
/* input_length is the number of code points in the input. The */
|
||||
/* output_length is an in/out argument: the caller passes in */
|
||||
/* the maximum number of code points that it can receive, and */
|
||||
/* on successful return it will contain the actual number of */
|
||||
/* code points output. The case_flags array needs room for at */
|
||||
/* least output_length values, or it can be a null pointer if the */
|
||||
/* case information is not needed. A nonzero flag suggests that */
|
||||
/* the corresponding Unicode character be forced to uppercase */
|
||||
/* by the caller (if possible), while zero suggests that it be */
|
||||
/* forced to lowercase (if possible). ASCII code points are */
|
||||
/* output already in the proper case, but their flags will be set */
|
||||
/* appropriately so that applying the flags would be harmless. */
|
||||
/* The return value can be any of the punycode_status values */
|
||||
/* defined above; if not punycode_success, then output_length, */
|
||||
/* output, and case_flags might contain garbage. On success, the */
|
||||
/* decoder will never need to write an output_length greater than */
|
||||
/* input_length, because of how the encoding is defined. */
|
||||
#endif
|
||||
|
513
icu4c/source/test/intltest/testidn.cpp
Normal file
513
icu4c/source/test/intltest/testidn.cpp
Normal file
|
@ -0,0 +1,513 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: genidn.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003-02-06
|
||||
* created by: Ram Viswanadha
|
||||
*
|
||||
* This program reads the rfc3454_*.txt files,
|
||||
* parses them, and extracts the data for Nameprep conformance.
|
||||
* It then preprocesses it and writes a binary file for efficient use
|
||||
* in various IDNA conversion processes.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "uoptions.h"
|
||||
#include "uparse.h"
|
||||
#include "utrie.h"
|
||||
#include "umutex.h"
|
||||
#include "sprpimpl.h"
|
||||
#include "testidna.h"
|
||||
|
||||
#ifdef WIN32
|
||||
# pragma warning(disable: 4100)
|
||||
#endif
|
||||
|
||||
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
|
||||
static UBool isDataLoaded = FALSE;
|
||||
static UTrie idnTrie={ 0,0,0,0,0,0,0 };
|
||||
static UDataMemory *idnData=NULL;
|
||||
static UErrorCode dataErrorCode =U_ZERO_ERROR;
|
||||
|
||||
|
||||
static const uint16_t* mappingData = NULL;
|
||||
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
|
||||
|
||||
|
||||
static void
|
||||
parseMappings(const char *filename, UBool withNorm, UBool reportError,TestIDNA& test, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseTable(const char *filename, UBool isUnassigned, TestIDNA& test, UErrorCode *pErrorCode);
|
||||
|
||||
static UBool loadIDNData(UErrorCode &errorCode);
|
||||
|
||||
static UBool cleanup();
|
||||
|
||||
static void
|
||||
compareMapping(uint32_t codepoint, uint32_t* mapping, int32_t mapLength,
|
||||
UBool withNorm, UErrorCode *status);
|
||||
|
||||
static void
|
||||
compareFlagsForRange(uint32_t start, uint32_t end,
|
||||
UBool isUnassigned, UErrorCode *status);
|
||||
|
||||
static void
|
||||
testAllCodepoints(TestIDNA& test);
|
||||
|
||||
static TestIDNA* pTestIDNA =NULL;
|
||||
|
||||
static const char* fileNames[] = {
|
||||
"rfc3454_A_1.txt", /* contains unassigned code points */
|
||||
"rfc3454_C_X.txt", /* contains code points that are prohibited */
|
||||
"rfc3454_B_1.txt", /* contains case mappings when normalization is turned off */
|
||||
"rfc3454_B_2.txt", /* contains case mappings when normalization it turned on */
|
||||
/* "NormalizationCorrections.txt",contains NFKC case mappings whicha are not included in UTR 21 */
|
||||
};
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
UOPTION_COPYRIGHT,
|
||||
UOPTION_DESTDIR,
|
||||
UOPTION_SOURCEDIR,
|
||||
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
|
||||
};
|
||||
|
||||
/* file definitions */
|
||||
#define DATA_NAME "uidna"
|
||||
#define DATA_TYPE "icu"
|
||||
|
||||
#define MISC_DIR "misc"
|
||||
|
||||
extern int
|
||||
testData(TestIDNA& test) {
|
||||
char filename[300];
|
||||
//TODO get the srcDir dynamically
|
||||
const char *srcDir=IntlTest::pathToDataDirectory(), *destDir=NULL, *suffix=NULL;
|
||||
char *basename=NULL;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
char *saveBasename =NULL;
|
||||
|
||||
loadIDNData(errorCode);
|
||||
if(U_FAILURE(dataErrorCode)){
|
||||
test.errln( "Could not load data. Error: %s\n",u_errorName(dataErrorCode));
|
||||
return dataErrorCode;
|
||||
}
|
||||
|
||||
//initialize
|
||||
pTestIDNA = &test;
|
||||
/* prepare the filename beginning with the source dir */
|
||||
if(srcDir[0] == U_FILE_SEP_CHAR){
|
||||
filename[0]= 0x2E;
|
||||
uprv_strcat(filename+1,srcDir);
|
||||
}else if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL){
|
||||
filename[0] = 0x2E;
|
||||
filename[1] = U_FILE_SEP_CHAR;
|
||||
uprv_strcpy(filename+2,srcDir);
|
||||
}else{
|
||||
uprv_strcpy(filename, srcDir);
|
||||
}
|
||||
|
||||
/* process unassigned */
|
||||
basename=filename+uprv_strlen(filename);
|
||||
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
|
||||
*basename++=U_FILE_SEP_CHAR;
|
||||
}
|
||||
|
||||
uprv_strcpy(basename,MISC_DIR);
|
||||
basename= basename + uprv_strlen(MISC_DIR);
|
||||
*basename++ = U_FILE_SEP_CHAR;
|
||||
|
||||
uprv_strcpy(basename,fileNames[0]);
|
||||
parseTable(filename,TRUE, test,&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
test.errln( "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
/* process prohibited */
|
||||
uprv_strcpy(basename,fileNames[1]);
|
||||
parseTable(filename,FALSE, test, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
test.errln( "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
/* process mappings */
|
||||
uprv_strcpy(basename,fileNames[2]);
|
||||
parseMappings(filename, FALSE, FALSE,test, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
test.errln( "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
uprv_strcpy(basename,fileNames[3]);
|
||||
parseMappings(filename, TRUE, FALSE,test, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
test.errln( "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
testAllCodepoints(test);
|
||||
|
||||
cleanup();
|
||||
pTestIDNA = NULL;
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
static void U_CALLCONV
|
||||
caseMapLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t mapping[40];
|
||||
char *end, *s;
|
||||
uint32_t code;
|
||||
int32_t length;
|
||||
UBool* mapWithNorm = (UBool*) context;
|
||||
|
||||
/* ignore First and Last entries for ranges */
|
||||
if( *fields[1][0]=='<' &&
|
||||
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
|
||||
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the character code, field 0 */
|
||||
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
|
||||
}
|
||||
|
||||
s = fields[1][0];
|
||||
/* parse the mapping string */
|
||||
length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
|
||||
|
||||
/* store the mapping */
|
||||
|
||||
compareMapping(code,mapping, length, *mapWithNorm, pErrorCode);
|
||||
}
|
||||
|
||||
static void
|
||||
parseMappings(const char *filename,UBool withNorm, UBool reportError, TestIDNA& test, UErrorCode *pErrorCode) {
|
||||
char *fields[3][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 3, caseMapLineFn, &withNorm, pErrorCode);
|
||||
|
||||
//fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);
|
||||
|
||||
if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
|
||||
test.errln( "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
|
||||
}
|
||||
}
|
||||
|
||||
/* parser for UnicodeData.txt ----------------------------------------------- */
|
||||
|
||||
static void U_CALLCONV
|
||||
unicodeDataLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t length;
|
||||
uint32_t rangeStart=0,rangeEnd =0;
|
||||
UBool* isUnassigned = (UBool*) context;
|
||||
|
||||
/* ignore First and Last entries for ranges */
|
||||
if( *fields[1][0]=='<' &&
|
||||
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
|
||||
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode);
|
||||
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
*pErrorCode = U_PARSE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
compareFlagsForRange(rangeStart,rangeEnd,*isUnassigned, pErrorCode);
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
parseTable(const char *filename,UBool isUnassigned,TestIDNA& test, UErrorCode *pErrorCode) {
|
||||
char *fields[1][2];
|
||||
int32_t len=0;
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 1, unicodeDataLineFn, &isUnassigned, pErrorCode);
|
||||
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
test.errln( "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
testAllCodepoints(TestIDNA& test){
|
||||
if(isDataLoaded){
|
||||
uint32_t i = 0;
|
||||
int32_t unassigned = 0;
|
||||
int32_t prohibited = 0;
|
||||
int32_t mappedWithNorm = 0;
|
||||
int32_t mapped = 0;
|
||||
int32_t noValueInTrie = 0;
|
||||
|
||||
|
||||
for(i=0;i<=0x10FFFF;i++){
|
||||
uint32_t result = 0;
|
||||
UTRIE_GET16(&idnTrie,i, result);
|
||||
|
||||
if(result != UIDNA_NO_VALUE ){
|
||||
if((result & 0x07) == UIDNA_UNASSIGNED){
|
||||
unassigned++;
|
||||
}
|
||||
if((result & 0x07) == UIDNA_PROHIBITED){
|
||||
prohibited++;
|
||||
}
|
||||
if((result>>5) == _IDNA_MAP_TO_NOTHING){
|
||||
mapped++;
|
||||
}
|
||||
if((result & 0x07) == UIDNA_MAP_NFKC){
|
||||
mappedWithNorm++;
|
||||
}
|
||||
}else{
|
||||
noValueInTrie++;
|
||||
if(result > 0){
|
||||
test.errln("The return value for 0x%06X is wrong. %i\n",i,result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test.logln("Number of Unassinged code points : %i \n",unassigned);
|
||||
test.logln("Number of Prohibited code points : %i \n",prohibited);
|
||||
test.logln("Number of Mapped code points : %i \n",mapped);
|
||||
test.logln("Number of Mapped with NFKC code points : %i \n",mappedWithNorm);
|
||||
test.logln("Number of code points that have no value in Trie: %i \n",noValueInTrie);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static inline void getValues(uint32_t result, int8_t& flag,
|
||||
int8_t& length, int32_t& index){
|
||||
/* first 3 bits contain the flag */
|
||||
flag = (int8_t) (result & 0x07);
|
||||
/* next 2 bits contain the length */
|
||||
length = (int8_t) ((result>>3) & 0x03);
|
||||
/* next 10 bits contain the index */
|
||||
index = (result>> 5);
|
||||
}
|
||||
|
||||
static void
|
||||
compareMapping(uint32_t codepoint, uint32_t* mapping,int32_t mapLength,
|
||||
UBool withNorm, UErrorCode *status){
|
||||
if(isDataLoaded){
|
||||
uint32_t result = 0;
|
||||
UTRIE_GET16(&idnTrie,codepoint, result);
|
||||
|
||||
int8_t flag, length;
|
||||
int32_t index;
|
||||
getValues(result,flag,length, index);
|
||||
|
||||
|
||||
if(withNorm){
|
||||
if(flag != UIDNA_MAP_NFKC){
|
||||
pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n",codepoint, UIDNA_MAP_NFKC, flag);
|
||||
}
|
||||
}else{
|
||||
if(flag=UIDNA_NO_VALUE || flag == UIDNA_PROHIBITED){
|
||||
if(index != _IDNA_MAP_TO_NOTHING ){
|
||||
pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n", codepoint, _IDNA_MAP_TO_NOTHING, index);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(length ==_IDNA_LENGTH_IN_MAPPING_TABLE){
|
||||
length = (int8_t)mappingData[index];
|
||||
index++;
|
||||
}
|
||||
|
||||
if(mapLength != length){
|
||||
pTestIDNA->errln( "Did not get the expected length. Expected: %i Got: %i\n", mapLength, length);
|
||||
}
|
||||
|
||||
|
||||
for(int8_t i =0; i< mapLength; i++){
|
||||
if(mapping[i] <= 0xFFFF){
|
||||
if(mappingData[index+i] != (uint16_t)mapping[i]){
|
||||
pTestIDNA->errln("Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n", mapping[i], mappingData[index+i]);
|
||||
}
|
||||
}else{
|
||||
UChar lead = UTF16_LEAD(mapping[i]);
|
||||
UChar trail = UTF16_TRAIL(mapping[i]);
|
||||
if(mappingData[index+i] != lead ||
|
||||
mappingData[index+i+1] != trail){
|
||||
pTestIDNA->errln( "Did not get the expected result. Expected: 0x%04X 0x%04X Got: 0x%04X 0x%04X", lead, trail, mappingData[index+i], mappingData[index+i+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
compareFlagsForRange(uint32_t start, uint32_t end,
|
||||
UBool isUnassigned, UErrorCode *status){
|
||||
if(isDataLoaded){
|
||||
uint32_t result =0 ;
|
||||
while(start < end+1){
|
||||
UTRIE_GET16(&idnTrie,start, result);
|
||||
if(isUnassigned){
|
||||
if(result != UIDNA_UNASSIGNED){
|
||||
pTestIDNA->errln( "UIDNA_UASSIGNED flag failed for 0x%06X. Expected: %04X Got: %04X\n",start,UIDNA_UNASSIGNED, result);
|
||||
}
|
||||
}else{
|
||||
if((result & 0x03) != UIDNA_PROHIBITED){
|
||||
pTestIDNA->errln( "UIDNA_PROHIBITED flag failed for 0x%06X. Expected: %04X Got: %04X\n\n",start,UIDNA_PROHIBITED, result);
|
||||
}
|
||||
}
|
||||
start++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
cleanup() {
|
||||
if(idnData!=NULL) {
|
||||
udata_close(idnData);
|
||||
idnData=NULL;
|
||||
}
|
||||
dataErrorCode=U_ZERO_ERROR;
|
||||
isDataLoaded=FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV
|
||||
isAcceptable(void * /* context */,
|
||||
const char * /* type */, const char * /* name */,
|
||||
const UDataInfo *pInfo) {
|
||||
if(
|
||||
pInfo->size>=20 &&
|
||||
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
|
||||
pInfo->charsetFamily==U_CHARSET_FAMILY &&
|
||||
pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */
|
||||
pInfo->dataFormat[1]==0x44 &&
|
||||
pInfo->dataFormat[2]==0x4e &&
|
||||
pInfo->dataFormat[3]==0x41 &&
|
||||
pInfo->formatVersion[0]==2 &&
|
||||
pInfo->formatVersion[2]==UTRIE_SHIFT &&
|
||||
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
|
||||
) {
|
||||
return TRUE;
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
/* idnTrie: the folding offset is the lead FCD value itself */
|
||||
static int32_t U_CALLCONV
|
||||
getFoldingOffset(uint32_t data) {
|
||||
if(data&0x8000) {
|
||||
return (int32_t)(data&0x7fff);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static UBool
|
||||
loadIDNData(UErrorCode &errorCode) {
|
||||
/* load Unicode normalization data from file */
|
||||
if(isDataLoaded==FALSE) {
|
||||
UTrie _idnTrie={ 0,0,0,0,0,0,0 };
|
||||
UDataMemory *data;
|
||||
const int32_t *p=NULL;
|
||||
const uint8_t *pb;
|
||||
|
||||
if(&errorCode==NULL || U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* open the data outside the mutex block */
|
||||
data=udata_openChoice("c:\\work\\devicu\\idn\\genidn\\icudt26l", DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
|
||||
dataErrorCode=errorCode;
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return isDataLoaded=FALSE;
|
||||
}
|
||||
|
||||
p=(const int32_t *)udata_getMemory(data);
|
||||
pb=(const uint8_t *)(p+_IDNA_INDEX_TOP);
|
||||
utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode);
|
||||
_idnTrie.getFoldingOffset=getFoldingOffset;
|
||||
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dataErrorCode=errorCode;
|
||||
udata_close(data);
|
||||
return isDataLoaded=FALSE;
|
||||
}
|
||||
|
||||
/* in the mutex block, set the data for this process */
|
||||
umtx_lock(NULL);
|
||||
if(idnData==NULL) {
|
||||
idnData=data;
|
||||
data=NULL;
|
||||
uprv_memcpy(&indexes, p, sizeof(indexes));
|
||||
uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie));
|
||||
} else {
|
||||
p=(const int32_t *)udata_getMemory(idnData);
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
/* initialize some variables */
|
||||
mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]);
|
||||
|
||||
isDataLoaded = TRUE;
|
||||
|
||||
/* if a different thread set it first, then close the extra data */
|
||||
if(data!=NULL) {
|
||||
udata_close(data); /* NULL if it was set correctly */
|
||||
}
|
||||
}
|
||||
|
||||
return isDataLoaded;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
1142
icu4c/source/test/intltest/testidna.cpp
Normal file
1142
icu4c/source/test/intltest/testidna.cpp
Normal file
File diff suppressed because it is too large
Load diff
96
icu4c/source/test/intltest/testidna.h
Normal file
96
icu4c/source/test/intltest/testidna.h
Normal file
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: strprep.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003feb1
|
||||
* created by: Ram Viswanadha
|
||||
*/
|
||||
|
||||
#ifndef TESTIDNA_H
|
||||
#define TESTIDNA_H
|
||||
|
||||
#include "sprpimpl.h"
|
||||
#include "intltest.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
||||
typedef int32_t
|
||||
(*TestFunc) ( const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
int32_t options, UParseError *parseError,
|
||||
UErrorCode *status);
|
||||
typedef int32_t
|
||||
(*CompareFunc) (const UChar *s1, int32_t s1Len,
|
||||
const UChar *s2, int32_t s2Len,
|
||||
int32_t options,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
|
||||
|
||||
// test the API
|
||||
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary General test of HexadecimalToUnicodeTransliterator
|
||||
*/
|
||||
class TestIDNA : public IntlTest {
|
||||
public:
|
||||
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
|
||||
void TestDataFile();
|
||||
void TestToASCII();
|
||||
void TestToUnicode();
|
||||
void TestIDNToUnicode();
|
||||
void TestIDNToASCII();
|
||||
void TestCompare();
|
||||
void TestErrorCases();
|
||||
void TestChaining();
|
||||
void TestRootLabelSeparator();
|
||||
void TestCompareReferenceImpl();
|
||||
void TestRefIDNA();
|
||||
void TestIDNAMonkeyTest();
|
||||
private:
|
||||
void testToASCII(const char* testName, TestFunc func);
|
||||
void testToUnicode(const char* testName, TestFunc func);
|
||||
void testIDNToUnicode(const char* testName, TestFunc func);
|
||||
void testIDNToASCII(const char* testName, TestFunc func);
|
||||
void testCompare(const char* testName, CompareFunc func);
|
||||
void testChaining(const char* toASCIIName, TestFunc toASCII,
|
||||
const char* toUnicodeName, TestFunc toUnicode);
|
||||
|
||||
// main testing functions
|
||||
void testAPI(const UChar *src, const UChar *expected, const char *testName,
|
||||
UBool useSTD3ASCIIRules, UErrorCode expectedStatus,
|
||||
UBool doCompare, TestFunc func);
|
||||
|
||||
void testCompare(const UChar* s1, int32_t s1Len,
|
||||
const UChar* s2, int32_t s2Len,
|
||||
const char* testName, CompareFunc func,
|
||||
UBool isEqual);
|
||||
|
||||
void testErrorCases(const char* toASCIIName, TestFunc toASCII,
|
||||
const char* IDNToASCIIName, TestFunc IDNToASCII,
|
||||
const char* IDNToUnicodeName, TestFunc IDNToUnicode);
|
||||
|
||||
void testChaining(UChar* src,int32_t numIterations,const char* testName,
|
||||
UBool useSTD3ASCIIRules, UBool caseInsensitive, TestFunc func);
|
||||
|
||||
void testRootLabelSeparator(const char* testName, CompareFunc func,
|
||||
const char* IDNToASCIIName, TestFunc IDNToASCII,
|
||||
const char* IDNToUnicodeName, TestFunc IDNToUnicode);
|
||||
|
||||
void testCompareReferenceImpl(const UChar* src, int32_t srcLen);
|
||||
};
|
||||
|
||||
// test the TRIE data structure
|
||||
int testData(TestIDNA& test);
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue