ICU-96 correct handling of level separator for quad level, sortkeytostring private function, some tweaks for CE generation, rule parser factored out

X-SVN-Rev: 4189
This commit is contained in:
Vladimir Weinstein 2001-03-20 00:56:37 +00:00
parent a6265c42b8
commit cda9dc782f
6 changed files with 868 additions and 248 deletions

View file

@ -1431,7 +1431,7 @@ ucol_calcSortKey(const UCollator *coll,
uint8_t *frenchEndPtr = NULL;
uint32_t caseShift = 0;
sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + (compareQuad?0:1) + (compareIdent?1:0));
sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + (qShifted?1:0)/*(compareQuad?0:1)*/ + (compareIdent?1:0));
collIterate s;
init_collIterate(coll, (UChar *)source, len, &s, FALSE);
@ -1780,7 +1780,7 @@ ucol_calcSortKey(const UCollator *coll,
if(sortKeySize <= resultLength) {
uprv_memcpy(primaries, terStart, tersize);
primaries += tersize;
if(compareQuad == 0) {
if(/*compareQuad == 0*/qShifted == TRUE) {
if(count4 > 0) {
while (count4 >= UCOL_BOT_COUNT4) {
*quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
@ -2198,6 +2198,65 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
return sortKeySize;
}
/* this function makes a string with representation of a sortkey */
U_CAPI char U_EXPORT2 *ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) {
uint32_t strength = UCOL_PRIMARY;
uint32_t res_size = 0;
UBool doneCase = FALSE;
char *current = buffer;
const uint8_t *currentSk = sortkey;
sprintf(current, "[");
current++;
while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
if(strength > UCOL_PRIMARY) {
sprintf(current, " . ");
current += 3;
}
while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */
sprintf(current, "%02X ", *currentSk++);
current+=3;
}
if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
doneCase = TRUE;
} else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
strength ++;
}
sprintf(current, "%02X", *(currentSk++)); /* This should print '01' */
current +=2;
if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
break;
}
}
if(coll->strength == UCOL_IDENTICAL) {
sprintf(current, " . ");
current += 3;
while(*currentSk != 0) {
if(*currentSk == 0x01) {
sprintf(current, "%02X", *(currentSk++));
current +=2;
}
sprintf(current, "%02X%02X ", *currentSk, *(currentSk+1));
current +=5;
currentSk+=2;
}
sprintf(current, "%02X", *(currentSk++)); /* This should print '00' */
current += 2;
}
sprintf(current, "]");
current += 3;
return buffer;
}
/* This is a trick string compare function that goes in and uses sortkeys to compare */
/* It is used when compare gets in trouble and needs to bail out */
UCollationResult ucol_compareUsingSortKeys(const UCollator *coll,

View file

@ -309,6 +309,10 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
}
}
if(low == 0) {
low = 0x01000000;
}
if(strength == UCOL_SECONDARY) { /* similar as simple */
if(low >= UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
low = UCOL_COMMON_TOP2<<24;

View file

@ -549,6 +549,7 @@ uint32_t ucol_getIncrementalUCA(UChar ch, incrementalContext *collationSource, U
int32_t ucol_getIncrementalSpecialCE(const UCollator *coll, uint32_t CE, incrementalContext *ctx, UErrorCode *status);
void ucol_updateInternalState(UCollator *coll);
uint32_t ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status);
U_CAPI char U_EXPORT2 *ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len);
#endif

View file

@ -312,6 +312,254 @@ UBool ucol_uprv_tok_readAndSetOption(UCATableHeader *image, const UChar* start,
#define UCOL_TOK_UNSET 0xFFFFFFFF
#define UCOL_TOK_RESET 0xDEADBEEF
const UChar *ucol_tok_parseNextToken(UColTokenParser *src,
uint32_t *strength,
uint32_t *chOffset, uint32_t *chLen,
uint32_t *exOffset, uint32_t *exLen,
UBool *varT, UBool *top_,
UBool startOfRules,
UErrorCode *status) {
/* parsing part */
UBool variableTop = FALSE;
UBool top = FALSE;
UBool inChars = TRUE;
UBool inQuote = FALSE;
UBool wasInQuote = FALSE;
UChar *optionEnd = NULL;
uint32_t newCharsLen = 0, newExtensionLen = 0;
uint32_t charsOffset = 0, extensionOffset = 0;
uint32_t newStrength = UCOL_TOK_UNSET;
while (src->current < src->end) {
UChar ch = *(src->current);
if (inQuote) {
if (ch == 0x0027/*'\''*/) {
inQuote = FALSE;
} else {
if ((newCharsLen == 0) || inChars) {
if(newCharsLen == 0) {
charsOffset = src->extraCurrent - src->source;
}
newCharsLen++;
} else {
if(newExtensionLen == 0) {
extensionOffset = src->extraCurrent - src->source;
}
newExtensionLen++;
}
}
} else {
/* Sets the strength for this entry */
switch (ch) {
case 0x003D/*'='*/ :
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_IDENTICAL;
break;
case 0x002C/*','*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_TERTIARY;
break;
case 0x003B/*';'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_SECONDARY;
break;
case 0x003C/*'<'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
/* before this, do a scan to verify whether this is */
/* another strength */
if(*(src->current+1) == 0x003C) {
src->current++;
if(*(src->current+1) == 0x003C) {
src->current++; /* three in a row! */
newStrength = UCOL_TERTIARY;
} else { /* two in a row */
newStrength = UCOL_SECONDARY;
}
} else { /* just one */
newStrength = UCOL_PRIMARY;
}
break;
case 0x0026/*'&'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
break;
case 0x005b/*'['*/:
/* options - read an option, analyze it */
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
ucol_uprv_tok_readAndSetOption(src->image, src->current, optionEnd, &variableTop, &top, status);
src->current = optionEnd;
if(top == TRUE) {
if(newStrength == UCOL_TOK_RESET) {
src->current++;
goto EndOfLoop;
} else {
*status = U_INVALID_FORMAT_ERROR;
}
}
if(U_FAILURE(*status)) {
return NULL;
}
}
break;
/* Ignore the white spaces */
case 0x0009/*'\t'*/:
case 0x000C/*'\f'*/:
case 0x000D/*'\r'*/:
case 0x000A/*'\n'*/:
case 0x0020/*' '*/:
break; /* skip whitespace TODO use Unicode */
case 0x002F/*'/'*/:
/* This entry has an extension. */
inChars = FALSE;
break;
/* found a quote, we're gonna start copying */
case 0x0027/*'\''*/:
inQuote = TRUE;
wasInQuote = TRUE;
if (newCharsLen == 0) {
charsOffset = src->extraCurrent - src->source;
newCharsLen++;
} else if (inChars) { /* we're reading some chars */
charsOffset = src->extraCurrent - src->source;
if(newCharsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
src->extraCurrent += newCharsLen;
}
newCharsLen++;
} else {
if(newExtensionLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
src->extraCurrent += newExtensionLen;
}
newExtensionLen++;
}
ch = *(++(src->current)); /*pattern[++index]; */
break;
/* '@' is french only if the strength is not currently set */
/* if it is, it's just a regular character in collation rules */
case 0x0040/*'@'*/:
if (newStrength == UCOL_TOK_UNSET) {
src->image->frenchCollation = UCOL_ON;
break;
}
default:
if (newStrength == UCOL_TOK_UNSET) {
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
if (inChars) {
if(newCharsLen == 0) {
charsOffset = src->current - src->source;
}
newCharsLen++;
} else {
if(newExtensionLen == 0) {
extensionOffset = src->current - src->source;
}
newExtensionLen++;
}
break;
}
}
if(wasInQuote) {
if(ch != 0x27 || newCharsLen == 1) {
*src->extraCurrent++ = ch;
}
if(src->extraCurrent == src->extraEnd) {
/* reallocate */
}
}
src->current++;
}
EndOfLoop:
wasInQuote = FALSE;
if (newStrength == UCOL_TOK_UNSET) {
return NULL;
}
if (newCharsLen == 0 && top == FALSE) {
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
*strength = newStrength;
*chOffset = charsOffset;
*chLen = newCharsLen;
*exOffset = extensionOffset;
*exLen = newExtensionLen;
*varT = variableTop;
*top_ = top;
return src->current;
}
/*
Processing Description
1 Build a ListList. Each list has a header, which contains two lists (positive
@ -323,14 +571,15 @@ Processing Description
uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status) {
UColToken *lastToken = NULL;
uint32_t newCharsLen = 0, newExtensionsLen = 0;
uint32_t charsOffset = 0, extensionOffset = 0;
const UChar *parseEnd = NULL;
uint32_t expandNext = 0;
UBool variableTop = FALSE;
UBool top = FALSE;
UColTokListHeader *ListList = NULL;
uint32_t newCharsLen = 0, newExtensionsLen = 0;
uint32_t charsOffset = 0, extensionOffset = 0;
uint32_t newStrength = UCOL_TOK_UNSET;
ucol_tok_initTokenList(src, status);
@ -340,235 +589,16 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
src->image->variableTopValue = 0;
while(src->current < src->end) {
{ /* parsing part */
parseEnd = ucol_tok_parseNextToken(src,
&newStrength,
&charsOffset, &newCharsLen,
&extensionOffset, &newExtensionsLen,
&variableTop, &top,
(UBool)(lastToken == NULL),
status);
UBool inChars = TRUE;
UBool inQuote = FALSE;
UBool wasInQuote = FALSE;
UChar *optionEnd = NULL;
newStrength = UCOL_TOK_UNSET;
newCharsLen = 0; newExtensionsLen = 0;
charsOffset = 0; extensionOffset = 0;
while (src->current < src->end) {
UChar ch = *(src->current);
if (inQuote) {
if (ch == 0x0027/*'\''*/) {
inQuote = FALSE;
} else {
if ((newCharsLen == 0) || inChars) {
if(newCharsLen == 0) {
charsOffset = src->extraCurrent - src->source;
}
newCharsLen++;
} else {
if(newExtensionsLen == 0) {
extensionOffset = src->extraCurrent - src->source;
}
newExtensionsLen++;
}
}
} else {
/* Sets the strength for this entry */
switch (ch) {
case 0x003D/*'='*/ :
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_IDENTICAL;
break;
case 0x002C/*','*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_TERTIARY;
break;
case 0x003B/*';'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_SECONDARY;
break;
case 0x003C/*'<'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
/* before this, do a scan to verify whether this is */
/* another strength */
if(*(src->current+1) == 0x003C) {
src->current++;
if(*(src->current+1) == 0x003C) {
src->current++; /* three in a row! */
newStrength = UCOL_TERTIARY;
} else { /* two in a row */
newStrength = UCOL_SECONDARY;
}
} else { /* just one */
newStrength = UCOL_PRIMARY;
}
break;
case 0x0026/*'&'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
break;
case 0x005b/*'['*/:
/* options - read an option, analyze it */
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
ucol_uprv_tok_readAndSetOption(src->image, src->current, optionEnd, &variableTop, &top, status);
src->current = optionEnd;
if(top == TRUE) {
if(newStrength == UCOL_TOK_RESET) {
src->current++;
goto EndOfLoop;
} else {
*status = U_INVALID_FORMAT_ERROR;
}
}
if(U_FAILURE(*status)) {
return 0;
}
}
break;
/* Ignore the white spaces */
case 0x0009/*'\t'*/:
case 0x000C/*'\f'*/:
case 0x000D/*'\r'*/:
case 0x000A/*'\n'*/:
case 0x0020/*' '*/:
break; /* skip whitespace TODO use Unicode */
case 0x002F/*'/'*/:
/* This entry has an extension. */
inChars = FALSE;
break;
/* found a quote, we're gonna start copying */
case 0x0027/*'\''*/:
inQuote = TRUE;
wasInQuote = TRUE;
if (newCharsLen == 0) {
charsOffset = src->extraCurrent - src->source;
newCharsLen++;
} else if (inChars) { /* we're reading some chars */
charsOffset = src->extraCurrent - src->source;
if(newCharsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
src->extraCurrent += newCharsLen;
}
newCharsLen++;
} else {
if(newExtensionsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newExtensionsLen, newExtensionsLen*sizeof(UChar));
src->extraCurrent += newExtensionsLen;
}
newExtensionsLen++;
}
ch = *(++(src->current)); /*pattern[++index]; */
break;
/* '@' is french only if the strength is not currently set */
/* if it is, it's just a regular character in collation rules */
case 0x0040/*'@'*/:
if (newStrength == UCOL_TOK_UNSET) {
src->image->frenchCollation = UCOL_ON;
break;
}
default:
if (newStrength == UCOL_TOK_UNSET) {
*status = U_INVALID_FORMAT_ERROR;
return 0;
}
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
*status = U_INVALID_FORMAT_ERROR;
return 0;
}
if (inChars) {
if(newCharsLen == 0) {
charsOffset = src->current - src->source;
}
newCharsLen++;
} else {
if(newExtensionsLen == 0) {
extensionOffset = src->current - src->source;
}
newExtensionsLen++;
}
break;
}
}
if(wasInQuote) {
if(ch != 0x27) {
*src->extraCurrent++ = ch;
}
if(src->extraCurrent == src->extraEnd) {
/* reallocate */
}
}
src->current++;
}
EndOfLoop:
wasInQuote = FALSE;
if (newStrength == UCOL_TOK_UNSET) {
return 0;
}
if (newCharsLen == 0 && top == FALSE) {
*status = U_INVALID_FORMAT_ERROR;
return 0;
}
}
{
if(U_SUCCESS(*status) && parseEnd != NULL) {
UColToken *sourceToken = NULL;
UColToken key;
@ -789,7 +819,9 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
}
/* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
lastToken = sourceToken;
}
} else {
return 0;
}
}
return src->resultLen;

View file

@ -117,6 +117,13 @@ int32_t uhash_hashTokens(const void *k);
UBool uhash_compareTokens(const void *key1, const void *key2);
void ucol_tok_initTokenList(UColTokenParser *src, UErrorCode *status);
uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status);
U_CAPI const UChar U_EXPORT2 *ucol_tok_parseNextToken(UColTokenParser *src,
uint32_t *strength,
uint32_t *chOffset, uint32_t *chLen,
uint32_t *exOffset, uint32_t *exLen,
UBool *varT, UBool *top_,
UBool startOfRules,
UErrorCode *status);
#endif

View file

@ -25,6 +25,7 @@
#include "callcoll.h"
#include "unicode/ustring.h"
#include "string.h"
#include "ucol_imp.h"
static UCollator *myCollation;
const static UChar rules[MAX_TOKEN_LEN] =
@ -435,24 +436,55 @@ static void FunkyATest( )
ucol_close(myCollation);
}
UColAttributeValue caseFirst[] = {
UCOL_OFF,
UCOL_LOWER_FIRST,
UCOL_UPPER_FIRST
};
UColAttributeValue alternateHandling[] = {
UCOL_NON_IGNORABLE,
UCOL_NON_IGNORABLE,
UCOL_SHIFTED
};
UColAttributeValue caseLevel[] = {
UCOL_OFF,
UCOL_OFF,
UCOL_ON
};
UColAttributeValue strengths[] = {
UCOL_PRIMARY,
UCOL_PRIMARY,
UCOL_SECONDARY,
UCOL_TERTIARY,
UCOL_QUATERNARY,
UCOL_IDENTICAL
};
char * caseFirstC[] = {
"UCOL_OFF",
"UCOL_LOWER_FIRST",
"UCOL_UPPER_FIRST"
};
char * alternateHandlingC[] = {
"UCOL_NON_IGNORABLE",
"UCOL_SHIFTED"
};
char * caseLevelC[] = {
"UCOL_OFF",
"UCOL_ON"
};
char * strengthsC[] = {
"UCOL_PRIMARY",
"UCOL_SECONDARY",
"UCOL_TERTIARY",
"UCOL_QUATERNARY",
"UCOL_IDENTICAL"
};
static void PrintMarkDavis( )
@ -461,8 +493,10 @@ static void PrintMarkDavis( )
UChar m[256];
uint8_t sortkey[256];
UCollator *coll = ucol_open(NULL, &status);
uint32_t i,j,k,l, sortkeysize;
uint32_t h,i,j,k, sortkeysize;
uint32_t sizem = 0;
char buffer[512];
uint32_t len = 512;
u_uastrcpy(m, "Mark Davis");
sizem = u_strlen(m);
@ -475,20 +509,29 @@ static void PrintMarkDavis( )
}
fprintf(stderr, "\n");
for(i = 0; i<sizeof(alternateHandling)/sizeof(alternateHandling[0]); i++) {
ucol_setAttribute(coll, UCOL_ALTERNATE_HANDLING, alternateHandling[i], &status);
for(j = 0; j<sizeof(caseLevel)/sizeof(caseLevel[0]); j++) {
ucol_setAttribute(coll, UCOL_CASE_LEVEL, caseLevel[j], &status);
for(k = 0; k<sizeof(strengths)/sizeof(strengths[0]); k++) {
ucol_setAttribute(coll, UCOL_STRENGTH, strengths[k], &status);
sortkeysize = ucol_getSortKey(coll, m, sizem, sortkey, 256);
fprintf(stderr, "aH: %i, case: %i, st: %i\nSortkey: ", alternateHandling[i], caseLevel[j], strengths[k]);
for(l = 0; l<sortkeysize; l++) {
fprintf(stderr, "%02X", sortkey[l]);
for(h = 0; h<sizeof(caseFirst)/sizeof(caseFirst[0]); h++) {
ucol_setAttribute(coll, UCOL_CASE_FIRST, caseFirst[i], &status);
fprintf(stderr, "caseFirst: %s\n", caseFirstC[h]);
for(i = 0; i<sizeof(alternateHandling)/sizeof(alternateHandling[0]); i++) {
ucol_setAttribute(coll, UCOL_ALTERNATE_HANDLING, alternateHandling[i], &status);
fprintf(stderr, " AltHandling: %s\n", alternateHandlingC[i]);
for(j = 0; j<sizeof(caseLevel)/sizeof(caseLevel[0]); j++) {
ucol_setAttribute(coll, UCOL_CASE_LEVEL, caseLevel[j], &status);
fprintf(stderr, " caseLevel: %s\n", caseLevelC[j]);
for(k = 0; k<sizeof(strengths)/sizeof(strengths[0]); k++) {
ucol_setAttribute(coll, UCOL_STRENGTH, strengths[k], &status);
sortkeysize = ucol_getSortKey(coll, m, sizem, sortkey, 256);
fprintf(stderr, " strength: %s\n Sortkey: ", strengthsC[k]);
fprintf(stderr, "%s\n", ucol_sortKeyToString(coll, sortkey, buffer, &len));
}
fprintf(stderr, "\n");
}
}
}
}
@ -502,3 +545,477 @@ void addMiscCollTest(TestNode** root)
/*addTest(root, &PrintMarkDavis, "tscoll/cmsccoll/PrintMarkDavis");*/
}
#if 0
/* Ram's rule test */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "unicode\ucol.h"
#include "unicode\ustdio.h"
#include "unicode\ustring.h"
#include "ucol_tok.h"
#define AMP '&'
#define GREAT '<'
#define EQUAL '='
#define COMA ','
#define SEMIC ';'
#define BRACKET '['
#define ACCENT '@'
#define AMP_STR "&"
#define GREAT_STR "<"
#define EQUAL_STR "="
#define COMA_STR ","
#define SEMIC_STR ";"
#define DG_STR "<<"
#define TG_STR "<<<"
static FILE* file;
int32_t transformUTF16ToUTF8(uint8_t *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength) {
int32_t srcIndex, destIndex;
UChar32 c;
for(srcIndex=destIndex=0; srcIndex<srcLength && destIndex<destCapacity;) {
/* get code point from UTF-16 */
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
/* write code point in UTF-8 */
UTF8_APPEND_CHAR_SAFE(dest, destIndex, destCapacity, c);
}
return destIndex; /* return destination length */
}
void resetBuf(UChar** src,int len){
UChar* local = *src;
int i=0;
while(i<len){
*local++ = '\0';
i++;
}
}
UChar* findDelimiter(UChar* source,int srcLen){
UChar* local = source;
int i=0;
while(i<srcLen){
switch(*local){
case AMP:
case EQUAL :
case COMA :
case SEMIC :
case GREAT :
return local;
default:
break;
}
local++;
i++;
}
return NULL;
}
char *aescstrdup(const UChar* unichars, char* buf,int len){
int length;
char *newString,*targetLimit,*target;
UConverterFromUCallback cb;
void *p;
UErrorCode errorCode = U_ZERO_ERROR;
UConverter* conv = ucnv_open("US-ASCII",&errorCode);
length = u_strlen( unichars);
newString = buf;
target = newString;
targetLimit = newString+len;
ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, &cb, &p, &errorCode);
ucnv_fromUnicode(conv,&target,targetLimit, &unichars, (UChar*)(unichars+length),NULL,TRUE,&errorCode);
*target = '\0';
return newString;
}
void testPrimary(UCollator* col, const UChar* p,const UChar* q){
UChar source[256] = { '\0'};
UChar target[256] = { '\0'};
UChar temp[2] = {'\0'};
unsigned char utfSource[256] = {'\0'};
unsigned char utfTarget[256] = {'\0'};
UCollationResult result = ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
if(result!=UCOL_LESS){
aescstrdup(p,utfSource,256);
aescstrdup(q,utfTarget,256);
fprintf(file,"Primary failed source: %s target: %s \n", utfSource,utfTarget);
}
source[0] = 0x00E0;
u_strcat(source,p);
target[0] = 0x0061;
u_strcat(target,q);
result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
if(result!=UCOL_LESS){
aescstrdup(source,utfSource,256);
aescstrdup(target,utfTarget,256);
fprintf(file,"Primary swamps 2nd failed source: %s target: %s \n", utfSource,utfTarget);
}
}
void testSecondary(UCollator* col, const UChar* p,const UChar* q){
UChar source[256] = { '\0'};
UChar target[256] = { '\0'};
UChar temp[2] = {'\0'};
unsigned char utfSource[256] = {'\0'};
unsigned char utfTarget[256] = {'\0'};
UCollationResult result= ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
if(result!=UCOL_LESS){
aescstrdup(p,utfSource,256);
aescstrdup(q,utfTarget,256);
fprintf(file,"secondary failed source: %s target: %s \n", utfSource,utfTarget);
}
source[0] = 0x0041;
u_strcat(source,p);
target[0]= 0x0061;
u_strcat(target,q);
result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
if(result!=UCOL_LESS){
aescstrdup(source,utfSource,256);
aescstrdup(target,utfTarget,256);
fprintf(file,"secondary swamps 3rd failed source: %s target: %s \n",utfSource,utfTarget);
}
source[0] = '\0';
u_strcat(source,p);
u_strcat(source,(UChar*)"b");
target[0] = '\0';
u_strcat(target,q);
u_strcat(target,(UChar*)"a");
result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
if(result!=UCOL_GREATER){
aescstrdup(source,utfSource,256);
aescstrdup(target,utfTarget,256);
fprintf(file,"secondary is swamped by 1 failed source: %s target: %s \n",utfSource,utfTarget);
}
}
void testTertiary(UCollator* col, const UChar* p,const UChar* q){
UChar source[256] = { '\0'};
UChar target[256] = { '\0'};
UChar temp[2] = {'\0'};
unsigned char utfSource[256] = {'\0'};
unsigned char utfTarget[256] = {'\0'};
UCollationResult result= ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
if(result!=UCOL_LESS){
aescstrdup(p,utfSource,256);
aescstrdup(q,utfTarget,256);
fprintf(file,"Tertiary failed source: %s target: %s \n",utfSource,utfTarget);
}
source[0] = 0x0020;
u_strcat(source,p);
target[0]= 0x002D;
u_strcat(target,q);
result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
if(result!=UCOL_LESS){
aescstrdup(source,utfSource,256);
aescstrdup(target,utfTarget,256);
fprintf(file,"Tertiary swamps 4th failed source: %s target: %s \n", utfSource,utfTarget);
}
source[0] = '\0';
u_strcat(source,p);
*temp = 0x00E0;
u_strcat(source,temp);
target[0] = '\0';
u_strcat(target,q);
u_strcat(target,(UChar*)"a");
result = ucol_strcoll(col,source,u_strlen(source),target,u_strlen(target));
if(result!=UCOL_GREATER){
aescstrdup(source,utfSource,256);
aescstrdup(target,utfTarget,256);
fprintf(file,"Tertiary is swamped by 3rd failed source: %s target: %s \n",utfSource,utfTarget);
}
}
void testEquality(UCollator* col, const UChar* p,const UChar* q){
UChar source[256] = { '\0'};
UChar target[256] = { '\0'};
UChar temp[2] = {'\0'};
unsigned char utfSource[256] = {'\0'};
unsigned char utfTarget[256] = {'\0'};
UCollationResult result = ucol_strcoll(col,p,u_strlen(p),q,u_strlen(q));
if(result!=UCOL_EQUAL){
aescstrdup(p,utfSource,256);
aescstrdup(q,utfTarget,256);
fprintf(file,"Primary failed source: %s target: %s \n", utfSource,utfTarget);
}
}
void testCollator(UCollator* col, const UChar* p,const UChar* q, UChar* delimiter,int strength){
UChar source[256] = { '\0'};
UChar target[256] = { '\0'};
UChar temp[2] = {'\0'};
unsigned char utfSource[256] = {'\0'};
unsigned char utfTarget[256] = {'\0'};
UCollationResult result=0;
switch(strength){
case 0:
testEquality(col,p,q);
break;
case 1:
testPrimary(col,p,q);
break;
case 2:
testSecondary(col,p,q);
break;
case 3:
testTertiary(col,p,q);
break;
default:
break;
}
}
/*ar bg ca cs da el en_BE en_US_POSIX es et fi fr hi hr hu is iw ja ko lt lv mk mt nb nn nn_NO pl ro ru sh sk sl sq sr sv th tr uk vi zh zh_TW*/
UChar* consumeDelimiter(UChar** source, int srcLen,int* strength, UChar** delimiter){
UChar* local = *source;
UBool foundDelimiter = FALSE;
int i=0;
while(i<srcLen){
switch(*local){
case AMP:
*strength=1;
*delimiter = (UChar*)AMP_STR ;
if(*(local+1) == BRACKET ||*(local+2) == BRACKET ){
local++;
continue;
}
if(*(local-1)!= 0x0027)
foundDelimiter = TRUE;
break;
case BRACKET:
{
if(*(local-1)!= 0x0027){
UChar* limit;
limit = findDelimiter(local,srcLen-i);
*source=local=limit;
continue;
}
}
break;
case EQUAL :
*strength=0;
if(*(local-1)!= 0x0027){
*delimiter = (UChar*)EQUAL_STR;
foundDelimiter = TRUE;
}
break;
case COMA :
*strength = 3;
*delimiter =(UChar*)COMA_STR ;
foundDelimiter = TRUE;
break;
case SEMIC :
*delimiter = (UChar*)SEMIC_STR;
*strength = 2;
foundDelimiter = TRUE;
break;
case GREAT :
if(*(local+1)== GREAT){
local++;
if(*(local+2)==GREAT){
*delimiter = (UChar*)DG_STR;
*strength = 2;
local++;
}
else{
*delimiter = (UChar*)TG_STR;
*strength =3;
}
}
else{
*delimiter = (UChar*)GREAT_STR ;
*strength =1;
}
if(*(local-1)!= 0x0027)
foundDelimiter =TRUE;
break;
default:
break;
}
if(foundDelimiter){
if(local ==*source){
*source = ++local;
return NULL;
}
else{
return local;
}
}
local++;
i++;
}
return NULL;
}
UChar* istrncpy(UChar* dst,const UChar* src,int32_t n){
UChar *anchor = dst; /* save a pointer to start of dst */
while( (n-- > 0) ) { /* copy string 2 over */
if(*src!=0x0020 && *src!=0 && *src!=0x0027){
*(dst++) = *(src);
}
*src++;
}
return anchor;
}
void parseAndPrintRules(UCollator* col,const char* loc, const UChar* rules, int length){
UChar *local = (UChar*)rules;
UChar current[20]={'\0'};
UChar previous[20]= {'\0'};
UChar *first =current, *second = previous;
UChar* delimiter = (UChar*)" ";
int i = 0, strength;
char fileName[20] = {'\0'};
UBool gotBoth = FALSE;
if(loc){
strcpy(fileName,loc);
}
strcat(fileName,"TestCases.txt");
file = fopen(fileName,"wb");
if(file){
while((local-rules < length) && i<300){
UChar* limit =consumeDelimiter(&local,length-i,&strength,&delimiter);
if(limit==NULL ){
if(u_strcmp(delimiter ,(UChar*) AMP_STR)==0){
resetBuf(&first,20);
}
limit =findDelimiter(local,length-(local-rules));
if(limit==NULL){
limit= (UChar*)rules+length;
}
}
if(limit){
if(*first=='\0'){
istrncpy(first,local,(int)(limit-local));
local=limit;
}
else{
if((local-rules) < length){
istrncpy(second,local,(int)(limit-local));
}
local=limit;
gotBoth=TRUE;
}
}
if(gotBoth){
unsigned char tempFirst[20] = {'\0'};
unsigned char tempSecond[20] = {'\0'};
aescstrdup(first,tempFirst,20);
aescstrdup(second,tempSecond,20);
//fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
testCollator(col,first,second,delimiter,strength);
//fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
resetBuf(&first,20);
u_strcpy(first,second);
resetBuf(&second,20);
gotBoth=FALSE;
}
i++;
}
}
}
void parseAndPrintRules2(UCollator* col,const char* loc, const UChar* rules, int length){
UChar *local = (UChar*)rules;
UChar current[20]={'\0'};
UChar previous[20]= {'\0'};
UChar *first =current, *second = previous;
UChar* delimiter = (UChar*)" ";
int i = 0, strength;
char fileName[20] = {'\0'};
UBool gotBoth = FALSE;
if(loc){
strcpy(fileName,loc);
}
strcat(fileName,"TestCases.txt");
file = fopen(fileName,"wb");
if(file){
if(limit){
if(*first=='\0'){
istrncpy(first,local,(int)(limit-local));
local=limit;
}
else{
if((local-rules) < length){
istrncpy(second,local,(int)(limit-local));
}
local=limit;
gotBoth=TRUE;
}
}
if(gotBoth){
unsigned char tempFirst[20] = {'\0'};
unsigned char tempSecond[20] = {'\0'};
aescstrdup(first,tempFirst,20);
aescstrdup(second,tempSecond,20);
//fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
testCollator(col,first,second,delimiter,strength);
//fprintf(file,"first:%s second: %s delimiter: %s strength:%i \n ",tempFirst,tempSecond,delimiter,strength);
resetBuf(&first,20);
u_strcpy(first,second);
resetBuf(&second,20);
gotBoth=FALSE;
}
i++;
}
}
}
void processRules(const char* loc){
UErrorCode status = U_ZERO_ERROR;
UCollator* col = ucol_open(loc,&status);
int length=0;
const UChar* rules;
if(loc){
rules = ucol_getRules(col,&length);
}
ucol_setAttribute(col,UCOL_STRENGTH,UCOL_QUATERNARY,&status);
parseAndPrintRules2(col,loc,rules,length);
}
extern int
main(int argc, const char *argv[]) {
if(argc<2) {
fprintf(stderr,
"usage: %s { rpmap/rxmap-filename }+\n",
argv[0]);
exit(1);
}
while(--argc>0) {
processRules(*++argv);
}
return 0;
}
#endif