mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 04:29:31 +00:00
ICU-96 ground for parsing options + some tests
X-SVN-Rev: 3906
This commit is contained in:
parent
9d65e715fd
commit
2b117c5c1a
3 changed files with 133 additions and 20 deletions
|
@ -792,6 +792,15 @@ U_CFUNC void ucol_initBuffers(UColTokListHeader *lh, UHashtable *tailored, UErro
|
|||
}
|
||||
}
|
||||
|
||||
U_CFUNC ucol_getFirstCE(UCollator *coll, UChar u, UErrorCode *status) {
|
||||
collIterate colIt;
|
||||
uint32_t order;
|
||||
init_collIterate(&u, 1, &colIt, FALSE);
|
||||
order = ucol_getNextCE(coll, &colIt, status);
|
||||
/*UCOL_GETNEXTCE(order, coll, colIt, status);*/
|
||||
return order;
|
||||
}
|
||||
|
||||
U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UHashtable *tailored, UErrorCode *status) {
|
||||
UCAElements el;
|
||||
UColToken *tok = lh->first[UCOL_TOK_POLARITY_POSITIVE];
|
||||
|
@ -862,6 +871,21 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
|||
el.isThai = FALSE;
|
||||
}
|
||||
|
||||
/* we also need a case bit here, and we'll fish it out from the UCA for the first codepoint */
|
||||
uint32_t caseCE = ucol_getFirstCE(UCA, el.cPoints[0], status);
|
||||
if((caseCE & 0x40) != 0) {
|
||||
el.caseBit = TRUE;
|
||||
/* el.CEs[0] |= 0x40;*/
|
||||
for(i = 0; i<el.noOfCEs; i++) {
|
||||
el.CEs[i] |= 0x40;
|
||||
}
|
||||
} else {
|
||||
el.caseBit = FALSE;
|
||||
/* el.CEs[0] &= 0xFFFFFFBF;*/
|
||||
for(i = 0; i<el.noOfCEs; i++) {
|
||||
el.CEs[i] &= 0xFFFFFFBF;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* and then, add it */
|
||||
|
|
|
@ -102,6 +102,58 @@ void ucol_tok_initTokenList(UColTokenParser *src, UErrorCode *status) {
|
|||
uhash_setValueDeleter(uchars2tokens, deleteToken);
|
||||
}
|
||||
|
||||
/*
|
||||
*   U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
|
||||
*   U_STRING_DECL(ustringVar2, "jumps 5%", 8);
|
||||
*   static UBool didInit=FALSE;
|
||||
*  
|
||||
*   int32_t function() {
|
||||
*   if(!didInit) {
|
||||
*   U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
|
||||
*   U_STRING_INIT(ustringVar2, "jumps 5%", 8);
|
||||
*   didInit=TRUE;
|
||||
*   }
|
||||
*   return u_strcmp(ustringVar1, ustringVar2);
|
||||
*   }
|
||||
*/
|
||||
#define UTOK_MAX_OPTION_LEN 20
|
||||
|
||||
static didInit = FALSE;
|
||||
|
||||
U_STRING_DECL(option_01, "rearrange", 9); U_STRING_DECL(option_02, "alternate", 9);
|
||||
U_STRING_DECL(option_03, "backwards", 9); U_STRING_DECL(option_04, "variable top", 12);
|
||||
U_STRING_DECL(option_05, "top", 3); U_STRING_DECL(option_06, "normalization", 13);
|
||||
U_STRING_DECL(option_07, "caseLevel", 9); U_STRING_DECL(option_08, "caseFirst", 9);
|
||||
U_STRING_DECL(option_09, "scriptOrder", 11); U_STRING_DECL(option_10, "charsetname", 11);
|
||||
U_STRING_DECL(option_11, "charset", 7); U_STRING_DECL(option_12, "undefined", 9);
|
||||
|
||||
const UChar *options[] = {
|
||||
option_01,
|
||||
option_02,
|
||||
option_03,
|
||||
option_04,
|
||||
option_05,
|
||||
option_06,
|
||||
option_07,
|
||||
option_08,
|
||||
option_09,
|
||||
option_10,
|
||||
option_11,
|
||||
option_12
|
||||
};
|
||||
|
||||
|
||||
UBool ucol_uprv_tok_readAndSetOption(UCATableHeader *image, const UChar* start, const UChar *end, UBool *variableTop, UErrorCode *status) {
|
||||
if(!didInit) {
|
||||
U_STRING_INIT(option_01, "rearrange", 9); U_STRING_INIT(option_02, "alternate", 9);
|
||||
U_STRING_INIT(option_03, "backwards", 9); U_STRING_INIT(option_04, "variable top", 12);
|
||||
U_STRING_INIT(option_05, "top", 3); U_STRING_INIT(option_06, "normalization", 13);
|
||||
U_STRING_INIT(option_07, "caseLevel", 9); U_STRING_INIT(option_08, "caseFirst", 9);
|
||||
U_STRING_INIT(option_09, "scriptOrder", 11); U_STRING_INIT(option_10, "charsetname", 11);
|
||||
U_STRING_INIT(option_11, "charset", 7); U_STRING_INIT(option_12, "undefined", 9);
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
#define UCOL_TOK_UNSET 0xFFFFFFFF
|
||||
#define UCOL_TOK_RESET 0xDEADBEEF
|
||||
|
@ -120,7 +172,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
uint32_t newCharsLen = 0, newExtensionsLen = 0;
|
||||
uint32_t charsOffset = 0, extensionOffset = 0;
|
||||
uint32_t expandNext = 0;
|
||||
UBool caseBit = FALSE;
|
||||
UBool variableTop = FALSE;
|
||||
|
||||
UColTokListHeader *ListList = NULL;
|
||||
|
||||
|
@ -130,11 +182,14 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
|
||||
ListList = src->lh;
|
||||
|
||||
src->image->variableTopValue = 0;
|
||||
|
||||
while(src->current < src->end) {
|
||||
{ /* parsing part */
|
||||
|
||||
UBool inChars = TRUE;
|
||||
UBool inQuote = FALSE;
|
||||
UChar *optionEnd = NULL;
|
||||
|
||||
newStrength = UCOL_TOK_UNSET;
|
||||
newCharsLen = 0; newExtensionsLen = 0;
|
||||
|
@ -152,9 +207,6 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
charsOffset = src->current - src->source;
|
||||
}
|
||||
newCharsLen++;
|
||||
if(u_tolower(ch)!=ch) {
|
||||
caseBit = TRUE;
|
||||
}
|
||||
} else {
|
||||
if(newExtensionsLen == 0) {
|
||||
extensionOffset = src->current - src->source;
|
||||
|
@ -194,7 +246,19 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
goto EndOfLoop;
|
||||
}
|
||||
|
||||
newStrength = UCOL_PRIMARY;
|
||||
/* before this, do a scan to verify whether this is */
|
||||
/* another strength */
|
||||
if(*(src->current+1) == 0x003C) {
|
||||
src->current++;
|
||||
if(*(src->current+1) == 0x003C) {
|
||||
src->current++; /* three in a row! */
|
||||
newStrength = UCOL_TERTIARY;
|
||||
} else { /* two in a row */
|
||||
newStrength = UCOL_SECONDARY;
|
||||
}
|
||||
} else { /* just one */
|
||||
newStrength = UCOL_PRIMARY;
|
||||
}
|
||||
break;
|
||||
|
||||
case 0x0026/*'&'*/:
|
||||
|
@ -205,6 +269,14 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
|
||||
break;
|
||||
|
||||
case 0x005b/*'['*/:
|
||||
/* options - read an option, analyze it */
|
||||
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
|
||||
ucol_uprv_tok_readAndSetOption(src->image, src->current, optionEnd, &variableTop, status);
|
||||
src->current = optionEnd+1;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Ignore the white spaces */
|
||||
case 0x0009/*'\t'*/:
|
||||
case 0x000C/*'\f'*/:
|
||||
|
@ -225,17 +297,11 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
if (newCharsLen == 0) {
|
||||
charsOffset = src->current - src->source;
|
||||
newCharsLen++;
|
||||
if(u_tolower(ch)!=ch) {
|
||||
caseBit = TRUE;
|
||||
}
|
||||
} else if (inChars) {
|
||||
if(newCharsLen == 0) {
|
||||
charsOffset = src->current - src->source;
|
||||
}
|
||||
newCharsLen++;
|
||||
if(u_tolower(ch)!=ch) {
|
||||
caseBit = TRUE;
|
||||
}
|
||||
} else {
|
||||
newExtensionsLen++;
|
||||
}
|
||||
|
@ -268,17 +334,11 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
charsOffset = src->current - src->source;
|
||||
}
|
||||
newCharsLen++;
|
||||
if(u_tolower(ch)!=ch) {
|
||||
caseBit = TRUE;
|
||||
}
|
||||
} else {
|
||||
if(newExtensionsLen == 0) {
|
||||
extensionOffset = src->current - src->source;
|
||||
}
|
||||
newExtensionsLen++;
|
||||
if(u_tolower(ch)!=ch) {
|
||||
caseBit = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -303,6 +363,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
UColToken *sourceToken = NULL;
|
||||
UColToken key;
|
||||
|
||||
/* if we had a variable top, we're gonna put it in */
|
||||
if(variableTop == TRUE && src->image->variableTopValue == 0) {
|
||||
variableTop = FALSE;
|
||||
src->image->variableTopValue = *(src->source + charsOffset);
|
||||
}
|
||||
|
||||
key.source = newCharsLen << 24 | charsOffset;
|
||||
key.expansion = newExtensionsLen << 24 | extensionOffset;
|
||||
|
||||
|
@ -320,7 +386,6 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
|||
sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
|
||||
sourceToken->source = newCharsLen << 24 | charsOffset;
|
||||
sourceToken->expansion = newExtensionsLen << 24 | extensionOffset;
|
||||
sourceToken->caseBit = caseBit;
|
||||
|
||||
sourceToken->debugSource = *(src->source + charsOffset);
|
||||
if(newExtensionsLen > 0) {
|
||||
|
@ -523,4 +588,4 @@ void ucol_tok_closeTokenList(UColTokenParser *src) {
|
|||
uhash_close(uchars2tokens);
|
||||
uprv_free(src->lh);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -26,6 +26,10 @@
|
|||
#include "string.h"
|
||||
|
||||
static UCollator *myCollation;
|
||||
const static UChar rules[MAX_TOKEN_LEN] =
|
||||
/*" & 0 < 1,\u2461<a,A"*/
|
||||
{ 0x0026, 0x0030, 0x003C, 0x0031, 0x002C, 0x2460, 0x003C, 0x0061, 0x002C, 0x0041, 0x0000 };
|
||||
|
||||
const static UChar testCase[][MAX_TOKEN_LEN] =
|
||||
{
|
||||
/*0*/ {0x0031 /*'1'*/, 0x0061/*'a'*/, 0x0000},
|
||||
|
@ -74,7 +78,27 @@ static void TestCase( )
|
|||
}
|
||||
}
|
||||
}
|
||||
ucol_close(myCollation);
|
||||
ucol_close(myCollation);
|
||||
|
||||
myCollation = ucol_openRules(rules, u_strlen(rules), UNORM_NONE, UCOL_TERTIARY, &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err("ERROR: in creation of rule based collator: %s\n", myErrorName(status));
|
||||
return;
|
||||
}
|
||||
log_verbose("Testing different case settings with custom rules\n");
|
||||
ucol_setStrength(myCollation, UCOL_TERTIARY);
|
||||
|
||||
for(k = 0; k<4; k++) {
|
||||
ucol_setAttribute(myCollation, UCOL_CASE_FIRST, caseTestAttributes[k][0], &status);
|
||||
ucol_setAttribute(myCollation, UCOL_CASE_LEVEL, caseTestAttributes[k][1], &status);
|
||||
for (i = 0; i < 3 ; i++) {
|
||||
for(j = i+1; j<4; j++) {
|
||||
doTest(myCollation, testCase[i], testCase[j], caseTestResults[k][3*i+j-1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
ucol_close(myCollation);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue