diff --git a/icu4c/source/common/ubidi.c b/icu4c/source/common/ubidi.c index 077d317957b..8716b9aadb0 100644 --- a/icu4c/source/common/ubidi.c +++ b/icu4c/source/common/ubidi.c @@ -60,7 +60,9 @@ * do not matter. * * Note that this implementation never modifies the dirProps - * after the initial setup. + * after the initial setup, except for FSI which is changed to either + * LRI or RLI in getDirProps(), and paired brackets which may be changed + * to L or R according to N0. * * * In this implementation, the resolution of weak types (Wn), @@ -240,12 +242,18 @@ ubidi_close(UBiDi *pBiDi) { if(pBiDi->levelsMemory!=NULL) { uprv_free(pBiDi->levelsMemory); } - if(pBiDi->runsMemory!=NULL) { - uprv_free(pBiDi->runsMemory); + if(pBiDi->openingsMemory!=NULL) { + uprv_free(pBiDi->openingsMemory); } if(pBiDi->parasMemory!=NULL) { uprv_free(pBiDi->parasMemory); } + if(pBiDi->runsMemory!=NULL) { + uprv_free(pBiDi->runsMemory); + } + if(pBiDi->isolatesMemory!=NULL) { + uprv_free(pBiDi->isolatesMemory); + } if(pBiDi->insertPoints.points!=NULL) { uprv_free(pBiDi->insertPoints.points); } @@ -356,9 +364,13 @@ int32_t length){ /* perform (P2)..(P3) ------------------------------------------------------- */ +/** + * Returns the directionality of the first strong character + * after the last B in prologue, if any. + * Requires prologue!=null. + */ static DirProp firstL_R_AL(UBiDi *pBiDi) { - /* return first strong char after the last B in prologue if any */ const UChar *text=pBiDi->prologue; int32_t length=pBiDi->proLength; int32_t i; @@ -382,22 +394,43 @@ firstL_R_AL(UBiDi *pBiDi) { } /* - * Get the directional properties for the text, - * calculate the flags bit-set, and - * determine the paragraph level if necessary. + * Check that there are enough entries in the array pointed to by pBiDi->paras */ -static void +static UBool +checkParaCount(UBiDi *pBiDi) { + int32_t count=pBiDi->paraCount; + if(pBiDi->paras==pBiDi->simpleParas) { + if(count<=SIMPLE_PARAS_SIZE) + return TRUE; + if(!getInitialParasMemory(pBiDi, SIMPLE_PARAS_SIZE * 2)) + return FALSE; + pBiDi->paras=pBiDi->parasMemory; + uprv_memcpy(pBiDi->parasMemory, pBiDi->simpleParas, SIMPLE_PARAS_SIZE * sizeof(Para)); + return TRUE; + } + if(!getInitialParasMemory(pBiDi, count * 2)) + return FALSE; + pBiDi->paras=pBiDi->parasMemory; + return TRUE; +} + +/* + * Get the directional properties for the text, calculate the flags bit-set, and + * determine the paragraph level if necessary (in pBiDi->paras[i].level). + * FSI initiators are also resolved and their dirProp replaced with LRI or RLI. + */ +static UBool getDirProps(UBiDi *pBiDi) { const UChar *text=pBiDi->text; DirProp *dirProps=pBiDi->dirPropsMemory; /* pBiDi->dirProps is const */ - int32_t i=0, i1, length=pBiDi->originalLength; + int32_t i=0, originalLength=pBiDi->originalLength; Flags flags=0; /* collect all directionalities in the text */ UChar32 uchar; - DirProp dirProp=0, paraDirDefault=0;/* initialize to avoid compiler warnings */ + DirProp dirProp=0, defaultParaLevel=0; /* initialize to avoid compiler warnings */ UBool isDefaultLevel=IS_DEFAULT_LEVEL(pBiDi->paraLevel); /* for inverse BiDi, the default para level is set to RTL if there is a - strong R or AL character at either end of the text */ + strong R or AL character at either end of the text */ UBool isDefaultLevelInverse=isDefaultLevel && (UBool) (pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_LIKE_DIRECT || pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL); @@ -407,138 +440,465 @@ getDirProps(UBiDi *pBiDi) { UBIDI_OPTION_REMOVE_CONTROLS); typedef enum { - NOT_CONTEXTUAL, /* 0: not contextual paraLevel */ - LOOKING_FOR_STRONG, /* 1: looking for first strong char */ - FOUND_STRONG_CHAR /* 2: found first strong char */ + NOT_SEEKING_STRONG, /* 0: not contextual paraLevel, not after FSI */ + SEEKING_STRONG_FOR_PARA, /* 1: looking for first strong char in para */ + SEEKING_STRONG_FOR_FSI, /* 2: looking for first strong after FSI */ + LOOKING_FOR_PDI /* 3: found strong after FSI, looking for PDI */ } State; State state; - int32_t paraStart=0; /* index of first char in paragraph */ - DirProp paraDir; /* == CONTEXT_RTL within paragraphs - starting with strong R char */ - DirProp lastStrongDir=0; /* for default level & inverse BiDi */ - int32_t lastStrongLTR=0; /* for STREAMING option */ + DirProp lastStrong=ON; /* for default level & inverse BiDi */ + /* The following stacks are used to manage isolate sequences. Those + sequences may be nested, but obviously never more deeply than the + maximum explicit embedding level. + lastStack is the index of the last used entry in the stack. A value of -1 + means that there is no open isolate sequence. + lastStack is reset to -1 on paragraph boundaries. */ + /* The following stack contains the position of the initiator of + each open isolate sequence */ + int32_t isolateStartStack[UBIDI_MAX_EXPLICIT_LEVEL+1]; + /* The following stack contains the last known state before + encountering the initiator of an isolate sequence */ + int8_t previousStateStack[UBIDI_MAX_EXPLICIT_LEVEL+1]; + int32_t stackLast=-1; - if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING) { + if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING) pBiDi->length=0; - lastStrongLTR=0; - } + defaultParaLevel=pBiDi->paraLevel&1; if(isDefaultLevel) { - DirProp lastStrong; - paraDirDefault=pBiDi->paraLevel&1 ? CONTEXT_RTL : 0; - if(pBiDi->proLength>0 && - (lastStrong=firstL_R_AL(pBiDi))!=ON) { - paraDir=(lastStrong==L) ? 0 : CONTEXT_RTL; - state=FOUND_STRONG_CHAR; + pBiDi->paras[0].level=defaultParaLevel; + lastStrong=defaultParaLevel; + if(pBiDi->proLength>0 && /* there is a prologue */ + (dirProp=firstL_R_AL(pBiDi))!=ON) { /* with a strong character */ + if(dirProp==L) + pBiDi->paras[0].level=0; /* set the default para level */ + else + pBiDi->paras[0].level=1; /* set the default para level */ + state=NOT_SEEKING_STRONG; } else { - paraDir=paraDirDefault; - state=LOOKING_FOR_STRONG; + state=SEEKING_STRONG_FOR_PARA; } - lastStrongDir=paraDir; } else { - state=NOT_CONTEXTUAL; - paraDir=0; + pBiDi->paras[0].level=pBiDi->paraLevel; + state=NOT_SEEKING_STRONG; } /* count paragraphs and determine the paragraph level (P2..P3) */ /* * see comment in ubidi.h: - * the DEFAULT_XXX values are designed so that + * the UBIDI_DEFAULT_XXX values are designed so that * their bit 0 alone yields the intended default */ - for( /* i=0 above */ ; i0xffff) { /* set the lead surrogate's property to BN */ flags|=DIRPROP_FLAG(BN); - dirProps[i-2]=(DirProp)(BN|paraDir); + dirProps[i-2]=BN; } - if(state==LOOKING_FOR_STRONG) { - if(dirProp==L) { - state=FOUND_STRONG_CHAR; - if(paraDir) { - paraDir=0; - for(i1=paraStart; i1reorderingOptions & UBIDI_OPTION_STREAMING) { - pBiDi->length=i; /* i is index to next character */ - } - if(isDefaultLevelInverse && (lastStrongDir==CONTEXT_RTL) &&(paraDir!=lastStrongDir)) { - for( ; paraStartparaCount++; - } - if(isDefaultLevel) { - state=LOOKING_FOR_STRONG; - paraStart=i; /* i is index to next character */ - paraDir=paraDirDefault; - lastStrongDir=paraDirDefault; - } - } - } - if(removeBiDiControls && IS_BIDI_CONTROL_CHAR(uchar)) { + if(removeBiDiControls && IS_BIDI_CONTROL_CHAR(uchar)) controlCount++; + if(dirProp==L) { + if(state==SEEKING_STRONG_FOR_PARA) { + pBiDi->paras[pBiDi->paraCount-1].level=0; + state=NOT_SEEKING_STRONG; + } + else if(state==SEEKING_STRONG_FOR_FSI) { + if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) { + dirProps[isolateStartStack[stackLast]]=LRI; + flags|=DIRPROP_FLAG(LRI); + } + state=LOOKING_FOR_PDI; + } + lastStrong=L; + continue; + } + if(dirProp==R || dirProp==AL) { + if(state==SEEKING_STRONG_FOR_PARA) { + pBiDi->paras[pBiDi->paraCount-1].level=1; + state=NOT_SEEKING_STRONG; + } + else if(state==SEEKING_STRONG_FOR_FSI) { + if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) { + dirProps[isolateStartStack[stackLast]]=RLI; + flags|=DIRPROP_FLAG(RLI); + } + state=LOOKING_FOR_PDI; + } + lastStrong=R; + if(dirProp==AL) + lastArabicPos=i-1; + continue; + } + if(dirProp>=FSI && dirProp<=RLI) { /* FSI, LRI or RLI */ + stackLast++; + if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) { + isolateStartStack[stackLast]=i-1; + previousStateStack[stackLast]=state; + } + if(dirProp==FSI) + state=SEEKING_STRONG_FOR_FSI; + else + state=LOOKING_FOR_PDI; + continue; + } + if(dirProp==PDI) { + if(state==SEEKING_STRONG_FOR_FSI) { + if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) { + dirProps[isolateStartStack[stackLast]]=LRI; + flags|=DIRPROP_FLAG(LRI); + } + } + if(stackLast>=0) { + if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) + state=previousStateStack[stackLast]; + stackLast--; + } + continue; + } + if(dirProp==B) { + if(iparas[pBiDi->paraCount-1].limit=i; + if(isDefaultLevelInverse && lastStrong==R) + pBiDi->paras[pBiDi->paraCount-1].level=1; + if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING) { + /* When streaming, we only process whole paragraphs + thus some updates are only done on paragraph boundaries */ + pBiDi->length=i; /* i is index to next character */ + pBiDi->controlCount=controlCount; + } + if(iparaCount++; + if(checkParaCount(pBiDi)==FALSE) /* not enough memory for a new para entry */ + return FALSE; + if(isDefaultLevel) { + pBiDi->paras[pBiDi->paraCount-1].level=defaultParaLevel; + state=SEEKING_STRONG_FOR_PARA; + lastStrong=defaultParaLevel; + } else { + pBiDi->paras[pBiDi->paraCount-1].level=pBiDi->paraLevel; + state=NOT_SEEKING_STRONG; + } + stackLast=-1; + } + continue; } } - if(isDefaultLevelInverse && (lastStrongDir==CONTEXT_RTL) &&(paraDir!=lastStrongDir)) { - for(i1=paraStart; i1UBIDI_MAX_EXPLICIT_LEVEL) { + stackLast=UBIDI_MAX_EXPLICIT_LEVEL; + if(dirProps[previousStateStack[UBIDI_MAX_EXPLICIT_LEVEL]]!=FSI) + state=LOOKING_FOR_PDI; + } + /* Resolve direction of still unresolved open FSI sequences */ + while(stackLast>=0) { + if(state==SEEKING_STRONG_FOR_FSI) { + dirProps[isolateStartStack[stackLast]]=LRI; + flags|=DIRPROP_FLAG(LRI); } + state=previousStateStack[stackLast]; + stackLast--; + } + /* When streaming, ignore text after the last paragraph separator */ + if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING) { + if(pBiDi->lengthparaCount--; + } else { + pBiDi->paras[pBiDi->paraCount-1].limit=originalLength; + pBiDi->controlCount=controlCount; + } + /* For inverse bidi, default para direction is RTL if there is + a strong R or AL at either end of the paragraph */ + if(isDefaultLevelInverse && lastStrong==R) { + pBiDi->paras[pBiDi->paraCount-1].level=1; } if(isDefaultLevel) { - pBiDi->paraLevel=GET_PARALEVEL(pBiDi, 0); + pBiDi->paraLevel=pBiDi->paras[0].level; } - if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING) { - if((lastStrongLTR>pBiDi->length) && - (GET_PARALEVEL(pBiDi, lastStrongLTR)==0)) { - pBiDi->length = lastStrongLTR; - } - if(pBiDi->lengthoriginalLength) { - pBiDi->paraCount--; - } - } - /* The following line does nothing new for contextual paraLevel, but is - needed for absolute paraLevel. */ - flags|=DIRPROP_FLAG_LR(pBiDi->paraLevel); + /* The following is needed to resolve the text direction for default level + paragraphs containing no strong character */ + for(i=0; iparaCount; i++) + flags|=DIRPROP_FLAG_LR(pBiDi->paras[i].level); if(pBiDi->orderParagraphsLTR && (flags&DIRPROP_FLAG(B))) { flags|=DIRPROP_FLAG(L); } - - pBiDi->controlCount = controlCount; pBiDi->flags=flags; pBiDi->lastArabicPos=lastArabicPos; + return TRUE; +} + +/* determine the paragraph level at position index */ +U_CFUNC UBiDiLevel +ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index) { + int32_t i; + for(i=0; iparaCount; i++) + if(indexparas[i].limit) + break; + if(i>=pBiDi->paraCount) + i=pBiDi->paraCount-1; + return (UBiDiLevel)(pBiDi->paras[i].level); +} + +/* Functions for handling paired brackets ----------------------------------- */ + +/* In the isoRuns array, the first entry is used for text outside of any + isolate sequence. Higher entries are used for each more deeply nested + isolate sequence. isoRunLast is the index of the last used entry. The + openings array is used to note the data of opening brackets not yet + matched by a closing bracket, or matched but still susceptible to change + level. + Each isoRun entry contains the index of the first and + one-after-last openings entries for pending opening brackets it + contains. The next openings entry to use is the one-after-last of the + most deeply nested isoRun entry. + isoRun entries also contain their current embedding level and the last + encountered strong character, since these will be needed to resolve + the level of paired brackets. */ + +static void +bracketInit(UBiDi *pBiDi, BracketData *bd) { + bd->pBiDi=pBiDi; + bd->isoRunLast=0; + bd->isoRuns[0].start=0; + bd->isoRuns[0].limit=0; + bd->isoRuns[0].level=GET_PARALEVEL(pBiDi, 0); + bd->isoRuns[0].lastStrong=GET_PARALEVEL(pBiDi, 0)&1; + bd->isoRuns[0].lastStrongPos=0; + if(pBiDi->openingsMemory) { + bd->openings=pBiDi->openingsMemory; + bd->openingsSize=pBiDi->openingsSize; + } else { + bd->openings=bd->simpleOpenings; + bd->openingsSize=SIMPLE_OPENINGS_SIZE; + } +} + +/* paragraph boundary */ +static void +bracketProcessB(BracketData *bd, UBiDiLevel level) { + bd->isoRunLast=0; + bd->isoRuns[0].limit=0; + bd->isoRuns[0].level=level; + bd->isoRuns[0].lastStrong=level&1; + bd->isoRuns[0].lastStrongPos=0; +} + +/* LRE, LRO, RLE, RLO, PDF */ +static void +bracketProcessBoundary(BracketData *bd, UBiDiLevel level) { + IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast]; + pLastIsoRun->limit=pLastIsoRun->start; + pLastIsoRun->level=level; + pLastIsoRun->lastStrong=level&1; + pLastIsoRun->lastStrongPos=0; +} + +/* LRI or RLI */ +static void +bracketProcessLRI_RLI(BracketData *bd, UBiDiLevel level) { + IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast]; + int16_t lastLimit; + lastLimit=pLastIsoRun->limit; + bd->isoRunLast++; + pLastIsoRun++; + pLastIsoRun->start=pLastIsoRun->limit=lastLimit; + pLastIsoRun->level=level; + pLastIsoRun->lastStrong=level&1; + pLastIsoRun->lastStrongPos=0; +} + +/* PDI */ +static void +bracketProcessPDI(BracketData *bd) { + bd->isoRunLast--; +} + +/* newly found opening bracket: create an openings entry */ +static UBool /* return TRUE if success */ +bracketAddOpening(BracketData *bd, UChar match, int32_t position) { + IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast]; + Opening *pOpening; + if(pLastIsoRun->limit>=bd->openingsSize) { /* no available new entry */ + UBiDi *pBiDi=bd->pBiDi; + if(!getInitialOpeningsMemory(pBiDi, pLastIsoRun->limit * 2)) + return FALSE; + if(bd->openings==bd->simpleOpenings) + uprv_memcpy(pBiDi->openingsMemory, bd->simpleOpenings, + SIMPLE_OPENINGS_SIZE * sizeof(Opening)); + bd->openings=pBiDi->openingsMemory; /* may have changed */ + bd->openingsSize=pBiDi->openingsSize; + } + pOpening=&bd->openings[pLastIsoRun->limit]; + pOpening->position=position; + pOpening->match=match; + pOpening->lastStrong=pLastIsoRun->lastStrong; + pOpening->lastStrongPos=pLastIsoRun->lastStrongPos; + pOpening->flags=0; + pLastIsoRun->limit++; + return TRUE; +} + +/* change N0c1 to N0c2 when a preceding bracket is assigned the embedding level */ +static void +fixN0c(BracketData *bd, int32_t openingIndex, int32_t newPropPosition, DirProp newProp) { + /* This function calls itself recursively */ + IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast]; + Opening *qOpening; + DirProp *dirProps=bd->pBiDi->dirProps; + int32_t k, openingPosition, closingPosition; + for(k=openingIndex+1, qOpening=&bd->openings[k]; klimit; k++, qOpening++) { + if(qOpening->match>=0) /* not an N0c match */ + continue; + if(newPropPosition<=qOpening->lastStrongPos) + break; + if(newPropPosition>=qOpening->position) + continue; + if(newProp==qOpening->lastStrong || (newProp==R && qOpening->lastStrong==AL)) + break; + openingPosition=qOpening->position; + dirProps[openingPosition]=dirProps[newPropPosition]; + closingPosition=-(qOpening->match); + dirProps[closingPosition]= newProp; /* can never be AL */ + qOpening->match=0; /* prevent further changes */ + fixN0c(bd, k, openingPosition, newProp); + fixN0c(bd, k, closingPosition, newProp); + } +} + +/* handle strong characters and candidates for closing brackets */ +static UBool /* return TRUE if success */ +bracketProcessChar(BracketData *bd, int32_t position, DirProp dirProp) { + IsoRun *pLastIsoRun; + Opening *pOpening, *qOpening; + DirProp *dirProps, newProp; + UBiDiDirection direction; + uint8_t flag; + int32_t i, k; + UBool stable; + UChar c, match; + if(DIRPROP_FLAG(dirProp)&MASK_STRONG) { /* L, R or AL */ + pLastIsoRun=&bd->isoRuns[bd->isoRunLast]; + pLastIsoRun->lastStrong=dirProp; + pLastIsoRun->lastStrongPos=position; + if(dirProp==AL) + dirProp=R; + flag=DIRPROP_FLAG(dirProp); + /* strong characters found after an unmatched opening bracket + must be noted for possibly applying N0b */ + for(i=pLastIsoRun->start; ilimit; i++) + bd->openings[i].flags|=flag; + return TRUE; + } + if(dirProp!=ON) + return TRUE; + /* First see if it is a matching closing bracket. Hopefully, this is more + efficient than checking if it is a closing bracket at all */ + c=bd->pBiDi->text[position]; + pLastIsoRun=&bd->isoRuns[bd->isoRunLast]; + for(i=pLastIsoRun->limit-1; i>=pLastIsoRun->start; i--) { + if(bd->openings[i].match!=c) + continue; + /* We have a match */ + dirProps=bd->pBiDi->dirProps; + pOpening=&bd->openings[i]; + direction=pLastIsoRun->level&1; + stable=TRUE; /* assume stable until proved otherwise */ + + /* The stable flag is set when brackets are paired and their + level is resolved and cannot be changed by what will be + found later in the source string. + An unstable match can occur only when applying N0c, where + the resolved level depends on the preceding context, and + this context may be affected by text occurring later. + Example: RTL paragraph containing: abc[(latin) HEBREW] + When the closing parenthesis is encountered, it appears + that N0c1 must be applied since 'abc' sets an opposite + direction context and both parentheses receive level 2. + However, when the closing square bracket is processed, + N0b applies because of 'HEBREW' being included within the + brackets, thus the square brackets are treated like R and + receive level 1. However, this changes the preceding + context of the opening parenthesis, and it now appears + that N0c2 must be applied to the parentheses rather than + N0c1. */ + + if((direction==0 && pOpening->flags&FOUND_L) || + (direction==1 && pOpening->flags&FOUND_R)) { /* N0b */ + newProp=direction; + } + else if(pOpening->flags&(FOUND_L|FOUND_R)) { /* N0c */ + if((direction==1 && pOpening->lastStrong==L) || + (direction==0 && pOpening->lastStrong!=L)) { + newProp=direction^1; /* N0c1 */ + /* it is stable if there is no preceding text or in + conditions too complicated and not worth checking */ + stable=(i==pLastIsoRun->start); + } + else + newProp=direction; /* N0c2 */ + } + else { + newProp=BN; /* N0d */ + } + if(newProp==L) { + dirProps[pOpening->position]=L; + dirProps[position]=L; + pLastIsoRun->lastStrong=L; + } + else if(newProp==R) { + dirProps[pOpening->position]=pOpening->lastStrong==AL ? AL : R; + dirProps[position]=pLastIsoRun->lastStrong==AL ? AL : R; + pLastIsoRun->lastStrong=dirProps[position]; + } + /* Update nested N0c pairs that may be affected */ + if(newProp==direction) + fixN0c(bd, i, pOpening->position, newProp); + if(stable) { + pLastIsoRun->limit=i; /* forget any brackets nested within this pair */ + /* remove lower located synonyms if any */ + while(pLastIsoRun->limit>pLastIsoRun->start && + bd->openings[pLastIsoRun->limit-1].position==pOpening->position) + pLastIsoRun->limit--; + } + else { + pOpening->match=-position; + /* neutralize lower located synonyms if any */ + k=i-1; + while(k>=pLastIsoRun->start && + bd->openings[k].position==pOpening->position) + bd->openings[k--].match=0; + /* neutralize any unmatched opening between the current pair; + this will also neutralize higher located synonyms if any */ + for(k=i+1; klimit; k++) { + qOpening=&bd->openings[k]; + if(qOpening->position>=position) + break; + if(qOpening->match>0) + qOpening->match=0; + } + } + return TRUE; + } + /* We get here only if the ON character was not a matching closing bracket */ + /* Now see if it is an opening bracket */ + match=u_getBidiPairedBracket(c); /* get the matching char */ + if(match==c) /* if no matching char */ + return TRUE; + if(u_getIntPropertyValue(c, UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=U_BPT_OPEN) + return TRUE; /* not an opening bracket */ + /* special case: process synonyms + create an opening entry for each synonym */ + if(match==0x232A) { /* RIGHT-POINTING ANGLE BRACKET */ + if(!bracketAddOpening(bd, 0x3009, position)) + return FALSE; + } + else if(match==0x3009) { /* RIGHT ANGLE BRACKET */ + if(!bracketAddOpening(bd, 0x232A, position)) + return FALSE; + } + return bracketAddOpening(bd, match, position); } /* perform (X1)..(X9) ------------------------------------------------------- */ @@ -564,14 +924,14 @@ directionFromFlags(UBiDi *pBiDi) { * * The BiDi algorithm is designed to result in the same behavior whether embedding * levels are externally specified (from "styled text", supposedly the preferred - * method) or set by explicit embedding codes (LRx, RLx, PDF) in the plain text. - * That is why (X9) instructs to remove all explicit codes (and BN). - * However, in a real implementation, this removal of these codes and their index + * method) or set by explicit embedding codes (LRx, RLx, PDF, FSI, PDI) in the plain text. + * That is why (X9) instructs to remove all not-isolate explicit codes (and BN). + * However, in a real implementation, the removal of these codes and their index * positions in the plain text is undesirable since it would result in * reallocated, reindexed text. * Instead, this implementation leaves the codes in there and just ignores them * in the subsequent processing. - * In order to get the same reordering behavior, positions with a BN or an + * In order to get the same reordering behavior, positions with a BN or a not-isolate * explicit embedding code just get the same level assigned as the last "real" * character. * @@ -594,24 +954,20 @@ directionFromFlags(UBiDi *pBiDi) { * * Handling the stack of explicit levels (Xn): * - * With the BiDi stack of explicit levels, - * as pushed with each LRE, RLE, LRO, and RLO and popped with each PDF, - * the explicit level must never exceed UBIDI_MAX_EXPLICIT_LEVEL==61. + * With the BiDi stack of explicit levels, as pushed with each + * LRE, RLE, LRO, RLO, LRI, RLI and FSO and popped with each PDF and PDI, + * the explicit level must never exceed UBIDI_MAX_EXPLICIT_LEVEL. * * In order to have a correct push-pop semantics even in the case of overflows, - * there are two overflow counters: - * - countOver60 is incremented with each LRx at level 60 - * - from level 60, one RLx increases the level to 61 - * - countOver61 is incremented with each LRx and RLx at level 61 - * - * Popping levels with PDF must work in the opposite order so that level 61 - * is correct at the correct point. Underflows (too many PDFs) must be checked. + * overflow counters and a valid isolate counter are used as described in UAX#9 + * section 3.3.2 "Explicit Levels and Directions". * * This implementation assumes that UBIDI_MAX_EXPLICIT_LEVEL is odd. */ static UBiDiDirection -resolveExplicitLevels(UBiDi *pBiDi) { - const DirProp *dirProps=pBiDi->dirProps; +resolveExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) { + if(U_FAILURE(*pErrorCode)) { return UBIDI_LTR; } + DirProp *dirProps=pBiDi->dirProps; UBiDiLevel *levels=pBiDi->levels; const UChar *text=pBiDi->text; @@ -619,110 +975,189 @@ resolveExplicitLevels(UBiDi *pBiDi) { Flags flags=pBiDi->flags; /* collect all directionalities in the text */ DirProp dirProp; UBiDiLevel level=GET_PARALEVEL(pBiDi, 0); - UBiDiDirection direction; - int32_t paraIndex=0; + pBiDi->isolateCount=0; /* determine if the text is mixed-directional or single-directional */ direction=directionFromFlags(pBiDi); - /* we may not need to resolve any explicit levels, but for multiple - paragraphs we want to loop on all chars to set the para boundaries */ - if((direction!=UBIDI_MIXED) && (pBiDi->paraCount==1)) { + /* we may not need to resolve any explicit levels */ + if((direction!=UBIDI_MIXED)) { /* not mixed directionality: levels don't matter - trailingWSStart will be 0 */ - } else if((pBiDi->paraCount==1) && - (!(flags&MASK_EXPLICIT) || - (pBiDi->reorderingMode > UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL))) { - /* mixed, but all characters are at the same embedding level */ - /* or we are in "inverse BiDi" */ - /* and we don't have contextual multiple paragraphs with some B char */ + return direction; + } + if(pBiDi->reorderingMode > UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL) { + /* inverse BiDi: mixed, but all characters are at the same embedding level */ /* set all levels to the paragraph level */ - for(i=0; iparaCount; paraIndex++) { + if(paraIndex==0) + start=0; + else + start=pBiDi->paras[paraIndex-1].limit; + limit=pBiDi->paras[paraIndex].limit; + level=pBiDi->paras[paraIndex].level; + for(i=start; iparaCount; paraIndex++) { + if(paraIndex==0) + start=0; + else + start=pBiDi->paras[paraIndex-1].limit; + limit=pBiDi->paras[paraIndex].limit; + level=pBiDi->paras[paraIndex].level; + for(i=start; i=UBIDI_MAX_EXPLICIT_LEVEL */ - uint32_t countOver60=0, countOver61=0; /* count overflows of explicit levels */ + uint16_t stack[UBIDI_MAX_EXPLICIT_LEVEL+2]; /* we never push anything >=UBIDI_MAX_EXPLICIT_LEVEL + but we need one more entry as base */ + uint32_t stackLast=0; + int32_t overflowIsolateCount=0; + int32_t overflowEmbeddingCount=0; + int32_t validIsolateCount=0; + BracketData bracketData; + bracketInit(pBiDi, &bracketData); + stack[0]=level; /* initialize base entry to para level, no override, no isolate */ /* recalculate the flags */ flags=0; for(i=0; i0) { - --countOver61; - } else if(countOver60>0 && (embeddingLevel&~UBIDI_LEVEL_OVERRIDE)!=UBIDI_MAX_EXPLICIT_LEVEL) { - /* handle LRx overflows from level 60 */ - --countOver60; - } else if(stackTop>0) { - /* this is the pop operation; it also pops level 61 while countOver60>0 */ - --stackTop; - embeddingLevel=stack[stackTop]; - /* } else { (underflow) */ - } flags|=DIRPROP_FLAG(BN); + /* handle all the overflow cases first */ + if(overflowIsolateCount) { + dirProps[i]|=IGNORE_CC; + break; + } + if(overflowEmbeddingCount) { + dirProps[i]|=IGNORE_CC; + overflowEmbeddingCount--; + break; + } + if(stackLast>0 && stack[stackLast]pBiDi->isolateCount) + pBiDi->isolateCount=validIsolateCount; + embeddingLevel=newLevel; + stackLast++; + stack[stackLast]=embeddingLevel+ISOLATE; + bracketProcessLRI_RLI(&bracketData, embeddingLevel); + } else { + dirProps[i]|=IGNORE_CC; + overflowIsolateCount++; + } + break; + case PDI: + /* (X6a) */ + if(overflowIsolateCount) { + dirProps[i]|=IGNORE_CC; + overflowIsolateCount--; + } + else if(validIsolateCount) { + overflowEmbeddingCount=0; + while(stack[stackLast]paras[paraIndex++]=i+1; - } + bracketProcessB(&bracketData, embeddingLevel); } flags|=DIRPROP_FLAG(B); break; @@ -733,17 +1168,13 @@ resolveExplicitLevels(UBiDi *pBiDi) { break; default: /* all other types get the "real" level */ - if(level!=embeddingLevel) { - level=embeddingLevel; - if(level&UBIDI_LEVEL_OVERRIDE) { - flags|=DIRPROP_FLAG_O(level)|DIRPROP_FLAG_MULTI_RUNS; - } else { - flags|=DIRPROP_FLAG_E(level)|DIRPROP_FLAG_MULTI_RUNS; - } - } - if(!(level&UBIDI_LEVEL_OVERRIDE)) { + level=embeddingLevel; + if(level&UBIDI_LEVEL_OVERRIDE) + flags|=DIRPROP_FLAG_LR(level); + else flags|=DIRPROP_FLAG(dirProp); - } + if(!bracketProcessChar(&bracketData, i, dirProp)) + return -1; break; } @@ -752,6 +1183,15 @@ resolveExplicitLevels(UBiDi *pBiDi) { * explicit codes because we will later look at same-level runs (X10). */ levels[i]=level; + if(i>0 && levels[i-1]!=level) { + flags|=DIRPROP_FLAG_MULTI_RUNS; + if(level&UBIDI_LEVEL_OVERRIDE) + flags|=DIRPROP_FLAG_O(level); + else + flags|=DIRPROP_FLAG_E(level); + } + if(DIRPROP_FLAG(dirProp)&MASK_ISO) + level=embeddingLevel; } if(flags&MASK_EMBEDDING) { flags|=DIRPROP_FLAG_LR(pBiDi->paraLevel); @@ -766,7 +1206,6 @@ resolveExplicitLevels(UBiDi *pBiDi) { pBiDi->flags=flags; direction=directionFromFlags(pBiDi); } - return direction; } @@ -782,19 +1221,28 @@ resolveExplicitLevels(UBiDi *pBiDi) { */ static UBiDiDirection checkExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) { - const DirProp *dirProps=pBiDi->dirProps; + DirProp *dirProps=pBiDi->dirProps; DirProp dirProp; UBiDiLevel *levels=pBiDi->levels; - const UChar *text=pBiDi->text; + int32_t isolateCount=0; int32_t i, length=pBiDi->length; Flags flags=0; /* collect all directionalities in the text */ UBiDiLevel level; - uint32_t paraIndex=0; + pBiDi->isolateCount=0; for(i=0; ipBiDi->isolateCount) + pBiDi->isolateCount=isolateCount; + } + else if(dirProp==PDI) + isolateCount--; + else if(dirProp==B) + isolateCount=0; if(level&UBIDI_LEVEL_OVERRIDE) { /* keep the override flag in levels[i] but adjust the flags */ level&=~UBIDI_LEVEL_OVERRIDE; /* make the range check below simpler */ @@ -810,11 +1258,6 @@ checkExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return UBIDI_LTR; } - if((dirProp==B) && ((i+1)paras[paraIndex++]=i+1; - } - } } if(flags&MASK_EMBEDDING) { flags|=DIRPROP_FLAG_LR(pBiDi->paraLevel); @@ -850,8 +1293,8 @@ checkExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) { static const uint8_t groupProp[] = /* dirProp regrouped */ { -/* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN */ - 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10 +/* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI */ + 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10, 4, 4, 4, 4 }; enum { DirProp_L=0, DirProp_R=1, DirProp_EN=2, DirProp_AN=3, DirProp_ON=4, DirProp_S=5, DirProp_B=6 }; /* reduced dirProp */ @@ -860,7 +1303,7 @@ enum { DirProp_L=0, DirProp_R=1, DirProp_EN=2, DirProp_AN=3, DirProp_ON=4, DirPr PROPERTIES STATE TABLE In table impTabProps, - - the ON column regroups ON and WS + - the ON column regroups ON and WS, FSI, RLI, LRI and PDI - the BN column regroups BN, LRE, RLE, LRO, RLO, PDF - the Res column is the reduced property assigned to a run @@ -1185,6 +1628,7 @@ typedef struct { int32_t startL2EN; /* start of level 2 sequence */ int32_t lastStrongRTL; /* index of last found R or AL */ int32_t state; /* current state */ + int32_t runStart; /* start position of the run */ UBiDiLevel runLevel; /* run level before implicit solving */ } LevState; @@ -1327,7 +1771,7 @@ processPropertySeq(UBiDi *pBiDi, LevState *pLevState, uint8_t _prop, case 5: /* EN/AN after R/AL + possible cont */ /* check for real AN */ - if ((_prop == DirProp_AN) && (NO_CONTEXT_RTL(pBiDi->dirProps[start0]) == AN) && + if ((_prop == DirProp_AN) && (pBiDi->dirProps[start0] == AN) && (pBiDi->reorderingMode!=UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL)) { /* real AN */ @@ -1432,15 +1876,32 @@ processPropertySeq(UBiDi *pBiDi, LevState *pLevState, uint8_t _prop, } if((addLevel) || (start < start0)) { level=pLevState->runLevel + addLevel; - for(k=start; k=pLevState->runStart) { + for(k=start; kdirProps, dirProp; + int32_t isolateCount=0; + for(k=start; kprologue; int32_t length=pBiDi->proLength; int32_t i; @@ -1463,9 +1924,12 @@ lastL_R_AL(UBiDi *pBiDi) { return DirProp_ON; } +/** + * Returns the directionality of the first strong character, or digit, in the epilogue, if any. + * Requires epilogue!=null. + */ static DirProp firstL_R_AL_EN_AN(UBiDi *pBiDi) { - /* return first strong char or digit in epilogue */ const UChar *text=pBiDi->epilogue; int32_t length=pBiDi->epiLength; int32_t i; @@ -1496,17 +1960,15 @@ resolveImplicitLevels(UBiDi *pBiDi, int32_t start, int32_t limit, DirProp sor, DirProp eor) { const DirProp *dirProps=pBiDi->dirProps; - + DirProp dirProp; LevState levState; int32_t i, start1, start2; - uint8_t oldStateImp, stateImp, actionImp; + uint16_t oldStateImp, stateImp, actionImp; uint8_t gprop, resProp, cell; UBool inverseRTL; DirProp nextStrongProp=R; int32_t nextStrongPos=-1; - levState.startON = -1; /* silence gcc flow analysis */ - /* check for RTL inverse BiDi mode */ /* FOOD FOR THOUGHT: in case of RTL inverse BiDi, it would make sense to * loop on the text characters from end to start. @@ -1518,10 +1980,12 @@ resolveImplicitLevels(UBiDi *pBiDi, ((startlastArabicPos) && (GET_PARALEVEL(pBiDi, start) & 1) && (pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_LIKE_DIRECT || pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL)); - /* initialize for levels state table */ + + /* initialize for property and levels state tables */ + levState.startON=-1; levState.startL2EN=-1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ levState.lastStrongRTL=-1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ - levState.state=0; + levState.runStart=start; levState.runLevel=pBiDi->levels[start]; levState.pImpTab=(const ImpTab*)((pBiDi->pImpTabPair)->pImpTab)[levState.runLevel&1]; levState.pImpAct=(const ImpAct*)((pBiDi->pImpTabPair)->pImpAct)[levState.runLevel&1]; @@ -1531,22 +1995,36 @@ resolveImplicitLevels(UBiDi *pBiDi, sor=lastStrong; } } - processPropertySeq(pBiDi, &levState, sor, start, start); - /* initialize for property state table */ - if(NO_CONTEXT_RTL(dirProps[start])==NSM) { - stateImp = 1 + sor; + /* The isolates[] entries contain enough information to + resume the bidi algorithm in the same state as it was + when it was interrupted by an isolate sequence. */ + if(dirProps[start]==PDI) { + start1=pBiDi->isolates[pBiDi->isolateCount].start1; + stateImp=pBiDi->isolates[pBiDi->isolateCount].stateImp; + levState.state=pBiDi->isolates[pBiDi->isolateCount].state; + pBiDi->isolateCount--; } else { - stateImp=0; + start1=start; + if(dirProps[start]==NSM) + stateImp = 1 + sor; + else + stateImp=0; + levState.state=0; + processPropertySeq(pBiDi, &levState, sor, start, start); } - start1=start; start2=start; for(i=start; i<=limit; i++) { if(i>=limit) { + if(limit>start) { + dirProp=pBiDi->dirProps[limit-1]; + if(dirProp==LRI || dirProp==RLI) + break; /* no forced closing for sequence ending with LRI/RLI */ + } gprop=eor; } else { DirProp prop, prop1; - prop=NO_CONTEXT_RTL(dirProps[i]); + prop=PURE_DIRPROP(dirProps[i]); if(inverseRTL) { if(prop==AL) { /* AL before EN does not make it AN */ @@ -1558,7 +2036,7 @@ resolveImplicitLevels(UBiDi *pBiDi, nextStrongProp=R; /* set default */ nextStrongPos=limit; for(j=i+1; jlength && pBiDi->epiLength>0) { DirProp firstStrong=firstL_R_AL_EN_AN(pBiDi); @@ -1614,7 +2093,16 @@ resolveImplicitLevels(UBiDi *pBiDi, eor=firstStrong; } } - processPropertySeq(pBiDi, &levState, eor, limit, limit); + + dirProp=dirProps[limit-1]; + if((dirProp==LRI || dirProp==RLI) && limitlength) { + pBiDi->isolateCount++; + pBiDi->isolates[pBiDi->isolateCount].stateImp=stateImp; + pBiDi->isolates[pBiDi->isolateCount].state=levState.state; + pBiDi->isolates[pBiDi->isolateCount].start1=start1; + } + else + processPropertySeq(pBiDi, &levState, eor, limit, limit); } /* perform (L1) and (X9) ---------------------------------------------------- */ @@ -1638,7 +2126,7 @@ adjustWSLevels(UBiDi *pBiDi) { i=pBiDi->trailingWSStart; while(i>0) { /* reset a sequence of WS/BN before eop and B/S to the paragraph paraLevel */ - while(i>0 && (flag=DIRPROP_FLAG_NC(dirProps[--i]))&MASK_WS) { + while(i>0 && (flag=DIRPROP_FLAG(PURE_DIRPROP(dirProps[--i])))&MASK_WS) { if(orderParagraphsLTR&&(flag&DIRPROP_FLAG(B))) { levels[i]=0; } else { @@ -1649,7 +2137,7 @@ adjustWSLevels(UBiDi *pBiDi) { /* reset BN to the next character's paraLevel until B/S, which restarts above loop */ /* here, i+1 is guaranteed to be 0) { - flag=DIRPROP_FLAG_NC(dirProps[--i]); + flag=DIRPROP_FLAG(PURE_DIRPROP(dirProps[--i])); if(flag&MASK_BN_EXPLICIT) { levels[i]=levels[i+1]; } else if(orderParagraphsLTR&&(flag&DIRPROP_FLAG(B))) { @@ -1700,6 +2188,7 @@ setParaSuccess(UBiDi *pBiDi) { #define BIDI_MIN(x, y) ((x)<(y) ? (x) : (y)) #define BIDI_ABS(x) ((x)>=0 ? (x) : (-(x))) + static void setParaRunsOnly(UBiDi *pBiDi, const UChar *text, int32_t length, UBiDiLevel paraLevel, UErrorCode *pErrorCode) { @@ -1917,7 +2406,7 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, pBiDi->text=text; pBiDi->length=pBiDi->originalLength=pBiDi->resultLength=length; pBiDi->paraLevel=paraLevel; - pBiDi->direction=UBIDI_LTR; + pBiDi->direction=paraLevel&1; pBiDi->paraCount=1; pBiDi->dirProps=NULL; @@ -1929,11 +2418,7 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, /* * Save the original paraLevel if contextual; otherwise, set to 0. */ - if(IS_DEFAULT_LEVEL(paraLevel)) { - pBiDi->defaultParaLevel=paraLevel; - } else { - pBiDi->defaultParaLevel=0; - } + pBiDi->defaultParaLevel=IS_DEFAULT_LEVEL(paraLevel); if(length==0) { /* @@ -1945,14 +2430,7 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, pBiDi->paraLevel&=1; pBiDi->defaultParaLevel=0; } - if(paraLevel&1) { - pBiDi->flags=DIRPROP_FLAG(R); - pBiDi->direction=UBIDI_RTL; - } else { - pBiDi->flags=DIRPROP_FLAG(L); - pBiDi->direction=UBIDI_LTR; - } - + pBiDi->flags=DIRPROP_FLAG_LR(paraLevel); pBiDi->runCount=0; pBiDi->paraCount=0; setParaSuccess(pBiDi); /* mark successful setPara */ @@ -1961,6 +2439,12 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, pBiDi->runCount=-1; + /* allocate paras memory */ + if(pBiDi->parasMemory) + pBiDi->paras=pBiDi->parasMemory; + else + pBiDi->paras=pBiDi->simpleParas; + /* * Get the directional properties, * the flags bit-set, and @@ -1968,7 +2452,10 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, */ if(getDirPropsMemory(pBiDi, length)) { pBiDi->dirProps=pBiDi->dirPropsMemory; - getDirProps(pBiDi); + if(!getDirProps(pBiDi)) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } } else { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return; @@ -1976,27 +2463,16 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, /* the processed length may have changed if UBIDI_OPTION_STREAMING */ length= pBiDi->length; pBiDi->trailingWSStart=length; /* the levels[] will reflect the WS run */ - /* allocate paras memory */ - if(pBiDi->paraCount>1) { - if(getInitialParasMemory(pBiDi, pBiDi->paraCount)) { - pBiDi->paras=pBiDi->parasMemory; - pBiDi->paras[pBiDi->paraCount-1]=length; - } else { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return; - } - } else { - /* initialize paras for single paragraph */ - pBiDi->paras=pBiDi->simpleParas; - pBiDi->simpleParas[0]=length; - } /* are explicit levels specified? */ if(embeddingLevels==NULL) { /* no: determine explicit levels according to the (Xn) rules */\ if(getLevelsMemory(pBiDi, length)) { pBiDi->levels=pBiDi->levelsMemory; - direction=resolveExplicitLevels(pBiDi); + direction=resolveExplicitLevels(pBiDi, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } } else { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return; @@ -2010,6 +2486,22 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, } } + /* allocate isolate memory */ + if(pBiDi->isolateCount<=SIMPLE_ISOLATES_SIZE) + pBiDi->isolates=pBiDi->simpleIsolates; + else + if(pBiDi->isolateCount<=pBiDi->isolatesSize) + pBiDi->isolates=pBiDi->isolatesMemory; + else { + if(getInitialIsolatesMemory(pBiDi, pBiDi->isolateCount)) { + pBiDi->isolates=pBiDi->isolatesMemory; + } else { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + } + pBiDi->isolateCount=-1; /* current isolates stack entry == none */ + /* * The steps after (X9) in the UBiDi algorithm are performed only if * the paragraph text has mixed directionality! @@ -2104,7 +2596,7 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, /* the values for this run's start are the same as for the previous run's end */ start=limit; level=nextLevel; - if((start>0) && (NO_CONTEXT_RTL(pBiDi->dirProps[start-1])==B)) { + if((start>0) && (pBiDi->dirProps[start-1]==B)) { /* except if this is a new paragraph, then set sor = para level */ sor=GET_LR_FROM_LEVEL(GET_PARALEVEL(pBiDi, start)); } else { @@ -2158,18 +2650,19 @@ ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, ((pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_LIKE_DIRECT) || (pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL))) { int32_t i, j, start, last; + UBiDiLevel level; DirProp dirProp; for(i=0; iparaCount; i++) { - last=pBiDi->paras[i]-1; - if((pBiDi->dirProps[last] & CONTEXT_RTL)==0) { + last=(pBiDi->paras[i].limit)-1; + level=pBiDi->paras[i].level; + if(level==0) continue; /* LTR paragraph */ - } - start= i==0 ? 0 : pBiDi->paras[i - 1]; + start= i==0 ? 0 : pBiDi->paras[i-1].limit; for(j=last; j>=start; j--) { - dirProp=NO_CONTEXT_RTL(pBiDi->dirProps[j]); + dirProp=pBiDi->dirProps[j]; if(dirProp==L) { if(jdirProps[last])==B) { + while(pBiDi->dirProps[last]==B) { last--; } } @@ -2285,7 +2778,7 @@ ubidi_getParagraphByIndex(const UBiDi *pBiDi, int32_t paraIndex, pBiDi=pBiDi->pParaBiDi; /* get Para object if Line object */ if(paraIndex) { - paraStart=pBiDi->paras[paraIndex-1]; + paraStart=pBiDi->paras[paraIndex-1].limit; } else { paraStart=0; } @@ -2293,7 +2786,7 @@ ubidi_getParagraphByIndex(const UBiDi *pBiDi, int32_t paraIndex, *pParaStart=paraStart; } if(pParaLimit!=NULL) { - *pParaLimit=pBiDi->paras[paraIndex]; + *pParaLimit=pBiDi->paras[paraIndex].limit; } if(pParaLevel!=NULL) { *pParaLevel=GET_PARALEVEL(pBiDi, paraStart); @@ -2304,7 +2797,7 @@ U_CAPI int32_t U_EXPORT2 ubidi_getParagraph(const UBiDi *pBiDi, int32_t charIndex, int32_t *pParaStart, int32_t *pParaLimit, UBiDiLevel *pParaLevel, UErrorCode *pErrorCode) { - uint32_t paraIndex; + int32_t paraIndex; /* check the argument values */ /* pErrorCode will be checked by the call to ubidi_getParagraphByIndex */ @@ -2313,7 +2806,7 @@ ubidi_getParagraph(const UBiDi *pBiDi, int32_t charIndex, pBiDi=pBiDi->pParaBiDi; /* get Para object if Line object */ RETURN_IF_BAD_RANGE(charIndex, 0, pBiDi->length, *pErrorCode, -1); - for(paraIndex=0; charIndex>=pBiDi->paras[paraIndex]; paraIndex++); + for(paraIndex=0; charIndex>=pBiDi->paras[paraIndex].limit; paraIndex++); ubidi_getParagraphByIndex(pBiDi, paraIndex, pParaStart, pParaLimit, pParaLevel, pErrorCode); return paraIndex; } @@ -2366,8 +2859,7 @@ ubidi_getCustomizedClass(UBiDi *pBiDi, UChar32 c) { dir = ubidi_getClass(pBiDi->bdp, c); } - if(dir > 18) { - // TODO: Implement Unicode 6.3 BiDi isolates in the ICU BiDi code. + if(dir >= U_CHAR_DIRECTION_COUNT) { dir = ON; } return dir; diff --git a/icu4c/source/common/ubidiimp.h b/icu4c/source/common/ubidiimp.h index c6ac0104d08..848c2431d17 100644 --- a/icu4c/source/common/ubidiimp.h +++ b/icu4c/source/common/ubidiimp.h @@ -72,30 +72,27 @@ enum { #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31) /* are there any characters that are LTR or RTL? */ -#define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)) -#define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)) +#define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI)) +#define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI)) #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)) +#define MASK_STRONG (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)) /* explicit embedding codes */ -#define MASK_LRX (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)) -#define MASK_RLX (DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)) -#define MASK_OVERRIDE (DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLO)) +#define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF)) + +/* explicit isolate codes */ +#define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI)) -#define MASK_EXPLICIT (MASK_LRX|MASK_RLX|DIRPROP_FLAG(PDF)) #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT) /* paragraph and segment separators */ #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S)) /* all types that are counted as White Space or Neutral in some steps */ -#define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT) -#define MASK_N (DIRPROP_FLAG(ON)|MASK_WS) - -/* all types that are included in a sequence of European Terminators for (W5) */ -#define MASK_ET_NSM_BN (DIRPROP_FLAG(ET)|DIRPROP_FLAG(NSM)|MASK_BN_EXPLICIT) +#define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO) /* types that are neutrals or could becomes neutrals in (Wn) */ -#define MASK_POSSIBLE_N (DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_N) +#define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS) /* * These types may be changed to "e", @@ -110,22 +107,33 @@ enum { #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe) /* - * The following bit is ORed to the property of characters in paragraphs - * with contextual RTL direction when paraLevel is contextual. + * The following bit is ORed to the property of directional control + * characters which are ignored: unmatched PDF or PDI; LRx, RLx or FSI + * which would exceed the maximum explicit bidi level. */ -#define CONTEXT_RTL 0x80 -#define NO_CONTEXT_RTL(dir) ((dir)&~CONTEXT_RTL) +#define IGNORE_CC 0x40 + +#define PURE_DIRPROP(prop) ((prop)&~IGNORE_CC) + /* - * The following is a variant of DIRPROP_FLAG which ignores the CONTEXT_RTL bit. + * The following bit is used for the directional isolate status. + * Stack entries corresponding to isolate sequences are greater than ISOLATE. */ -#define DIRPROP_FLAG_NC(dir) (1UL<<(NO_CONTEXT_RTL(dir))) +#define ISOLATE 0x0100 + +U_CFUNC UBiDiLevel +ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index); #define GET_PARALEVEL(ubidi, index) \ - (UBiDiLevel)((ubidi)->defaultParaLevel ? (ubidi)->dirProps[index]>>7 \ - : (ubidi)->paraLevel) + ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \ + (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index)))) -/* Paragraph type for multiple paragraph support ---------------------------- */ -typedef int32_t Para; +/* number of paras entries allocated initially without malloc */ +#define SIMPLE_PARAS_SIZE 10 +/* number of isolate entries allocated initially without malloc */ +#define SIMPLE_ISOLATES_SIZE 5 +/* number of isolate run entries for paired brackets allocated initially without malloc */ +#define SIMPLE_OPENINGS_SIZE 20 #define CR 0x000D #define LF 0x000A @@ -138,6 +146,50 @@ enum { RLM_AFTER=8 }; +typedef struct Para { + int32_t limit; + int32_t level; +} Para; + +enum { /* flags for Opening.flags */ + FOUND_L=DIRPROP_FLAG(L), + FOUND_R=DIRPROP_FLAG(R) +}; + +typedef struct Opening { + int32_t position; /* position of opening bracket */ + int32_t match; /* matching char or -position of closing bracket */ + int32_t lastStrongPos; /* position of last strong char found before opening */ + DirProp lastStrong; /* bidi class of last strong char before opening */ + uint16_t flags; /* bits for L or R/AL found within the pair */ +} Opening; + +typedef struct IsoRun { + int32_t lastStrongPos; /* position of last strong char found in this run */ + uint16_t start; /* index of first opening entry for this run */ + uint16_t limit; /* index after last opening entry for this run */ + UBiDiLevel level; /* level of this run */ + DirProp lastStrong; /* bidi class of last strong char found in this run */ +} IsoRun; + +typedef struct BracketData { + UBiDi *pBiDi; + /* array of opening entries which should be enough in most cases; no malloc() */ + Opening simpleOpenings[SIMPLE_OPENINGS_SIZE]; + Opening *openings; /* pointer to current array of entries */ + int32_t openingsSize; /* number of allocated entries */ + int32_t isoRunLast; /* index of last used entry */ + /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL + + 1 for index 0, + 1 for before the first isolated sequence */ + IsoRun isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2]; +} BracketData; + +typedef struct Isolate { + int32_t start1; + int16_t stateImp; + int16_t state; +} Isolate; + typedef struct Run { int32_t logicalStart, /* first character of the run; b31 indicates even/odd level */ visualLimit, /* last visual position of the run +1 */ @@ -170,10 +222,14 @@ enum { RLE_CHAR, PDF_CHAR, LRO_CHAR, - RLO_CHAR + RLO_CHAR, + LRI_CHAR=0x2066, + RLI_CHAR, + FSI_CHAR, + PDI_CHAR }; -#define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5) +#define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4) /* InsertPoints structure for noting where to put BiDi marks ---------------- */ @@ -222,19 +278,21 @@ struct UBiDi { int32_t resultLength; /* memory sizes in bytes */ - int32_t dirPropsSize, levelsSize, parasSize, runsSize; + int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize; /* allocated memory */ DirProp *dirPropsMemory; UBiDiLevel *levelsMemory; + Opening *openingsMemory; Para *parasMemory; Run *runsMemory; + Isolate *isolatesMemory; /* indicators for whether memory may be allocated after ubidi_open() */ UBool mayAllocateText, mayAllocateRuns; /* arrays with one value per text-character */ - const DirProp *dirProps; + DirProp *dirProps; UBiDiLevel *levels; /* are we performing an approximation of the "inverse BiDi" algorithm? */ @@ -285,11 +343,11 @@ struct UBiDi { /* fields for paragraph handling */ int32_t paraCount; /* set in getDirProps() */ - Para *paras; /* limits of paragraphs, filled in - ResolveExplicitLevels() or CheckExplicitLevels() */ + /* filled in getDirProps() */ + Para *paras; - /* for single paragraph text, we only need a tiny array of paras (no malloc()) */ - Para simpleParas[1]; + /* for relatively short text, we only need a tiny array of paras (no malloc()) */ + Para simpleParas[SIMPLE_PARAS_SIZE]; /* fields for line reordering */ int32_t runCount; /* ==-1: runs not set up yet */ @@ -298,6 +356,17 @@ struct UBiDi { /* for non-mixed text, we only need a tiny array of runs (no malloc()) */ Run simpleRuns[1]; + /* maximum or current nesting depth of isolate sequences */ + /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal + nesting encountered. + Within resolveImplicitLevels(), this is the index of the current isolates + stack entry. */ + int32_t isolateCount; + Isolate *isolates; + + /* for simple text, have a small stack (no malloc()) */ + Isolate simpleIsolates[SIMPLE_ISOLATES_SIZE]; + /* for inverse Bidi with insertion of directional marks */ InsertPoints insertPoints; @@ -315,8 +384,10 @@ struct UBiDi { typedef union { DirProp *dirPropsMemory; UBiDiLevel *levelsMemory; + Opening *openingsMemory; Para *parasMemory; Run *runsMemory; + Isolate *isolatesMemory; } BidiMemoryForAllocation; /* Macros for initial checks at function entry */ @@ -382,6 +453,10 @@ ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAlloc ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \ TRUE, (length)) +#define getInitialOpeningsMemory(pBiDi, length) \ + ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \ + TRUE, (length)*sizeof(Opening)) + #define getInitialParasMemory(pBiDi, length) \ ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \ TRUE, (length)*sizeof(Para)) @@ -390,6 +465,10 @@ ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAlloc ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \ TRUE, (length)*sizeof(Run)) +#define getInitialIsolatesMemory(pBiDi, length) \ + ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \ + TRUE, (length)*sizeof(Isolate)) + #endif #endif diff --git a/icu4c/source/common/ubidiln.c b/icu4c/source/common/ubidiln.c index 518a54d80ab..743dfb9f3b1 100644 --- a/icu4c/source/common/ubidiln.c +++ b/icu4c/source/common/ubidiln.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1999-2011, International Business Machines +* Copyright (C) 1999-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -32,9 +32,9 @@ * These functions deal with the aspects of potentially mixed-directional * text in a single paragraph or in a line of a single paragraph * which has already been processed according to - * the Unicode 3.0 BiDi algorithm as defined in - * http://www.unicode.org/unicode/reports/tr9/ , version 13, - * also described in The Unicode Standard, Version 4.0.1 . + * the Unicode 6.3 BiDi algorithm as defined in + * http://www.unicode.org/unicode/reports/tr9/ , version 28, + * also described in The Unicode Standard, Version 6.3.0 . * * This means that there is a UBiDi object with a levels * and a dirProps array. @@ -105,12 +105,12 @@ setTrailingWSStart(UBiDi *pBiDi) { level of B chars from 0 to paraLevel in ubidi_getLevels when orderParagraphsLTR==TRUE. */ - if(NO_CONTEXT_RTL(dirProps[start-1])==B) { + if(dirProps[start-1]==B) { pBiDi->trailingWSStart=start; /* currently == pBiDi->length */ return; } /* go backwards across all WS, BN, explicit codes */ - while(start>0 && DIRPROP_FLAG_NC(dirProps[start-1])&MASK_WS) { + while(start>0 && DIRPROP_FLAG(PURE_DIRPROP(dirProps[start-1]))&MASK_WS) { --start; } diff --git a/icu4c/source/common/unicode/ubidi.h b/icu4c/source/common/unicode/ubidi.h index baf2345df25..27042ed7f4b 100644 --- a/icu4c/source/common/unicode/ubidi.h +++ b/icu4c/source/common/unicode/ubidi.h @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1999-2012, International Business Machines +* Copyright (C) 1999-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -387,7 +387,7 @@ typedef uint8_t UBiDiLevel; * (The maximum resolved level can be up to UBIDI_MAX_EXPLICIT_LEVEL+1). * @stable ICU 2.0 */ -#define UBIDI_MAX_EXPLICIT_LEVEL 61 +#define UBIDI_MAX_EXPLICIT_LEVEL 125 /** Bit flag for level input. * Overrides directional properties. diff --git a/icu4c/source/test/cintltst/cbididat.c b/icu4c/source/test/cintltst/cbididat.c index e178ef82ff7..1ef2dc88cb7 100644 --- a/icu4c/source/test/cintltst/cbididat.c +++ b/icu4c/source/test/cintltst/cbididat.c @@ -1,5 +1,5 @@ /******************************************************************** - * COPYRIGHT: + * COPYRIGHT: * Copyright (c) 1997-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -157,62 +157,73 @@ testVisualMap8[]={ static const uint8_t testText9[]={ - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - AN, RLO, NSM, LRE, PDF, RLE, ES, EN, ON + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, /* 15 entries */ + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, /* 15 entries */ + AN, RLO, NSM, LRE, PDF, RLE, ES, EN, ON /* 9 entries */ }; static const UBiDiLevel testLevels9[]={ - 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61 + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, /* 15 entries */ + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, /* 15 entries */ + 126, 125, 125, 125, 125, 125, 125, 125, 125 /* 9 entries */ }; static const uint8_t testVisualMap9[]={ - 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 7, 6, 5, 4, 3, 2, 1, 0 + 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, /* 15 entries */ + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, /* 15 entries */ + 38, 7, 6, 5, 4, 3, 2, 1, 0 /* 9 entries */ }; static const uint8_t testText10[]={ - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, BN, CS, RLO, S, PDF, EN, LRO, AN, ES + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, /* 15 entries */ + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, /* 15 entries */ + LRE, BN, CS, RLO, S, PDF, EN, LRO, AN, ES /* 10 entries */ }; static const UBiDiLevel testLevels10[]={ - 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 0, 0, 62, 62, 62, 62, 60 + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, /* 15 entries */ + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, /* 15 entries */ + 124, 124, 124, 64, 64, 124, 124, 126, 126, 124 /* 10 entries */ }; static const uint8_t testVisualMap10[]={ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* 15 entries */ + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, /* 15 entries */ + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 /* 10 entries */ }; static const uint8_t testText11[]={ - S, WS, NSM, RLE, WS, L, L, L, WS, LRO, WS, R, R, R, WS, RLO, WS, L, L, - L, WS, LRE, WS, R, R, R, WS, PDF, WS, L, L, L, WS, PDF, WS, - AL, AL, AL, WS, PDF, WS, L, L, L, WS, PDF, WS, L, L, L, WS, PDF, - ON, PDF, BN, BN, ON, PDF + S, WS, NSM, RLE, WS, L, L, L, WS, LRO, WS, R, R, R, WS, RLO, WS, L, L, L, /* 20 entries */ + WS, LRE, WS, R, R, R, WS, PDF, WS, L, L, L, WS, PDF, WS, AL, AL, AL, WS, PDF, /* 20 entries */ + WS, L, L, L, WS, PDF, WS, L, L, L, WS, PDF, ON, PDF, BN, BN, ON, PDF /* 18 entries */ }; static const UBiDiLevel testLevels11[]={ - 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, /* 20 entries */ + 3, 4, 4, 5, 5, 5, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, /* 20 entries */ + 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* 18 entries */ }; static const uint8_t testVisualMap11[]={ - 0, 1, 2, 44, 43, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 31, 30, 29, 28, 27, 26, 20, 21, 24, 23, 22, 25, 19, 18, 17, 16, 15, 14, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 3, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57 + 0, 1, 2, 44, 43, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 31, 30, 29, 28, 27, /* 20 entries */ + 26, 20, 21, 24, 23, 22, 25, 19, 18, 17, 16, 15, 14, 32, 33, 34, 35, 36, 37, 38, /* 20 entries */ + 39, 40, 41, 42, 3, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57 /* 18 entries */ }; static const uint8_t testText12[]={ - NSM, WS, L, L, L, L, L, L, L, WS, L, L, L, L, WS, - R, R, R, R, R, WS, L, L, L, L, L, L, L, WS, WS, AL, - AL, AL, AL, WS, EN, EN, ES, EN, EN, CS, S, EN, EN, CS, WS, - EN, EN, WS, AL, AL, AL, AL, AL, B, L, L, L, L, L, L, + NSM, WS, L, L, L, L, L, L, L, WS, L, L, L, L, WS, + R, R, R, R, R, WS, L, L, L, L, L, L, L, WS, WS, AL, + AL, AL, AL, WS, EN, EN, ES, EN, EN, CS, S, EN, EN, CS, WS, + EN, EN, WS, AL, AL, AL, AL, AL, B, L, L, L, L, L, L, L, L, WS, AN, AN, CS, AN, AN, WS }; @@ -387,11 +398,11 @@ tests[]={ {testText8, ARRAY_LENGTH(testText8), UBIDI_DEFAULT_LTR, -1, -1, UBIDI_RTL, 1, testLevels8, testVisualMap8}, - {testText9, ARRAY_LENGTH(testText9), UBIDI_DEFAULT_LTR, -1, -1, - UBIDI_MIXED, 0, + {testText9, ARRAY_LENGTH(testText9), 64, -1, -1, + UBIDI_MIXED, 64, testLevels9, testVisualMap9}, - {testText10, ARRAY_LENGTH(testText10), UBIDI_DEFAULT_LTR, -1, -1, - UBIDI_MIXED, 0, + {testText10, ARRAY_LENGTH(testText10), 64, -1, -1, + UBIDI_MIXED, 64, testLevels10, testVisualMap10}, {testText11, ARRAY_LENGTH(testText11), UBIDI_DEFAULT_LTR, -1, -1, UBIDI_MIXED, 0, @@ -429,7 +440,7 @@ tests[]={ {testText17, ARRAY_LENGTH(testText17), UBIDI_LTR, 0, 8, UBIDI_MIXED, 0, testLevels22, testVisualMap21}, - {testTextXX, ARRAY_LENGTH(testTextXX), UBIDI_RTL, -1, -1, + {testTextXX, ARRAY_LENGTH(testTextXX), UBIDI_RTL, -1, -1, UBIDI_MIXED, 1, testLevelsXX, testVisualMapXX} }; diff --git a/icu4c/source/test/cintltst/cbiditst.c b/icu4c/source/test/cintltst/cbiditst.c index a43dcd745f3..30bca7af375 100644 --- a/icu4c/source/test/cintltst/cbiditst.c +++ b/icu4c/source/test/cintltst/cbiditst.c @@ -1,9 +1,9 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2012, International Business Machines Corporation and + * Copyright (c) 1997-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ -/* file name: cbiditst.cpp +/* file name: cbiditst.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 @@ -650,8 +650,8 @@ testReorder(void) { static const char* const visualOrder[]={ "del(CK)add(&.C.K)", "del(TVDQ) add(LDVB)", - "del(QP)add(&.U(T(.S.R", - "del(VL)add(&.V.L (.V.L", + "del(QP)add(S.R.)&.U(T", /* updated for Unicode 6.3 matching brackets */ + "del(VL)add(V.L.) &.V.L", /* updated for Unicode 6.3 matching brackets */ "day 0 RVRHDPD R dayabbr", "day 1 ADHDPHPD H dayabbr", "day 2 ADNELBPD L dayabbr", @@ -664,8 +664,8 @@ testReorder(void) { static const char* const visualOrder1[]={ ")K.C.&(dda)KC(led", ")BVDL(dda )QDVT(led", - "R.S.(T(U.&(dda)PQ(led", - "L.V.( L.V.&(dda)LV(led", + "T(U.&).R.S(dda)PQ(led", /* updated for Unicode 6.3 matching brackets */ + "L.V.& ).L.V(dda)LV(led", /* updated for Unicode 6.3 matching brackets */ "rbbayad R DPDHRVR 0 yad", "rbbayad H DPHPDHDA 1 yad", "rbbayad L DPBLENDA 2 yad", @@ -898,86 +898,86 @@ static void testReorderArabicMathSymbols(void) { static const UChar logicalOrder[][MAXLEN]={ /* Arabic mathematical Symbols 0x1EE00 - 0x1EE1B */ - {0xD83B, 0xDE00, 0xD83B, 0xDE01, 0xD83B, 0xDE02, 0xD83B, 0xDE03, 0x20, - 0xD83B, 0xDE24, 0xD83B, 0xDE05, 0xD83B, 0xDE06, 0x20, - 0xD83B, 0xDE07, 0xD83B, 0xDE08, 0xD83B, 0xDE09, 0x20, - 0xD83B, 0xDE0A, 0xD83B, 0xDE0B, 0xD83B, 0xDE0C, 0xD83B, 0xDE0D, 0x20, - 0xD83B, 0xDE0E, 0xD83B, 0xDE0F, 0xD83B, 0xDE10, 0xD83B, 0xDE11, 0x20, - 0xD83B, 0xDE12, 0xD83B, 0xDE13, 0xD83B, 0xDE14, 0xD83B, 0xDE15, 0x20, - 0xD83B, 0xDE16, 0xD83B, 0xDE17, 0xD83B, 0xDE18, 0x20, + {0xD83B, 0xDE00, 0xD83B, 0xDE01, 0xD83B, 0xDE02, 0xD83B, 0xDE03, 0x20, + 0xD83B, 0xDE24, 0xD83B, 0xDE05, 0xD83B, 0xDE06, 0x20, + 0xD83B, 0xDE07, 0xD83B, 0xDE08, 0xD83B, 0xDE09, 0x20, + 0xD83B, 0xDE0A, 0xD83B, 0xDE0B, 0xD83B, 0xDE0C, 0xD83B, 0xDE0D, 0x20, + 0xD83B, 0xDE0E, 0xD83B, 0xDE0F, 0xD83B, 0xDE10, 0xD83B, 0xDE11, 0x20, + 0xD83B, 0xDE12, 0xD83B, 0xDE13, 0xD83B, 0xDE14, 0xD83B, 0xDE15, 0x20, + 0xD83B, 0xDE16, 0xD83B, 0xDE17, 0xD83B, 0xDE18, 0x20, 0xD83B, 0xDE19, 0xD83B, 0xDE1A, 0xD83B, 0xDE1B}, /* Arabic mathematical Symbols - Looped Symbols, 0x1EE80 - 0x1EE9B */ - {0xD83B, 0xDE80, 0xD83B, 0xDE81, 0xD83B, 0xDE82, 0xD83B, 0xDE83, 0x20, - 0xD83B, 0xDE84, 0xD83B, 0xDE85, 0xD83B, 0xDE86, 0x20, - 0xD83B, 0xDE87, 0xD83B, 0xDE88, 0xD83B, 0xDE89, 0x20, - 0xD83B, 0xDE8B, 0xD83B, 0xDE8C, 0xD83B, 0xDE8D, 0x20, - 0xD83B, 0xDE8E, 0xD83B, 0xDE8F, 0xD83B, 0xDE90, 0xD83B, 0xDE91, 0x20, - 0xD83B, 0xDE92, 0xD83B, 0xDE93, 0xD83B, 0xDE94, 0xD83B, 0xDE95, 0x20, - 0xD83B, 0xDE96, 0xD83B, 0xDE97, 0xD83B, 0xDE98, 0x20, + {0xD83B, 0xDE80, 0xD83B, 0xDE81, 0xD83B, 0xDE82, 0xD83B, 0xDE83, 0x20, + 0xD83B, 0xDE84, 0xD83B, 0xDE85, 0xD83B, 0xDE86, 0x20, + 0xD83B, 0xDE87, 0xD83B, 0xDE88, 0xD83B, 0xDE89, 0x20, + 0xD83B, 0xDE8B, 0xD83B, 0xDE8C, 0xD83B, 0xDE8D, 0x20, + 0xD83B, 0xDE8E, 0xD83B, 0xDE8F, 0xD83B, 0xDE90, 0xD83B, 0xDE91, 0x20, + 0xD83B, 0xDE92, 0xD83B, 0xDE93, 0xD83B, 0xDE94, 0xD83B, 0xDE95, 0x20, + 0xD83B, 0xDE96, 0xD83B, 0xDE97, 0xD83B, 0xDE98, 0x20, 0xD83B, 0xDE99, 0xD83B, 0xDE9A, 0xD83B, 0xDE9B}, /* Arabic mathematical Symbols - Double-struck Symbols, 0x1EEA1 - 0x1EEBB */ - {0xD83B, 0xDEA1, 0xD83B, 0xDEA2, 0xD83B, 0xDEA3, 0x20, - 0xD83B, 0xDEA5, 0xD83B, 0xDEA6, 0x20, - 0xD83B, 0xDEA7, 0xD83B, 0xDEA8, 0xD83B, 0xDEA9, 0x20, - 0xD83B, 0xDEAB, 0xD83B, 0xDEAC, 0xD83B, 0xDEAD, 0x20, - 0xD83B, 0xDEAE, 0xD83B, 0xDEAF, 0xD83B, 0xDEB0, 0xD83B, 0xDEB1, 0x20, - 0xD83B, 0xDEB2, 0xD83B, 0xDEB3, 0xD83B, 0xDEB4, 0xD83B, 0xDEB5, 0x20, - 0xD83B, 0xDEB6, 0xD83B, 0xDEB7, 0xD83B, 0xDEB8, 0x20, + {0xD83B, 0xDEA1, 0xD83B, 0xDEA2, 0xD83B, 0xDEA3, 0x20, + 0xD83B, 0xDEA5, 0xD83B, 0xDEA6, 0x20, + 0xD83B, 0xDEA7, 0xD83B, 0xDEA8, 0xD83B, 0xDEA9, 0x20, + 0xD83B, 0xDEAB, 0xD83B, 0xDEAC, 0xD83B, 0xDEAD, 0x20, + 0xD83B, 0xDEAE, 0xD83B, 0xDEAF, 0xD83B, 0xDEB0, 0xD83B, 0xDEB1, 0x20, + 0xD83B, 0xDEB2, 0xD83B, 0xDEB3, 0xD83B, 0xDEB4, 0xD83B, 0xDEB5, 0x20, + 0xD83B, 0xDEB6, 0xD83B, 0xDEB7, 0xD83B, 0xDEB8, 0x20, 0xD83B, 0xDEB9, 0xD83B, 0xDEBA, 0xD83B, 0xDEBB}, /* Arabic mathematical Symbols - Initial Symbols, 0x1EE21 - 0x1EE3B */ - {0xD83B, 0xDE21, 0xD83B, 0xDE22, 0x20, - 0xD83B, 0xDE27, 0xD83B, 0xDE29, 0x20, - 0xD83B, 0xDE2A, 0xD83B, 0xDE2B, 0xD83B, 0xDE2C, 0xD83B, 0xDE2D, 0x20, - 0xD83B, 0xDE2E, 0xD83B, 0xDE2F, 0xD83B, 0xDE30, 0xD83B, 0xDE31, 0x20, - 0xD83B, 0xDE32, 0xD83B, 0xDE34, 0xD83B, 0xDE35, 0x20, - 0xD83B, 0xDE36, 0xD83B, 0xDE37, 0x20, + {0xD83B, 0xDE21, 0xD83B, 0xDE22, 0x20, + 0xD83B, 0xDE27, 0xD83B, 0xDE29, 0x20, + 0xD83B, 0xDE2A, 0xD83B, 0xDE2B, 0xD83B, 0xDE2C, 0xD83B, 0xDE2D, 0x20, + 0xD83B, 0xDE2E, 0xD83B, 0xDE2F, 0xD83B, 0xDE30, 0xD83B, 0xDE31, 0x20, + 0xD83B, 0xDE32, 0xD83B, 0xDE34, 0xD83B, 0xDE35, 0x20, + 0xD83B, 0xDE36, 0xD83B, 0xDE37, 0x20, 0xD83B, 0xDE39, 0xD83B, 0xDE3B}, /* Arabic mathematical Symbols - Tailed Symbols */ - {0xD83B, 0xDE42, 0xD83B, 0xDE47, 0xD83B, 0xDE49, 0xD83B, 0xDE4B, 0x20, - 0xD83B, 0xDE4D, 0xD83B, 0xDE4E, 0xD83B, 0xDE4F, 0x20, - 0xD83B, 0xDE51, 0xD83B, 0xDE52, 0xD83B, 0xDE54, 0xD83B, 0xDE57, 0x20, + {0xD83B, 0xDE42, 0xD83B, 0xDE47, 0xD83B, 0xDE49, 0xD83B, 0xDE4B, 0x20, + 0xD83B, 0xDE4D, 0xD83B, 0xDE4E, 0xD83B, 0xDE4F, 0x20, + 0xD83B, 0xDE51, 0xD83B, 0xDE52, 0xD83B, 0xDE54, 0xD83B, 0xDE57, 0x20, 0xD83B, 0xDE59, 0xD83B, 0xDE5B, 0xD83B, 0xDE5D, 0xD83B, 0xDE5F} }; static const UChar visualOrder[][MAXLEN]={ /* Arabic mathematical Symbols 0x1EE00 - 0x1EE1B */ - {0xD83B, 0xDE1B, 0xD83B, 0xDE1A, 0xD83B, 0xDE19, 0x20, - 0xD83B, 0xDE18, 0xD83B, 0xDE17, 0xD83B, 0xDE16, 0x20, + {0xD83B, 0xDE1B, 0xD83B, 0xDE1A, 0xD83B, 0xDE19, 0x20, + 0xD83B, 0xDE18, 0xD83B, 0xDE17, 0xD83B, 0xDE16, 0x20, 0xD83B, 0xDE15, 0xD83B, 0xDE14, 0xD83B, 0xDE13, 0xD83B, 0xDE12, 0x20, 0xD83B, 0xDE11, 0xD83B, 0xDE10, 0xD83B, 0xDE0F, 0xD83B, 0xDE0E, 0x20, 0xD83B, 0xDE0D, 0xD83B, 0xDE0C, 0xD83B, 0xDE0B, 0xD83B, 0xDE0A, 0x20, - 0xD83B, 0xDE09, 0xD83B, 0xDE08, 0xD83B, 0xDE07, 0x20, - 0xD83B, 0xDE06, 0xD83B, 0xDE05, 0xD83B, 0xDE24, 0x20, + 0xD83B, 0xDE09, 0xD83B, 0xDE08, 0xD83B, 0xDE07, 0x20, + 0xD83B, 0xDE06, 0xD83B, 0xDE05, 0xD83B, 0xDE24, 0x20, 0xD83B, 0xDE03, 0xD83B, 0xDE02, 0xD83B, 0xDE01, 0xD83B, 0xDE00}, /* Arabic mathematical Symbols - Looped Symbols, 0x1EE80 - 0x1EE9B */ - {0xD83B, 0xDE9B, 0xD83B, 0xDE9A, 0xD83B, 0xDE99, 0x20, - 0xD83B, 0xDE98, 0xD83B, 0xDE97, 0xD83B, 0xDE96, 0x20, + {0xD83B, 0xDE9B, 0xD83B, 0xDE9A, 0xD83B, 0xDE99, 0x20, + 0xD83B, 0xDE98, 0xD83B, 0xDE97, 0xD83B, 0xDE96, 0x20, 0xD83B, 0xDE95, 0xD83B, 0xDE94, 0xD83B, 0xDE93, 0xD83B, 0xDE92, 0x20, 0xD83B, 0xDE91, 0xD83B, 0xDE90, 0xD83B, 0xDE8F, 0xD83B, 0xDE8E, 0x20, - 0xD83B, 0xDE8D, 0xD83B, 0xDE8C, 0xD83B, 0xDE8B, 0x20, - 0xD83B, 0xDE89, 0xD83B, 0xDE88, 0xD83B, 0xDE87, 0x20, - 0xD83B, 0xDE86, 0xD83B, 0xDE85, 0xD83B, 0xDE84, 0x20, + 0xD83B, 0xDE8D, 0xD83B, 0xDE8C, 0xD83B, 0xDE8B, 0x20, + 0xD83B, 0xDE89, 0xD83B, 0xDE88, 0xD83B, 0xDE87, 0x20, + 0xD83B, 0xDE86, 0xD83B, 0xDE85, 0xD83B, 0xDE84, 0x20, 0xD83B, 0xDE83, 0xD83B, 0xDE82, 0xD83B, 0xDE81, 0xD83B, 0xDE80}, /* Arabic mathematical Symbols - Double-struck Symbols, 0x1EEA1 - 0x1EEBB */ - {0xD83B, 0xDEBB, 0xD83B, 0xDEBA, 0xD83B, 0xDEB9, 0x20, - 0xD83B, 0xDEB8, 0xD83B, 0xDEB7, 0xD83B, 0xDEB6, 0x20, + {0xD83B, 0xDEBB, 0xD83B, 0xDEBA, 0xD83B, 0xDEB9, 0x20, + 0xD83B, 0xDEB8, 0xD83B, 0xDEB7, 0xD83B, 0xDEB6, 0x20, 0xD83B, 0xDEB5, 0xD83B, 0xDEB4, 0xD83B, 0xDEB3, 0xD83B, 0xDEB2, 0x20, 0xD83B, 0xDEB1, 0xD83B, 0xDEB0, 0xD83B, 0xDEAF, 0xD83B, 0xDEAE, 0x20, - 0xD83B, 0xDEAD, 0xD83B, 0xDEAC, 0xD83B, 0xDEAB, 0x20, - 0xD83B, 0xDEA9, 0xD83B, 0xDEA8, 0xD83B, 0xDEA7, 0x20, - 0xD83B, 0xDEA6, 0xD83B, 0xDEA5, 0x20, + 0xD83B, 0xDEAD, 0xD83B, 0xDEAC, 0xD83B, 0xDEAB, 0x20, + 0xD83B, 0xDEA9, 0xD83B, 0xDEA8, 0xD83B, 0xDEA7, 0x20, + 0xD83B, 0xDEA6, 0xD83B, 0xDEA5, 0x20, 0xD83B, 0xDEA3, 0xD83B, 0xDEA2, 0xD83B, 0xDEA1}, /* Arabic mathematical Symbols - Initial Symbols, 0x1EE21 - 0x1EE3B */ - {0xD83B, 0xDE3B, 0xD83B, 0xDE39, 0x20, - 0xD83B, 0xDE37, 0xD83B, 0xDE36, 0x20, - 0xD83B, 0xDE35, 0xD83B, 0xDE34, 0xD83B, 0xDE32, 0x20, + {0xD83B, 0xDE3B, 0xD83B, 0xDE39, 0x20, + 0xD83B, 0xDE37, 0xD83B, 0xDE36, 0x20, + 0xD83B, 0xDE35, 0xD83B, 0xDE34, 0xD83B, 0xDE32, 0x20, 0xD83B, 0xDE31, 0xD83B, 0xDE30, 0xD83B, 0xDE2F, 0xD83B, 0xDE2E, 0x20, 0xD83B, 0xDE2D, 0xD83B, 0xDE2C, 0xD83B, 0xDE2B, 0xD83B, 0xDE2A, 0x20, - 0xD83B, 0xDE29, 0xD83B, 0xDE27, 0x20, + 0xD83B, 0xDE29, 0xD83B, 0xDE27, 0x20, 0xD83B, 0xDE22, 0xD83B, 0xDE21}, /* Arabic mathematical Symbols - Tailed Symbols */ {0xD83B, 0xDE5F, 0xD83B, 0xDE5D, 0xD83B, 0xDE5B, 0xD83B, 0xDE59, 0x20, 0xD83B, 0xDE57, 0xD83B, 0xDE54, 0xD83B, 0xDE52, 0xD83B, 0xDE51, 0x20, - 0xD83B, 0xDE4F, 0xD83B, 0xDE4E, 0xD83B, 0xDE4D, 0x20, + 0xD83B, 0xDE4F, 0xD83B, 0xDE4E, 0xD83B, 0xDE4D, 0x20, 0xD83B, 0xDE4B, 0xD83B, 0xDE49, 0xD83B, 0xDE47, 0xD83B, 0xDE42} }; char formatChars[MAXLEN]; @@ -1666,8 +1666,8 @@ static void doMisc(void) { srcLen = u_unescape("A\\u202a\\u05d0\\u202aC\\u202c\\u05d1\\u202cE", src, MAXLEN); ubidi_setPara(bidi, src, srcLen, UBIDI_MAX_EXPLICIT_LEVEL - 1, NULL, &errorCode); level = ubidi_getLevelAt(bidi, 2); - if (level != 61) { - log_err("\nWrong level at index 2\n, should be 61, got %d\n", level); + if (level != UBIDI_MAX_EXPLICIT_LEVEL) { + log_err("\nWrong level at index 2\n, should be %d, got %d\n", UBIDI_MAX_EXPLICIT_LEVEL, level); } RETURN_IF_BAD_ERRCODE("#24#"); @@ -2910,7 +2910,7 @@ doTailTest(void) { UChar dst[3] = { 0x0000, 0x0000,0 }; int32_t length; UErrorCode status; - + log_verbose("SRC: U+%04X U+%04X\n", src[0],src[1]); log_verbose("Trying old tail\n"); @@ -2918,7 +2918,7 @@ doTailTest(void) { length = u_shapeArabic(src, -1, dst, LENGTHOF(dst), U_SHAPE_LETTERS_SHAPE|U_SHAPE_SEEN_TWOCELL_NEAR, &status); if(U_FAILURE(status)) { - log_err("Fail: status %s\n", u_errorName(status)); + log_err("Fail: status %s\n", u_errorName(status)); } else if(length!=2) { log_err("Fail: len %d expected 3\n", length); } else if(u_strncmp(dst,dst_old,LENGTHOF(dst))) { @@ -2935,7 +2935,7 @@ doTailTest(void) { length = u_shapeArabic(src, -1, dst, LENGTHOF(dst), U_SHAPE_LETTERS_SHAPE|U_SHAPE_SEEN_TWOCELL_NEAR|U_SHAPE_TAIL_NEW_UNICODE, &status); if(U_FAILURE(status)) { - log_err("Fail: status %s\n", u_errorName(status)); + log_err("Fail: status %s\n", u_errorName(status)); } else if(length!=2) { log_err("Fail: len %d expected 3\n", length); } else if(u_strncmp(dst,dst_new,LENGTHOF(dst))) { @@ -3028,21 +3028,21 @@ doArabicShapingTestForBug8703(void) { letters_source1[]={ 0x0634,0x0651,0x0645,0x0652,0x0633 }, letters_source2[]={ - 0x0634,0x0651,0x0645,0x0652,0x0633 + 0x0634,0x0651,0x0645,0x0652,0x0633 }, letters_source3[]={ 0x0634,0x0651,0x0645,0x0652,0x0633 }, letters_source4[]={ - 0x0634,0x0651,0x0645,0x0652,0x0633 + 0x0634,0x0651,0x0645,0x0652,0x0633 }, letters_source5[]={ 0x0633,0x0652,0x0645,0x0651,0x0634 }, letters_source6[]={ - 0x0633,0x0652,0x0645,0x0651,0x0634 + 0x0633,0x0652,0x0645,0x0651,0x0634 }, letters_source7[]={ 0x0633,0x0652,0x0645,0x0651,0x0634 }, letters_source8[]={ 0x0633,0x0652,0x0645,0x0651,0x0634 }, letters_dest1[]={ - 0x0020,0xFEB7,0xFE7D,0xFEE4,0xFEB2 + 0x0020,0xFEB7,0xFE7D,0xFEE4,0xFEB2 }, letters_dest2[]={ 0xFEB7,0xFE7D,0xFEE4,0xFEB2,0x0020 }, letters_dest3[]={ @@ -3050,7 +3050,7 @@ doArabicShapingTestForBug8703(void) { }, letters_dest4[]={ 0xFEB7,0xFE7D,0xFEE4,0x0640,0xFEB2 }, letters_dest5[]={ - 0x0020,0xFEB2,0xFEE4,0xFE7D,0xFEB7 + 0x0020,0xFEB2,0xFEE4,0xFE7D,0xFEB7 }, letters_dest6[]={ 0xFEB2,0xFEE4,0xFE7D,0xFEB7,0x0020 }, letters_dest7[]={ @@ -3156,7 +3156,7 @@ static void doArabicShapingTestForBug9024(void) { static const UChar letters_source1[]={ /* Arabic mathematical Symbols 0x1EE00 - 0x1EE1B */ - 0xD83B, 0xDE00, 0xD83B, 0xDE01, 0xD83B, 0xDE02, 0xD83B, 0xDE03, 0x20, + 0xD83B, 0xDE00, 0xD83B, 0xDE01, 0xD83B, 0xDE02, 0xD83B, 0xDE03, 0x20, 0xD83B, 0xDE24, 0xD83B, 0xDE05, 0xD83B, 0xDE06, 0x20, 0xD83B, 0xDE07, 0xD83B, 0xDE08, 0xD83B, 0xDE09, 0x20, 0xD83B, 0xDE0A, 0xD83B, 0xDE0B, 0xD83B, 0xDE0C, 0xD83B, 0xDE0D, 0x20, @@ -3165,7 +3165,7 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDE16, 0xD83B, 0xDE17, 0xD83B, 0xDE18, 0x20, 0xD83B, 0xDE19, 0xD83B, 0xDE1A, 0xD83B, 0xDE1B }, letters_source2[]={/* Arabic mathematical Symbols - Looped Symbols, 0x1EE80 - 0x1EE9B */ - 0xD83B, 0xDE80, 0xD83B, 0xDE81, 0xD83B, 0xDE82, 0xD83B, 0xDE83, 0x20, + 0xD83B, 0xDE80, 0xD83B, 0xDE81, 0xD83B, 0xDE82, 0xD83B, 0xDE83, 0x20, 0xD83B, 0xDE84, 0xD83B, 0xDE85, 0xD83B, 0xDE86, 0x20, 0xD83B, 0xDE87, 0xD83B, 0xDE88, 0xD83B, 0xDE89, 0x20, 0xD83B, 0xDE8B, 0xD83B, 0xDE8C, 0xD83B, 0xDE8D, 0x20, @@ -3174,7 +3174,7 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDE96, 0xD83B, 0xDE97, 0xD83B, 0xDE98, 0x20, 0xD83B, 0xDE99, 0xD83B, 0xDE9A, 0xD83B, 0xDE9B }, letters_source3[]={/* Arabic mathematical Symbols - Double-struck Symbols, 0x1EEA1 - 0x1EEBB */ - 0xD83B, 0xDEA1, 0xD83B, 0xDEA2, 0xD83B, 0xDEA3, 0x20, + 0xD83B, 0xDEA1, 0xD83B, 0xDEA2, 0xD83B, 0xDEA3, 0x20, 0xD83B, 0xDEA5, 0xD83B, 0xDEA6, 0x20, 0xD83B, 0xDEA7, 0xD83B, 0xDEA8, 0xD83B, 0xDEA9, 0x20, 0xD83B, 0xDEAB, 0xD83B, 0xDEAC, 0xD83B, 0xDEAD, 0x20, @@ -3183,7 +3183,7 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDEB6, 0xD83B, 0xDEB7, 0xD83B, 0xDEB8, 0x20, 0xD83B, 0xDEB9, 0xD83B, 0xDEBA, 0xD83B, 0xDEBB }, letters_source4[]={/* Arabic mathematical Symbols - Initial Symbols, 0x1EE21 - 0x1EE3B */ - 0xD83B, 0xDE21, 0xD83B, 0xDE22, 0x20, + 0xD83B, 0xDE21, 0xD83B, 0xDE22, 0x20, 0xD83B, 0xDE27, 0xD83B, 0xDE29, 0x20, 0xD83B, 0xDE2A, 0xD83B, 0xDE2B, 0xD83B, 0xDE2C, 0xD83B, 0xDE2D, 0x20, 0xD83B, 0xDE2E, 0xD83B, 0xDE2F, 0xD83B, 0xDE30, 0xD83B, 0xDE31, 0x20, @@ -3191,14 +3191,14 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDE36, 0xD83B, 0xDE37, 0x20, 0xD83B, 0xDE39, 0xD83B, 0xDE3B }, letters_source5[]={/* Arabic mathematical Symbols - Tailed Symbols */ - 0xD83B, 0xDE42, 0xD83B, 0xDE47, 0xD83B, 0xDE49, 0xD83B, 0xDE4B, 0x20, + 0xD83B, 0xDE42, 0xD83B, 0xDE47, 0xD83B, 0xDE49, 0xD83B, 0xDE4B, 0x20, 0xD83B, 0xDE4D, 0xD83B, 0xDE4E, 0xD83B, 0xDE4F, 0x20, 0xD83B, 0xDE51, 0xD83B, 0xDE52, 0xD83B, 0xDE54, 0xD83B, 0xDE57, 0x20, 0xD83B, 0xDE59, 0xD83B, 0xDE5B, 0xD83B, 0xDE5D, 0xD83B, 0xDE5F }, letters_source6[]={/* Arabic mathematical Symbols - Stretched Symbols with 06 range */ 0xD83B, 0xDE21, 0x0633, 0xD83B, 0xDE62, 0x0647 }, letters_dest1[]={ - 0xD83B, 0xDE00, 0xD83B, 0xDE01, 0xD83B, 0xDE02, 0xD83B, 0xDE03, 0x20, + 0xD83B, 0xDE00, 0xD83B, 0xDE01, 0xD83B, 0xDE02, 0xD83B, 0xDE03, 0x20, 0xD83B, 0xDE24, 0xD83B, 0xDE05, 0xD83B, 0xDE06, 0x20, 0xD83B, 0xDE07, 0xD83B, 0xDE08, 0xD83B, 0xDE09, 0x20, 0xD83B, 0xDE0A, 0xD83B, 0xDE0B, 0xD83B, 0xDE0C, 0xD83B, 0xDE0D, 0x20, @@ -3207,7 +3207,7 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDE16, 0xD83B, 0xDE17, 0xD83B, 0xDE18, 0x20, 0xD83B, 0xDE19, 0xD83B, 0xDE1A, 0xD83B, 0xDE1B }, letters_dest2[]={ - 0xD83B, 0xDE80, 0xD83B, 0xDE81, 0xD83B, 0xDE82, 0xD83B, 0xDE83, 0x20, + 0xD83B, 0xDE80, 0xD83B, 0xDE81, 0xD83B, 0xDE82, 0xD83B, 0xDE83, 0x20, 0xD83B, 0xDE84, 0xD83B, 0xDE85, 0xD83B, 0xDE86, 0x20, 0xD83B, 0xDE87, 0xD83B, 0xDE88, 0xD83B, 0xDE89, 0x20, 0xD83B, 0xDE8B, 0xD83B, 0xDE8C, 0xD83B, 0xDE8D, 0x20, @@ -3216,7 +3216,7 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDE96, 0xD83B, 0xDE97, 0xD83B, 0xDE98, 0x20, 0xD83B, 0xDE99, 0xD83B, 0xDE9A, 0xD83B, 0xDE9B }, letters_dest3[]={ - 0xD83B, 0xDEA1, 0xD83B, 0xDEA2, 0xD83B, 0xDEA3, 0x20, + 0xD83B, 0xDEA1, 0xD83B, 0xDEA2, 0xD83B, 0xDEA3, 0x20, 0xD83B, 0xDEA5, 0xD83B, 0xDEA6, 0x20, 0xD83B, 0xDEA7, 0xD83B, 0xDEA8, 0xD83B, 0xDEA9, 0x20, 0xD83B, 0xDEAB, 0xD83B, 0xDEAC, 0xD83B, 0xDEAD, 0x20, @@ -3225,7 +3225,7 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDEB6, 0xD83B, 0xDEB7, 0xD83B, 0xDEB8, 0x20, 0xD83B, 0xDEB9, 0xD83B, 0xDEBA, 0xD83B, 0xDEBB }, letters_dest4[]={ - 0xD83B, 0xDE21, 0xD83B, 0xDE22, 0x20, + 0xD83B, 0xDE21, 0xD83B, 0xDE22, 0x20, 0xD83B, 0xDE27, 0xD83B, 0xDE29, 0x20, 0xD83B, 0xDE2A, 0xD83B, 0xDE2B, 0xD83B, 0xDE2C, 0xD83B, 0xDE2D, 0x20, 0xD83B, 0xDE2E, 0xD83B, 0xDE2F, 0xD83B, 0xDE30, 0xD83B, 0xDE31, 0x20, @@ -3233,7 +3233,7 @@ doArabicShapingTestForBug9024(void) { 0xD83B, 0xDE36, 0xD83B, 0xDE37, 0x20, 0xD83B, 0xDE39, 0xD83B, 0xDE3B }, letters_dest5[]={ - 0xD83B, 0xDE42, 0xD83B, 0xDE47, 0xD83B, 0xDE49, 0xD83B, 0xDE4B, 0x20, + 0xD83B, 0xDE42, 0xD83B, 0xDE47, 0xD83B, 0xDE49, 0xD83B, 0xDE4B, 0x20, 0xD83B, 0xDE4D, 0xD83B, 0xDE4E, 0xD83B, 0xDE4F, 0x20, 0xD83B, 0xDE51, 0xD83B, 0xDE52, 0xD83B, 0xDE54, 0xD83B, 0xDE57, 0x20, 0xD83B, 0xDE59, 0xD83B, 0xDE5B, 0xD83B, 0xDE5D, 0xD83B, 0xDE5F @@ -4169,20 +4169,20 @@ testStreaming(void) { "\\u000D" "02468\\u000D" "ghi", - 6, { 6, 6 }, {{ 6, 4, 6, 1, 6, 3}, { 4, 6, 6, 1, 6, 3 }}, - {"6, 4, 6, 1, 6, 3", "4, 6, 6, 1, 6, 3"} + 6, { 6, 6 }, {{ 4, 6, 6, 1, 6, 3}, { 4, 6, 6, 1, 6, 3 }}, + {"4, 6, 6, 1, 6, 3", "4, 6, 6, 1, 6, 3"} }, { "abcd\\u000Afgh\\u000D12345\\u000A456", - 6, { 4, 4 }, {{ 6, 3, 6, 3 }, { 5, 4, 6, 3 }}, - {"6, 3, 6, 3", "5, 4, 6, 3"} + 6, { 4, 4 }, {{ 5, 4, 6, 3 }, { 5, 4, 6, 3 }}, + {"5, 4, 6, 3", "5, 4, 6, 3"} }, { "abcd\\u000Afgh\\u000D12345\\u000A45\\u000D", - 6, { 4, 4 }, {{ 6, 3, 6, 3 }, { 5, 4, 6, 3 }}, - {"6, 3, 6, 3", "5, 4, 6, 3"} + 6, { 4, 4 }, {{ 5, 4, 6, 3 }, { 5, 4, 6, 3 }}, + {"5, 4, 6, 3", "5, 4, 6, 3"} }, { "abcde\\u000Afghi", - 10, { 1, 2 }, {{ 10 }, { 6, 4 }}, - {"10", "6, 4"} + 10, { 2, 2 }, {{ 6, 4 }, { 6, 4 }}, + {"6, 4", "6, 4"} } }; UChar src[MAXLEN]; @@ -4194,7 +4194,7 @@ testStreaming(void) { UBiDiLevel level; int nTests = LENGTHOF(testData), nLevels = LENGTHOF(paraLevels); UBool mismatch, testOK = TRUE; - char processedLenStr[MAXPORTIONS * 5]; + char processedLenStr[MAXPORTIONS * 5]; log_verbose("\nEntering TestStreaming\n\n"); @@ -4208,7 +4208,7 @@ testStreaming(void) { chunk = testData[i].chunk; nPortions = testData[i].nPortions[levelIndex]; level = paraLevels[levelIndex]; - *processedLenStr = NULL_CHAR; + processedLenStr[0] = NULL_CHAR; log_verbose("Testing level %d, case %d\n", level, i); mismatch = FALSE; @@ -4230,7 +4230,7 @@ testStreaming(void) { } ubidi_setReorderingOptions(pBiDi, UBIDI_OPTION_STREAMING); - mismatch = (UBool)(j >= nPortions || + mismatch |= (UBool)(j >= nPortions || processedLen != testData[i].portionLens[levelIndex][j]); sprintf(processedLenStr + j * 4, "%4d", processedLen); diff --git a/icu4c/source/test/cintltst/cbiditst.h b/icu4c/source/test/cintltst/cbiditst.h index fb5f5c07fce..fbeb85c2fca 100644 --- a/icu4c/source/test/cintltst/cbiditst.h +++ b/icu4c/source/test/cintltst/cbiditst.h @@ -1,6 +1,6 @@ /******************************************************************** - * COPYRIGHT: - * Copyright (c) 1997-2011, International Business Machines Corporation and + * COPYRIGHT: + * Copyright (c) 1997-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /* file name: cbiditst.h @@ -51,6 +51,10 @@ extern "C" { #define PDF U_POP_DIRECTIONAL_FORMAT #define NSM U_DIR_NON_SPACING_MARK #define BN U_BOUNDARY_NEUTRAL +#define FSI U_FIRST_STRONG_ISOLATE +#define LRI U_LEFT_TO_RIGHT_ISOLATE +#define RLI U_RIGHT_TO_LEFT_ISOLATE +#define PDI U_POP_DIRECTIONAL_ISOLATE extern const char * const dirPropNames[U_CHAR_DIRECTION_COUNT]; diff --git a/icu4c/source/test/intltest/bidiconf.cpp b/icu4c/source/test/intltest/bidiconf.cpp index 2f4527efa24..c42c02d73df 100644 --- a/icu4c/source/test/intltest/bidiconf.cpp +++ b/icu4c/source/test/intltest/bidiconf.cpp @@ -13,7 +13,7 @@ * created on: 2009oct16 * created by: Markus W. Scherer * -* BiDi conformance test, using the Unicode BidiTest.txt file. +* BiDi conformance test, using the Unicode BidiTest.txt and BidiCharacterTest.txt files. */ #include @@ -37,18 +37,18 @@ public: void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL); void TestBidiTest(); + void TestBidiCharacterTest(); private: char *getUnidataPath(char path[]); - UBool parseLevels(const char *start); + UBool parseLevels(const char *&start); UBool parseOrdering(const char *start); - UBool parseInputStringFromBiDiClasses(const char *&start); + UBool parseInputStringFromBiDiClasses(const char *&start, UBool parseChars); - UBool checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount, - const char *paraLevelName); - UBool checkOrdering(UBiDi *ubidi, const char *paraLevelName); + UBool checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount); + UBool checkOrdering(UBiDi *ubidi); - void printErrorLine(const char *paraLevelName); + void printErrorLine(); char line[10000]; UBiDiLevel levels[1000]; @@ -59,6 +59,8 @@ private: int32_t orderingCount; int32_t errorCount; UnicodeString inputString; + const char *paraLevelName; + char levelNameString[12]; }; extern IntlTest *createBiDiConformanceTest() { @@ -69,12 +71,10 @@ void BiDiConformanceTest::runIndexedTest(int32_t index, UBool exec, const char * if(exec) { logln("TestSuite BiDiConformanceTest: "); } - switch (index) { - TESTCASE(0, TestBidiTest); - default: - name=""; - break; // needed to end the loop - } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(TestBidiTest); + TESTCASE_AUTO(TestBidiCharacterTest); + TESTCASE_AUTO_END; } // TODO: Move to a common place (IntlTest?) to avoid duplication with UnicodeTest (ucdtest.cpp). @@ -115,18 +115,20 @@ char *BiDiConformanceTest::getUnidataPath(char path[]) { U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); -UBool BiDiConformanceTest::parseLevels(const char *start) { +UBool BiDiConformanceTest::parseLevels(const char *&start) { directionBits=0; levelsCount=0; - while(*start!=0 && *(start=u_skipWhitespace(start))!=0) { + while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') { if(*start=='x') { levels[levelsCount++]=UBIDI_DEFAULT_LTR; ++start; } else { char *end; uint32_t value=(uint32_t)strtoul(start, &end, 10); - if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>(UBIDI_MAX_EXPLICIT_LEVEL+1)) { - errln("@Levels: parse error at %s", start); + if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0 && *end!=';') + || value>(UBIDI_MAX_EXPLICIT_LEVEL+1)) { + errln("\nError on line %d: Levels parse error at %s", (int)lineNumber, start); + printErrorLine(); return FALSE; } levels[levelsCount++]=(UBiDiLevel)value; @@ -139,11 +141,12 @@ UBool BiDiConformanceTest::parseLevels(const char *start) { UBool BiDiConformanceTest::parseOrdering(const char *start) { orderingCount=0; - while(*start!=0 && *(start=u_skipWhitespace(start))!=0) { + while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') { char *end; uint32_t value=(uint32_t)strtoul(start, &end, 10); - if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>=1000) { - errln("@Reorder: parse error at %s", start); + if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0 && *end!=';') || value>=1000) { + errln("\nError on line %d: Reorder parse error at %s", (int)lineNumber, start); + printErrorLine(); return FALSE; } ordering[orderingCount++]=(int32_t)value; @@ -152,7 +155,7 @@ UBool BiDiConformanceTest::parseOrdering(const char *start) { return TRUE; } -static const UChar charFromBiDiClass[U_CHAR_DIRECTION_COUNT]={ +static const UChar pseudoCharFromBiDiClass[U_CHAR_DIRECTION_COUNT]={ 0x6c, // 'l' for L 0x52, // 'R' for R 0x33, // '3' for EN @@ -179,12 +182,38 @@ static const UChar charFromBiDiClass[U_CHAR_DIRECTION_COUNT]={ 0x2e // '.' for PDI }; +static const UChar realCharFromBiDiClass[U_CHAR_DIRECTION_COUNT]={ + 0x006c, // 'l' for L + 0x05d0, // Hebrew Letter Alef for R + 0x0033, // '3' for EN + 0x002d, // '-' for ES + 0x0025, // '%' for ET + 0x0669, // Arabic-Indic '9' for AN + 0x002c, // ',' for CS + 0x000d, // CR for B + 0x0009, // Tab for S + 0x0020, // ' ' for WS + 0x003d, // '=' for ON + 0x202a, // LRE + 0x202d, // LRO + 0x0630, // Arabic Letter Thal for AL + 0x202b, // RLE + 0x202e, // RLO + 0x202c, // PDF + 0x05b9, // Hebrew Point Holam for NSM + 0x00ad, // Soft Hyphen for BN + 0x2068, // FSI + 0x2066, // LRI + 0x2067, // RLI + 0x2069 // PDI +}; + U_CDECL_BEGIN static UCharDirection U_CALLCONV biDiConfUBiDiClassCallback(const void * /*context*/, UChar32 c) { for(int i=0; i='0' && c<='9') + return c - '0'; + if(c>='A' && c<='F') + return c - ('A'-10); + if(c>='a' && c<='f') + return c - ('a'-10); + return -1; +} + static const int8_t biDiClassNameLengths[U_CHAR_DIRECTION_COUNT+1]={ 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 0 }; -UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) { +UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start, UBool parseChars) { inputString.remove(); /* * Lengthy but fast BiDi class parser. @@ -208,6 +247,24 @@ UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) { * but that makes this test take significantly more time. */ while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') { + int32_t d1, d2, hexnum; + // First look for an hexa value of at least 2 digits + if(parseChars && (d1=hexdigit(start[0]))>=0 && (d2=hexdigit(start[1]))>=0) { + const char *saveStart=start; + hexnum=(d1<<4) + d2; + start+=2; + while((d1=hexdigit(start[0]))>=0) { + hexnum=(hexnum<<4) + d1; + start++; + } + if(hexnum<=0 || hexnum>0xffff || + (!U_IS_INV_WHITESPACE(start[0]) && start[0]!=';' && start[0]!=0)) { + errln("\nError on line %d: Invalid hexa number at %s", (int)lineNumber, saveStart); + return FALSE; + } + inputString.append(hexnum); + continue; + } UCharDirection biDiClass=U_CHAR_DIRECTION_COUNT; // Compare each character once until we have a match on // a complete, short BiDi class name. @@ -278,22 +335,37 @@ UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) { // and not just the start of a longer word. int8_t biDiClassNameLength=biDiClassNameLengths[biDiClass]; char c=start[biDiClassNameLength]; - if(biDiClass==U_CHAR_DIRECTION_COUNT || (!U_IS_INV_WHITESPACE(c) && c!=';' && c!=0)) { - errln("BiDi class string not recognized at %s", start); - return FALSE; + if(biDiClass1) { + errln("\nError on line %d: Resolved paragraph level incorrect at %s", (int)lineNumber, start); + printErrorLine(); + continue; + } + start=u_skipWhitespace(end); + if(*start!=';') { + errorCount++; + errln("\nError on line %d: Missing ; separator on line: %s", (int)lineNumber, line); + return; + } + start++; + if(!parseLevels(start)) { + continue; + } + start=u_skipWhitespace(start); + if(*start==';') { + if(!parseOrdering(start+1)) { + continue; + } + } + else + orderingCount=-1; + + ubidi_setPara(ubidi.getAlias(), inputString.getBuffer(), inputString.length(), + paraLevel, NULL, errorCode); + const UBiDiLevel *actualLevels=ubidi_getLevels(ubidi.getAlias(), errorCode); + if(errorCode.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) { + errln("Input line %d: %s", (int)lineNumber, line); + continue; + } + UBiDiLevel actualLevel; + if((actualLevel=ubidi_getParaLevel(ubidi.getAlias()))!=resolvedParaLevel) { + printErrorLine(); + errln("\nError on line %d: Wrong resolved paragraph level; expected %d actual %d", + (int)lineNumber, resolvedParaLevel, actualLevel); + continue; + } + if(!checkLevels(actualLevels, ubidi_getProcessedLength(ubidi.getAlias()))) { + continue; + } + if(orderingCount>=0 && !checkOrdering(ubidi.getAlias())) { + continue; + } + } +} + static UChar printLevel(UBiDiLevel level) { if(level