diff --git a/icu4c/source/common/unorm.cpp b/icu4c/source/common/unorm.cpp index 1cb1d8e653f..cc11327d094 100644 --- a/icu4c/source/common/unorm.cpp +++ b/icu4c/source/common/unorm.cpp @@ -463,139 +463,19 @@ _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) { /* reorder UTF-16 in-place -------------------------------------------------- */ /* - * merge two UTF-16 string parts together - * to canonically order (order by combining classes) their concatenation + * simpler, single-character version of _mergeOrdered() - + * bubble-insert one single code point into the preceding string + * which is already canonically ordered + * (c, c2) may or may not yet have been inserted at [current..p[ * - * the two strings may already be adjacent, so that the merging is done in-place - * if the two strings are not adjacent, then the buffer holding the first one - * must be large enough - * the second string may or may not be ordered in itself + * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) * * before: [start..current[ is already ordered, and - * [next..limit[ may be ordered in itself, but - * is not in relation to [start..current[ - * after: [start..current+(limit-next)[ is ordered - * - * the algorithm is a simple bubble-sort that takes the characters from *next++ - * and inserts them in correct combining class order into the preceding part - * of the string + * [current..p[ may or may not hold (c, c2) but + * must be exactly the same length as (c, c2) + * after: [start..p[ is ordered * * returns the trailing combining class - * ### TODO see how often this is used - if very rare then just iterate over [next..limit[ and call optimized fn - */ -static uint8_t -_mergeOrdered(UChar *start, UChar *current, - const UChar *next, const UChar *limit, UBool isOrdered=TRUE) { - const UChar *pBack, *pPreBack; - UChar *q, *r; - UChar c, c2; - uint8_t cc, prevCC, trailCC=0; - UBool adjacent; - - adjacent= current==next; - - if(start!=current || !isOrdered) { - while(next=prevCC */ - pPreBack=pBack=current; - prevCC=_getPrevCC(start, pPreBack); - if(cc>=prevCC) { - /* does not bubble back */ - trailCC=cc; - if(adjacent) { - current=(UChar *)next; - } else { - *current++=c; - if(c2!=0) { - *current++=c2; - } - } - if(isOrdered) { - break; - } - } else { - /* this will be the last code point, so keep its cc */ - trailCC=prevCC; - pBack=pPreBack; - while(start=prevCC) { - break; - } - pBack=pPreBack; - } - - /* - * this is where we are right now with all these pointers: - * [start..pPreBack[ 0..? code points that we can ignore - * [pPreBack..pBack[ 0..1 code points with prevCC<=cc - * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2) - * [current..next[ 1 code point (c, c2) with cc - * [next..limit[ 0..? code points yet to be bubbled in - * - * note that current and next may be unrelated (if not adjacent)! - */ - - /* move the code units in between up (q moves left of r) */ - q=current; - r=current= c2==0 ? current+1 : current+2; - do { - *--r=*--q; - } while(pBack!=q); - - /* insert (c, c2) */ - *q=c; - if(c2!=0) { - *(q+1)=c2; - } - - if(isOrdered) { - /* we know that the new part is ordered in itself, so we can move start up */ - start=r; /* set it to after where (c, c2) were inserted */ - } - } - } - } - } - - if(next==limit) { - /* we know the cc of the last code point */ - return trailCC; - } else { - if(!adjacent) { - /* copy the second string part */ - do { - *current++=*next++; - } while(next!=limit); - limit=current; - } - return _getPrevCC(start, limit); - } -} - -/* - * simpler, more efficient version of _mergeOrdered() - - * inserts only one code point into the preceding string - * assume that (c, c2) has not yet been inserted at [current..p[ - * ### TODO doc that p=current+1 or +2 according to c2=?=0 */ static uint8_t _insertOrdered(const UChar *start, UChar *current, UChar *p, @@ -646,6 +526,82 @@ _insertOrdered(const UChar *start, UChar *current, UChar *p, return trailCC; } +/* + * merge two UTF-16 string parts together + * to canonically order (order by combining classes) their concatenation + * + * the two strings may already be adjacent, so that the merging is done in-place + * if the two strings are not adjacent, then the buffer holding the first one + * must be large enough + * the second string may or may not be ordered in itself + * + * before: [start..current[ is already ordered, and + * [next..limit[ may be ordered in itself, but + * is not in relation to [start..current[ + * after: [start..current+(limit-next)[ is ordered + * + * the algorithm is a simple bubble-sort that takes the characters from *next++ + * and inserts them in correct combining class order into the preceding part + * of the string + * + * since this function is called much less often than the single-code point + * _insertOrdered(), it just uses that for easier maintenance + * (see file version from before 2001aug31 for a more optimized version) + * + * returns the trailing combining class + */ +static uint8_t +_mergeOrdered(UChar *start, UChar *current, + const UChar *next, const UChar *limit, UBool isOrdered=TRUE) { + UChar *r; + UChar c, c2; + uint8_t cc, trailCC=0; + UBool adjacent; + + adjacent= current==next; + + if(start!=current || !isOrdered) { + while(next>_NORM_CC_SHIFT); - p=src; - } else { - /* c decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, decompQCMask, length, cc, trailCC); - if(cc==0) { - /* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */ - norm32=_getNorm32(p, qcMask); - } - } - - if(cc==0 && !(norm32&qcMask)) { - return NULL; - } else { - src+= c2==0 ? 1 : 2; - return p; - } -} - -/* - * decompose the previous code point (needs start=0 to the last starter in the decomposition - * that has NF*C "yes" - * starterIndex==-1 if there is no starter - */ -static const UChar * -_decomposeBackFindStarter(const UChar *start, const UChar *&src, - uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe, - int32_t &starterIndex, - int32_t &length) { - const UChar *p; - uint32_t norm32; + uint32_t norm32, ccOrQCMask; + int32_t length; UChar c, c2; uint8_t cc, trailCC; - norm32=_getPrevNorm32(start, src, minNoMaybe, _NORM_CC_MASK|qcMask|decompQCMask, c, c2); - length= c2==0 ? 1 : 2; - starterIndex=0; /* many characters are themselves starters */ + ccOrQCMask=_NORM_CC_MASK|qcMask; - if( (norm32&(_NORM_CC_MASK|qcMask|decompQCMask))==0 || - isNorm32HangulOrJamo(norm32) - ) { - /* found a true starter */ - /* - * Hangul decomposes but is all starters, Jamo L are starters. - * We never get Jamo V/T here because - * we go back through quick check "yes" text - * and Jamo V/T have NFC_MAYBE. - */ - return src; - } - - /* get the decomposition and the lead and trail cc's */ - if((norm32&decompQCMask)==0) { - /* c does not decompose */ - if((norm32&(_NORM_CC_MASK|qcMask))!=0) { - starterIndex=-1; + for(;;) { + if(src==limit) { + break; /* end of string */ + } + c=*src; + if(c>_NORM_CC_SHIFT); - p=prevSrc; + /* + * find the last true starter in [prevStarter..src[ + * it is either the decomposition of the current character (at prevSrc), + * or prevStarter + */ + if(_isTrueStarter(norm32, _NORM_CC_MASK|qcMask, decompQCMask)) { + prevStarter=prevSrc; } else { - /* c decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, decompQCMask, length, cc, trailCC); - if(cc==0) { - /* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */ - norm32=_getNorm32(p, qcMask); - } - } - - /* copy the decomposition into the buffer, assume that it fits */ - startIndex=limitIndex=bufferCapacity/2; - do { - buffer[limitIndex++]=*p++; - } while(--length>0); - - /* find the last starter in [prevStarter..src[ including this new decomposition */ - if((cc==0 && !(norm32&qcMask)) || prevStarter==prevSrc) { - prevCC=trailCC; - starter=prevSrc; - firstStarterIndex=startIndex; - } else { - /* - * ### TODO - * - verify that prevStarter is indeed at the _last_ starter before prevSrc - * - if that is so, then perform a normal decomposition on [prevStarter..src[ - * instead of this special, incremental one - */ - - /* decompose backwards and look for a starter */ - firstStarterIndex=0; - starter=prevSrc; - for(;;) { - p=_decomposeBackFindStarter(prevStarter, starter, - qcMask, decompQCMask, minNoMaybe, - starterIndex, length); - - /* make sure there is enough space in the buffer */ - if(startIndex0); - - /* stop if we found a starter */ - if(starterIndex>=0) { - firstStarterIndex=startIndex+starterIndex; - break; - } - - /* stop if we are at the beginning of the text */ - if(prevStarter>=starter) { - firstStarterIndex=startIndex; - break; - } - } - - /* reorder the backwards decomposition, set prevCC */ - reorderSplit=buffer+firstStarterIndex; - prevCC=_mergeOrdered(reorderSplit, reorderSplit, reorderSplit, buffer+limitIndex, FALSE); - /* adjust destIndex: back out what had been copied with qc "yes" */ - destIndex-=(int32_t)(prevSrc-starter); + destIndex-=(int32_t)(prevSrc-prevStarter); } - /* find the next starter in [src..limit[ */ - for(;;) { - p=_decomposeBeforeNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe, cc, trailCC, length); - if(p==NULL) { - break; /* reached a starter */ - } + /* find the next true starter in [src..limit[ */ + src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe); - /* make sure there is enough space in the buffer */ - if((limitIndex+length)>bufferCapacity) { - if(startIndex>=length) { - /* it fits if we move the buffer contents up */ - uprv_memmove(buffer, buffer+startIndex, (limitIndex-startIndex)*U_SIZEOF_UCHAR); - firstStarterIndex-=startIndex; - limitIndex-=startIndex; - startIndex=0; - } else if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, limitIndex)) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - } + /* decompose [prevStarter..src[ */ + length=_decompose(buffer, bufferCapacity, + prevStarter, src-prevStarter, + (decompQCMask&_NORM_QC_NFKD)!=0, FALSE, + u_growBufferFromStatic, stackBuffer, + trailCC, + pErrorCode); - if(cc!=0 && cc0); - prevCC=trailCC; - } - } - - /* recompose between the two starters */ - recomposeLimit=buffer+limitIndex; - if((limitIndex-firstStarterIndex)>=2) { - prevCC=_recompose(buffer+firstStarterIndex, recomposeLimit); - } - - /* set output parameters and return with a pointer to the recomposition */ + /* set the next starter */ prevStarter=src; - p=buffer+startIndex; - length=recomposeLimit-p; - return p; + + /* recompose the decomposition */ + recomposeLimit=buffer+length; + if(length>=2) { + prevCC=_recompose(buffer, recomposeLimit); + } + + /* return with a pointer to the recomposition and its length */ + length=recomposeLimit-buffer; + return buffer; } U_CFUNC int32_t @@ -2267,7 +2046,7 @@ unorm_compose(UChar *dest, int32_t destCapacity, c2=(UChar)(c2-JAMO_L_BASE); if(c2=dest && src<(dest+destCapacity)) || (srcLength>0 && dest>=src && dest<(src+srcLength)) ) {