mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-1007 complete implementation of Normalizer iteration
X-SVN-Rev: 5666
This commit is contained in:
parent
db2761b72b
commit
d365d8a956
4 changed files with 440 additions and 412 deletions
|
@ -38,7 +38,9 @@ Normalizer::Normalizer(const UnicodeString& str,
|
|||
fMode(mode), fOptions(0),
|
||||
text(new StringCharacterIterator(str)), nextIndex(-1),
|
||||
buffer(), bufferPos(0)
|
||||
{}
|
||||
{
|
||||
checkData();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const UnicodeString& str,
|
||||
EMode mode,
|
||||
|
@ -46,20 +48,26 @@ Normalizer::Normalizer(const UnicodeString& str,
|
|||
fMode(mode), fOptions(options),
|
||||
text(new StringCharacterIterator(str)), nextIndex(-1),
|
||||
buffer(), bufferPos(0)
|
||||
{}
|
||||
{
|
||||
checkData();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const UChar *str, int32_t length, EMode mode) :
|
||||
fMode(mode), fOptions(0),
|
||||
text(new UCharCharacterIterator(str, length)), nextIndex(-1),
|
||||
buffer(), bufferPos(0)
|
||||
{}
|
||||
{
|
||||
checkData();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const CharacterIterator& iter,
|
||||
EMode mode) :
|
||||
fMode(mode), fOptions(0),
|
||||
text(iter.clone()), nextIndex(-1),
|
||||
buffer(), bufferPos(0)
|
||||
{}
|
||||
{
|
||||
checkData();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const CharacterIterator& iter,
|
||||
EMode mode,
|
||||
|
@ -67,13 +75,28 @@ Normalizer::Normalizer(const CharacterIterator& iter,
|
|||
fMode(mode), fOptions(options),
|
||||
text(iter.clone()), nextIndex(-1),
|
||||
buffer(), bufferPos(0)
|
||||
{}
|
||||
{
|
||||
checkData();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const Normalizer ©) :
|
||||
fMode(copy.fMode), fOptions(copy.fOptions),
|
||||
text(copy.text->clone()), nextIndex(copy.nextIndex),
|
||||
buffer(copy.buffer), bufferPos(copy.bufferPos)
|
||||
{}
|
||||
{
|
||||
checkData();
|
||||
}
|
||||
|
||||
static const UChar _NUL=0;
|
||||
|
||||
void
|
||||
Normalizer::checkData() {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
if(!unorm_haveData(&errorCode)) {
|
||||
delete text;
|
||||
text=new UCharCharacterIterator(&_NUL, 0);
|
||||
}
|
||||
}
|
||||
|
||||
Normalizer::~Normalizer()
|
||||
{
|
||||
|
@ -247,7 +270,7 @@ UChar32 Normalizer::next() {
|
|||
if(nextIndex>=0) {
|
||||
text->setIndex(nextIndex);
|
||||
}
|
||||
if(nextNormalize()) {
|
||||
if(text->hasNext() && nextNormalize()) {
|
||||
c=buffer.char32At(bufferPos);
|
||||
bufferPos+=UTF_CHAR_LENGTH(c);
|
||||
return c;
|
||||
|
@ -265,7 +288,7 @@ UChar32 Normalizer::next() {
|
|||
UChar32 Normalizer::previous() {
|
||||
UChar32 c;
|
||||
|
||||
if(bufferPos>0 || previousNormalize()) {
|
||||
if(bufferPos>0 || text->hasPrevious() && previousNormalize()) {
|
||||
c=buffer.char32At(bufferPos-1);
|
||||
bufferPos-=UTF_CHAR_LENGTH(c);
|
||||
return c;
|
||||
|
@ -482,70 +505,26 @@ void Normalizer::clearBuffer() {
|
|||
UBool
|
||||
Normalizer::nextNormalize() {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t length;
|
||||
|
||||
clearBuffer();
|
||||
switch(fMode) {
|
||||
case NO_OP:
|
||||
buffer.setTo(text->next32PostInc());
|
||||
length=buffer.length();
|
||||
break;
|
||||
case COMPOSE:
|
||||
case COMPOSE_COMPAT:
|
||||
length=unorm_nextCompose(buffer.fArray, buffer.fCapacity, *text,
|
||||
fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
break;
|
||||
case DECOMP:
|
||||
case DECOMP_COMPAT:
|
||||
length=unorm_nextDecompose(buffer.fArray, buffer.fCapacity, *text,
|
||||
fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
break;
|
||||
case FCD:
|
||||
length=unorm_nextFCD(buffer.fArray, buffer.fCapacity, *text,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
break;
|
||||
}
|
||||
|
||||
return U_SUCCESS(errorCode) && length>0;
|
||||
buffer.fLength=unorm_nextNormalize(buffer.fArray, buffer.fCapacity, *text,
|
||||
getUNormalizationMode(fMode, errorCode),
|
||||
(fOptions&IGNORE_HANGUL)!=0,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
return U_SUCCESS(errorCode) && buffer.length()>0;
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer::previousNormalize() {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t length;
|
||||
|
||||
clearBuffer();
|
||||
switch(fMode) {
|
||||
case NO_OP:
|
||||
buffer.setTo(text->previous32());
|
||||
length=buffer.length();
|
||||
break;
|
||||
case COMPOSE:
|
||||
case COMPOSE_COMPAT:
|
||||
length=unorm_prevCompose(buffer.fArray, buffer.fCapacity, *text,
|
||||
fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
break;
|
||||
case DECOMP:
|
||||
case DECOMP_COMPAT:
|
||||
length=unorm_prevDecompose(buffer.fArray, buffer.fCapacity, *text,
|
||||
fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
break;
|
||||
case FCD:
|
||||
length=unorm_prevFCD(buffer.fArray, buffer.fCapacity, *text,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
break;
|
||||
}
|
||||
|
||||
bufferPos=length;
|
||||
return U_SUCCESS(errorCode) && length>0;
|
||||
buffer.fLength=unorm_previousNormalize(buffer.fArray, buffer.fCapacity, *text,
|
||||
getUNormalizationMode(fMode, errorCode),
|
||||
(fOptions&IGNORE_HANGUL)!=0,
|
||||
UnicodeString::growBuffer, &buffer,
|
||||
&errorCode);
|
||||
bufferPos=buffer.length();
|
||||
return U_SUCCESS(errorCode) && buffer.length()>0;
|
||||
}
|
||||
|
|
|
@ -743,7 +743,7 @@ private:
|
|||
UBool nextNormalize();
|
||||
UBool previousNormalize();
|
||||
|
||||
void init(CharacterIterator* iter, EMode mode, int32_t option);
|
||||
void checkData();
|
||||
void clearBuffer(void);
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
|
|
@ -253,6 +253,20 @@ _getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
|
|||
];
|
||||
}
|
||||
|
||||
/*
|
||||
* get a norm32 from text with complete code points
|
||||
* (like from decompositions)
|
||||
*/
|
||||
inline uint32_t
|
||||
_getNorm32(const UChar *p, uint32_t mask) {
|
||||
uint32_t norm32=_getNorm32(*p);
|
||||
if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
|
||||
/* *p is a lead surrogate, get the real norm32 */
|
||||
norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
|
||||
}
|
||||
return norm32;
|
||||
}
|
||||
|
||||
inline uint16_t
|
||||
_getFCD16(UChar c) {
|
||||
return
|
||||
|
@ -364,37 +378,86 @@ _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* read backwards and get norm32
|
||||
* return 0 if the character is <minC
|
||||
* if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
|
||||
*/
|
||||
inline uint32_t
|
||||
_getPrevNorm32(const UChar *start, const UChar *&src,
|
||||
uint32_t minC, uint32_t mask,
|
||||
UChar &c, UChar &c2) {
|
||||
uint32_t norm32;
|
||||
|
||||
c=*--src;
|
||||
c2=0;
|
||||
|
||||
/* check for a surrogate before getting norm32 to see if we need to predecrement further */
|
||||
if(c<minC) {
|
||||
return 0;
|
||||
} else if(!UTF_IS_SURROGATE(c)) {
|
||||
return _getNorm32(c);
|
||||
} else if(UTF_IS_SURROGATE_FIRST(c)) {
|
||||
/* unpaired first surrogate */
|
||||
return 0;
|
||||
} else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
|
||||
--src;
|
||||
norm32=_getNorm32(c2);
|
||||
|
||||
if((norm32&mask)==0) {
|
||||
/* all surrogate pairs with this lead surrogate have only irrelevant data */
|
||||
return 0;
|
||||
} else {
|
||||
/* norm32 must be a surrogate special */
|
||||
return _getNorm32FromSurrogatePair(norm32, c);
|
||||
}
|
||||
} else {
|
||||
/* unpaired second surrogate */
|
||||
c2=0;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get the combining class of (c, c2)=*--p
|
||||
* before: start<p after: start<=p
|
||||
*/
|
||||
inline uint8_t
|
||||
_getPrevCC(const UChar *start, const UChar *&p) {
|
||||
uint32_t norm32;
|
||||
UChar c, c2;
|
||||
|
||||
c=*--p;
|
||||
return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
|
||||
}
|
||||
|
||||
/* check for a surrogate before getting norm32 to see if we need to predecrement further */
|
||||
if(!UTF_IS_SURROGATE(c)) {
|
||||
return (uint8_t)(_getNorm32(c)>>_NORM_CC_SHIFT);
|
||||
} else if(UTF_IS_SURROGATE_FIRST(c)) {
|
||||
/* unpaired first surrogate */
|
||||
return 0;
|
||||
} else if(p!=start && UTF_IS_FIRST_SURROGATE(c2=*(p-1))) {
|
||||
--p;
|
||||
norm32=_getNorm32(c2);
|
||||
if((norm32&_NORM_CC_MASK)==0) {
|
||||
/* all surrogate pairs with this lead surrogate have cc==0 */
|
||||
return 0;
|
||||
} else {
|
||||
/* norm32 must be a surrogate special */
|
||||
return (uint8_t)(_getNorm32FromSurrogatePair(norm32, c)>>_NORM_CC_SHIFT);
|
||||
}
|
||||
} else {
|
||||
/* unpaired second surrogate */
|
||||
return 0;
|
||||
/*
|
||||
* is this (or does its decomposition begin with) a "true starter"?
|
||||
* (cc==0 and NF*C_YES)
|
||||
*/
|
||||
inline UBool
|
||||
_isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
|
||||
if((norm32&ccOrQCMask)==0) {
|
||||
return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
|
||||
}
|
||||
|
||||
/* inspect its decomposition - not a Hangul or a surrogate here */
|
||||
if((norm32&decompQCMask)!=0) {
|
||||
const UChar *p;
|
||||
int32_t length;
|
||||
uint8_t cc, trailCC;
|
||||
|
||||
/* decomposes, get everything from the variable-length extra data */
|
||||
p=_decompose(norm32, decompQCMask, length, cc, trailCC);
|
||||
if(cc==0) {
|
||||
uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
|
||||
|
||||
/* does it begin with NFC_YES? */
|
||||
if((_getNorm32(p, qcMask)&qcMask)==0) {
|
||||
/* yes, the decomposition begins with a true starter */
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* reorder UTF-16 in-place -------------------------------------------------- */
|
||||
|
@ -1707,20 +1770,6 @@ _recompose(UChar *p, UChar *&limit) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get a norm32 from text with complete code points
|
||||
* (like from decompositions)
|
||||
*/
|
||||
inline uint32_t
|
||||
_getNorm32(const UChar *p, uint32_t mask) {
|
||||
uint32_t norm32=_getNorm32(*p);
|
||||
if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
|
||||
/* *p is a lead surrogate, get the real norm32 */
|
||||
norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
|
||||
}
|
||||
return norm32;
|
||||
}
|
||||
|
||||
/*
|
||||
* read and decompose the following character
|
||||
* return NULL if it is (or its decomposition starts with) a starter (cc==0)
|
||||
|
@ -1821,48 +1870,20 @@ _decomposeBackFindStarter(const UChar *start, const UChar *&src,
|
|||
UChar c, c2;
|
||||
uint8_t cc, trailCC;
|
||||
|
||||
c=*--src;
|
||||
length=1;
|
||||
norm32=_getPrevNorm32(start, src, minNoMaybe, _NORM_CC_MASK|qcMask|decompQCMask, c, c2);
|
||||
length= c2==0 ? 1 : 2;
|
||||
starterIndex=0; /* many characters are themselves starters */
|
||||
|
||||
/* check for a surrogate before getting norm32 to see if we need to predecrement further */
|
||||
if(c<minNoMaybe) {
|
||||
return src;
|
||||
} else if(!UTF_IS_SURROGATE(c)) {
|
||||
norm32=_getNorm32(c);
|
||||
if(isNorm32HangulOrJamo(norm32)) {
|
||||
/* Hangul decomposes but is all starters, Jamo L are starters */
|
||||
#if 0
|
||||
/*
|
||||
* this is commented out because
|
||||
* we should never get Jamo V/T here because
|
||||
* we go back through quick check "yes" text
|
||||
* and Jamo V/T have NFC_MAYBE
|
||||
*/
|
||||
if(!isHangulJamoNorm32HangulOrJamoL(norm32)) {
|
||||
/* Jamo V/T are not starters */
|
||||
starterIndex=-1;
|
||||
}
|
||||
#endif
|
||||
return src;
|
||||
}
|
||||
} else if(UTF_IS_SURROGATE_FIRST(c)) {
|
||||
/* unpaired first surrogate */
|
||||
return src;
|
||||
} else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
|
||||
--src;
|
||||
length=2;
|
||||
norm32=_getNorm32(c2);
|
||||
|
||||
if((norm32&(_NORM_CC_MASK|decompQCMask))==0) {
|
||||
/* all surrogate pairs with this lead surrogate have cc==0 and no decomposition */
|
||||
return src;
|
||||
} else {
|
||||
/* norm32 must be a surrogate special */
|
||||
norm32=_getNorm32FromSurrogatePair(norm32, c);
|
||||
}
|
||||
} else {
|
||||
/* unpaired second surrogate */
|
||||
if( (norm32&(_NORM_CC_MASK|qcMask|decompQCMask))==0 ||
|
||||
isNorm32HangulOrJamo(norm32)
|
||||
) {
|
||||
/* found a true starter */
|
||||
/*
|
||||
* Hangul decomposes but is all starters, Jamo L are starters.
|
||||
* We never get Jamo V/T here because
|
||||
* we go back through quick check "yes" text
|
||||
* and Jamo V/T have NFC_MAYBE.
|
||||
*/
|
||||
return src;
|
||||
}
|
||||
|
||||
|
@ -2118,6 +2139,14 @@ unorm_compose(UChar *dest, int32_t destCapacity,
|
|||
/*
|
||||
* prevStarter points to the last character before the current one
|
||||
* that is a "true" starter with cc==0 and quick check "yes".
|
||||
*
|
||||
* prevStarter will be used instead of looking for a true starter
|
||||
* while incrementally decomposing [prevStarter..prevSrc[
|
||||
* in _composePart(). Having a good prevStarter allows to just decompose
|
||||
* the entire [prevStarter..prevSrc[.
|
||||
*
|
||||
* This relies on the assumption that the decomposition of a true starter
|
||||
* also begins with a true starter. gennorm/store.c checks for this.
|
||||
*/
|
||||
prevStarter=src;
|
||||
|
||||
|
@ -2507,12 +2536,15 @@ unorm_normalize(const UChar *src, int32_t srcLength,
|
|||
* filled again.
|
||||
*/
|
||||
|
||||
/* backward iteration ------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Read backwards and check if the combining class is 0.
|
||||
* If c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!).
|
||||
* read backwards and get norm32
|
||||
* return 0 if the character is <minC
|
||||
* if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
|
||||
*/
|
||||
inline UBool
|
||||
_isPrevCCZero(CharacterIterator &src, UChar &c, UChar &c2) {
|
||||
inline uint32_t
|
||||
_getPrevNorm32(CharacterIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
|
||||
uint32_t norm32;
|
||||
|
||||
/* need src.hasPrevious() */
|
||||
|
@ -2520,132 +2552,75 @@ _isPrevCCZero(CharacterIterator &src, UChar &c, UChar &c2) {
|
|||
c2=0;
|
||||
|
||||
/* check for a surrogate before getting norm32 to see if we need to predecrement further */
|
||||
if(!UTF_IS_SURROGATE(c)) {
|
||||
return (_getNorm32(c)&_NORM_CC_MASK)==0;
|
||||
if(c<minC) {
|
||||
return 0;
|
||||
} else if(!UTF_IS_SURROGATE(c)) {
|
||||
return _getNorm32(c);
|
||||
} else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious()) {
|
||||
/* unpaired surrogate */
|
||||
return TRUE;
|
||||
return 0;
|
||||
} else if(UTF_IS_FIRST_SURROGATE(c2=src.previous())) {
|
||||
norm32=_getNorm32(c2);
|
||||
if((norm32&_NORM_CC_MASK)==0) {
|
||||
/* all surrogate pairs with this lead surrogate have cc==0 */
|
||||
return TRUE;
|
||||
if((norm32&mask)==0) {
|
||||
/* all surrogate pairs with this lead surrogate have irrelevant data */
|
||||
return 0;
|
||||
} else {
|
||||
/* norm32 must be a surrogate special */
|
||||
return (_getNorm32FromSurrogatePair(norm32, c)&_NORM_CC_MASK)==0;
|
||||
return _getNorm32FromSurrogatePair(norm32, c);
|
||||
}
|
||||
} else {
|
||||
/* unpaired second surrogate, undo the c2=src.previous() movement */
|
||||
src.move(1, CharacterIterator::kCurrent);
|
||||
return TRUE;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper functions:
|
||||
* Read from the CharacterIterator until a normalization boundary is reached.
|
||||
* read backwards and check if the character is a previous-iteration boundary
|
||||
* if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
|
||||
*/
|
||||
typedef UBool
|
||||
IsPrevBoundaryFn(CharacterIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
|
||||
|
||||
static int32_t
|
||||
_findNextDecomposeBoundary(CharacterIterator &src,
|
||||
UChar *&buffer, int32_t bufferCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar *stackBuffer;
|
||||
uint32_t norm32;
|
||||
int32_t bufferIndex;
|
||||
UChar c, c2;
|
||||
/*
|
||||
* read backwards and check if the combining class is 0
|
||||
* if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
|
||||
*/
|
||||
static UBool
|
||||
_isPrevCCZero(CharacterIterator &src, uint32_t minC, uint32_t ccMask, UChar &c, UChar &c2) {
|
||||
return (_getPrevNorm32(src, minC, ccMask, c, c2)&ccMask)==0;
|
||||
}
|
||||
|
||||
if(!src.hasNext()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
stackBuffer=buffer;
|
||||
|
||||
/* get one character, ignore its cc, and keep c filled with a one-UChar look-ahead */
|
||||
buffer[0]=c=src.current();
|
||||
bufferIndex=1;
|
||||
c2=src.next();
|
||||
if(UTF_IS_FIRST_SURROGATE(c) && UTF_IS_SECOND_SURROGATE(c2)) {
|
||||
buffer[bufferIndex++]=c2;
|
||||
c=src.next();
|
||||
} else {
|
||||
c=c2;
|
||||
}
|
||||
|
||||
/* get all following characters with non-zero combining class */
|
||||
/* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
|
||||
while(src.hasNext()) {
|
||||
/* get the cc of the next character, src.getIndex() is at c */
|
||||
norm32=_getNorm32(c);
|
||||
if((norm32&_NORM_CC_MASK)==0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if(!isNorm32LeadSurrogate(norm32)) {
|
||||
/* BMP character with cc!=0, write c into the buffer */
|
||||
if( bufferIndex<bufferCapacity ||
|
||||
/* attempt to grow the buffer */
|
||||
u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
|
||||
2*bufferCapacity,
|
||||
bufferIndex)
|
||||
) {
|
||||
buffer[bufferIndex++]=c;
|
||||
} else {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
/* c is a lead surrogate, get the real norm32 */
|
||||
if(UTF_IS_SECOND_SURROGATE(c2=src.next())) {
|
||||
norm32=_getNorm32FromSurrogatePair(norm32, c2);
|
||||
} else {
|
||||
norm32=0;
|
||||
}
|
||||
if((norm32&_NORM_CC_MASK)==0) {
|
||||
/* cc(supplementary)==0, back out the c2=src.next() movement to stop at the first surrogate */
|
||||
src.move(-1, CharacterIterator::kCurrent);
|
||||
break;
|
||||
}
|
||||
|
||||
/* supplementary character with cc!=0, write (c, c2) into the buffer */
|
||||
if( (bufferIndex+2)<=bufferCapacity ||
|
||||
/* attempt to grow the buffer */
|
||||
u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
|
||||
2*bufferCapacity,
|
||||
bufferIndex)
|
||||
) {
|
||||
buffer[bufferIndex++]=c;
|
||||
buffer[bufferIndex++]=c2;
|
||||
} else {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* get the next character (pre-increment) */
|
||||
c=src.next();
|
||||
}
|
||||
|
||||
/* return the length of the buffer contents */
|
||||
return bufferIndex;
|
||||
/*
|
||||
* read backwards and check if the character is (or its decomposition begins with)
|
||||
* a "true starter" (cc==0 and NF*C_YES)
|
||||
* if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
|
||||
*/
|
||||
static UBool
|
||||
_isPrevTrueStarter(CharacterIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
|
||||
uint32_t norm32, decompQCMask;
|
||||
|
||||
decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
|
||||
norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
|
||||
return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
_findPreviousDecomposeBoundary(CharacterIterator &src,
|
||||
_findPreviousIterationBoundary(CharacterIterator &src,
|
||||
IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
|
||||
UChar *&buffer, int32_t bufferCapacity,
|
||||
int32_t &startIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar *stackBuffer;
|
||||
UChar c, c2;
|
||||
UBool isZero;
|
||||
UBool isBoundary;
|
||||
|
||||
/* initialize */
|
||||
stackBuffer=buffer;
|
||||
startIndex=bufferCapacity; /* fill the buffer from the end backwards */
|
||||
|
||||
while(src.hasPrevious()) {
|
||||
isZero=_isPrevCCZero(src, c, c2);
|
||||
isBoundary=isPrevBoundary(src, minC, mask, c, c2);
|
||||
|
||||
/* always write this character to the front of the buffer */
|
||||
/* make sure there is enough space in the buffer */
|
||||
|
@ -2654,6 +2629,7 @@ _findPreviousDecomposeBoundary(CharacterIterator &src,
|
|||
|
||||
if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
src.setToStart();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2667,8 +2643,8 @@ _findPreviousDecomposeBoundary(CharacterIterator &src,
|
|||
buffer[--startIndex]=c2;
|
||||
}
|
||||
|
||||
/* stop if this just-copied character has cc==0 */
|
||||
if(isZero) {
|
||||
/* stop if this just-copied character is a boundary */
|
||||
if(isBoundary) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -2677,32 +2653,250 @@ _findPreviousDecomposeBoundary(CharacterIterator &src,
|
|||
return bufferCapacity-startIndex;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
unorm_previousNormalize(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UNormalizationMode mode, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar stackBuffer[40];
|
||||
UChar *buffer;
|
||||
IsPrevBoundaryFn *isPreviousBoundary;
|
||||
uint32_t mask;
|
||||
int32_t startIndex, bufferLength, destLength;
|
||||
UChar minC;
|
||||
|
||||
switch(mode) {
|
||||
case UNORM_NFD:
|
||||
case UNORM_NFKD:
|
||||
case UNORM_FCD:
|
||||
isPreviousBoundary=_isPrevCCZero;
|
||||
minC=_NORM_MIN_WITH_LEAD_CC;
|
||||
mask=_NORM_CC_MASK;
|
||||
break;
|
||||
case UNORM_NFC:
|
||||
isPreviousBoundary=_isPrevTrueStarter;
|
||||
minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
|
||||
mask=_NORM_CC_MASK|_NORM_QC_NFC;
|
||||
break;
|
||||
case UNORM_NFKC:
|
||||
isPreviousBoundary=_isPrevTrueStarter;
|
||||
minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
|
||||
mask=_NORM_CC_MASK|_NORM_QC_NFKC;
|
||||
break;
|
||||
case UNORM_NONE:
|
||||
if(src.hasPrevious()) {
|
||||
UChar32 c=src.previous32();
|
||||
|
||||
destLength=0;
|
||||
UTF_APPEND_CHAR_UNSAFE(dest, destLength, c);
|
||||
return destLength;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
default:
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
buffer=stackBuffer;
|
||||
bufferLength=_findPreviousIterationBoundary(src,
|
||||
isPreviousBoundary, minC, mask,
|
||||
buffer, sizeof(stackBuffer)/U_SIZEOF_UCHAR,
|
||||
startIndex,
|
||||
pErrorCode);
|
||||
if(bufferLength>0) {
|
||||
destLength=unorm_internalNormalize(dest, destCapacity,
|
||||
buffer+startIndex, bufferLength,
|
||||
mode, ignoreHangul,
|
||||
growBuffer, context, pErrorCode);
|
||||
} else {
|
||||
destLength=0;
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
if(buffer!=stackBuffer) {
|
||||
uprv_free(buffer);
|
||||
}
|
||||
|
||||
return destLength;
|
||||
}
|
||||
|
||||
/* forward iteration -------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Iteration functions:
|
||||
* Collect a minimum amount of text from the character iterator,
|
||||
* normalize into the destination buffer,
|
||||
* and return the length of the output.
|
||||
* read forward and get norm32
|
||||
* return 0 if the character is <minC
|
||||
* if c2!=0 then (c2, c) is a surrogate pair
|
||||
* always reads complete characters
|
||||
*/
|
||||
inline uint32_t
|
||||
_getNextNorm32(CharacterIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
|
||||
uint32_t norm32;
|
||||
|
||||
/* need src.hasNext() */
|
||||
c=src.nextPostInc();
|
||||
c2=0;
|
||||
|
||||
if(c<minC) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
norm32=_getNorm32(c);
|
||||
if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext() && UTF_IS_SECOND_SURROGATE(c2=src.current())) {
|
||||
src.move(1, CharacterIterator::kCurrent); /* skip the c2 surrogate */
|
||||
if((norm32&mask)==0) {
|
||||
/* irrelevant data */
|
||||
return 0;
|
||||
} else {
|
||||
/* norm32 must be a surrogate special */
|
||||
return _getNorm32FromSurrogatePair(norm32, c2);
|
||||
}
|
||||
}
|
||||
return norm32;
|
||||
}
|
||||
|
||||
/*
|
||||
* read forward and check if the character is a next-iteration boundary
|
||||
* if c2!=0 then (c, c2) is a surrogate pair
|
||||
*/
|
||||
typedef UBool
|
||||
IsNextBoundaryFn(CharacterIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
|
||||
|
||||
/*
|
||||
* read forward and check if the combining class is 0
|
||||
* if c2!=0 then (c, c2) is a surrogate pair
|
||||
*/
|
||||
static UBool
|
||||
_isNextCCZero(CharacterIterator &src, uint32_t minC, uint32_t ccMask, UChar &c, UChar &c2) {
|
||||
return (_getNextNorm32(src, minC, ccMask, c, c2)&ccMask)==0;
|
||||
}
|
||||
|
||||
/*
|
||||
* read forward and check if the character is (or its decomposition begins with)
|
||||
* a "true starter" (cc==0 and NF*C_YES)
|
||||
* if c2!=0 then (c, c2) is a surrogate pair
|
||||
*/
|
||||
static UBool
|
||||
_isNextTrueStarter(CharacterIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
|
||||
uint32_t norm32, decompQCMask;
|
||||
|
||||
decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
|
||||
norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
|
||||
return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
_findNextIterationBoundary(CharacterIterator &src,
|
||||
IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
|
||||
UChar *&buffer, int32_t bufferCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar *stackBuffer;
|
||||
int32_t bufferIndex;
|
||||
UChar c, c2;
|
||||
|
||||
if(!src.hasNext()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
stackBuffer=buffer;
|
||||
|
||||
/* get one character and ignore its properties */
|
||||
buffer[0]=c=src.current();
|
||||
bufferIndex=1;
|
||||
c2=src.next();
|
||||
if(UTF_IS_FIRST_SURROGATE(c) && UTF_IS_SECOND_SURROGATE(c2)) {
|
||||
buffer[bufferIndex++]=c2;
|
||||
src.move(1, CharacterIterator::kCurrent); /* skip the c2 surrogate */
|
||||
}
|
||||
|
||||
/* get all following characters until we see a boundary */
|
||||
/* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
|
||||
while(src.hasNext()) {
|
||||
if(isNextBoundary(src, minC, mask, c, c2)) {
|
||||
/* back out the latest movement to stop at the boundary */
|
||||
src.move(c2==0 ? -1 : -2, CharacterIterator::kCurrent);
|
||||
break;
|
||||
} else {
|
||||
if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
|
||||
/* attempt to grow the buffer */
|
||||
u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
|
||||
2*bufferCapacity,
|
||||
bufferIndex)
|
||||
) {
|
||||
buffer[bufferIndex++]=c;
|
||||
if(c2!=0) {
|
||||
buffer[bufferIndex++]=c2;
|
||||
}
|
||||
} else {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
src.setToEnd();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* return the length of the buffer contents */
|
||||
return bufferIndex;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
unorm_nextDecompose(UChar *dest, int32_t destCapacity,
|
||||
unorm_nextNormalize(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UNormalizationMode mode, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar stackBuffer[40];
|
||||
UChar *buffer;
|
||||
IsNextBoundaryFn *isNextBoundary;
|
||||
uint32_t mask;
|
||||
int32_t bufferLength, destLength;
|
||||
UChar minC;
|
||||
|
||||
switch(mode) {
|
||||
case UNORM_NFD:
|
||||
case UNORM_NFKD:
|
||||
case UNORM_FCD:
|
||||
isNextBoundary=_isNextCCZero;
|
||||
minC=_NORM_MIN_WITH_LEAD_CC;
|
||||
mask=_NORM_CC_MASK;
|
||||
break;
|
||||
case UNORM_NFC:
|
||||
isNextBoundary=_isNextTrueStarter;
|
||||
minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
|
||||
mask=_NORM_CC_MASK|_NORM_QC_NFC;
|
||||
break;
|
||||
case UNORM_NFKC:
|
||||
isNextBoundary=_isNextTrueStarter;
|
||||
minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
|
||||
mask=_NORM_CC_MASK|_NORM_QC_NFKC;
|
||||
break;
|
||||
case UNORM_NONE:
|
||||
if(src.hasNext()) {
|
||||
UChar32 c=src.next32PostInc();
|
||||
|
||||
destLength=0;
|
||||
UTF_APPEND_CHAR_UNSAFE(dest, destLength, c);
|
||||
return destLength;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
default:
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
buffer=stackBuffer;
|
||||
bufferLength=_findNextDecomposeBoundary(src,
|
||||
bufferLength=_findNextIterationBoundary(src,
|
||||
isNextBoundary, minC, mask,
|
||||
buffer, sizeof(stackBuffer)/U_SIZEOF_UCHAR,
|
||||
pErrorCode);
|
||||
if(bufferLength>0) {
|
||||
destLength=unorm_decompose(dest, destCapacity,
|
||||
buffer, bufferLength,
|
||||
compat, ignoreHangul,
|
||||
growBuffer, context, pErrorCode);
|
||||
destLength=unorm_internalNormalize(dest, destCapacity,
|
||||
buffer, bufferLength,
|
||||
mode, ignoreHangul,
|
||||
growBuffer, context, pErrorCode);
|
||||
} else {
|
||||
destLength=0;
|
||||
}
|
||||
|
@ -2715,113 +2909,8 @@ unorm_nextDecompose(UChar *dest, int32_t destCapacity,
|
|||
return destLength;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
unorm_prevDecompose(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar stackBuffer[40];
|
||||
UChar *buffer;
|
||||
int32_t startIndex, bufferLength, destLength;
|
||||
|
||||
buffer=stackBuffer;
|
||||
bufferLength=_findPreviousDecomposeBoundary(src,
|
||||
buffer, sizeof(stackBuffer)/U_SIZEOF_UCHAR,
|
||||
startIndex,
|
||||
pErrorCode);
|
||||
if(bufferLength>0) {
|
||||
destLength=unorm_decompose(dest, destCapacity,
|
||||
buffer+startIndex, bufferLength,
|
||||
compat, ignoreHangul,
|
||||
growBuffer, context, pErrorCode);
|
||||
} else {
|
||||
destLength=0;
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
if(buffer!=stackBuffer) {
|
||||
uprv_free(buffer);
|
||||
}
|
||||
|
||||
return destLength;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
unorm_nextFCD(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar stackBuffer[40];
|
||||
UChar *buffer;
|
||||
int32_t bufferLength, destLength;
|
||||
|
||||
buffer=stackBuffer;
|
||||
bufferLength=_findNextDecomposeBoundary(src,
|
||||
buffer, sizeof(stackBuffer)/U_SIZEOF_UCHAR,
|
||||
pErrorCode);
|
||||
if(bufferLength>0) {
|
||||
destLength=unorm_makeFCD(dest, destCapacity,
|
||||
buffer, bufferLength,
|
||||
growBuffer, context, pErrorCode);
|
||||
} else {
|
||||
destLength=0;
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
if(buffer!=stackBuffer) {
|
||||
uprv_free(buffer);
|
||||
}
|
||||
|
||||
return destLength;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
unorm_prevFCD(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar stackBuffer[40];
|
||||
UChar *buffer;
|
||||
int32_t startIndex, bufferLength, destLength;
|
||||
|
||||
buffer=stackBuffer;
|
||||
bufferLength=_findPreviousDecomposeBoundary(src,
|
||||
buffer, sizeof(stackBuffer)/U_SIZEOF_UCHAR,
|
||||
startIndex,
|
||||
pErrorCode);
|
||||
if(bufferLength>0) {
|
||||
destLength=unorm_makeFCD(dest, destCapacity,
|
||||
buffer+startIndex, bufferLength,
|
||||
growBuffer, context, pErrorCode);
|
||||
} else {
|
||||
destLength=0;
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
if(buffer!=stackBuffer) {
|
||||
uprv_free(buffer);
|
||||
}
|
||||
|
||||
return destLength;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
unorm_nextCompose(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
unorm_prevCompose(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
* ### TODO: check if NF*D and FCD iteration finds optimal boundaries
|
||||
* and if not, how hard it would be to improve it.
|
||||
* For example, see _findSafeFCD().
|
||||
*/
|
||||
|
|
|
@ -65,6 +65,8 @@ enum {
|
|||
_NORM_QC_MAYBE=0x10,
|
||||
_NORM_QC_ANY_MAYBE=0x30,
|
||||
|
||||
_NORM_QC_MASK=0x3f,
|
||||
|
||||
_NORM_COMBINES_FWD=0x40,
|
||||
_NORM_COMBINES_BACK=0x80,
|
||||
_NORM_COMBINES_ANY=0xc0,
|
||||
|
@ -253,9 +255,9 @@ unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UC
|
|||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_nextDecompose(UChar *dest, int32_t destCapacity,
|
||||
unorm_nextNormalize(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UNormalizationMode mode, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
|
@ -264,53 +266,11 @@ unorm_nextDecompose(UChar *dest, int32_t destCapacity,
|
|||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_prevDecompose(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API for iterative normalizing - see Normalizer.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_nextFCD(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API for iterative normalizing - see Normalizer.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_prevFCD(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API for iterative normalizing - see Normalizer.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_nextCompose(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API for iterative normalizing - see Normalizer.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_prevCompose(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
unorm_previousNormalize(UChar *dest, int32_t destCapacity,
|
||||
CharacterIterator &src,
|
||||
UNormalizationMode mode, UBool ignoreHangul,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue