diff --git a/expat/lib/xmltok.c b/expat/lib/xmltok.c index 72058d3c..41952795 100644 --- a/expat/lib/xmltok.c +++ b/expat/lib/xmltok.c @@ -333,9 +333,34 @@ void align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef) { const char * fromLim = *fromLimRef; - for (; fromLim > from; fromLim--) - if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) + size_t walked = 0; + for (; fromLim > from; fromLim--, walked++) { + const unsigned char prev = (unsigned char)fromLim[-1]; + if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ + if (walked + 1 >= 4) { + fromLim += 4 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ + if (walked + 1 >= 3) { + fromLim += 3 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ + if (walked + 1 >= 2) { + fromLim += 2 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ break; + } + } *fromLimRef = fromLim; }