ICU-13515 UTF-8 macro: reduce length of string if it ends with an incomplete sequence

X-SVN-Rev: 40883
This commit is contained in:
Markus Scherer 2018-02-09 21:01:56 +00:00
parent 64aa4beb28
commit 19b494f953
4 changed files with 160 additions and 80 deletions

View file

@ -696,36 +696,20 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
// Use a single counter for source and target, counting the minimum of
// the source length and the target capacity.
// Let the standard converter handle edge cases.
const uint8_t *limit=sourceLimit;
if(count>targetCapacity) {
limit-=(count-targetCapacity);
count=targetCapacity;
}
// The conversion loop checks count>0 only once per 1/2/3-byte character.
// If the buffer ends with a truncated 2- or 3-byte sequence,
// The conversion loop checks count>0 only once per character.
// If the buffer ends with a truncated sequence,
// then we reduce the count to stop before that,
// and collect the remaining bytes after the conversion loop.
{
// Do not go back into the bytes that will be read for finishing a partial
// sequence from the previous buffer.
int32_t length=count-toULimit;
if(length>0) {
uint8_t b1=*(limit-1);
if(U8_IS_SINGLE(b1)) {
// common ASCII character
} else if(U8_IS_TRAIL(b1) && length>=2) {
uint8_t b2=*(limit-2);
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
// truncated 3-byte sequence
count-=2;
}
} else if(0xc2<=b1 && b1<0xf0) {
// truncated 2- or 3-byte sequence
--count;
}
}
}
// Do not go back into the bytes that will be read for finishing a partial
// sequence from the previous buffer.
int32_t length=count-toULimit;
U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
count=toULimit+length;
}
if(c!=0) {
@ -815,7 +799,7 @@ moreBytes:
}
/* copy the legal byte sequence to the target */
if(count>=toULength) {
{
int8_t i;
for(i=0; i<oldToULength; ++i) {
@ -826,14 +810,6 @@ moreBytes:
*target++=*source++;
}
count-=toULength;
} else {
// A supplementary character that does not fit into the target.
// Let the standard converter handle this.
source-=(toULength-oldToULength);
pToUArgs->source=(char *)source;
pFromUArgs->target=(char *)target;
*pErrorCode=U_USING_DEFAULT_WARNING;
return;
}
}
}
@ -857,8 +833,7 @@ moreBytes:
utf8->toULength=toULength;
utf8->mode=toULimit;
break;
} else if(!U8_IS_TRAIL(b=*source)) {
/* lead byte in trail byte position */
} else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
utf8->toULength=toULength;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;

View file

@ -592,12 +592,15 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i
* @see U8_SET_CP_START_UNSAFE
* @see U8_TRUNCATE_IF_INCOMPLETE
* @stable ICU 2.4
*/
#define U8_SET_CP_START(s, start, i) { \
@ -606,6 +609,51 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
} \
}
/**
* If the string ends with a UTF-8 byte sequence that is valid so far
* but incomplete, then reduce the length of the string to end before
* the lead byte of that incomplete sequence.
* For example, if the string ends with E1 80, the length is reduced by 2.
*
* Useful for processing text split across multiple buffers
* (save the incomplete sequence for later)
* and for optimizing iteration
* (check for string length only once per character).
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_SET_CP_START(), this macro never reads s[length].
*
* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param length int32_t string length, must be start<=length
* @see U8_SET_CP_START
* @draft ICU 61
*/
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \
if((length)>(start)) { \
uint8_t __b1=s[(length)-1]; \
if(U8_IS_SINGLE(__b1)) { \
/* common ASCII character */ \
} else if(U8_IS_LEAD(__b1)) { \
--(length); \
} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
uint8_t __b2=s[(length)-2]; \
if(0xe0<=__b2 && __b2<=0xf4) { \
if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
(length)-=2; \
} \
} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
uint8_t __b3=s[(length)-3]; \
if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
(length)-=3; \
} \
} \
} \
}
/* definitions with backward iteration -------------------------------------- */
/**

View file

@ -238,33 +238,45 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
int32_t i=*pi;
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
if(0xc2<=b1 && b1<0xe0) {
*pi=i;
return ((b1-0xc0)<<6)|(c&0x3f);
if(U8_IS_LEAD(b1)) {
if(b1<0xe0) {
*pi=i;
return ((b1-0xc0)<<6)|(c&0x3f);
} else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
// Truncated 3- or 4-byte sequence.
*pi=i;
return errorValue(1, strict);
}
} else if(U8_IS_TRAIL(b1) && i>start) {
// Extract the value bits from the last trail byte.
c&=0x3f;
uint8_t b2=s[--i];
if(0xe0<=b2 && b2<0xf0) {
b2&=0xf;
if(strict!=-2) {
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
*pi=i;
c=(b2<<12)|((b1&0x3f)<<6)|c;
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
return c;
} else {
// strict: forbid non-characters like U+fffe
return errorValue(2, strict);
if(0xe0<=b2 && b2<=0xf4) {
if(b2<0xf0) {
b2&=0xf;
if(strict!=-2) {
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
*pi=i;
c=(b2<<12)|((b1&0x3f)<<6)|c;
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
return c;
} else {
// strict: forbid non-characters like U+fffe
return errorValue(2, strict);
}
}
} else {
// strict=-2 -> lenient: allow surrogates
b1-=0x80;
if((b2>0 || b1>=0x20)) {
*pi=i;
return (b2<<12)|(b1<<6)|c;
}
}
} else {
// strict=-2 -> lenient: allow surrogates
b1-=0x80;
if((b2>0 || b1>=0x20)) {
*pi=i;
return (b2<<12)|(b1<<6)|c;
}
} else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
// Truncated 4-byte sequence.
*pi=i;
return errorValue(2, strict);
}
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
@ -281,16 +293,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
}
}
}
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
// Truncated 4-byte sequence.
*pi=i;
return errorValue(2, strict);
}
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
// Truncated 3- or 4-byte sequence.
*pi=i;
return errorValue(1, strict);
}
}
return errorValue(0, strict);
@ -303,29 +306,23 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
uint8_t c=s[i];
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
if(0xc2<=b1 && b1<0xe0) {
return i;
if(U8_IS_LEAD(b1)) {
if(b1<0xe0 ||
(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
return i;
}
} else if(U8_IS_TRAIL(b1) && i>start) {
uint8_t b2=s[--i];
if(0xe0<=b2 && b2<0xf0) {
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
if(0xe0<=b2 && b2<=0xf4) {
if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
return i;
}
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
if(0xf0<=b3 && b3<=0xf4) {
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
return i;
}
if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
return i;
}
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
// Truncated 4-byte sequence.
return i;
}
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
// Truncated 3- or 4-byte sequence.
return i;
}
}
return orig_i;

View file

@ -94,6 +94,7 @@ static void TestFwdBack(void);
static void TestFwdBackUnsafe(void);
static void TestSetChar(void);
static void TestSetCharUnsafe(void);
static void TestTruncateIfIncomplete(void);
static void TestAppendChar(void);
static void TestAppend(void);
static void TestSurrogates(void);
@ -114,6 +115,7 @@ addUTF8Test(TestNode** root)
addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
addTest(root, &TestSetChar, "utf8tst/TestSetChar");
addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete");
addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
addTest(root, &TestAppend, "utf8tst/TestAppend");
addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
@ -927,6 +929,64 @@ static void TestSetCharUnsafe() {
}
}
static void TestTruncateIfIncomplete() {
// Difference from U8_SET_CP_START():
// U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
// Therefore, if the last byte is a lead byte, then this macro truncates
// even if the byte at the input index cannot continue a valid sequence
// (including when that is not a trail byte).
// On the other hand, if the last byte is a trail byte, then the two macros behave the same.
static const struct {
const char *s;
int32_t expected;
} cases[] = {
{ "", 0 },
{ "a", 1 },
{ "\x80", 1 },
{ "\xC1", 1 },
{ "\xC2", 0 },
{ "\xE0", 0 },
{ "\xF4", 0 },
{ "\xF5", 1 },
{ "\x80\x80", 2 },
{ "\xC2\xA0", 2 },
{ "\xE0\x9F", 2 },
{ "\xE0\xA0", 0 },
{ "\xED\x9F", 0 },
{ "\xED\xA0", 2 },
{ "\xF0\x8F", 2 },
{ "\xF0\x90", 0 },
{ "\xF4\x8F", 0 },
{ "\xF4\x90", 2 },
{ "\xF5\x80", 2 },
{ "\x80\x80\x80", 3 },
{ "\xC2\xA0\x80", 3 },
{ "\xE0\xA0\x80", 3 },
{ "\xF0\x8F\x80", 3 },
{ "\xF0\x90\x80", 0 },
{ "\xF4\x8F\x80", 0 },
{ "\xF4\x90\x80", 3 },
{ "\xF5\x80\x80", 3 },
{ "\x80\x80\x80\x80", 4 },
{ "\xC2\xA0\x80\x80", 4 },
{ "\xE0\xA0\x80\x80", 4 },
{ "\xF0\x90\x80\x80", 4 },
{ "\xF5\x80\x80\x80", 4 }
};
int32_t i;
for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
const char *s = cases[i].s;
int32_t expected = cases[i].expected;
int32_t length = (int32_t)strlen(s);
int32_t adjusted = length;
U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
if (adjusted != expected) {
log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
(int)i, (int)length, (int)expected, (int)adjusted);
}
}
}
static void TestAppendChar(){
#if !U_HIDE_OBSOLETE_UTF_OLD_H
static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};