mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-13515 UTF-8 macro: reduce length of string if it ends with an incomplete sequence
X-SVN-Rev: 40883
This commit is contained in:
parent
64aa4beb28
commit
19b494f953
4 changed files with 160 additions and 80 deletions
|
@ -696,36 +696,20 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
// Use a single counter for source and target, counting the minimum of
|
||||
// the source length and the target capacity.
|
||||
// Let the standard converter handle edge cases.
|
||||
const uint8_t *limit=sourceLimit;
|
||||
if(count>targetCapacity) {
|
||||
limit-=(count-targetCapacity);
|
||||
count=targetCapacity;
|
||||
}
|
||||
|
||||
// The conversion loop checks count>0 only once per 1/2/3-byte character.
|
||||
// If the buffer ends with a truncated 2- or 3-byte sequence,
|
||||
// The conversion loop checks count>0 only once per character.
|
||||
// If the buffer ends with a truncated sequence,
|
||||
// then we reduce the count to stop before that,
|
||||
// and collect the remaining bytes after the conversion loop.
|
||||
{
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=count-toULimit;
|
||||
if(length>0) {
|
||||
uint8_t b1=*(limit-1);
|
||||
if(U8_IS_SINGLE(b1)) {
|
||||
// common ASCII character
|
||||
} else if(U8_IS_TRAIL(b1) && length>=2) {
|
||||
uint8_t b2=*(limit-2);
|
||||
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
// truncated 3-byte sequence
|
||||
count-=2;
|
||||
}
|
||||
} else if(0xc2<=b1 && b1<0xf0) {
|
||||
// truncated 2- or 3-byte sequence
|
||||
--count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=count-toULimit;
|
||||
U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
|
||||
count=toULimit+length;
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
|
@ -815,7 +799,7 @@ moreBytes:
|
|||
}
|
||||
|
||||
/* copy the legal byte sequence to the target */
|
||||
if(count>=toULength) {
|
||||
{
|
||||
int8_t i;
|
||||
|
||||
for(i=0; i<oldToULength; ++i) {
|
||||
|
@ -826,14 +810,6 @@ moreBytes:
|
|||
*target++=*source++;
|
||||
}
|
||||
count-=toULength;
|
||||
} else {
|
||||
// A supplementary character that does not fit into the target.
|
||||
// Let the standard converter handle this.
|
||||
source-=(toULength-oldToULength);
|
||||
pToUArgs->source=(char *)source;
|
||||
pFromUArgs->target=(char *)target;
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -857,8 +833,7 @@ moreBytes:
|
|||
utf8->toULength=toULength;
|
||||
utf8->mode=toULimit;
|
||||
break;
|
||||
} else if(!U8_IS_TRAIL(b=*source)) {
|
||||
/* lead byte in trail byte position */
|
||||
} else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
|
||||
utf8->toULength=toULength;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
|
|
|
@ -592,12 +592,15 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
* If the offset points to a UTF-8 trail byte,
|
||||
* then the offset is moved backward to the corresponding lead byte.
|
||||
* Otherwise, it is not modified.
|
||||
*
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
|
||||
*
|
||||
* @param s const uint8_t * string
|
||||
* @param start int32_t starting string offset (usually 0)
|
||||
* @param i int32_t string offset, must be start<=i
|
||||
* @see U8_SET_CP_START_UNSAFE
|
||||
* @see U8_TRUNCATE_IF_INCOMPLETE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_SET_CP_START(s, start, i) { \
|
||||
|
@ -606,6 +609,51 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* If the string ends with a UTF-8 byte sequence that is valid so far
|
||||
* but incomplete, then reduce the length of the string to end before
|
||||
* the lead byte of that incomplete sequence.
|
||||
* For example, if the string ends with E1 80, the length is reduced by 2.
|
||||
*
|
||||
* Useful for processing text split across multiple buffers
|
||||
* (save the incomplete sequence for later)
|
||||
* and for optimizing iteration
|
||||
* (check for string length only once per character).
|
||||
*
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
* Unlike U8_SET_CP_START(), this macro never reads s[length].
|
||||
*
|
||||
* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
|
||||
*
|
||||
* @param s const uint8_t * string
|
||||
* @param start int32_t starting string offset (usually 0)
|
||||
* @param length int32_t string length, must be start<=length
|
||||
* @see U8_SET_CP_START
|
||||
* @draft ICU 61
|
||||
*/
|
||||
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \
|
||||
if((length)>(start)) { \
|
||||
uint8_t __b1=s[(length)-1]; \
|
||||
if(U8_IS_SINGLE(__b1)) { \
|
||||
/* common ASCII character */ \
|
||||
} else if(U8_IS_LEAD(__b1)) { \
|
||||
--(length); \
|
||||
} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
|
||||
uint8_t __b2=s[(length)-2]; \
|
||||
if(0xe0<=__b2 && __b2<=0xf4) { \
|
||||
if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
|
||||
U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
|
||||
(length)-=2; \
|
||||
} \
|
||||
} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
|
||||
uint8_t __b3=s[(length)-3]; \
|
||||
if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
|
||||
(length)-=3; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
/* definitions with backward iteration -------------------------------------- */
|
||||
|
||||
/**
|
||||
|
|
|
@ -238,33 +238,45 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
|||
int32_t i=*pi;
|
||||
if(U8_IS_TRAIL(c) && i>start) {
|
||||
uint8_t b1=s[--i];
|
||||
if(0xc2<=b1 && b1<0xe0) {
|
||||
*pi=i;
|
||||
return ((b1-0xc0)<<6)|(c&0x3f);
|
||||
if(U8_IS_LEAD(b1)) {
|
||||
if(b1<0xe0) {
|
||||
*pi=i;
|
||||
return ((b1-0xc0)<<6)|(c&0x3f);
|
||||
} else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(1, strict);
|
||||
}
|
||||
} else if(U8_IS_TRAIL(b1) && i>start) {
|
||||
// Extract the value bits from the last trail byte.
|
||||
c&=0x3f;
|
||||
uint8_t b2=s[--i];
|
||||
if(0xe0<=b2 && b2<0xf0) {
|
||||
b2&=0xf;
|
||||
if(strict!=-2) {
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
*pi=i;
|
||||
c=(b2<<12)|((b1&0x3f)<<6)|c;
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
return c;
|
||||
} else {
|
||||
// strict: forbid non-characters like U+fffe
|
||||
return errorValue(2, strict);
|
||||
if(0xe0<=b2 && b2<=0xf4) {
|
||||
if(b2<0xf0) {
|
||||
b2&=0xf;
|
||||
if(strict!=-2) {
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
*pi=i;
|
||||
c=(b2<<12)|((b1&0x3f)<<6)|c;
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
return c;
|
||||
} else {
|
||||
// strict: forbid non-characters like U+fffe
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// strict=-2 -> lenient: allow surrogates
|
||||
b1-=0x80;
|
||||
if((b2>0 || b1>=0x20)) {
|
||||
*pi=i;
|
||||
return (b2<<12)|(b1<<6)|c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// strict=-2 -> lenient: allow surrogates
|
||||
b1-=0x80;
|
||||
if((b2>0 || b1>=0x20)) {
|
||||
*pi=i;
|
||||
return (b2<<12)|(b1<<6)|c;
|
||||
}
|
||||
} else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
} else if(U8_IS_TRAIL(b2) && i>start) {
|
||||
uint8_t b3=s[--i];
|
||||
|
@ -281,16 +293,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
|||
}
|
||||
}
|
||||
}
|
||||
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(1, strict);
|
||||
}
|
||||
}
|
||||
return errorValue(0, strict);
|
||||
|
@ -303,29 +306,23 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
|
|||
uint8_t c=s[i];
|
||||
if(U8_IS_TRAIL(c) && i>start) {
|
||||
uint8_t b1=s[--i];
|
||||
if(0xc2<=b1 && b1<0xe0) {
|
||||
return i;
|
||||
if(U8_IS_LEAD(b1)) {
|
||||
if(b1<0xe0 ||
|
||||
(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
return i;
|
||||
}
|
||||
} else if(U8_IS_TRAIL(b1) && i>start) {
|
||||
uint8_t b2=s[--i];
|
||||
if(0xe0<=b2 && b2<0xf0) {
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
if(0xe0<=b2 && b2<=0xf4) {
|
||||
if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
return i;
|
||||
}
|
||||
} else if(U8_IS_TRAIL(b2) && i>start) {
|
||||
uint8_t b3=s[--i];
|
||||
if(0xf0<=b3 && b3<=0xf4) {
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||||
return i;
|
||||
}
|
||||
if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||||
return i;
|
||||
}
|
||||
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return orig_i;
|
||||
|
|
|
@ -94,6 +94,7 @@ static void TestFwdBack(void);
|
|||
static void TestFwdBackUnsafe(void);
|
||||
static void TestSetChar(void);
|
||||
static void TestSetCharUnsafe(void);
|
||||
static void TestTruncateIfIncomplete(void);
|
||||
static void TestAppendChar(void);
|
||||
static void TestAppend(void);
|
||||
static void TestSurrogates(void);
|
||||
|
@ -114,6 +115,7 @@ addUTF8Test(TestNode** root)
|
|||
addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
|
||||
addTest(root, &TestSetChar, "utf8tst/TestSetChar");
|
||||
addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
|
||||
addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete");
|
||||
addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
|
||||
addTest(root, &TestAppend, "utf8tst/TestAppend");
|
||||
addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
|
||||
|
@ -927,6 +929,64 @@ static void TestSetCharUnsafe() {
|
|||
}
|
||||
}
|
||||
|
||||
static void TestTruncateIfIncomplete() {
|
||||
// Difference from U8_SET_CP_START():
|
||||
// U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
|
||||
// Therefore, if the last byte is a lead byte, then this macro truncates
|
||||
// even if the byte at the input index cannot continue a valid sequence
|
||||
// (including when that is not a trail byte).
|
||||
// On the other hand, if the last byte is a trail byte, then the two macros behave the same.
|
||||
static const struct {
|
||||
const char *s;
|
||||
int32_t expected;
|
||||
} cases[] = {
|
||||
{ "", 0 },
|
||||
{ "a", 1 },
|
||||
{ "\x80", 1 },
|
||||
{ "\xC1", 1 },
|
||||
{ "\xC2", 0 },
|
||||
{ "\xE0", 0 },
|
||||
{ "\xF4", 0 },
|
||||
{ "\xF5", 1 },
|
||||
{ "\x80\x80", 2 },
|
||||
{ "\xC2\xA0", 2 },
|
||||
{ "\xE0\x9F", 2 },
|
||||
{ "\xE0\xA0", 0 },
|
||||
{ "\xED\x9F", 0 },
|
||||
{ "\xED\xA0", 2 },
|
||||
{ "\xF0\x8F", 2 },
|
||||
{ "\xF0\x90", 0 },
|
||||
{ "\xF4\x8F", 0 },
|
||||
{ "\xF4\x90", 2 },
|
||||
{ "\xF5\x80", 2 },
|
||||
{ "\x80\x80\x80", 3 },
|
||||
{ "\xC2\xA0\x80", 3 },
|
||||
{ "\xE0\xA0\x80", 3 },
|
||||
{ "\xF0\x8F\x80", 3 },
|
||||
{ "\xF0\x90\x80", 0 },
|
||||
{ "\xF4\x8F\x80", 0 },
|
||||
{ "\xF4\x90\x80", 3 },
|
||||
{ "\xF5\x80\x80", 3 },
|
||||
{ "\x80\x80\x80\x80", 4 },
|
||||
{ "\xC2\xA0\x80\x80", 4 },
|
||||
{ "\xE0\xA0\x80\x80", 4 },
|
||||
{ "\xF0\x90\x80\x80", 4 },
|
||||
{ "\xF5\x80\x80\x80", 4 }
|
||||
};
|
||||
int32_t i;
|
||||
for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
|
||||
const char *s = cases[i].s;
|
||||
int32_t expected = cases[i].expected;
|
||||
int32_t length = (int32_t)strlen(s);
|
||||
int32_t adjusted = length;
|
||||
U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
|
||||
if (adjusted != expected) {
|
||||
log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
|
||||
(int)i, (int)length, (int)expected, (int)adjusted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void TestAppendChar(){
|
||||
#if !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
|
||||
|
|
Loading…
Add table
Reference in a new issue