Fix UTF-8 validity checks to not do unaligned reads.
This commit is contained in:
parent
de74779454
commit
b221008884
2 changed files with 60 additions and 35 deletions
|
@ -371,36 +371,44 @@ int UTF8GenericScan(const UTF8ScanObj* st,
|
|||
// Do state-table scan
|
||||
int e = 0;
|
||||
uint8 c;
|
||||
|
||||
// Do fast for groups of 8 identity bytes.
|
||||
// This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
|
||||
// including slowing slightly on cr/lf/ht
|
||||
//----------------------------
|
||||
const uint8* Tbl2 = &st->fast_state[0];
|
||||
uint32 losub = st->losub;
|
||||
uint32 hiadd = st->hiadd;
|
||||
while (src < srclimit8) {
|
||||
uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
|
||||
uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
|
||||
src += 8;
|
||||
// This is a fast range check for all bytes in [lowsub..0x80-hiadd)
|
||||
uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
|
||||
(s4567 - losub) | (s4567 + hiadd);
|
||||
if ((temp & 0x80808080) != 0) {
|
||||
// We typically end up here on cr/lf/ht; src was incremented
|
||||
int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
|
||||
(Tbl2[src[-6]] | Tbl2[src[-5]]);
|
||||
if (e0123 != 0) {
|
||||
src -= 8;
|
||||
break;
|
||||
} // Exit on Non-interchange
|
||||
e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
|
||||
(Tbl2[src[-2]] | Tbl2[src[-1]]);
|
||||
if (e0123 != 0) {
|
||||
src -= 4;
|
||||
break;
|
||||
} // Exit on Non-interchange
|
||||
// Else OK, go around again
|
||||
const uint32 losub = st->losub;
|
||||
const uint32 hiadd = st->hiadd;
|
||||
// Check initial few bytes one at a time until 8-byte aligned
|
||||
//----------------------------
|
||||
while ((((uintptr_t)src & 0x07) != 0) &&
|
||||
(src < srclimit) &&
|
||||
Tbl2[src[0]] == 0) {
|
||||
src++;
|
||||
}
|
||||
if (((uintptr_t)src & 0x07) == 0) {
|
||||
// Do fast for groups of 8 identity bytes.
|
||||
// This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
|
||||
// including slowing slightly on cr/lf/ht
|
||||
//----------------------------
|
||||
while (src < srclimit8) {
|
||||
uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
|
||||
uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
|
||||
src += 8;
|
||||
// This is a fast range check for all bytes in [lowsub..0x80-hiadd)
|
||||
uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
|
||||
(s4567 - losub) | (s4567 + hiadd);
|
||||
if ((temp & 0x80808080) != 0) {
|
||||
// We typically end up here on cr/lf/ht; src was incremented
|
||||
int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
|
||||
(Tbl2[src[-6]] | Tbl2[src[-5]]);
|
||||
if (e0123 != 0) {
|
||||
src -= 8;
|
||||
break;
|
||||
} // Exit on Non-interchange
|
||||
e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
|
||||
(Tbl2[src[-2]] | Tbl2[src[-1]]);
|
||||
if (e0123 != 0) {
|
||||
src -= 4;
|
||||
break;
|
||||
} // Exit on Non-interchange
|
||||
// Else OK, go around again
|
||||
}
|
||||
}
|
||||
}
|
||||
//----------------------------
|
||||
|
@ -470,10 +478,17 @@ int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
|
|||
int rest_consumed;
|
||||
int exit_reason;
|
||||
do {
|
||||
while ((src < srclimit8) &&
|
||||
(((reinterpret_cast<const uint32*>(src)[0] |
|
||||
reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
|
||||
src += 8;
|
||||
// Check initial few bytes one at a time until 8-byte aligned
|
||||
while ((((uintptr_t)src & 0x07) != 0) &&
|
||||
(src < srclimit) && (src[0] < 0x80)) {
|
||||
src++;
|
||||
}
|
||||
if (((uintptr_t)src & 0x07) == 0) {
|
||||
while ((src < srclimit8) &&
|
||||
(((reinterpret_cast<const uint32*>(src)[0] |
|
||||
reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
|
||||
src += 8;
|
||||
}
|
||||
}
|
||||
while ((src < srclimit) && (src[0] < 0x80)) {
|
||||
src++;
|
||||
|
|
|
@ -13,15 +13,25 @@ TEST(StructurallyValidTest, ValidUTF8String) {
|
|||
// On GCC, this string can be written as:
|
||||
// "abcd 1234 - \u2014\u2013\u2212"
|
||||
// MSVC seems to interpret \u differently.
|
||||
string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222");
|
||||
string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222 - xyz789");
|
||||
EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data(),
|
||||
valid_str.size()));
|
||||
// Additional check for pointer alignment
|
||||
for (int i = 1; i < 8; ++i) {
|
||||
EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data() + i,
|
||||
valid_str.size() - i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StructurallyValidTest, InvalidUTF8String) {
|
||||
string invalid_str("\xA0\xB0");
|
||||
const string invalid_str("abcd\xA0\xB0\xA0\xB0\xA0\xB0 - xyz789");
|
||||
EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data(),
|
||||
invalid_str.size()));
|
||||
// Additional check for pointer alignment
|
||||
for (int i = 1; i < 8; ++i) {
|
||||
EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data() + i,
|
||||
invalid_str.size() - i));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
Loading…
Add table
Reference in a new issue