Merge feature/simd-strlen into master as a single commit

This commit is contained in:
Kraionix 2025-03-01 14:47:26 +07:00 committed by ocornut
parent 707ba8c472
commit c67c2026fd
2 changed files with 149 additions and 1 deletions

146
imgui.cpp
View file

@ -1960,6 +1960,152 @@ ImVec2 ImTriangleClosestPoint(const ImVec2& a, const ImVec2& b, const ImVec2& c,
// [SECTION] MISC HELPERS/UTILITIES (String, Format, Hash functions)
//-----------------------------------------------------------------------------
#if defined IMGUI_ENABLE_AVX2_IMSTRLEN
size_t ImStrlen(const char* str)
{
const size_t SIMD_LENGTH = 32;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;
const unsigned char* begin = (unsigned char*)str;
const unsigned char* ptr = begin;
// first page
{
const size_t PAGE_LENGTH = 4096;
const size_t PAGE_LENGTH_MASK = PAGE_LENGTH - 1;
const unsigned char* page_end = (const unsigned char*)_andn_u64(PAGE_LENGTH_MASK, (uintptr_t)ptr + PAGE_LENGTH_MASK);
const unsigned char* align_page_end = (const unsigned char*)(page_end - SIMD_LENGTH);
// if ptr is far the end of page
if (ptr <= align_page_end)
{
__m256i target = _mm256_setzero_si256();
// if ptr not aligned, align ptr to SIMD_LENGTH
if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m256i chunk = _mm256_lddqu_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));
if (mask)
return (uintptr_t)(ptr - begin + _tzcnt_u32(mask));
ptr = (const unsigned char*)_andn_u64(SIMD_LENGTH_MASK, (uintptr_t)ptr + SIMD_LENGTH_MASK);
}
// main loop of first page
for (; ptr <= align_page_end; ptr += SIMD_LENGTH)
{
__m256i chunk = _mm256_load_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));
if (mask)
return (uintptr_t)(ptr - begin + _tzcnt_u32(mask));
_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}
// if ptr is near the end of page
for (; ptr < page_end; ptr++)
{
if (!(*ptr))
return (uintptr_t)(ptr - begin);
}
}
__m256i target = _mm256_setzero_si256();
// main loop
for (; ; ptr += SIMD_LENGTH)
{
__m256i chunk = _mm256_load_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));
if (mask)
return (uintptr_t)(ptr - begin + _tzcnt_u32(mask));
_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}
#elif defined IMGUI_ENABLE_SSE_IMSTRLEN
size_t ImStrlen(const char* str)
{
const size_t SIMD_LENGTH = 16;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;
const unsigned char* begin = (unsigned char*)str;
const unsigned char* ptr = begin;
const unsigned char ch = '\0';
// first page
{
const size_t PAGE_LENGTH = 4096;
const size_t PAGE_LENGTH_MASK = PAGE_LENGTH - 1;
const unsigned char* page_end = (const unsigned char*)(((uintptr_t)ptr + PAGE_LENGTH_MASK) & ~PAGE_LENGTH_MASK);
const unsigned char* align_page_end = (const unsigned char*)(page_end - SIMD_LENGTH);
// if ptr is far the end of page
if (ptr <= align_page_end)
{
__m128i target = _mm_set1_epi8(ch);
// if ptr not aligned, align ptr to SIMD_LENGTH
if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m128i chunk = _mm_lddqu_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));
if (mask)
return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin);
ptr = (const unsigned char*)(((uintptr_t)ptr + SIMD_LENGTH_MASK) & ~SIMD_LENGTH_MASK);
}
// main loop of first page
for (; ptr <= align_page_end; ptr += SIMD_LENGTH)
{
__m128i chunk = _mm_load_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));
if (mask)
return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin);
_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}
// if ptr is near the end of page
for (; ptr < page_end; ptr++)
{
if (*ptr == ch)
return (uintptr_t)(ptr - begin);
}
}
__m128i target = _mm_set1_epi8(ch);
// main loop
for (; ; ptr += SIMD_LENGTH)
{
__m128i chunk = _mm_load_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));
if (mask)
return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin);
_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}
#else
size_t ImStrlen(const char* str)
{
return strlen(str);
}
#endif
#if defined IMGUI_ENABLE_AVX2_IMMEMCHR
const void* ImMemchr(const void* buf, int val, size_t count)
{

View file

@ -90,8 +90,10 @@ Index of this file:
// Only AVX2 supports integer and byte instructions for 256-bit registers. Implementation this on AVX1 is not possible.
#if defined(IMGUI_ENABLE_AVX2)
#define IMGUI_ENABLE_AVX2_IMSTRLEN
#define IMGUI_ENABLE_AVX2_IMMEMCHR
#elif defined(IMGUI_ENABLE_AVX) || defined(IMGUI_ENABLE_SSE)
#define IMGUI_ENABLE_SSE_IMSTRLEN
#define IMGUI_ENABLE_SSE_IMMEMCHR
#endif
@ -395,7 +397,7 @@ static inline bool ImIsPowerOfTwo(ImU64 v) { return v != 0 && (v &
static inline int ImUpperPowerOfTwo(int v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; }
// Helpers: String
#define ImStrlen strlen
IMGUI_API size_t ImStrlen(const char* str); // Compute the length of a null-terminated string.
IMGUI_API const void* ImMemchr(const void* buf, int val, size_t count); // Find first occurrence of 'val' in buffer given length.
IMGUI_API int ImStricmp(const char* str1, const char* str2); // Case insensitive compare.
IMGUI_API int ImStrnicmp(const char* str1, const char* str2, size_t count); // Case insensitive compare to a certain count.