diff --git a/src/hb-ot-var-hvac-table.hh b/src/hb-ot-var-hvac-table.hh index f7be4c617..9fc5ad47f 100644 --- a/src/hb-ot-var-hvac-table.hh +++ b/src/hb-ot-var-hvac-table.hh @@ -37,118 +37,6 @@ namespace OT { -#include - -static inline void updatePointsAVXscale(const float* x, const float* y, float scalar, unsigned deltasCount, contour_point_t* points) -{ - __m256 scalarVec = _mm256_set1_ps(scalar); - - unsigned i = 0; - - // Process 8 elements at a time with AVX - for (; i <= deltasCount - 8; i += 8) - { - // Load 8 elements from x and y arrays - __m256 xVec = _mm256_loadu_ps(&x[i]); - __m256 yVec = _mm256_loadu_ps(&y[i]); - - // Gather point x and y values - __m256 pointXVec = _mm256_set_ps(points[i+7].x, points[i+6].x, points[i+5].x, points[i+4].x, - points[i+3].x, points[i+2].x, points[i+1].x, points[i].x); - __m256 pointYVec = _mm256_set_ps(points[i+7].y, points[i+6].y, points[i+5].y, points[i+4].y, - points[i+3].y, points[i+2].y, points[i+1].y, points[i].y); - - // Multiply x and y vectors by the scalar - xVec = _mm256_mul_ps(xVec, scalarVec); - yVec = _mm256_mul_ps(yVec, scalarVec); - - // Add the scaled x and y to the point coordinates - pointXVec = _mm256_add_ps(pointXVec, xVec); - pointYVec = _mm256_add_ps(pointYVec, yVec); - - // Store the updated coordinates back to the points array - points[i].x = _mm256_cvtss_f32(pointXVec); - points[i].y = _mm256_cvtss_f32(pointYVec); - - points[i+1].x = _mm256_cvtss_f32(_mm256_permute_ps(pointXVec, _MM_SHUFFLE(0, 0, 0, 1))); - points[i+1].y = _mm256_cvtss_f32(_mm256_permute_ps(pointYVec, _MM_SHUFFLE(0, 0, 0, 1))); - - points[i+2].x = _mm256_cvtss_f32(_mm256_permute_ps(pointXVec, _MM_SHUFFLE(0, 0, 0, 2))); - points[i+2].y = _mm256_cvtss_f32(_mm256_permute_ps(pointYVec, _MM_SHUFFLE(0, 0, 0, 2))); - - points[i+3].x = _mm256_cvtss_f32(_mm256_permute_ps(pointXVec, _MM_SHUFFLE(0, 0, 0, 3))); - points[i+3].y = _mm256_cvtss_f32(_mm256_permute_ps(pointYVec, _MM_SHUFFLE(0, 0, 0, 3))); - - points[i+4].x = _mm256_cvtss_f32(_mm256_permute_ps(pointXVec, _MM_SHUFFLE(0, 0, 0, 4))); - points[i+4].y = _mm256_cvtss_f32(_mm256_permute_ps(pointYVec, _MM_SHUFFLE(0, 0, 0, 4))); - - points[i+5].x = _mm256_cvtss_f32(_mm256_permute_ps(pointXVec, _MM_SHUFFLE(0, 0, 0, 5))); - points[i+5].y = _mm256_cvtss_f32(_mm256_permute_ps(pointYVec, _MM_SHUFFLE(0, 0, 0, 5))); - - points[i+6].x = _mm256_cvtss_f32(_mm256_permute_ps(pointXVec, _MM_SHUFFLE(0, 0, 0, 6))); - points[i+6].y = _mm256_cvtss_f32(_mm256_permute_ps(pointYVec, _MM_SHUFFLE(0, 0, 0, 6))); - - points[i+7].x = _mm256_cvtss_f32(_mm256_permute_ps(pointXVec, _MM_SHUFFLE(0, 0, 0, 7))); - points[i+7].y = _mm256_cvtss_f32(_mm256_permute_ps(pointYVec, _MM_SHUFFLE(0, 0, 0, 7))); - } - - // Process remaining elements - for (; i < deltasCount; i++) - { - points[i].x += x[i] * scalar; - points[i].y += y[i] * scalar; - } -} - -static inline void updatePointsAVX(const float* x, const float* y, unsigned deltasCount, contour_point_t* points) -{ - unsigned i = 0; - - // Process 8 elements at a time with AVX - for (; i <= deltasCount - 8; i += 8) - { - // Load 8 elements from x and y arrays - __m256 xVec = _mm256_loadu_ps(&x[i]); - __m256 yVec = _mm256_loadu_ps(&y[i]); - - // Gather point x and y values into AVX vectors - __m256 pointXVec = _mm256_set_ps(points[i+7].x, points[i+6].x, points[i+5].x, points[i+4].x, - points[i+3].x, points[i+2].x, points[i+1].x, points[i].x); - __m256 pointYVec = _mm256_set_ps(points[i+7].y, points[i+6].y, points[i+5].y, points[i+4].y, - points[i+3].y, points[i+2].y, points[i+1].y, points[i].y); - - // Add x and y values to the point coordinates - pointXVec = _mm256_add_ps(pointXVec, xVec); - pointYVec = _mm256_add_ps(pointYVec, yVec); - - // Scatter the updated coordinates back to the points array - points[i].x = ((float*)&pointXVec)[0]; - points[i].y = ((float*)&pointYVec)[0]; - points[i+1].x = ((float*)&pointXVec)[1]; - points[i+1].y = ((float*)&pointYVec)[1]; - points[i+2].x = ((float*)&pointXVec)[2]; - points[i+2].y = ((float*)&pointYVec)[2]; - points[i+3].x = ((float*)&pointXVec)[3]; - points[i+3].y = ((float*)&pointYVec)[3]; - points[i+4].x = ((float*)&pointXVec)[4]; - points[i+4].y = ((float*)&pointYVec)[4]; - points[i+5].x = ((float*)&pointXVec)[5]; - points[i+5].y = ((float*)&pointYVec)[5]; - points[i+6].x = ((float*)&pointXVec)[6]; - points[i+6].y = ((float*)&pointYVec)[6]; - points[i+7].x = ((float*)&pointXVec)[7]; - points[i+7].y = ((float*)&pointYVec)[7]; - } - - // Process remaining elements - for (; i < deltasCount; i++) - { - points[i].x += x[i]; - points[i].y += y[i]; - } -} - - struct GlyphVariationDelta { unsigned get_size (unsigned deltasCount) const @@ -162,20 +50,12 @@ struct GlyphVariationDelta const SparseVarRegionList &varRegionList) const { float scalar = varRegionList.evaluate (regionIndex, coords, coords.length); - if (scalar == 0.f) + if (scalar == 0) return; const float *x = deltasZ; const float *y = deltasZ + deltasCount; - - - if (scalar == 1.f) - updatePointsAVX(x, y, deltasCount, points.arrayZ); - else - updatePointsAVXscale(x, y, scalar, deltasCount, points.arrayZ); - return; - - if (scalar == 1.f) + if (scalar == 1) for (unsigned i = 0; i < deltasCount; i++) { auto &point = points[i];