From 13837e107976e390515ace747362131e303c10f8 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 08:51:17 +1000 Subject: [PATCH] Remove the SSE3 implementation in operator*(fquatSIMD, fquatSIMD). The SSE2 version is now running faster than the SSE3 version. --- glm/gtx/simd_quat.inl | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index 25848a57..9502a6ab 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -115,11 +115,6 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & // 8 mul // 8 add - // SSE3 STATS: - // 3 shuffle - // 8 mul - // 8 add - // SSE4 STATS: // 3 shuffle // 4 mul @@ -136,33 +131,23 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); # else - mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); - mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); - mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); - mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); + add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); -# if((GLM_ARCH & GLM_ARCH_SSE3)) - __m128 add0 = _mm_hadd_ps(mul0, mul0); - add0 = _mm_hadd_ps(add0, add0); - __m128 add1 = _mm_hadd_ps(mul1, mul1); - add1 = _mm_hadd_ps(add1, add1); - __m128 add2 = _mm_hadd_ps(mul2, mul2); - add2 = _mm_hadd_ps(add2, add2); - __m128 add3 = _mm_hadd_ps(mul3, mul3); - add3 = _mm_hadd_ps(add3, add3); -# else - __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); - add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); - __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); - add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); - __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); - add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); - __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); - add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); -# endif + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); + add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); + add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); + add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); #endif - // I had tried something clever here using shuffles to produce the final result, but it turns out that using // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case.