diff --git a/glm/core/intrinsic_matrix.inl b/glm/core/intrinsic_matrix.inl index daa3d703..e1ded4cf 100644 --- a/glm/core/intrinsic_matrix.inl +++ b/glm/core/intrinsic_matrix.inl @@ -408,11 +408,79 @@ inline __m128 sse_slow_det_ps(__m128 const in[4]) return Det0; } +inline __m128 sse_detd_ps +( + __m128 const m[4] +) +{ + // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128( + + //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3]; + //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3]; + //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2]; + //T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3]; + //T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2]; + //T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1]; + + // First 2 columns + __m128 Swp2A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 1, 1, 2))); + __m128 Swp3A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(3, 2, 3, 3))); + __m128 MulA = _mm_mul_ps(Swp2A, Swp3A); + + // Second 2 columns + __m128 Swp2B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(3, 2, 3, 3))); + __m128 Swp3B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(0, 1, 1, 2))); + __m128 MulB = _mm_mul_ps(Swp2B, Swp3B); + + // Columns subtraction + __m128 SubE = _mm_sub_ps(MulA, MulB); + + // Last 2 rows + __m128 Swp2C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 0, 1, 2))); + __m128 Swp3C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(1, 2, 0, 0))); + __m128 MulC = _mm_mul_ps(Swp2C, Swp3C); + __m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC); + + //detail::tvec4 DetCof( + // + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02), + // - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04), + // + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05), + // - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05)); + + __m128 SubFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubE), _MM_SHUFFLE(2, 1, 0, 0))); + __m128 SwpFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(0, 0, 0, 1))); + __m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA); + + __m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1)); + __m128 SubFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpB), _MM_SHUFFLE(3, 1, 1, 0)));//SubF[0], SubE[3], SubE[3], SubE[1]; + __m128 SwpFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(1, 1, 2, 2))); + __m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB); + + __m128 SubRes = _mm_sub_ps(MulFacA, MulFacB); + + __m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2)); + __m128 SubFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpC), _MM_SHUFFLE(3, 3, 2, 0))); + __m128 SwpFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(2, 3, 3, 3))); + __m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC); + + __m128 AddRes = _mm_add_ps(SubRes, MulFacC); + __m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f)); + + //return m[0][0] * DetCof[0] + // + m[0][1] * DetCof[1] + // + m[0][2] * DetCof[2] + // + m[0][3] * DetCof[3]; + + return sse_dot_ps(m[0], DetCof); +} + inline __m128 sse_det_ps ( __m128 const m[4] ) { + // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(add) + //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3]; //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3]; //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2]; diff --git a/glm/gtx/simd_mat4.inl b/glm/gtx/simd_mat4.inl index 6cbf771b..a2169b5e 100644 --- a/glm/gtx/simd_mat4.inl +++ b/glm/gtx/simd_mat4.inl @@ -10,6 +10,21 @@ namespace glm{ namespace detail { + inline fmat4x4SIMD::size_type fmat4x4SIMD::value_size() + { + return sizeof(value_type); + } + + inline fmat4x4SIMD::size_type fmat4x4SIMD::col_size() + { + return 4; + } + + inline fmat4x4SIMD::size_type fmat4x4SIMD::row_size() + { + return 4; + } + inline fmat4x4SIMD::fmat4x4SIMD() {} diff --git a/test/gtx/gtx-simd-mat4.cpp b/test/gtx/gtx-simd-mat4.cpp index d3ae03d0..2506fdd5 100644 --- a/test/gtx/gtx-simd-mat4.cpp +++ b/test/gtx/gtx-simd-mat4.cpp @@ -10,98 +10,104 @@ #define GLM_INSTRUCTION_SET GLM_PLATFORM_SSE3 #include #include +#include #include #include #include +#include -void test_detA() +std::vector test_detA(std::vector const & Data) { - glm::mat4 Identity( - glm::vec4(4.0f, 0.7f, 0.1f, 0.01f), - glm::vec4(0.5f, 3.0f, 0.6f, 0.02f), - glm::vec4(0.2f, 0.4f, 2.0f, 0.03f), - glm::vec4(4.0f, 3.0f, 2.0f, 1.00f)); - - std::vector Test(10000000); + std::vector Test(Data.size()); std::clock_t TimeStart = clock(); for(std::size_t i = 0; i < Test.size(); ++i) - Test[i] = glm::determinant(Identity); + Test[i] = glm::determinant(Data[i]); std::clock_t TimeEnd = clock(); printf("Det A: %d\n", TimeEnd - TimeStart); + + return Test; } -void test_detB() +std::vector test_detB(std::vector const & Data) { - glm::simd_mat4 IdentityB( - glm::simd_vec4(4.0f, 0.7f, 0.1f, 0.01f), - glm::simd_vec4(0.5f, 3.0f, 0.6f, 0.02f), - glm::simd_vec4(0.2f, 0.4f, 2.0f, 0.03f), - glm::simd_vec4(4.0f, 3.0f, 2.0f, 1.00f)); - - std::vector<__m128> Test(10000000); + std::vector Test(Data.size()); std::clock_t TimeStart = clock(); for(std::size_t i = 0; i < Test.size(); ++i) - Test[i] = glm::detail::sse_slow_det_ps(&IdentityB.Data[0].Data); + { + glm::simd_mat4 m(Data[i]); + Test[i] = glm::simd_vec4(glm::detail::sse_slow_det_ps((__m128 const * const)&m)).x; + } std::clock_t TimeEnd = clock(); printf("Det B: %d\n", TimeEnd - TimeStart); + + return Test; } -void test_detC() +std::vector test_detC(std::vector const & Data) { - glm::simd_mat4 IdentityB( - glm::simd_vec4(4.0f, 0.7f, 0.1f, 0.01f), - glm::simd_vec4(0.5f, 3.0f, 0.6f, 0.02f), - glm::simd_vec4(0.2f, 0.4f, 2.0f, 0.03f), - glm::simd_vec4(4.0f, 3.0f, 2.0f, 1.00f)); - - std::vector<__m128> Test(10000000); + std::vector Test(Data.size()); std::clock_t TimeStart = clock(); for(std::size_t i = 0; i < Test.size(); ++i) - Test[i] = glm::detail::sse_det_ps(&IdentityB.Data[0].Data); + { + glm::simd_mat4 m(Data[i]); + Test[i] = glm::simd_vec4(glm::detail::sse_det_ps((__m128 const * const)&m)).x; + } std::clock_t TimeEnd = clock(); printf("Det C: %d\n", TimeEnd - TimeStart); + + return Test; +} + +std::vector test_detD(std::vector const & Data) +{ + std::vector Test(Data.size()); + + std::clock_t TimeStart = clock(); + + for(std::size_t i = 0; i < Test.size(); ++i) + { + glm::simd_mat4 m(Data[i]); + Test[i] = glm::simd_vec4(glm::detail::sse_detd_ps((__m128 const * const)&m)).x; + } + + std::clock_t TimeEnd = clock(); + printf("Det C: %d\n", TimeEnd - TimeStart); + + return Test; } int main(int argc, void* argv[]) { - test_detA(); - test_detB(); - test_detC(); + std::vector Data(1024 * 1024 * 16); + for(std::size_t i = 0; i < Data.size(); ++i) + Data[i] = glm::mat4( + glm::vec4(glm::compRand4(-2.0f, 2.0f)), + glm::vec4(glm::compRand4(-2.0f, 2.0f)), + glm::vec4(glm::compRand4(-2.0f, 2.0f)), + glm::vec4(glm::compRand4(-2.0f, 2.0f))); + + std::vector TestDetA = test_detA(Data); + std::vector TestDetB = test_detB(Data); + std::vector TestDetC = test_detC(Data); + std::vector TestDetD = test_detD(Data); + + for(std::size_t i = 0; i < TestDetA.size(); ++i) + if(TestDetA[i] != TestDetB[i] && TestDetC[i] != TestDetB[i] && TestDetC[i] != TestDetD[i]) + return 1; // shuffle test glm::simd_vec4 A(1.0f, 2.0f, 3.0f, 4.0f); glm::simd_vec4 B(5.0f, 6.0f, 7.0f, 8.0f); __m128 C = _mm_shuffle_ps(A.Data, B.Data, _MM_SHUFFLE(1, 0, 1, 0)); - glm::mat4 IdentityA( - glm::vec4(4.0f, 0.7f, 0.1f, 0.01f), - glm::vec4(0.5f, 3.0f, 0.6f, 0.02f), - glm::vec4(0.2f, 0.4f, 2.0f, 0.03f), - glm::vec4(4.0f, 3.0f, 2.0f, 1.00f)); - float DetA = glm::determinant(IdentityA); - - glm::simd_mat4 IdentityB( - glm::simd_vec4(4.0f, 0.7f, 0.1f, 0.01f), - glm::simd_vec4(0.5f, 3.0f, 0.6f, 0.02f), - glm::simd_vec4(0.2f, 0.4f, 2.0f, 0.03f), - glm::simd_vec4(4.0f, 3.0f, 2.0f, 1.00f)); - __m128 DetB = glm::detail::sse_slow_det_ps(&IdentityB.Data[0].Data); - __m128 DetC = glm::detail::sse_det_ps(&IdentityB.Data[0].Data); - - std::vector TestA(100000); - - - std::vector<__m128> TestB(100000); - std::vector<__m128> TestC(100000); - return 0; }