diff --git a/glm/detail/func_matrix_simd.inl b/glm/detail/func_matrix_simd.inl index 898082c8..f1e61855 100644 --- a/glm/detail/func_matrix_simd.inl +++ b/glm/detail/func_matrix_simd.inl @@ -15,7 +15,7 @@ namespace detail GLM_FUNC_QUALIFIER static tmat4x4 call(tmat4x4 const& m) { tmat4x4 Result(uninitialize); - glm_f32m4_inv(*reinterpret_cast<__m128 const(*)[4]>(&m[0].data), *reinterpret_cast<__m128(*)[4]>(&Result[0].data)); + glm_mat4_inverse(*reinterpret_cast<__m128 const(*)[4]>(&m[0].data), *reinterpret_cast<__m128(*)[4]>(&Result[0].data)); return Result; } }; diff --git a/glm/detail/type_vec4.hpp b/glm/detail/type_vec4.hpp index ab7c0c69..17d16832 100644 --- a/glm/detail/type_vec4.hpp +++ b/glm/detail/type_vec4.hpp @@ -33,19 +33,19 @@ namespace detail template <> struct simd_data { - typedef __m128 type; + typedef glm_vec4 type; }; template <> struct simd_data { - typedef __m128i type; + typedef glm_ivec4 type; }; template <> struct simd_data { - typedef __m128i type; + typedef glm_uvec4 type; }; # endif @@ -53,7 +53,7 @@ namespace detail template <> struct simd_data { - typedef __m256d type; + typedef glm_dvec4 type; }; # endif @@ -61,13 +61,13 @@ namespace detail template <> struct simd_data { - typedef __m256i type; + typedef glm_i64vec4 type; }; template <> struct simd_data { - typedef __m256i type; + typedef glm_u64vec4 type; }; # endif }//namespace detail diff --git a/glm/simd/common.h b/glm/simd/common.h index fbcbeef0..374eae50 100644 --- a/glm/simd/common.h +++ b/glm/simd/common.h @@ -7,10 +7,52 @@ #if GLM_ARCH & GLM_ARCH_SSE2_BIT -typedef __m128 glm_vec4; -typedef __m128i glm_ivec4; +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_add(glm_vec4 a, glm_vec4 b) +{ + return _mm_add_ps(a, b); +} -GLM_FUNC_QUALIFIER __m128 glm_vec1_fma(__m128 a, __m128 b, __m128 c) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec1_add(glm_vec4 a, glm_vec4 b) +{ + return _mm_add_ss(a, b); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_sub(glm_vec4 a, glm_vec4 b) +{ + return _mm_sub_ps(a, b); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec1_sub(glm_vec4 a, glm_vec4 b) +{ + return _mm_sub_ss(a, b); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_mul(glm_vec4 a, glm_vec4 b) +{ + return _mm_mul_ps(a, b); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec1_mul(glm_vec4 a, glm_vec4 b) +{ + return _mm_mul_ss(a, b); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_div(glm_vec4 a, glm_vec4 b) +{ + return _mm_div_ps(a, b); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec1_div(glm_vec4 a, glm_vec4 b) +{ + return _mm_div_ss(a, b); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_div_lowp(glm_vec4 a, glm_vec4 b) +{ + return glm_vec4_mul(a, _mm_rcp_ps(b)); +} + +GLM_FUNC_QUALIFIER glm_vec4 glm_vec1_fma(glm_vec4 a, glm_vec4 b, glm_vec4 c) { # if GLM_ARCH & GLM_ARCH_AVX2_BIT return _mm_fmadd_ss(a, b, c); @@ -19,172 +61,170 @@ GLM_FUNC_QUALIFIER __m128 glm_vec1_fma(__m128 a, __m128 b, __m128 c) # endif } -GLM_FUNC_QUALIFIER __m128 glm_vec4_fma(__m128 a, __m128 b, __m128 c) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_fma(glm_vec4 a, glm_vec4 b, glm_vec4 c) { # if GLM_ARCH & GLM_ARCH_AVX2_BIT return _mm_fmadd_ps(a, b, c); # else - return _mm_add_ps(_mm_mul_ps(a, b), c); + return glm_vec4_add(glm_vec4_mul(a, b), c); # endif } -GLM_FUNC_QUALIFIER __m128 glm_vec4_abs(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_abs(glm_vec4 x) { return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF))); } -GLM_FUNC_QUALIFIER __m128i glm_ivec4_abs(__m128i x) +GLM_FUNC_QUALIFIER glm_ivec4 glm_ivec4_abs(glm_ivec4 x) { # if GLM_ARCH & GLM_ARCH_SSSE3_BIT return _mm_sign_epi32(x, x); # else - __m128i const sgn0 = _mm_srai_epi32(x, 31); - __m128i const inv0 = _mm_xor_si128(x, sgn0); - __m128i const sub0 = _mm_sub_epi32(inv0, sgn0); + glm_ivec4 const sgn0 = _mm_srai_epi32(x, 31); + glm_ivec4 const inv0 = _mm_xor_si128(x, sgn0); + glm_ivec4 const sub0 = _mm_sub_epi32(inv0, sgn0); return sub0; # endif } -GLM_FUNC_QUALIFIER __m128 glm_vec4_sign(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_sign(glm_vec4 x) { - __m128 const zro0 = _mm_setzero_ps(); - __m128 const cmp0 = _mm_cmplt_ps(x, zro0); - __m128 const cmp1 = _mm_cmpgt_ps(x, zro0); - __m128 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(-1.0f)); - __m128 const and1 = _mm_and_ps(cmp1, _mm_set1_ps(1.0f)); - __m128 const or0 = _mm_or_ps(and0, and1);; + glm_vec4 const zro0 = _mm_setzero_ps(); + glm_vec4 const cmp0 = _mm_cmplt_ps(x, zro0); + glm_vec4 const cmp1 = _mm_cmpgt_ps(x, zro0); + glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(-1.0f)); + glm_vec4 const and1 = _mm_and_ps(cmp1, _mm_set1_ps(1.0f)); + glm_vec4 const or0 = _mm_or_ps(and0, and1);; return or0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_round(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_round(glm_vec4 x) { # if GLM_ARCH & GLM_ARCH_SSE41_BIT return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT); # else - __m128 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - __m128 const and0 = _mm_and_ps(sgn0, x); - __m128 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f)); - __m128 const add0 = _mm_add_ps(x, or0); - __m128 const sub0 = _mm_sub_ps(add0, or0); + glm_vec4 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + glm_vec4 const and0 = _mm_and_ps(sgn0, x); + glm_vec4 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f)); + glm_vec4 const add0 = glm_vec4_add(x, or0); + glm_vec4 const sub0 = glm_vec4_sub(add0, or0); return sub0; # endif } -GLM_FUNC_QUALIFIER __m128 glm_vec4_floor(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_floor(glm_vec4 x) { # if GLM_ARCH & GLM_ARCH_SSE41_BIT return _mm_floor_ps(x); # else - __m128 const rnd0 = glm_vec4_round(x); - __m128 const cmp0 = _mm_cmplt_ps(x, rnd0); - __m128 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f)); - __m128 const sub0 = _mm_sub_ps(rnd0, and0); + glm_vec4 const rnd0 = glm_vec4_round(x); + glm_vec4 const cmp0 = _mm_cmplt_ps(x, rnd0); + glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f)); + glm_vec4 const sub0 = glm_vec4_sub(rnd0, and0); return sub0; # endif } /* trunc TODO -GLM_FUNC_QUALIFIER __m128 glm_vec4_trunc(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_trunc(glm_vec4 x) { - return __m128(); + return glm_vec4(); } */ //roundEven -GLM_FUNC_QUALIFIER __m128 glm_vec4_roundEven(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_roundEven(glm_vec4 x) { - __m128 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - __m128 const and0 = _mm_and_ps(sgn0, x); - __m128 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f)); - __m128 const add0 = _mm_add_ps(x, or0); - __m128 const sub0 = _mm_sub_ps(add0, or0); + glm_vec4 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + glm_vec4 const and0 = _mm_and_ps(sgn0, x); + glm_vec4 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f)); + glm_vec4 const add0 = glm_vec4_add(x, or0); + glm_vec4 const sub0 = glm_vec4_sub(add0, or0); return sub0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_ceil(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_ceil(glm_vec4 x) { # if GLM_ARCH & GLM_ARCH_SSE41_BIT return _mm_ceil_ps(x); # else - __m128 const rnd0 = glm_vec4_round(x); - __m128 const cmp0 = _mm_cmpgt_ps(x, rnd0); - __m128 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f)); - __m128 const add0 = _mm_add_ps(rnd0, and0); + glm_vec4 const rnd0 = glm_vec4_round(x); + glm_vec4 const cmp0 = _mm_cmpgt_ps(x, rnd0); + glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f)); + glm_vec4 const add0 = glm_vec4_add(rnd0, and0); return add0; # endif } -GLM_FUNC_QUALIFIER __m128 glm_vec4_fract(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_fract(glm_vec4 x) { - __m128 const flr0 = glm_vec4_floor(x); - __m128 const sub0 = _mm_sub_ps(x, flr0); + glm_vec4 const flr0 = glm_vec4_floor(x); + glm_vec4 const sub0 = glm_vec4_sub(x, flr0); return sub0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_mod(__m128 x, __m128 y) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_mod(glm_vec4 x, glm_vec4 y) { - __m128 const div0 = _mm_div_ps(x, y); - __m128 const flr0 = glm_vec4_floor(div0); - __m128 const mul0 = _mm_mul_ps(y, flr0); - __m128 const sub0 = _mm_sub_ps(x, mul0); + glm_vec4 const div0 = glm_vec4_div(x, y); + glm_vec4 const flr0 = glm_vec4_floor(div0); + glm_vec4 const mul0 = glm_vec4_mul(y, flr0); + glm_vec4 const sub0 = glm_vec4_sub(x, mul0); return sub0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_clamp(__m128 v, __m128 minVal, __m128 maxVal) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_clamp(glm_vec4 v, glm_vec4 minVal, glm_vec4 maxVal) { - __m128 const min0 = _mm_min_ps(v, maxVal); - __m128 const max0 = _mm_max_ps(min0, minVal); + glm_vec4 const min0 = _mm_min_ps(v, maxVal); + glm_vec4 const max0 = _mm_max_ps(min0, minVal); return max0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_mix(__m128 v1, __m128 v2, __m128 a) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_mix(glm_vec4 v1, glm_vec4 v2, glm_vec4 a) { - __m128 const sub0 = _mm_sub_ps(_mm_set1_ps(1.0f), a); - __m128 const mul0 = _mm_mul_ps(v1, sub0); - __m128 const mad0 = glm_vec4_fma(v2, a, mul0); + glm_vec4 const sub0 = glm_vec4_sub(_mm_set1_ps(1.0f), a); + glm_vec4 const mul0 = glm_vec4_mul(v1, sub0); + glm_vec4 const mad0 = glm_vec4_fma(v2, a, mul0); return mad0; } -//step -GLM_FUNC_QUALIFIER __m128 glm_vec4_step(__m128 edge, __m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_step(glm_vec4 edge, glm_vec4 x) { - __m128 const cmp = _mm_cmple_ps(x, edge); + glm_vec4 const cmp = _mm_cmple_ps(x, edge); return _mm_movemask_ps(cmp) == 0 ? _mm_set1_ps(1.0f) : _mm_setzero_ps(); } -// smoothstep -GLM_FUNC_QUALIFIER __m128 glm_vec4_smoothstep(__m128 edge0, __m128 edge1, __m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_smoothstep(glm_vec4 edge0, glm_vec4 edge1, glm_vec4 x) { - __m128 const sub0 = _mm_sub_ps(x, edge0); - __m128 const sub1 = _mm_sub_ps(edge1, edge0); - __m128 const div0 = _mm_sub_ps(sub0, sub1); - __m128 const clp0 = glm_vec4_clamp(div0, _mm_setzero_ps(), _mm_set1_ps(1.0f)); - __m128 const mul0 = _mm_mul_ps(_mm_set1_ps(2.0f), clp0); - __m128 const sub2 = _mm_sub_ps(_mm_set1_ps(3.0f), mul0); - __m128 const mul1 = _mm_mul_ps(clp0, clp0); - __m128 const mul2 = _mm_mul_ps(mul1, sub2); + glm_vec4 const sub0 = glm_vec4_sub(x, edge0); + glm_vec4 const sub1 = glm_vec4_sub(edge1, edge0); + glm_vec4 const div0 = glm_vec4_sub(sub0, sub1); + glm_vec4 const clp0 = glm_vec4_clamp(div0, _mm_setzero_ps(), _mm_set1_ps(1.0f)); + glm_vec4 const mul0 = glm_vec4_mul(_mm_set1_ps(2.0f), clp0); + glm_vec4 const sub2 = glm_vec4_sub(_mm_set1_ps(3.0f), mul0); + glm_vec4 const mul1 = glm_vec4_mul(clp0, clp0); + glm_vec4 const mul2 = glm_vec4_mul(mul1, sub2); return mul2; } // Agner Fog method -GLM_FUNC_QUALIFIER __m128 glm_vec4_nan(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_nan(glm_vec4 x) { - __m128i const t1 = _mm_castps_si128(x); // reinterpret as 32-bit integer - __m128i const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1)); // shift out sign bit - __m128i const t3 = _mm_set1_epi32(0xFF000000); // exponent mask - __m128i const t4 = _mm_and_si128(t2, t3); // exponent - __m128i const t5 = _mm_andnot_si128(t3, t2); // fraction - __m128i const Equal = _mm_cmpeq_epi32(t3, t4); - __m128i const Nequal = _mm_cmpeq_epi32(t5, _mm_setzero_si128()); - __m128i const And = _mm_and_si128(Equal, Nequal); - return _mm_castsi128_ps(And); // exponent = all 1s and fraction != 0 + glm_ivec4 const t1 = _mm_castps_si128(x); // reinterpret as 32-bit integer + glm_ivec4 const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1)); // shift out sign bit + glm_ivec4 const t3 = _mm_set1_epi32(0xFF000000); // exponent mask + glm_ivec4 const t4 = _mm_and_si128(t2, t3); // exponent + glm_ivec4 const t5 = _mm_andnot_si128(t3, t2); // fraction + glm_ivec4 const Equal = _mm_cmpeq_epi32(t3, t4); + glm_ivec4 const Nequal = _mm_cmpeq_epi32(t5, _mm_setzero_si128()); + glm_ivec4 const And = _mm_and_si128(Equal, Nequal); + return _mm_castsi128_ps(And); // exponent = all 1s and fraction != 0 } // Agner Fog method -GLM_FUNC_QUALIFIER __m128 glm_vec4_inf(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_inf(glm_vec4 x) { - __m128i const t1 = _mm_castps_si128(x); // reinterpret as 32-bit integer - __m128i const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1)); // shift out sign bit + glm_ivec4 const t1 = _mm_castps_si128(x); // reinterpret as 32-bit integer + glm_ivec4 const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1)); // shift out sign bit return _mm_castsi128_ps(_mm_cmpeq_epi32(t2, _mm_set1_epi32(0xFF000000))); // exponent is all 1s, fraction is 0 } diff --git a/glm/simd/exponential.h b/glm/simd/exponential.h index 17216a91..4eb0fb74 100644 --- a/glm/simd/exponential.h +++ b/glm/simd/exponential.h @@ -3,14 +3,16 @@ #pragma once +#include "platform.h" + #if GLM_ARCH & GLM_ARCH_SSE2_BIT -GLM_FUNC_QUALIFIER __m128 glm_vec1_sqrt_lowp(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec1_sqrt_lowp(glm_vec4 x) { return _mm_mul_ss(_mm_rsqrt_ss(x), x); } -GLM_FUNC_QUALIFIER __m128 glm_vec4_sqrt_lowp(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_sqrt_lowp(glm_vec4 x) { return _mm_mul_ps(_mm_rsqrt_ps(x), x); } diff --git a/glm/simd/geometric.h b/glm/simd/geometric.h index 41469999..ca533872 100644 --- a/glm/simd/geometric.h +++ b/glm/simd/geometric.h @@ -7,116 +7,116 @@ #if GLM_ARCH & GLM_ARCH_SSE2_BIT -GLM_FUNC_DECL __m128 glm_vec4_dot(__m128 v1, __m128 v2); -GLM_FUNC_DECL __m128 glm_vec1_dot(__m128 v1, __m128 v2); +GLM_FUNC_DECL glm_vec4 glm_vec4_dot(glm_vec4 v1, glm_vec4 v2); +GLM_FUNC_DECL glm_vec4 glm_vec1_dot(glm_vec4 v1, glm_vec4 v2); -GLM_FUNC_QUALIFIER __m128 glm_vec4_length(__m128 x) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_length(glm_vec4 x) { - __m128 const dot0 = glm_vec4_dot(x, x); - __m128 const sqt0 = _mm_sqrt_ps(dot0); + glm_vec4 const dot0 = glm_vec4_dot(x, x); + glm_vec4 const sqt0 = _mm_sqrt_ps(dot0); return sqt0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_distance(__m128 p0, __m128 p1) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_distance(glm_vec4 p0, glm_vec4 p1) { - __m128 const sub0 = _mm_sub_ps(p0, p1); - __m128 const len0 = glm_vec4_length(sub0); + glm_vec4 const sub0 = _mm_sub_ps(p0, p1); + glm_vec4 const len0 = glm_vec4_length(sub0); return len0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_dot(__m128 v1, __m128 v2) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_dot(glm_vec4 v1, glm_vec4 v2) { # if GLM_ARCH & GLM_ARCH_AVX_BIT return _mm_dp_ps(v1, v2, 0xff); # elif GLM_ARCH & GLM_ARCH_SSE3_BIT - __m128 const mul0 = _mm_mul_ps(v1, v2); - __m128 const hadd0 = _mm_hadd_ps(mul0, mul0); - __m128 const hadd1 = _mm_hadd_ps(hadd0, hadd0); + glm_vec4 const mul0 = _mm_mul_ps(v1, v2); + glm_vec4 const hadd0 = _mm_hadd_ps(mul0, mul0); + glm_vec4 const hadd1 = _mm_hadd_ps(hadd0, hadd0); return hadd1; # else - __m128 const mul0 = _mm_mul_ps(v1, v2); - __m128 const swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1)); - __m128 const add0 = _mm_add_ps(mul0, swp0); - __m128 const swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3)); - __m128 const add1 = _mm_add_ps(add0, swp1); + glm_vec4 const mul0 = _mm_mul_ps(v1, v2); + glm_vec4 const swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1)); + glm_vec4 const add0 = _mm_add_ps(mul0, swp0); + glm_vec4 const swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3)); + glm_vec4 const add1 = _mm_add_ps(add0, swp1); return add1; # endif } -GLM_FUNC_QUALIFIER __m128 glm_vec1_dot(__m128 v1, __m128 v2) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec1_dot(glm_vec4 v1, glm_vec4 v2) { # if GLM_ARCH & GLM_ARCH_AVX_BIT return _mm_dp_ps(v1, v2, 0xff); # elif GLM_ARCH & GLM_ARCH_SSE3_BIT - __m128 const mul0 = _mm_mul_ps(v1, v2); - __m128 const had0 = _mm_hadd_ps(mul0, mul0); - __m128 const had1 = _mm_hadd_ps(had0, had0); - return Hadd1; + glm_vec4 const mul0 = _mm_mul_ps(v1, v2); + glm_vec4 const had0 = _mm_hadd_ps(mul0, mul0); + glm_vec4 const had1 = _mm_hadd_ps(had0, had0); + return had1; # else - __m128 const mul0 = _mm_mul_ps(v1, v2); - __m128 const mov0 = _mm_movehl_ps(mul0, mul0); - __m128 const add0 = _mm_add_ps(mov0, mul0); - __m128 const swp1 = _mm_shuffle_ps(add0, add0, 1); - __m128 const add1 = _mm_add_ss(add0, swp1); + glm_vec4 const mul0 = _mm_mul_ps(v1, v2); + glm_vec4 const mov0 = _mm_movehl_ps(mul0, mul0); + glm_vec4 const add0 = _mm_add_ps(mov0, mul0); + glm_vec4 const swp1 = _mm_shuffle_ps(add0, add0, 1); + glm_vec4 const add1 = _mm_add_ss(add0, swp1); return add1; # endif } -GLM_FUNC_QUALIFIER __m128 glm_vec4_cross(__m128 v1, __m128 v2) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_cross(glm_vec4 v1, glm_vec4 v2) { - __m128 const swp0 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1)); - __m128 const swp1 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 1, 0, 2)); - __m128 const swp2 = _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)); - __m128 const swp3 = _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 1, 0, 2)); - __m128 const mul0 = _mm_mul_ps(swp0, swp3); - __m128 const mul1 = _mm_mul_ps(swp1, swp2); - __m128 const sub0 = _mm_sub_ps(mul0, mul1); + glm_vec4 const swp0 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1)); + glm_vec4 const swp1 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 1, 0, 2)); + glm_vec4 const swp2 = _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)); + glm_vec4 const swp3 = _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 1, 0, 2)); + glm_vec4 const mul0 = _mm_mul_ps(swp0, swp3); + glm_vec4 const mul1 = _mm_mul_ps(swp1, swp2); + glm_vec4 const sub0 = _mm_sub_ps(mul0, mul1); return sub0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_normalize(__m128 v) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_normalize(glm_vec4 v) { - __m128 const dot0 = glm_vec4_dot(v, v); - __m128 const isr0 = _mm_rsqrt_ps(dot0); - __m128 const mul0 = _mm_mul_ps(v, isr0); + glm_vec4 const dot0 = glm_vec4_dot(v, v); + glm_vec4 const isr0 = _mm_rsqrt_ps(dot0); + glm_vec4 const mul0 = _mm_mul_ps(v, isr0); return mul0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_faceforward(__m128 N, __m128 I, __m128 Nref) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_faceforward(glm_vec4 N, glm_vec4 I, glm_vec4 Nref) { - __m128 const dot0 = glm_vec4_dot(Nref, I); - __m128 const sgn0 = glm_vec4_sign(dot0); - __m128 const mul0 = _mm_mul_ps(sgn0, _mm_set1_ps(-1.0f)); - __m128 const mul1 = _mm_mul_ps(N, mul0); + glm_vec4 const dot0 = glm_vec4_dot(Nref, I); + glm_vec4 const sgn0 = glm_vec4_sign(dot0); + glm_vec4 const mul0 = _mm_mul_ps(sgn0, _mm_set1_ps(-1.0f)); + glm_vec4 const mul1 = _mm_mul_ps(N, mul0); return mul1; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_reflect(__m128 I, __m128 N) +GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_reflect(glm_vec4 I, glm_vec4 N) { - __m128 const dot0 = glm_vec4_dot(N, I); - __m128 const mul0 = _mm_mul_ps(N, dot0); - __m128 const mul1 = _mm_mul_ps(mul0, _mm_set1_ps(2.0f)); - __m128 const sub0 = _mm_sub_ps(I, mul1); + glm_vec4 const dot0 = glm_vec4_dot(N, I); + glm_vec4 const mul0 = _mm_mul_ps(N, dot0); + glm_vec4 const mul1 = _mm_mul_ps(mul0, _mm_set1_ps(2.0f)); + glm_vec4 const sub0 = _mm_sub_ps(I, mul1); return sub0; } -GLM_FUNC_QUALIFIER __m128 glm_vec4_refract(__m128 I, __m128 N, __m128 eta) +GLM_FUNC_QUALIFIER __m128 glm_vec4_refract(glm_vec4 I, glm_vec4 N, glm_vec4 eta) { - __m128 const dot0 = glm_vec4_dot(N, I); - __m128 const mul0 = _mm_mul_ps(eta, eta); - __m128 const mul1 = _mm_mul_ps(dot0, dot0); - __m128 const sub0 = _mm_sub_ps(_mm_set1_ps(1.0f), mul0); - __m128 const sub1 = _mm_sub_ps(_mm_set1_ps(1.0f), mul1); - __m128 const mul2 = _mm_mul_ps(sub0, sub1); + glm_vec4 const dot0 = glm_vec4_dot(N, I); + glm_vec4 const mul0 = _mm_mul_ps(eta, eta); + glm_vec4 const mul1 = _mm_mul_ps(dot0, dot0); + glm_vec4 const sub0 = _mm_sub_ps(_mm_set1_ps(1.0f), mul0); + glm_vec4 const sub1 = _mm_sub_ps(_mm_set1_ps(1.0f), mul1); + glm_vec4 const mul2 = _mm_mul_ps(sub0, sub1); if(_mm_movemask_ps(_mm_cmplt_ss(mul2, _mm_set1_ps(0.0f))) == 0) return _mm_set1_ps(0.0f); - __m128 const sqt0 = _mm_sqrt_ps(mul2); - __m128 const mad0 = glm_vec4_fma(eta, dot0, sqt0); - __m128 const mul4 = _mm_mul_ps(mad0, N); - __m128 const mul5 = _mm_mul_ps(eta, I); - __m128 const sub2 = _mm_sub_ps(mul5, mul4); + glm_vec4 const sqt0 = _mm_sqrt_ps(mul2); + glm_vec4 const mad0 = glm_vec4_fma(eta, dot0, sqt0); + glm_vec4 const mul4 = _mm_mul_ps(mad0, N); + glm_vec4 const mul5 = _mm_mul_ps(eta, I); + glm_vec4 const sub2 = _mm_sub_ps(mul5, mul4); return sub2; } diff --git a/glm/simd/integer.h b/glm/simd/integer.h index 8e31c4d5..50fd8248 100644 --- a/glm/simd/integer.h +++ b/glm/simd/integer.h @@ -5,16 +5,16 @@ #if GLM_ARCH & GLM_ARCH_SSE2_BIT -GLM_FUNC_QUALIFIER __m128i glm_i128_interleave(__m128i x) +GLM_FUNC_QUALIFIER glm_uvec4 glm_i128_interleave(glm_uvec4 x) { - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); + glm_uvec4 const Mask4 = _mm_set1_epi32(0x0000FFFF); + glm_uvec4 const Mask3 = _mm_set1_epi32(0x00FF00FF); + glm_uvec4 const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + glm_uvec4 const Mask1 = _mm_set1_epi32(0x33333333); + glm_uvec4 const Mask0 = _mm_set1_epi32(0x55555555); - __m128i Reg1; - __m128i Reg2; + glm_uvec4 Reg1; + glm_uvec4 Reg2; // REG1 = x; // REG2 = y; @@ -59,16 +59,16 @@ GLM_FUNC_QUALIFIER __m128i glm_i128_interleave(__m128i x) return Reg1; } -GLM_FUNC_QUALIFIER __m128i glm_i128_interleave2(__m128i x, __m128i y) +GLM_FUNC_QUALIFIER glm_uvec4 glm_i128_interleave2(glm_uvec4 x, glm_uvec4 y) { - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); + glm_uvec4 const Mask4 = _mm_set1_epi32(0x0000FFFF); + glm_uvec4 const Mask3 = _mm_set1_epi32(0x00FF00FF); + glm_uvec4 const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + glm_uvec4 const Mask1 = _mm_set1_epi32(0x33333333); + glm_uvec4 const Mask0 = _mm_set1_epi32(0x55555555); - __m128i Reg1; - __m128i Reg2; + glm_uvec4 Reg1; + glm_uvec4 Reg2; // REG1 = x; // REG2 = y; diff --git a/glm/simd/matrix.h b/glm/simd/matrix.h index a9fb5a83..bcfe69e3 100644 --- a/glm/simd/matrix.h +++ b/glm/simd/matrix.h @@ -7,11 +7,7 @@ #if GLM_ARCH & GLM_ARCH_SSE2_BIT -static const __m128 GLM_VAR_USED _m128_rad_ps = _mm_set_ps1(3.141592653589793238462643383279f / 180.f); -static const __m128 GLM_VAR_USED _m128_deg_ps = _mm_set_ps1(180.f / 3.141592653589793238462643383279f); - -template -GLM_FUNC_QUALIFIER matType glm_f32m4_cml(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_mat4_matrixCompMult(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4]) { out[0] = _mm_mul_ps(in1[0], in2[0]); out[1] = _mm_mul_ps(in1[1], in2[1]); @@ -19,7 +15,7 @@ GLM_FUNC_QUALIFIER matType glm_f32m4_cml(__m128 const in1[4], __m128 const in2[4 out[3] = _mm_mul_ps(in1[3], in2[3]); } -GLM_FUNC_QUALIFIER void glm_f32m4_add(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_mat4_add(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4]) { out[0] = _mm_add_ps(in1[0], in2[0]); out[1] = _mm_add_ps(in1[1], in2[1]); @@ -27,7 +23,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_add(__m128 const in1[4], __m128 const in2[4], out[3] = _mm_add_ps(in1[3], in2[3]); } -GLM_FUNC_QUALIFIER void glm_f32m4_sub(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_mat4_sub(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4]) { out[0] = _mm_sub_ps(in1[0], in2[0]); out[1] = _mm_sub_ps(in1[1], in2[1]); @@ -35,7 +31,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_sub(__m128 const in1[4], __m128 const in2[4], out[3] = _mm_sub_ps(in1[3], in2[3]); } -GLM_FUNC_QUALIFIER __m128 glm_f32m4_mul(__m128 const m[4], __m128 v) +GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_mul_vec4(glm_vec4 const m[4], glm_vec4 v) { __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); @@ -54,7 +50,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32m4_mul(__m128 const m[4], __m128 v) return a2; } -GLM_FUNC_QUALIFIER __m128 glm_f32m4_mul(__m128 v, __m128 const m[4]) +GLM_FUNC_QUALIFIER __m128 glm_vec4_mul_mat4(glm_vec4 v, glm_vec4 const m[4]) { __m128 i0 = m[0]; __m128 i1 = m[1]; @@ -81,7 +77,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32m4_mul(__m128 v, __m128 const m[4]) return f2; } -GLM_FUNC_QUALIFIER void glm_f32m4_mul(__m128 const in1[4], __m128 const in2[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_mat4_mul(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4]) { { __m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0)); @@ -157,7 +153,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_mul(__m128 const in1[4], __m128 const in2[4], } } -GLM_FUNC_QUALIFIER void glm_f32m4_transpose(__m128 const in[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_mat4_transpose(glm_vec4 const in[4], glm_vec4 out[4]) { __m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44); __m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE); @@ -170,7 +166,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_transpose(__m128 const in[4], __m128 out[4]) out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD); } -GLM_FUNC_QUALIFIER __m128 glm_f32m4_det_highp(__m128 const in[4]) +GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant_highp(glm_vec4 const in[4]) { __m128 Fac0; { @@ -384,7 +380,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32m4_det_highp(__m128 const in[4]) return Det0; } -GLM_FUNC_QUALIFIER __m128 glm_f32m4_detd(__m128 const m[4]) +GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant_lowp(glm_vec4 const m[4]) { // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128( @@ -447,7 +443,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32m4_detd(__m128 const m[4]) return glm_vec4_dot(m[0], DetCof); } -GLM_FUNC_QUALIFIER __m128 glm_f32m4_det(__m128 const m[4]) +GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant(glm_vec4 const m[4]) { // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(add) @@ -510,7 +506,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32m4_det(__m128 const m[4]) return glm_vec4_dot(m[0], DetCof); } -GLM_FUNC_QUALIFIER void glm_f32m4_inv(__m128 const in[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_mat4_inverse(glm_vec4 const in[4], glm_vec4 out[4]) { __m128 Fac0; { @@ -731,7 +727,7 @@ GLM_FUNC_QUALIFIER void glm_f32m4_inv(__m128 const in[4], __m128 out[4]) out[3] = _mm_mul_ps(Inv3, Rcp0); } -GLM_FUNC_QUALIFIER void glm_f32m4_inv_lowp(__m128 const in[4], __m128 out[4]) +GLM_FUNC_QUALIFIER void glm_mat4_inverse_lowp(glm_vec4 const in[4], glm_vec4 out[4]) { __m128 Fac0; { diff --git a/glm/simd/platform.h b/glm/simd/platform.h index 04c9245e..6aa6bb29 100644 --- a/glm/simd/platform.h +++ b/glm/simd/platform.h @@ -398,3 +398,18 @@ #elif GLM_ARCH & GLM_ARCH_SSE2_BIT # include #endif//GLM_ARCH + +#if GLM_ARCH & GLM_ARCH_SSE2_BIT + typedef __m128 glm_vec4; + typedef __m128i glm_ivec4; + typedef __m128i glm_uvec4; +#endif + +#if GLM_ARCH & GLM_ARCH_AVX_BIT + typedef __m256d glm_dvec4; +#endif + +#if GLM_ARCH & GLM_ARCH_AVX2_BIT + typedef __m256i glm_i64vec4; + typedef __m256i glm_u64vec4; +#endif