From 377d5a40469d5ed81b903c76319a9a382c039633 Mon Sep 17 00:00:00 2001
From: Christophe Riccio <mail@g-truc.net>
Date: Sun, 2 Nov 2014 23:48:02 +0100
Subject: [PATCH] Optimized bitCount function

---
 glm/detail/func_integer.hpp               |  13 ++-
 glm/detail/func_integer.inl               |  43 ++++++---
 glm/detail/type_int.hpp                   | 104 ++++++++++++++++++++++
 readme.txt                                |   2 +-
 test/core/core_func_integer.cpp           |  80 ++++++++++++-----
 test/core/core_func_integer_bit_count.cpp |  15 ++--
 6 files changed, 217 insertions(+), 40 deletions(-)
diff --git a/glm/detail/func_integer.hpp b/glm/detail/func_integer.hpp
index a3f45c3f..55552149 100644
--- a/glm/detail/func_integer.hpp
+++ b/glm/detail/func_integer.hpp
@@ -157,7 +157,18 @@ namespace glm
 	/// @see <a href="http://www.opengl.org/registry/doc/GLSLangSpec.4.20.8.pdf">GLSL 4.20.8 specification, section 8.8 Integer Functions</a>
 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_DECL vecType<T, P> bitfieldReverse(vecType<T, P> const & v);
-		
+
+	/// Returns the number of bits set to 1 in the binary representation of value.
+	///
+	/// @tparam T Signed or unsigned integer scalar or vector types.
+	///
+	/// @see <a href="http://www.opengl.org/sdk/docs/manglsl/xhtml/bitCount.xml">GLSL bitCount man page</a>
+	/// @see <a href="http://www.opengl.org/registry/doc/GLSLangSpec.4.20.8.pdf">GLSL 4.20.8 specification, section 8.8 Integer Functions</a>
+	///
+	/// @todo Clarify the declaration to specify that scalars are suported.
+	template <typename genType>
+	GLM_FUNC_DECL int bitCount(genType v);
+
 	/// Returns the number of bits set to 1 in the binary representation of value.
 	///
 	/// @tparam T Signed or unsigned integer scalar or vector types.
diff --git a/glm/detail/func_integer.inl b/glm/detail/func_integer.inl
index 13908e8b..e76b2384 100644
--- a/glm/detail/func_integer.inl
+++ b/glm/detail/func_integer.inl
@@ -66,6 +66,26 @@ namespace detail
 			return (v & Mask) << Shift | (v & (~Mask)) >> Shift;
 		}
 	};
+
+	template <bool EXEC = false>
+	struct compute_bitfieldBitCountStep
+	{
+		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T, T)
+		{
+			return v;
+		}
+	};
+
+	template <>
+	struct compute_bitfieldBitCountStep<true>
+	{
+		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T Mask, T Shift)
+		{
+			return (v & Mask) + ((v >> Shift) & Mask);
+		}
+	};
 }//namespace detail
 
 	// uaddCarry
@@ -207,21 +227,24 @@ namespace detail
 	}
 
 	// bitCount
-	template <typename genIUType>
-	GLM_FUNC_QUALIFIER int bitCount(genIUType x)
+	template <typename genType>
+	GLM_FUNC_QUALIFIER int bitCount(genType x)
 	{
-		return bitCount(tvec1<genIUType>(x)).x;
+		return bitCount(glm::tvec1<genType, glm::defaultp>(x)).x;
 	}
 
-	template <typename T, precision P, template <typename, precision> class vecType>
+	template <typename T, glm::precision P, template <typename, glm::precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<int, P> bitCount(vecType<T, P> const & v)
 	{
-		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_integer, "'bitCount' only accept integer values");
-
-		vecType<int, P> Count(0);
-		for(T i = 0, n = static_cast<T>(sizeof(T) * 8); i < n; ++i)
-			Count += vecType<int, P>((v >> i) & static_cast<T>(1));
-		return Count;
+		typedef glm::detail::make_unsigned<T>::type U;
+		vecType<U, P> x(*reinterpret_cast<vecType<U, P> const *>(&v));
+		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >=  2>::call<U, P, vecType>(x, U(0x5555555555555555ull), static_cast<U>( 1));
+		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >=  4>::call<U, P, vecType>(x, U(0x3333333333333333ull), static_cast<U>( 2));
+		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >=  8>::call<U, P, vecType>(x, U(0x0F0F0F0F0F0F0F0Full), static_cast<U>( 4));
+		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >= 16>::call<U, P, vecType>(x, U(0x00FF00FF00FF00FFull), static_cast<U>( 8));
+		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >= 32>::call<U, P, vecType>(x, U(0x0000FFFF0000FFFFull), static_cast<U>(16));
+		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >= 64>::call<U, P, vecType>(x, U(0x00000000FFFFFFFFull), static_cast<U>(32));
+		return vecType<int, P>(x);
 	}
 
 	// findLSB
diff --git a/glm/detail/type_int.hpp b/glm/detail/type_int.hpp
index fcf99a3f..d217c747 100644
--- a/glm/detail/type_int.hpp
+++ b/glm/detail/type_int.hpp
@@ -83,6 +83,110 @@ namespace detail
 	typedef unsigned int					lowp_uint_t;
 	typedef unsigned int					mediump_uint_t;
 	typedef unsigned int					highp_uint_t;
+
+	template <typename genType>
+	struct make_signed
+	{};
+
+	template <>
+	struct make_signed<int8>
+	{
+		typedef int8 type;
+	};
+
+	template <>
+	struct make_signed<uint8>
+	{
+		typedef int8 type;
+	};
+
+	template <>
+	struct make_signed<int16>
+	{
+		typedef int16 type;
+	};
+
+	template <>
+	struct make_signed<uint16>
+	{
+		typedef int16 type;
+	};
+
+	template <>
+	struct make_signed<int32>
+	{
+		typedef int32 type;
+	};
+
+	template <>
+	struct make_signed<uint32>
+	{
+		typedef int32 type;
+	};
+
+	template <>
+	struct make_signed<int64>
+	{
+		typedef int64 type;
+	};
+
+	template <>
+	struct make_signed<uint64>
+	{
+		typedef int64 type;
+	};
+
+	template <typename genType>
+	struct make_unsigned
+	{};
+
+	template <>
+	struct make_unsigned<int8>
+	{
+		typedef uint8 type;
+	};
+
+	template <>
+	struct make_unsigned<uint8>
+	{
+		typedef uint8 type;
+	};
+
+	template <>
+	struct make_unsigned<int16>
+	{
+		typedef uint16 type;
+	};
+
+	template <>
+	struct make_unsigned<uint16>
+	{
+		typedef uint16 type;
+	};
+
+	template <>
+	struct make_unsigned<int32>
+	{
+		typedef uint32 type;
+	};
+
+	template <>
+	struct make_unsigned<uint32>
+	{
+		typedef uint32 type;
+	};
+
+	template <>
+	struct make_unsigned<int64>
+	{
+		typedef uint64 type;
+	};
+
+	template <>
+	struct make_unsigned<uint64>
+	{
+		typedef uint64 type;
+	};
 }//namespace detail
 
 	typedef detail::int8					int8;
diff --git a/readme.txt b/readme.txt
index d9a90e99..a2eee497 100644
--- a/readme.txt
+++ b/readme.txt
@@ -80,7 +80,7 @@ GLM 0.9.6.0: 2014-XX-XX
 - Added GTC_bitfield extension, promoted GTX_bit
 - Added GTC_integer extension, promoted GTX_bit
 - Fixed bad matrix-vector multiple performance with Cuda #257, #258
-- Optimized bitfieldReverse function
+- Optimized bitfieldReverse and bitCount functions
 
 ================================================================================
 GLM 0.9.5.4: 2014-06-21
diff --git a/test/core/core_func_integer.cpp b/test/core/core_func_integer.cpp
index 8a765c5e..5493c4bf 100644
--- a/test/core/core_func_integer.cpp
+++ b/test/core/core_func_integer.cpp
@@ -858,7 +858,7 @@ namespace findMSB
 		int Error(0);
 
 		Error += perf_950();
-		Error += perf_ops();
+		//Error += perf_ops();
 
 		return Error;
 	}
@@ -1190,17 +1190,44 @@ namespace bitCount
 		return Count;
 	}
 
-	template <typename T>
-	inline int bitCount_bits(T v)
+	template <bool EXEC = false>
+	struct compute_bitfieldBitCountStep
 	{
-		GLM_STATIC_ASSERT(std::numeric_limits<T>::is_integer, "'bitCount' only accept integer values");
-
-		int Count(0);
-		for(T i = 0, n = static_cast<T>(sizeof(T) * 8); i < n; ++i)
+		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T, T)
 		{
-			Count += static_cast<int>((v >> i) & static_cast<T>(1));
+			return v;
 		}
-		return Count;
+	};
+
+	template <>
+	struct compute_bitfieldBitCountStep<true>
+	{
+		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T Mask, T Shift)
+		{
+			return (v & Mask) + ((v >> Shift) & Mask);
+		}
+	};
+
+	template <typename T, glm::precision P, template <typename, glm::precision> class vecType>
+	GLM_FUNC_QUALIFIER vecType<int, P> bitCount_bitfield(vecType<T, P> const & v)
+	{
+		typedef glm::detail::make_unsigned<T>::type U;
+		vecType<U, P> x(*reinterpret_cast<vecType<U, P> const *>(&v));
+		x = compute_bitfieldBitCountStep<sizeof(T) * 8 >=  2>::call<U, P, vecType>(x, U(0x5555555555555555ull), static_cast<U>( 1));
+		x = compute_bitfieldBitCountStep<sizeof(T) * 8 >=  4>::call<U, P, vecType>(x, U(0x3333333333333333ull), static_cast<U>( 2));
+		x = compute_bitfieldBitCountStep<sizeof(T) * 8 >=  8>::call<U, P, vecType>(x, U(0x0F0F0F0F0F0F0F0Full), static_cast<U>( 4));
+		x = compute_bitfieldBitCountStep<sizeof(T) * 8 >= 16>::call<U, P, vecType>(x, U(0x00FF00FF00FF00FFull), static_cast<U>( 8));
+		x = compute_bitfieldBitCountStep<sizeof(T) * 8 >= 32>::call<U, P, vecType>(x, U(0x0000FFFF0000FFFFull), static_cast<U>(16));
+		x = compute_bitfieldBitCountStep<sizeof(T) * 8 >= 64>::call<U, P, vecType>(x, U(0x00000000FFFFFFFFull), static_cast<U>(32));
+		return vecType<int, P>(x);
+	}
+
+	template <typename genType>
+	GLM_FUNC_QUALIFIER int bitCount_bitfield(genType x)
+	{
+		return bitCount_bitfield(glm::tvec1<genType, glm::defaultp>(x)).x;
 	}
 
 	int perf()
@@ -1249,15 +1276,18 @@ namespace bitCount
 
 		std::clock_t TimestampsE = std::clock();
 
-		std::clock_t TimeIf = TimestampsB - TimestampsA;
-		std::clock_t TimeVec = TimestampsC - TimestampsB;
-		std::clock_t TimeDefault = TimestampsD - TimestampsC;
-		std::clock_t TimeVec4 = TimestampsE - TimestampsD;
+		{
+			for(std::size_t i = 0, n = v.size(); i < n; ++i)
+				v[i] = bitCount_bitfield(static_cast<int>(i));
+		}
 
-		std::printf("bitCount - TimeIf %d\n", static_cast<unsigned int>(TimeIf));
-		std::printf("bitCount - TimeVec %d\n", static_cast<unsigned int>(TimeVec));
-		std::printf("bitCount - TimeDefault %d\n", static_cast<unsigned int>(TimeDefault));
-		std::printf("bitCount - TimeVec4 %d\n", static_cast<unsigned int>(TimeVec4));
+		std::clock_t TimestampsF = std::clock();
+
+		std::printf("bitCount - TimeIf %d\n", static_cast<unsigned int>(TimestampsB - TimestampsA));
+		std::printf("bitCount - TimeVec %d\n", static_cast<unsigned int>(TimestampsC - TimestampsB));
+		std::printf("bitCount - TimeDefault %d\n", static_cast<unsigned int>(TimestampsD - TimestampsC));
+		std::printf("bitCount - TimeVec4 %d\n", static_cast<unsigned int>(TimestampsE - TimestampsD));
+		std::printf("bitCount - bitfield %d\n", static_cast<unsigned int>(TimestampsF - TimestampsE));
 
 		return Error;
 	}
@@ -1268,8 +1298,16 @@ namespace bitCount
 
 		for(std::size_t i = 0, n = sizeof(DataI32) / sizeof(type<int>); i < n; ++i)
 		{
-			int Result = glm::bitCount(DataI32[i].Value);
-			Error += DataI32[i].Return == Result ? 0 : 1;
+			int ResultA = glm::bitCount(DataI32[i].Value);
+			int ResultB = bitCount_if(DataI32[i].Value);
+			int ResultC = bitCount_vec(DataI32[i].Value);
+			int ResultE = bitCount_bitfield(DataI32[i].Value);
+
+			Error += DataI32[i].Return == ResultA ? 0 : 1;
+			Error += DataI32[i].Return == ResultB ? 0 : 1;
+			Error += DataI32[i].Return == ResultC ? 0 : 1;
+			Error += DataI32[i].Return == ResultE ? 0 : 1;
+
 			assert(!Error);
 		}
 
@@ -1281,6 +1319,8 @@ int main()
 {
 	int Error = 0;
 
+	Error += ::bitCount::test();
+	Error += ::bitCount::perf();
 	Error += ::bitfieldReverse::test();
 	Error += ::bitfieldReverse::perf();
 	Error += ::findMSB::test();
@@ -1292,8 +1332,6 @@ int main()
 	Error += ::usubBorrow::test();
 	Error += ::bitfieldInsert::test();
 	Error += ::bitfieldExtract::test();
-	Error += ::bitCount::test();
-	Error += ::bitCount::perf();
 
 	return Error;
 }
diff --git a/test/core/core_func_integer_bit_count.cpp b/test/core/core_func_integer_bit_count.cpp
index 370af34e..cc21b275 100644
--- a/test/core/core_func_integer_bit_count.cpp
+++ b/test/core/core_func_integer_bit_count.cpp
@@ -10,13 +10,14 @@ unsigned rotatel(unsigned x, int n) {
    return (x << n) | (x >> (32 - n));
 }
 
-int pop0(unsigned x) {
-   x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
-   x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-   x = (x & 0x0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F);
-   x = (x & 0x00FF00FF) + ((x >> 8) & 0x00FF00FF);
-   x = (x & 0x0000FFFF) + ((x >>16) & 0x0000FFFF);
-   return x;
+int pop0(unsigned x)
+{
+	x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
+	x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+	x = (x & 0x0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F);
+	x = (x & 0x00FF00FF) + ((x >> 8) & 0x00FF00FF);
+	x = (x & 0x0000FFFF) + ((x >>16) & 0x0000FFFF);
+	return x;
 }
 
 int pop1(unsigned x) {