From 49fc1e3e84d036c43ffbac0128588d47c5a11f36 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 27 Mar 2009 14:41:46 +0000
Subject: [PATCH] add vectorization of sqrt for float

---
 Eigen/Core                                    |  2 +-
 Eigen/src/Array/Functors.h                    |  8 ++++++-
 Eigen/src/Core/GenericPacketMath.h            |  4 ++++
 ...nscendentalFunctions.h => MathFunctions.h} | 23 +++++++++++--------
 Eigen/src/Core/arch/SSE/PacketMath.h          |  3 ++-
 test/packetmath.cpp                           |  1 +
 6 files changed, 29 insertions(+), 12 deletions(-)
 rename Eigen/src/Core/arch/SSE/{TranscendentalFunctions.h => MathFunctions.h} (95%)
diff --git a/Eigen/Core b/Eigen/Core
index ad85b6595..904c6089b 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -102,7 +102,7 @@ namespace Eigen {
 
 #if defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/TranscendentalFunctions.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_ALTIVEC
   #include "src/Core/arch/AltiVec/PacketMath.h"
 #endif
diff --git a/Eigen/src/Array/Functors.h b/Eigen/src/Array/Functors.h
index 9759ebf2a..f5607ba90 100644
--- a/Eigen/src/Array/Functors.h
+++ b/Eigen/src/Array/Functors.h
@@ -58,10 +58,16 @@ struct ei_functor_traits<ei_scalar_add_op<Scalar> >
   */
 template<typename Scalar> struct ei_scalar_sqrt_op EIGEN_EMPTY_STRUCT {
   inline const Scalar operator() (const Scalar& a) const { return ei_sqrt(a); }
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return ei_psqrt(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_sqrt_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = ei_packet_traits<Scalar>::HasSqrt
+  };
+};
 
 /** \internal
   *
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 8583faa4a..0b5144632 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -46,6 +46,7 @@ struct ei_default_packet_traits
     HasMax    = 1,
     
     HasDiv    = 0,
+    HasSqrt   = 0,
     HasExp    = 0,
     HasLog    = 0,
     HasPow    = 0,
@@ -192,6 +193,9 @@ template<typename Packet> inline Packet ei_pexp(Packet a) { return ei_exp(a); }
 /** \internal \returns the log of \a a (coeff-wise) */
 template<typename Packet> inline Packet ei_plog(Packet a) { return ei_log(a); }
 
+/** \internal \returns the square-root of \a a (coeff-wise) */
+template<typename Packet> inline Packet ei_psqrt(Packet a) { return ei_log(a); }
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
diff --git a/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
similarity index 95%
rename from Eigen/src/Core/arch/SSE/TranscendentalFunctions.h
rename to Eigen/src/Core/arch/SSE/MathFunctions.h
index 0c8accc17..7df9dc659 100644
--- a/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -23,9 +23,9 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
-/* The functions of this file come from Julien Pommier's sse math library.
- * which is itself inspired by Intel Approximate Math library, and based on the
- * corresponding algorithms of the cephes math library.
+/* The sin, cos, exp, and log functions of this file come from Julien Pommier's sse
+ * math library, which is itself inspired by Intel Approximate Math library,
+ * and based on the corresponding algorithms of the cephes math library.
  */
 
 /* Copyright (C) 2007  Julien Pommier
@@ -49,18 +49,16 @@
   (this is the zlib license)
 */
 
-#ifndef EIGEN_TRANSCENDENTAL_FUNCTIONS_SSE_H
-#define EIGEN_TRANSCENDENTAL_FUNCTIONS_SSE_H
+#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
+#define EIGEN_MATH_FUNCTIONS_SSE_H
 
 _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0);
 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
 /* the smallest non denormalized float number */
 _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-// _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(mant_mask, 0x7f800000);
 _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
 
 _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-// _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_sign_mask, ~0x80000000);
 
 _EIGEN_DECLARE_CONST_Packet4i(1, 1);
 _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
@@ -214,7 +212,6 @@ _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005);
 _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003);
 _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002);
 _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-_EIGEN_DECLARE_CONST_Packet4f(2pi, 2.*M_PI);
 
 template<> EIGEN_DONT_INLINE Packet4f ei_psin(Packet4f x)
 {
@@ -358,4 +355,12 @@ template<> Packet4f ei_pcos(Packet4f x)
   return _mm_xor_ps(y, sign_bit);
 }
 
-#endif // EIGEN_TRANSCENDENTAL_FUNCTIONS_SSE_H
+template<> Packet4f ei_psqrt(Packet4f _x)
+{
+  Packet4f half = ei_pmul(_x, ei_pset1(.5f));
+  Packet4f x = _mm_rsqrt_ps(_x);
+  x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x))));
+  return ei_pmul(_x,x);
+}
+
+#endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index d49479c16..cb8052a4b 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -61,7 +61,8 @@ template<> struct ei_packet_traits<float>  : ei_default_packet_traits
     HasSin  = 1,
     HasCos  = 1,
     HasLog  = 1,
-    HasExp  = 1
+    HasExp  = 1,
+    HasSqrt = 1
   };
 };
 template<> struct ei_packet_traits<double> : ei_default_packet_traits
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 858bbb9c3..250398241 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -227,6 +227,7 @@ template<typename Scalar> void packetmath_real()
     data2[i] = ei_random<Scalar>(0,1e6);
   }
   CHECK_CWISE1_IF(ei_packet_traits<Scalar>::HasLog, ei_log, ei_plog);
+  CHECK_CWISE1_IF(ei_packet_traits<Scalar>::HasSqrt, ei_sqrt, ei_psqrt);
 }
 
 void test_packetmath()