From b2c6dc48d9189eb96f878aa6028aec245eadde85 Mon Sep 17 00:00:00 2001 From: RJ Ryan Date: Tue, 20 Sep 2016 07:18:20 -0700 Subject: [PATCH] Add CUDA-specific std::complex specializations for scalar_sum_op, scalar_difference_op, scalar_product_op, and scalar_quotient_op. --- Eigen/Core | 1 + Eigen/src/Core/arch/CUDA/Complex.h | 80 +++++++++++++++ unsupported/test/CMakeLists.txt | 1 + .../cxx11_tensor_complex_cwise_ops_cuda.cu | 97 +++++++++++++++++++ 4 files changed, 179 insertions(+) create mode 100644 Eigen/src/Core/arch/CUDA/Complex.h create mode 100644 unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu diff --git a/Eigen/Core b/Eigen/Core index 3ffd6220b..bf2479585 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -359,6 +359,7 @@ using std::ptrdiff_t; #include "src/Core/arch/ZVector/Complex.h" #endif +#include "src/Core/arch/CUDA/Complex.h" // Half float support #include "src/Core/arch/CUDA/Half.h" #include "src/Core/arch/CUDA/PacketMathHalf.h" diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h new file mode 100644 index 000000000..aa511a4b2 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -0,0 +1,80 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_CUDA_H +#define EIGEN_COMPLEX_CUDA_H + +// clang-format off + +namespace Eigen { + +namespace internal { + +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +// Many std::complex methods such as operator+, operator-, operator* and +// operator/ are not constexpr. Due to this, clang does not treat them as device +// functions and thus Eigen functors making use of these operators fail to +// compile. Here, we manually specialize these functors for complex types when +// building for CUDA to avoid non-constexpr methods. + +template struct scalar_sum_op> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + return std::complex(numext::real(a) + numext::real(b), + numext::imag(a) + numext::imag(b)); + } +}; + +template struct scalar_difference_op> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + return std::complex(numext::real(a) - numext::real(b), + numext::imag(a) - numext::imag(b)); + } +}; + +template struct scalar_product_op, std::complex> { + enum { + Vectorizable = packet_traits>::HasMul + }; + EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + return std::complex(a_real * b_real - a_imag * b_imag, + a_real * b_imag + a_imag * b_real); + } +}; + +template struct scalar_quotient_op, std::complex> { + enum { + Vectorizable = packet_traits>::HasDiv + }; + EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + const T norm = T(1) / (b_real * b_real + b_imag * b_imag); + return std::complex((a_real * b_real + a_imag * b_imag) * norm, + (a_imag * b_real - a_real * b_imag) * norm); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_CUDA_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 714046809..9eac6ec73 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(cxx11_tensor_complex_cuda) + ei_add_test(cxx11_tensor_complex_cwise_ops_cuda) ei_add_test(cxx11_tensor_reduction_cuda) ei_add_test(cxx11_tensor_argmax_cuda) ei_add_test(cxx11_tensor_cast_float16_cuda) diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu new file mode 100644 index 000000000..54c17ca28 --- /dev/null +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -0,0 +1,97 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops +#define EIGEN_USE_GPU + +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include +#endif +#include "main.h" +#include + +using Eigen::Tensor; + +template +void test_cuda_complex_cwise_ops() { + const int kNumItems = 2; + std::size_t complex_bytes = kNumItems * sizeof(std::complex); + + std::complex* d_in1; + std::complex* d_in2; + std::complex* d_out; + cudaMalloc((void**)(&d_in1), complex_bytes); + cudaMalloc((void**)(&d_in2), complex_bytes); + cudaMalloc((void**)(&d_out), complex_bytes); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, 1, 0, int>, Eigen::Aligned> gpu_in1( + d_in1, kNumItems); + Eigen::TensorMap, 1, 0, int>, Eigen::Aligned> gpu_in2( + d_in2, kNumItems); + Eigen::TensorMap, 1, 0, int>, Eigen::Aligned> gpu_out( + d_out, kNumItems); + + const std::complex a(3.14f, 2.7f); + const std::complex b(-10.6f, 1.4f); + + gpu_in1.device(gpu_device) = gpu_in1.constant(a); + gpu_in2.device(gpu_device) = gpu_in2.constant(b); + + enum CwiseOp { + Add, + Sub, + Mul, + Div + }; + + Tensor, 1, 0, int> actual(2); + for (CwiseOp op : {Add, Sub, Mul, Div}) { + std::complex expected; + switch (op) { + case Add: + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + expected = a + b; + break; + case Sub: + gpu_out.device(gpu_device) = gpu_in1 - gpu_in2; + expected = a - b; + break; + case Mul: + gpu_out.device(gpu_device) = gpu_in1 * gpu_in2; + expected = a * b; + break; + case Div: + gpu_out.device(gpu_device) = gpu_in1 / gpu_in2; + expected = a / b; + break; + } + assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < kNumItems; ++i) { + VERIFY_IS_APPROX(actual(i), expected); + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); +} + + +void test_cxx11_tensor_complex_cwise_ops() +{ + CALL_SUBTEST(test_cuda_complex_cwise_ops()); + CALL_SUBTEST(test_cuda_complex_cwise_ops()); +}