heffte/heffte__stock__vec__types_8h_source.html

 /*

     -- heFFTe --

        Univ. of Tennessee, Knoxville

        @date

 */


 #ifndef HEFFTE_STOCK_VEC_TYPES_H

 #define HEFFTE_STOCK_VEC_TYPES_H


 #include "heffte_config.h"


 #ifdef __AVX__

 #include <immintrin.h>

 #endif

 #include <complex>


 namespace heffte {

 namespace stock {

 template<typename T>

 using is_float = std::is_same<float, typename std::remove_cv<T>::type>;

 template<typename T>

 using is_double = std::is_same<double, typename std::remove_cv<T>::type>;

 template<typename T>

 using is_fcomplex = std::is_same<std::complex<float>, typename std::remove_cv<T>::type>;

 template<typename T>

 using is_dcomplex = std::is_same<std::complex<double>, typename std::remove_cv<T>::type>;


 template<typename T> struct is_real {

     static constexpr bool value = is_float<T>::value || is_double<T>::value;

 };


 template<typename T> struct is_complex {

     static constexpr bool value = is_fcomplex<T>::value || is_dcomplex<T>::value;

 };


 template<typename T, int N> struct pack {};

 template<> struct pack<float, 1> { using type = std::complex<float>; };

 template<> struct pack<double, 1> { using type = std::complex<double>; };


 // Some simple operations that will be useful for vectorized types.


 template<typename F, int L>

 inline typename pack<F,L>::type mm_zero(){return 0.0;}

 template<typename F, int L>

 inline typename pack<F,L>::type mm_load(F const *src) { return typename pack<F,L>::type {src[0], src[1]}; }

 template<typename F, int L>

 inline void mm_store(F *dest, typename pack<F,L>::type const &src) {

     dest[0] = src.real(); dest[1] = src.imag();

 }

 template<typename F, int L>

 inline typename pack<F,L>::type mm_pair_set(F x, F y) { return typename pack<F,L>::type(x, y); }

 template<typename F, int L>

 inline typename pack<F,L>::type mm_set1(F src) { return typename pack<F,L>::type(src,src); }

 template<typename F, int L>

 inline typename pack<F,L>::type mm_complex_load(std::complex<F> const *src) { return *src; }

 template<typename F, int L>

 inline typename pack<F,L>::type mm_complex_load(std::complex<F> const *src, int) { return *src; }


 // Real basic arithmetic for the "none" case


 inline typename pack<float, 1>::type mm_add(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b){ return a + b; }

 inline typename pack<double, 1>::type mm_add(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b){ return a + b; }

 inline typename pack<float, 1>::type mm_sub(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b){ return a - b; }

 inline typename pack<double, 1>::type mm_sub(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b){ return a - b; }

 inline typename pack<float, 1>::type mm_div(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b){ return a / b.real(); }

 inline typename pack<double, 1>::type mm_div(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b){ return a / b.real(); }

 inline typename pack<float, 1>::type mm_neg(typename pack<float, 1>::type const &a){ return -a; }

 inline typename pack<double, 1>::type mm_neg(typename pack<double, 1>::type const &a){ return -a; }

 inline typename pack<float, 1>::type mm_mul(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b){ return a * b.real(); }

 inline typename pack<double, 1>::type mm_mul(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b){ return a * b.real(); }

 inline typename pack<float, 1>::type mm_complex_mul(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b){ return a * b; }

 inline typename pack<double, 1>::type mm_complex_mul(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b){ return a * b; }

 inline typename pack<float, 1>::type mm_complex_fmadd(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b, typename pack<float, 1>::type const &c){ return a * b + c; }

 inline typename pack<double, 1>::type mm_complex_fmadd(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b, typename pack<double, 1>::type const &c){ return a * b + c; }

 inline typename pack<float, 1>::type mm_complex_fmsub(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b, typename pack<float, 1>::type const &c){ return a * b - c; }

 inline typename pack<double, 1>::type mm_complex_fmsub(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b, typename pack<double, 1>::type const &c){ return a * b - c; }

 inline typename pack<float, 1>::type mm_complex_mul_i(typename pack<float, 1>::type const &a){return a * std::complex<float>{0.f,1.f}; }

 inline typename pack<double, 1>::type mm_complex_mul_i(typename pack<double, 1>::type const &a){ return a * std::complex<double>{0.,1.}; }

 inline typename pack<float, 1>::type mm_complex_mul_neg_i(typename pack<float, 1>::type const &a){return a * std::complex<float>{0.f,-1.f}; }

 inline typename pack<double, 1>::type mm_complex_mul_neg_i(typename pack<double, 1>::type const &a){ return a * std::complex<double>{0.,-1.}; }

 inline typename pack<float, 1>::type mm_complex_sq_mod(typename pack<float,1>::type const &a){ return std::complex<float>{norm(a), norm(a)}; }

 inline typename pack<double, 1>::type mm_complex_sq_mod(typename pack<double,1>::type const &a){ return std::complex<double>{norm(a), norm(a)}; }

 inline typename pack<float, 1>::type mm_complex_mod(typename pack<float,1>::type const &a){ return std::complex<float>{std::abs(a), std::abs(a)}; }

 inline typename pack<double, 1>::type mm_complex_mod(typename pack<double,1>::type const &a){ return std::complex<double>{std::abs(a), std::abs(a)}; }

 inline typename pack<float, 1>::type mm_complex_conj(typename pack<float,1>::type const &a){ return conj(a); }

 inline typename pack<double, 1>::type mm_complex_conj(typename pack<double,1>::type const &a){ return conj(a); }

 inline typename pack<float, 1>::type mm_complex_div(typename pack<float, 1>::type const &a, typename pack<float, 1>::type const &b){ return a / b; }

 inline typename pack<double, 1>::type mm_complex_div(typename pack<double, 1>::type const &a, typename pack<double, 1>::type const &b){ return a / b; }


 /* Below is functionality for vector packs */


 #ifdef Heffte_ENABLE_AVX


 template<> struct pack<double, 2> { using type = __m128d; };

 template<> struct pack<float, 4> { using type = __m128; };

 template<> struct pack<double, 4> { using type = __m256d; };

 template<> struct pack<float, 8> { using type = __m256; };


 /* Below are structs for pack<float, 4> */


 template<>

 inline typename pack<float, 4>::type  mm_zero<float, 4>(){ return _mm_setzero_ps(); }


 template<>

 inline typename pack<float, 4>::type mm_load<float, 4>(float const *src) { return _mm_loadu_ps(src); }


 template<>

 inline void mm_store<float, 4>(float *dest, pack<float, 4>::type const &src) { _mm_storeu_ps(dest, src); }


 template<>

 inline typename pack<float, 4>::type mm_pair_set<float, 4>(float x, float y) { return _mm_setr_ps(x, y, x, y); }


 template<>

 inline typename pack<float, 4>::type mm_set1<float,4> (float x) { return _mm_set1_ps(x); }


 template<>

 inline typename pack<float, 4>::type mm_complex_load<float,4>(std::complex<float> const *src, int stride) {

     return _mm_setr_ps(src[0].real(), src[0].imag(), src[stride].real(), src[stride].imag());

 }

 template<>

 inline typename pack<float, 4>::type mm_complex_load<float,4>(std::complex<float> const *src) {

     return mm_complex_load<float,4>(src, 1);

 }


 /* Below are structs for pack<float, 8> */


 template<>

 inline typename pack<float, 8>::type mm_zero<float, 8>(){ return _mm256_setzero_ps(); }


 template<>

 inline typename pack<float, 8>::type mm_load<float, 8>(float const *src) { return _mm256_loadu_ps(src); }


 template<>

 inline void mm_store<float, 8>(float *dest, pack<float, 8>::type const &src) { _mm256_storeu_ps(dest, src); }


 template<>

 inline typename pack<float, 8>::type mm_pair_set<float, 8>(float x, float y) { return _mm256_setr_ps(x, y, x, y, x, y, x, y); }


 template<>

 inline typename pack<float, 8>::type mm_set1<float,8> (float x) { return _mm256_set1_ps(x); }


 template<>

 inline typename pack<float, 8>::type mm_complex_load<float, 8>(std::complex<float> const *src, int stride) {

     return _mm256_setr_ps(src[0*stride].real(), src[0*stride].imag(),

                           src[1*stride].real(), src[1*stride].imag(),

                           src[2*stride].real(), src[2*stride].imag(),

                           src[3*stride].real(), src[3*stride].imag());

 }

 template<>

 inline typename pack<float, 8>::type mm_complex_load<float,8>(std::complex<float> const *src) {

     return mm_complex_load<float,8>(src, 1);

 }


 /* Below are structs for pack<double, 2> */


 template<>

 inline typename pack<double, 2>::type mm_zero<double, 2>(){ return _mm_setzero_pd(); }


 template<>

 inline typename pack<double, 2>::type mm_load<double, 2>(double const *src) { return _mm_loadu_pd(src); }


 template<>

 inline void mm_store<double, 2>(double *dest, pack<double, 2>::type const &src) { _mm_storeu_pd(dest, src); }


 template<>

 inline typename pack<double, 2>::type mm_pair_set<double, 2>(double x, double y) { return _mm_setr_pd(x, y); }


 template<>

 inline typename pack<double, 2>::type mm_set1<double, 2>(double x) { return _mm_set1_pd(x); }


 template<>

 inline typename pack<double,2>::type mm_complex_load<double,2>(std::complex<double> const *src, int) {

     return _mm_setr_pd(src[0].real(), src[0].imag());

 }

 template<>

 inline typename pack<double,2>::type mm_complex_load<double,2>(std::complex<double> const *src) {

     return mm_complex_load<double,2>(src, 1);

 }


 /* Below are structs for pack<double, 4> */


 template<>

 inline typename pack<double, 4>::type mm_zero<double, 4>(){ return _mm256_setzero_pd(); }


 template<>

 inline typename pack<double, 4>::type mm_load<double, 4>(double const *src) { return _mm256_loadu_pd(src); }


 template<>

 inline void mm_store<double, 4>(double *dest, pack<double, 4>::type const &src) { _mm256_storeu_pd(dest, src); }


 template<>

 inline typename pack<double, 4>::type mm_pair_set<double, 4>(double x, double y) { return _mm256_setr_pd(x, y, x, y); }


 template<>

 inline typename pack<double, 4>::type mm_set1<double, 4>(double x) { return _mm256_set1_pd(x); }


 template<>

 inline typename pack<double,4>::type mm_complex_load<double,4>(std::complex<double> const *src, int stride) {

     return _mm256_setr_pd(src[0].real(), src[0].imag(), src[stride].real(), src[stride].imag());

 }

 template<>

 inline typename pack<double,4>::type mm_complex_load<double,4>(std::complex<double> const *src) {

     return mm_complex_load<double,4>(src, 1);

 }


 /* Elementary operations for vector packs */


 /* Addition */


 inline pack<float, 4>::type mm_add(pack<float, 4>::type const &x,pack<float, 4>::type const &y) {

     return _mm_add_ps(x, y);

 }


 inline pack<float, 8>::type mm_add(pack<float, 8>::type const &x, pack<float, 8>::type const &y) {

     return _mm256_add_ps(x, y);

 }


 inline pack<double, 2>::type mm_add(pack<double, 2>::type const &x, pack<double, 2>::type const &y) {

     return _mm_add_pd(x, y);

 }


 inline pack<double, 4>::type mm_add(pack<double, 4>::type const &x, pack<double, 4>::type const &y) {

     return _mm256_add_pd(x, y);

 }


 /* Subtraction */


 inline pack<float, 4>::type mm_sub(pack<float, 4>::type const &x,pack<float, 4>::type const &y) {

     return _mm_sub_ps(x, y);

 }


 inline pack<float, 8>::type mm_sub(pack<float, 8>::type const &x, pack<float, 8>::type const &y) {

     return _mm256_sub_ps(x, y);

 }


 inline pack<double, 2>::type mm_sub(pack<double, 2>::type const &x, pack<double, 2>::type const &y) {

     return _mm_sub_pd(x, y);

 }


 inline pack<double, 4>::type mm_sub(pack<double, 4>::type const &x, pack<double, 4>::type const &y) {

     return _mm256_sub_pd(x, y);

 }


 /* Multiplication */


 inline pack<float, 4>::type mm_mul(pack<float, 4>::type const &x, pack<float, 4>::type const &y) {

     return _mm_mul_ps(x, y);

 }


 inline pack<float, 8>::type mm_mul(pack<float, 8>::type const &x, pack<float, 8>::type const &y) {

     return _mm256_mul_ps(x, y);

 }


 inline pack<double, 2>::type mm_mul(pack<double, 2>::type const &x, pack<double, 2>::type const &y) {

     return _mm_mul_pd(x, y);

 }


 inline pack<double, 4>::type mm_mul(pack<double, 4>::type const &x, pack<double, 4>::type const &y) {

     return _mm256_mul_pd(x, y);

 }


 /* Division */


 inline pack<float, 4>::type mm_div(pack<float, 4>::type const &x,pack<float, 4>::type const &y) {

     return _mm_div_ps(x, y);

 }


 inline pack<float, 8>::type mm_div(pack<float, 8>::type const &x, pack<float, 8>::type const &y) {

     return _mm256_div_ps(x, y);

 }


 inline pack<double, 2>::type mm_div(pack<double, 2>::type const &x, pack<double, 2>::type const &y) {

     return _mm_div_pd(x, y);

 }


 inline pack<double, 4>::type mm_div(pack<double, 4>::type const &x, pack<double, 4>::type const &y) {

     return _mm256_div_pd(x, y);

 }


 /* Negation */

 inline pack<float, 4>::type mm_neg(pack<float, 4>::type const &x) {

     return _mm_xor_ps(x, (mm_set1<float, 4>(-0.f)));

 }


 inline pack<float, 8>::type mm_neg(pack<float, 8>::type const &x) {

     return _mm256_xor_ps(x, (mm_set1<float, 8>(-0.f)));

 }


 inline pack<double, 2>::type mm_neg(pack<double, 2>::type const &x) {

     return _mm_xor_pd(x, (mm_set1<double, 2>(-0.)));

 }


 inline pack<double, 4>::type mm_neg(pack<double, 4>::type const &x) {

     return _mm256_xor_pd(x, (mm_set1<double, 4>(-0.)));

 }


 /* Complex operations using vector packs */


 // Complex Multiplication


 inline pack<float,4>::type mm_complex_mul(pack<float, 4>::type const &x, pack<float, 4>::type const &y) {

     typename pack<float,4>::type cc = _mm_permute_ps(y, 0b10100000);

     typename pack<float,4>::type ba = _mm_permute_ps(x, 0b10110001);

     typename pack<float,4>::type dd = _mm_permute_ps(y, 0b11110101);

     typename pack<float,4>::type dba = _mm_mul_ps(ba, dd);

     typename pack<float,4>::type mult = _mm_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<float, 8>::type mm_complex_mul(pack<float, 8>::type const &x, pack<float, 8>::type const &y) {

     typename pack<float,8>::type cc = _mm256_permute_ps(y, 0b10100000);

     typename pack<float,8>::type ba = _mm256_permute_ps(x, 0b10110001);

     typename pack<float,8>::type dd = _mm256_permute_ps(y, 0b11110101);

     typename pack<float,8>::type dba = _mm256_mul_ps(ba, dd);

     typename pack<float,8>::type mult = _mm256_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<double, 2>::type mm_complex_mul(pack<double, 2>::type const &x, pack<double, 2>::type const &y) {

     typename pack<double,2>::type cc = _mm_permute_pd(y, 0);

     typename pack<double,2>::type ba = _mm_permute_pd(x, 0b01);

     typename pack<double,2>::type dd = _mm_permute_pd(y, 0b11);

     typename pack<double,2>::type dba = _mm_mul_pd(ba, dd);

     typename pack<double,2>::type mult = _mm_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 inline pack<double, 4>::type mm_complex_mul(pack<double, 4>::type const &x, pack<double, 4>::type const &y) {

     typename pack<double,4>::type cc = _mm256_permute_pd(y, 0b0000);

     typename pack<double,4>::type ba = _mm256_permute_pd(x, 0b0101);

     typename pack<double,4>::type dd = _mm256_permute_pd(y, 0b1111);

     typename pack<double,4>::type dba = _mm256_mul_pd(ba, dd);

     typename pack<double,4>::type mult = _mm256_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 // Fused multiply-add


 inline pack<float,4>::type mm_complex_fmadd(pack<float, 4>::type const &x, pack<float, 4>::type const &y, pack<float, 4>::type const &z) {

     typename pack<float,4>::type cc = _mm_permute_ps(y, 0b10100000);

     typename pack<float,4>::type ba = _mm_permute_ps(x, 0b10110001);

     typename pack<float,4>::type dd = _mm_permute_ps(y, 0b11110101);

     typename pack<float,4>::type dba = _mm_fmaddsub_ps(ba, dd, z);

     typename pack<float,4>::type mult = _mm_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<float, 8>::type mm_complex_fmadd(pack<float, 8>::type const &x, pack<float, 8>::type const &y, pack<float, 8>::type const &z) {

     typename pack<float,8>::type cc = _mm256_permute_ps(y, 0b10100000);

     typename pack<float,8>::type ba = _mm256_permute_ps(x, 0b10110001);

     typename pack<float,8>::type dd = _mm256_permute_ps(y, 0b11110101);

     typename pack<float,8>::type dba = _mm256_fmaddsub_ps(ba, dd, z);

     typename pack<float,8>::type mult = _mm256_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<double, 2>::type mm_complex_fmadd(pack<double, 2>::type const &x, pack<double, 2>::type const &y, pack<double, 2>::type const &z) {

     typename pack<double,2>::type cc = _mm_permute_pd(y, 0);

     typename pack<double,2>::type ba = _mm_permute_pd(x, 0b01);

     typename pack<double,2>::type dd = _mm_permute_pd(y, 0b11);

     typename pack<double,2>::type dba = _mm_fmaddsub_pd(ba, dd, z);

     typename pack<double,2>::type mult = _mm_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 inline pack<double, 4>::type mm_complex_fmadd(pack<double, 4>::type const &x, pack<double, 4>::type const &y, pack<double, 4>::type const &z) {

     typename pack<double,4>::type cc = _mm256_permute_pd(y, 0b0000);

     typename pack<double,4>::type ba = _mm256_permute_pd(x, 0b0101);

     typename pack<double,4>::type dd = _mm256_permute_pd(y, 0b1111);

     typename pack<double,4>::type dba = _mm256_fmaddsub_pd(ba, dd, z);

     typename pack<double,4>::type mult = _mm256_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 inline pack<float,4>::type mm_complex_fmsub(pack<float, 4>::type const &x, pack<float, 4>::type const &y, pack<float, 4>::type const &z) {

     typename pack<float,4>::type cc = _mm_permute_ps(y, 0b10100000);

     typename pack<float,4>::type ba = _mm_permute_ps(x, 0b10110001);

     typename pack<float,4>::type dd = _mm_permute_ps(y, 0b11110101);

     typename pack<float,4>::type dba = _mm_fmsubadd_ps(ba, dd, z);

     typename pack<float,4>::type mult = _mm_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<float, 8>::type mm_complex_fmsub(pack<float, 8>::type const &x, pack<float, 8>::type const &y, pack<float, 8>::type const &z) {

     typename pack<float,8>::type cc = _mm256_permute_ps(y, 0b10100000);

     typename pack<float,8>::type ba = _mm256_permute_ps(x, 0b10110001);

     typename pack<float,8>::type dd = _mm256_permute_ps(y, 0b11110101);

     typename pack<float,8>::type dba = _mm256_fmsubadd_ps(ba, dd, z);

     typename pack<float,8>::type mult = _mm256_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<double, 2>::type mm_complex_fmsub(pack<double, 2>::type const &x, pack<double, 2>::type const &y, pack<double, 2>::type const &z) {

     typename pack<double,2>::type cc = _mm_permute_pd(y, 0);

     typename pack<double,2>::type ba = _mm_permute_pd(x, 0b01);

     typename pack<double,2>::type dd = _mm_permute_pd(y, 0b11);

     typename pack<double,2>::type dba = _mm_fmsubadd_pd(ba, dd, z);

     typename pack<double,2>::type mult = _mm_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 inline pack<double, 4>::type mm_complex_fmsub(pack<double, 4>::type const &x, pack<double, 4>::type const &y, pack<double, 4>::type const &z) {

     typename pack<double,4>::type cc = _mm256_permute_pd(y, 0b0000);

     typename pack<double,4>::type ba = _mm256_permute_pd(x, 0b0101);

     typename pack<double,4>::type dd = _mm256_permute_pd(y, 0b1111);

     typename pack<double,4>::type dba = _mm256_fmsubadd_pd(ba, dd, z);

     typename pack<double,4>::type mult = _mm256_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 // Squared modulus of the complex numbers in a pack


 inline pack<float, 4>::type mm_complex_sq_mod(pack<float, 4>::type const &x) {

     return _mm_or_ps(_mm_dp_ps(x, x, 0b11001100), _mm_dp_ps(x, x, 0b00110011));

 }


 inline pack<float, 8>::type mm_complex_sq_mod(pack<float, 8>::type const &x) {

     return _mm256_or_ps(_mm256_dp_ps(x, x, 0b11001100), _mm256_dp_ps(x, x, 0b00110011));

 }


 inline pack<double, 2>::type mm_complex_sq_mod(pack<double, 2>::type const &x) {

     return _mm_dp_pd(x, x, 0b11111111);

 }


 inline pack<double, 4>::type mm_complex_sq_mod(pack<double, 4>::type const &x) {

     typename pack<double,4>::type a = _mm256_mul_pd(x, x);

     return _mm256_hadd_pd(a, a);

 }


 // Moduli (with square root) of complex numbers


 inline pack<float, 4>::type mm_complex_mod(pack<float, 4>::type const &x) {

     return _mm_sqrt_ps(mm_complex_sq_mod(x));

 }


 inline pack<float, 8>::type mm_complex_mod(pack<float, 8>::type const &x) {

     return _mm256_sqrt_ps(mm_complex_sq_mod(x));

 }


 inline pack<double, 2>::type mm_complex_mod(pack<double, 2>::type const &x) {

     return _mm_sqrt_pd(mm_complex_sq_mod(x));

 }


 inline pack<double, 4>::type mm_complex_mod(pack<double, 4>::type const &x) {

     return _mm256_sqrt_pd(mm_complex_sq_mod(x));

 }


 inline pack<float, 4>::type mm_complex_conj(pack<float, 4>::type const &x) {

     return _mm_blend_ps(x, (mm_neg(x)), 0b1010);

 }


 inline pack<float, 8>::type mm_complex_conj(pack<float, 8>::type const &x) {

     return _mm256_blend_ps(x, (mm_neg(x)), 0b10101010);

 }


 inline pack<double, 2>::type mm_complex_conj(pack<double, 2>::type const &x) {

     return _mm_blend_pd(x, (mm_neg(x)), 0b10);

 }


 inline pack<double, 4>::type mm_complex_conj(pack<double, 4>::type const &x) {

     return _mm256_blend_pd(x, (mm_neg(x)), 0b1010);

 }


 // Special operation when multiplying by i and -i

 inline pack<float, 4>::type mm_complex_mul_i(pack<float, 4>::type const &x) {

     return _mm_permute_ps( (mm_complex_conj(x)), 0b10110001);

 }


 inline pack<float, 8>::type mm_complex_mul_i(pack<float, 8>::type const &x) {

     return _mm256_permute_ps( (mm_complex_conj(x)), 0b10110001);

 }


 inline pack<double, 2>::type mm_complex_mul_i(pack<double, 2>::type const &x) {

     return _mm_permute_pd( (mm_complex_conj(x)), 0b00000001);

 }


 inline pack<double, 4>::type mm_complex_mul_i(pack<double, 4>::type const &x) {

     return _mm256_permute_pd( (mm_complex_conj(x)), 0b00000101);

 }


 inline pack<float, 4>::type mm_complex_mul_neg_i(pack<float, 4>::type const &x) {

     return mm_complex_conj(_mm_permute_ps(x, 0b10110001));

 }


 inline pack<float, 8>::type mm_complex_mul_neg_i(pack<float, 8>::type const &x) {

     return mm_complex_conj(_mm256_permute_ps(x, 0b10110001));

 }


 inline pack<double, 2>::type mm_complex_mul_neg_i(pack<double, 2>::type const &x) {

     return mm_complex_conj(_mm_permute_pd(x, 0b0000001));

 }


 inline pack<double, 4>::type mm_complex_mul_neg_i(pack<double, 4>::type const &x) {

     return mm_complex_conj(_mm256_permute_pd(x, 0b00000101));

 }


 // Complex division


 inline pack<float, 4>::type mm_complex_div(pack<float, 4>::type const &x, pack<float, 4>::type const &y) {

     return _mm_div_ps(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));

 }


 inline pack<float, 8>::type mm_complex_div(pack<float, 8>::type const &x, pack<float, 8>::type const &y) {

     return _mm256_div_ps(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));

 }


 inline pack<double, 2>::type mm_complex_div(pack<double, 2>::type const &x, pack<double, 2>::type const &y) {

     return _mm_div_pd(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));

 }


 inline pack<double, 4>::type mm_complex_div(pack<double, 4>::type const &x, pack<double, 4>::type const &y) {

     return _mm256_div_pd(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));

 }


 // Now all the implementations for types in AVX512 headers

 #ifdef Heffte_ENABLE_AVX512


 template<> struct pack<double, 8> { using type = __m512d; };

 template<> struct pack<float, 16> { using type = __m512; };


 /* Below are structs for pack<float, 16> */


 template<>

 inline typename pack<float, 16>::type  mm_zero<float, 16>(){ return _mm512_setzero_ps(); }


 template<>

 inline typename pack<float, 16>::type mm_load<float, 16>(float const *src) { return _mm512_loadu_ps(src); }


 template<>

 inline void mm_store<float, 16>(float *dest, pack<float, 16>::type const &src) { _mm512_storeu_ps(dest, src); }


 template<>

 inline typename pack<float, 16>::type mm_pair_set<float, 16>(float x, float y) { return _mm512_setr_ps(x, y, x, y, x, y, x, y, x, y, x, y, x, y, x, y); }


 template<>

 inline typename pack<float, 16>::type mm_set1<float, 16> (float x) { return _mm512_set1_ps(x); }


 template<>

 inline typename pack<float, 16>::type mm_complex_load<float, 16>(std::complex<float> const *src, int stride) {

     return _mm512_setr_ps(src[0*stride].real(), src[0*stride].imag(), src[1*stride].real(), src[1*stride].imag(),

                           src[2*stride].real(), src[2*stride].imag(), src[3*stride].real(), src[3*stride].imag(),

                           src[4*stride].real(), src[4*stride].imag(), src[5*stride].real(), src[5*stride].imag(),

                           src[6*stride].real(), src[6*stride].imag(), src[7*stride].real(), src[7*stride].imag());

 }


 template<>

 inline typename pack<float, 16>::type mm_complex_load<float, 16>(std::complex<float> const *src) {

     return mm_complex_load<float, 16>(src, 1);

 }


 /* Below are structs for pack<double, 8> */


 template<>

 inline typename pack<double, 8>::type mm_zero<double, 8>(){ return _mm512_setzero_pd(); }


 template<>

 inline typename pack<double, 8>::type mm_load<double, 8>(double const *src) { return _mm512_loadu_pd(src); }


 template<>

 inline void mm_store<double, 8>(double *dest, pack<double, 8>::type const &src) { _mm512_storeu_pd(dest, src); }


 template<>

 inline typename pack<double, 8>::type mm_pair_set<double, 8>(double x, double y) { return _mm512_setr_pd(x, y, x, y, x, y, x, y); }


 template<>

 inline typename pack<double, 8>::type mm_set1<double, 8>(double x) { return _mm512_set1_pd(x); }


 template<>

 inline typename pack<double, 8>::type mm_complex_load<double, 8>(std::complex<double> const *src, int stride) {

     return _mm512_setr_pd(src[0*stride].real(), src[0*stride].imag(), src[1*stride].real(), src[1*stride].imag(),

                           src[2*stride].real(), src[2*stride].imag(), src[3*stride].real(), src[3*stride].imag());

 }

 template<>

 inline typename pack<double, 8>::type mm_complex_load<double, 8>(std::complex<double> const *src) {

     return mm_complex_load<double, 8>(src, 1);

 }


 /* Elementary binary operations for vector packs */


 /* Addition */


 inline pack<float, 16>::type mm_add(pack<float, 16>::type const &x, pack<float, 16>::type const &y) {

     return _mm512_add_ps(x, y);

 }


 inline pack<double, 8>::type mm_add(pack<double, 8>::type const &x, pack<double, 8>::type const &y) {

     return _mm512_add_pd(x, y);

 }


 /* Subtraction */


 inline pack<float, 16>::type mm_sub(pack<float, 16>::type const &x,pack<float, 16>::type const &y) {

     return _mm512_sub_ps(x, y);

 }


 inline pack<double, 8>::type mm_sub(pack<double, 8>::type const &x, pack<double, 8>::type const &y) {

     return _mm512_sub_pd(x, y);

 }


 /* Multiplication */


 inline pack<float, 16>::type mm_mul(pack<float, 16>::type const &x, pack<float, 16>::type const &y) {

     return _mm512_mul_ps(x, y);

 }


 inline pack<double, 8>::type mm_mul(pack<double, 8>::type const &x, pack<double, 8>::type const &y) {

     return _mm512_mul_pd(x, y);

 }


 /* Division */


 inline pack<float, 16>::type mm_div(pack<float, 16>::type const &x,pack<float, 16>::type const &y) {

     return _mm512_div_ps(x, y);

 }


 inline pack<double, 8>::type mm_div(pack<double, 8>::type const &x, pack<double, 8>::type const &y) {

     return _mm512_div_pd(x, y);

 }


 /* Negation */

 inline pack<float, 16>::type mm_neg(pack<float, 16>::type const &x) {

     return _mm512_xor_ps(x, (mm_set1<float, 16>(-0.f)));

 }


 inline pack<double, 8>::type mm_neg(pack<double, 8>::type const &x) {

     return _mm512_xor_pd(x, (mm_set1<double, 8>(-0.f)));

 }


 /* Complex operations using AVX512 vector packs */


 // Complex Multiplication


 inline pack<float,16>::type mm_complex_mul(pack<float, 16>::type const &x, pack<float, 16>::type const &y) {

     typename pack<float, 16>::type cc = _mm512_permute_ps(y, 0b10100000);

     typename pack<float, 16>::type ba = _mm512_permute_ps(x, 0b10110001);

     typename pack<float, 16>::type dd = _mm512_permute_ps(y, 0b11110101);

     typename pack<float, 16>::type dba = _mm512_mul_ps(ba, dd);

     typename pack<float, 16>::type mult = _mm512_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<double, 8>::type mm_complex_mul(pack<double, 8>::type const &x, pack<double, 8>::type const &y) {

     typename pack<double, 8>::type cc = _mm512_permute_pd(y, 0b00000000);

     typename pack<double, 8>::type ba = _mm512_permute_pd(x, 0b01010101);

     typename pack<double, 8>::type dd = _mm512_permute_pd(y, 0b11111111);

     typename pack<double, 8>::type dba = _mm512_mul_pd(ba, dd);

     typename pack<double, 8>::type mult = _mm512_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 // Complex fused-multiply add


 inline pack<float,16>::type mm_complex_fmadd(pack<float, 16>::type const &x, pack<float, 16>::type const &y, pack<float, 16>::type const &alpha) {

     typename pack<float, 16>::type cc = _mm512_permute_ps(y, 0b10100000);

     typename pack<float, 16>::type ba = _mm512_permute_ps(x, 0b10110001);

     typename pack<float, 16>::type dd = _mm512_permute_ps(y, 0b11110101);

     typename pack<float, 16>::type dba = _mm512_fmaddsub_ps(ba, dd, alpha);

     typename pack<float, 16>::type mult = _mm512_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<double, 8>::type mm_complex_fmadd(pack<double, 8>::type const &x, pack<double, 8>::type const &y, pack<double, 8>::type const &alpha) {

     typename pack<double, 8>::type cc = _mm512_permute_pd(y, 0b00000000);

     typename pack<double, 8>::type ba = _mm512_permute_pd(x, 0b01010101);

     typename pack<double, 8>::type dd = _mm512_permute_pd(y, 0b11111111);

     typename pack<double, 8>::type dba = _mm512_fmaddsub_pd(ba, dd, alpha);

     typename pack<double, 8>::type mult = _mm512_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 inline pack<float,16>::type mm_complex_fmsub(pack<float, 16>::type const &x, pack<float, 16>::type const &y, pack<float, 16>::type const &alpha) {

     typename pack<float, 16>::type cc = _mm512_permute_ps(y, 0b10100000);

     typename pack<float, 16>::type ba = _mm512_permute_ps(x, 0b10110001);

     typename pack<float, 16>::type dd = _mm512_permute_ps(y, 0b11110101);

     typename pack<float, 16>::type dba = _mm512_fmsubadd_ps(ba, dd, alpha);

     typename pack<float, 16>::type mult = _mm512_fmaddsub_ps(x, cc, dba);

     return mult;

 }


 inline pack<double, 8>::type mm_complex_fmsub(pack<double, 8>::type const &x, pack<double, 8>::type const &y, pack<double, 8>::type const &alpha) {

     typename pack<double, 8>::type cc = _mm512_permute_pd(y, 0b00000000);

     typename pack<double, 8>::type ba = _mm512_permute_pd(x, 0b01010101);

     typename pack<double, 8>::type dd = _mm512_permute_pd(y, 0b11111111);

     typename pack<double, 8>::type dba = _mm512_fmsubadd_pd(ba, dd, alpha);

     typename pack<double, 8>::type mult = _mm512_fmaddsub_pd(x, cc, dba);

     return mult;

 }


 // Squared modulus of the complex numbers in a pack


 inline pack<float, 16>::type mm_complex_sq_mod(pack<float, 16>::type const &x) {

     typename pack<float, 16>::type sq = mm_mul(x, x);

     typename pack<float, 16>::type sq_perm = _mm512_permute_ps(sq, 0b10110001);

     typename pack<float, 16>::type mod = mm_add(sq, sq_perm);

     return mod;

 }


 inline pack<double, 8>::type mm_complex_sq_mod(pack<double, 8>::type const &x) {

     typename pack<double, 8>::type sq = mm_mul(x, x);

     typename pack<double, 8>::type sq_perm = _mm512_permute_pd(sq, 0b01010101);

     typename pack<double, 8>::type mod = mm_add(sq, sq_perm);

     return mod;

 }


 // Moduli (with square root) of complex numbers


 inline pack<float, 16>::type mm_complex_mod(pack<float, 16>::type const &x) {

     return _mm512_sqrt_ps(mm_complex_sq_mod(x));

 }


 inline pack<double, 8>::type mm_complex_mod(pack<double, 8>::type const &x) {

     return _mm512_sqrt_pd(mm_complex_sq_mod(x));

 }


 // Conjugate complex numbers


 inline pack<float, 16>::type mm_complex_conj(pack<float, 16>::type const &x) {

     return _mm512_mask_blend_ps(0b1010101010101010, x, mm_neg(x));

 }


 inline pack<double, 8>::type mm_complex_conj(pack<double, 8>::type const &x) {

     return _mm512_mask_blend_pd(0b10101010, x, mm_neg(x));

 }


 // Special operation when multiplying by i and -i

 inline pack<float, 16>::type mm_complex_mul_i(pack<float, 16>::type const &x) {

     return _mm512_permute_ps( (mm_complex_conj(x)), 0b10110001);

 }


 inline pack<double, 8>::type mm_complex_mul_i(pack<double, 8>::type const &x) {

     return _mm512_permute_pd( (mm_complex_conj(x)), 0b01010101);

 }


 inline pack<float, 16>::type mm_complex_mul_neg_i(pack<float, 16>::type const &x) {

     return mm_complex_conj(_mm512_permute_ps(x, 0b10110001));

 }


 inline pack<double, 8>::type mm_complex_mul_neg_i(pack<double, 8>::type const &x) {

     return mm_complex_conj(_mm512_permute_pd(x, 0b01010101));

 }


 // Complex division


 inline pack<float, 16>::type mm_complex_div(pack<float, 16>::type const &x, pack<float, 16>::type const &y) {

     return _mm512_div_ps(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));

 }


 inline pack<double, 8>::type mm_complex_div(pack<double, 8>::type const &x, pack<double, 8>::type const &y) {

     return _mm512_div_pd(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));

 }


 #endif // Heffte_ENABLE_AVX512

 #endif // Heffte_ENABLE_AVX


 }

 }


 #endif // HEFFTE_STOCK_VEC_TYPES_H

heffte
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38

heffte::stock::is_complex
Struct determining whether a type is a complex number.
Definition: heffte_stock_vec_types.h:38

heffte::stock::is_real
Struct determining whether a type is a real number.
Definition: heffte_stock_vec_types.h:33

heffte::stock::pack
Struct to retrieve the vector type associated with the number of elements stored "per unit".
Definition: heffte_stock_vec_types.h:43