7 #ifndef HEFFTE_STOCK_VEC_TYPES_H
8 #define HEFFTE_STOCK_VEC_TYPES_H
10 #include "heffte_config.h"
13 #include <immintrin.h>
21 using is_float = std::is_same<float, typename std::remove_cv<T>::type>;
24 using is_double = std::is_same<double, typename std::remove_cv<T>::type>;
27 using is_fcomplex = std::is_same<std::complex<float>,
typename std::remove_cv<T>::type>;
30 using is_dcomplex = std::is_same<std::complex<double>,
typename std::remove_cv<T>::type>;
34 static constexpr
bool value = is_float<T>::value || is_double<T>::value;
39 static constexpr
bool value = is_fcomplex<T>::value || is_dcomplex<T>::value;
43 template<
typename T,
int N>
struct pack {};
45 template<>
struct pack<float, 1> {
using type = std::complex<float>; };
47 template<>
struct pack<double, 1> {
using type = std::complex<double>; };
55 template<
typename F,
int L>
61 template<
typename F,
int L>
62 inline typename pack<F,L>::type mm_load(F
const *src) {
return typename pack<F,L>::type {src[0], src[1]}; }
67 template<
typename F,
int L>
68 inline void mm_store(F *dest,
typename pack<F,L>::type
const &src) {
69 dest[0] = src.real(); dest[1] = src.imag();
75 template<
typename F,
int L>
76 inline typename pack<F,L>::type mm_pair_set(F x, F y) {
return typename pack<F,L>::type(x, y); }
81 template<
typename F,
int L>
82 inline typename pack<F,L>::type mm_set1(F src) {
return typename pack<F,L>::type(src,src); }
87 template<
typename F,
int L>
88 inline typename pack<F,L>::type mm_complex_load(std::complex<F>
const *src) {
return *src; }
93 template<
typename F,
int L>
94 inline typename pack<F,L>::type mm_complex_load(std::complex<F>
const *src,
int) {
return *src; }
99 inline typename pack<float, 1>::type mm_add(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b){
return a + b; }
101 inline typename pack<double, 1>::type mm_add(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b){
return a + b; }
103 inline typename pack<float, 1>::type mm_sub(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b){
return a - b; }
105 inline typename pack<double, 1>::type mm_sub(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b){
return a - b; }
107 inline typename pack<float, 1>::type mm_div(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b){
return a / b.real(); }
109 inline typename pack<double, 1>::type mm_div(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b){
return a / b.real(); }
111 inline typename pack<float, 1>::type mm_neg(
typename pack<float, 1>::type
const &a){
return -a; }
113 inline typename pack<double, 1>::type mm_neg(
typename pack<double, 1>::type
const &a){
return -a; }
115 inline typename pack<float, 1>::type mm_mul(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b){
return a * b.real(); }
117 inline typename pack<double, 1>::type mm_mul(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b){
return a * b.real(); }
119 inline typename pack<float, 1>::type mm_complex_mul(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b){
return a * b; }
121 inline typename pack<double, 1>::type mm_complex_mul(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b){
return a * b; }
123 inline typename pack<float, 1>::type mm_complex_fmadd(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b,
typename pack<float, 1>::type
const &c){
return a * b + c; }
125 inline typename pack<double, 1>::type mm_complex_fmadd(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b,
typename pack<double, 1>::type
const &c){
return a * b + c; }
127 inline typename pack<float, 1>::type mm_complex_fmsub(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b,
typename pack<float, 1>::type
const &c){
return a * b - c; }
129 inline typename pack<double, 1>::type mm_complex_fmsub(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b,
typename pack<double, 1>::type
const &c){
return a * b - c; }
131 inline typename pack<float, 1>::type mm_complex_mul_i(
typename pack<float, 1>::type
const &a){
return a * std::complex<float>{0.f,1.f}; }
133 inline typename pack<double, 1>::type mm_complex_mul_i(
typename pack<double, 1>::type
const &a){
return a * std::complex<double>{0.,1.}; }
135 inline typename pack<float, 1>::type mm_complex_mul_neg_i(
typename pack<float, 1>::type
const &a){
return a * std::complex<float>{0.f,-1.f}; }
137 inline typename pack<double, 1>::type mm_complex_mul_neg_i(
typename pack<double, 1>::type
const &a){
return a * std::complex<double>{0.,-1.}; }
139 inline typename pack<float, 1>::type mm_complex_sq_mod(
typename pack<float,1>::type
const &a){
return std::complex<float>{norm(a), norm(a)}; }
141 inline typename pack<double, 1>::type mm_complex_sq_mod(
typename pack<double,1>::type
const &a){
return std::complex<double>{norm(a), norm(a)}; }
143 inline typename pack<float, 1>::type mm_complex_mod(
typename pack<float,1>::type
const &a){
return std::complex<float>{std::abs(a), std::abs(a)}; }
145 inline typename pack<double, 1>::type mm_complex_mod(
typename pack<double,1>::type
const &a){
return std::complex<double>{std::abs(a), std::abs(a)}; }
147 inline typename pack<float, 1>::type mm_complex_conj(
typename pack<float,1>::type
const &a){
return conj(a); }
149 inline typename pack<double, 1>::type mm_complex_conj(
typename pack<double,1>::type
const &a){
return conj(a); }
151 inline typename pack<float, 1>::type mm_complex_div(
typename pack<float, 1>::type
const &a,
typename pack<float, 1>::type
const &b){
return a / b; }
153 inline typename pack<double, 1>::type mm_complex_div(
typename pack<double, 1>::type
const &a,
typename pack<double, 1>::type
const &b){
return a / b; }
159 #ifdef Heffte_ENABLE_AVX
162 template<>
struct pack<double, 2> {
using type = __m128d; };
164 template<>
struct pack<float, 4> {
using type = __m128; };
166 template<>
struct pack<double, 4> {
using type = __m256d; };
168 template<>
struct pack<float, 8> {
using type = __m256; };
176 inline typename pack<float, 4>::type mm_zero<float, 4>(){
return _mm_setzero_ps(); }
180 inline typename pack<float, 4>::type mm_load<float, 4>(
float const *src) {
return _mm_loadu_ps(src); }
184 inline void mm_store<float, 4>(
float *dest, pack<float, 4>::type
const &src) { _mm_storeu_ps(dest, src); }
188 inline typename pack<float, 4>::type mm_pair_set<float, 4>(
float x,
float y) {
return _mm_setr_ps(x, y, x, y); }
192 inline typename pack<float, 4>::type mm_set1<float,4> (
float x) {
return _mm_set1_ps(x); }
196 inline typename pack<float, 4>::type mm_complex_load<float,4>(std::complex<float>
const *src,
int stride) {
197 return _mm_setr_ps(src[0].real(), src[0].imag(), src[stride].real(), src[stride].imag());
201 inline typename pack<float, 4>::type mm_complex_load<float,4>(std::complex<float>
const *src) {
202 return mm_complex_load<float,4>(src, 1);
211 inline typename pack<float, 8>::type mm_zero<float, 8>(){
return _mm256_setzero_ps(); }
215 inline typename pack<float, 8>::type mm_load<float, 8>(
float const *src) {
return _mm256_loadu_ps(src); }
219 inline void mm_store<float, 8>(
float *dest, pack<float, 8>::type
const &src) { _mm256_storeu_ps(dest, src); }
223 inline typename pack<float, 8>::type mm_pair_set<float, 8>(
float x,
float y) {
return _mm256_setr_ps(x, y, x, y, x, y, x, y); }
227 inline typename pack<float, 8>::type mm_set1<float,8> (
float x) {
return _mm256_set1_ps(x); }
231 inline typename pack<float, 8>::type mm_complex_load<float, 8>(std::complex<float>
const *src,
int stride) {
232 return _mm256_setr_ps(src[0*stride].real(), src[0*stride].imag(),
233 src[1*stride].real(), src[1*stride].imag(),
234 src[2*stride].real(), src[2*stride].imag(),
235 src[3*stride].real(), src[3*stride].imag());
239 inline typename pack<float, 8>::type mm_complex_load<float,8>(std::complex<float>
const *src) {
240 return mm_complex_load<float,8>(src, 1);
249 inline typename pack<double, 2>::type mm_zero<double, 2>(){
return _mm_setzero_pd(); }
253 inline typename pack<double, 2>::type mm_load<double, 2>(
double const *src) {
return _mm_loadu_pd(src); }
257 inline void mm_store<double, 2>(
double *dest, pack<double, 2>::type
const &src) { _mm_storeu_pd(dest, src); }
261 inline typename pack<double, 2>::type mm_pair_set<double, 2>(
double x,
double y) {
return _mm_setr_pd(x, y); }
265 inline typename pack<double, 2>::type mm_set1<double, 2>(
double x) {
return _mm_set1_pd(x); }
269 inline typename pack<double,2>::type mm_complex_load<double,2>(std::complex<double>
const *src,
int) {
270 return _mm_setr_pd(src[0].real(), src[0].imag());
273 inline typename pack<double,2>::type mm_complex_load<double,2>(std::complex<double>
const *src) {
274 return mm_complex_load<double,2>(src, 1);
283 inline typename pack<double, 4>::type mm_zero<double, 4>(){
return _mm256_setzero_pd(); }
287 inline typename pack<double, 4>::type mm_load<double, 4>(
double const *src) {
return _mm256_loadu_pd(src); }
291 inline void mm_store<double, 4>(
double *dest, pack<double, 4>::type
const &src) { _mm256_storeu_pd(dest, src); }
295 inline typename pack<double, 4>::type mm_pair_set<double, 4>(
double x,
double y) {
return _mm256_setr_pd(x, y, x, y); }
299 inline typename pack<double, 4>::type mm_set1<double, 4>(
double x) {
return _mm256_set1_pd(x); }
303 inline typename pack<double,4>::type mm_complex_load<double,4>(std::complex<double>
const *src,
int stride) {
304 return _mm256_setr_pd(src[0].real(), src[0].imag(), src[stride].real(), src[stride].imag());
308 inline typename pack<double,4>::type mm_complex_load<double,4>(std::complex<double>
const *src) {
309 return mm_complex_load<double,4>(src, 1);
319 inline pack<float, 4>::type mm_add(pack<float, 4>::type
const &x,pack<float, 4>::type
const &y) {
320 return _mm_add_ps(x, y);
324 inline pack<float, 8>::type mm_add(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y) {
325 return _mm256_add_ps(x, y);
329 inline pack<double, 2>::type mm_add(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y) {
330 return _mm_add_pd(x, y);
334 inline pack<double, 4>::type mm_add(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y) {
335 return _mm256_add_pd(x, y);
341 inline pack<float, 4>::type mm_sub(pack<float, 4>::type
const &x,pack<float, 4>::type
const &y) {
342 return _mm_sub_ps(x, y);
346 inline pack<float, 8>::type mm_sub(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y) {
347 return _mm256_sub_ps(x, y);
351 inline pack<double, 2>::type mm_sub(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y) {
352 return _mm_sub_pd(x, y);
356 inline pack<double, 4>::type mm_sub(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y) {
357 return _mm256_sub_pd(x, y);
363 inline pack<float, 4>::type mm_mul(pack<float, 4>::type
const &x, pack<float, 4>::type
const &y) {
364 return _mm_mul_ps(x, y);
368 inline pack<float, 8>::type mm_mul(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y) {
369 return _mm256_mul_ps(x, y);
373 inline pack<double, 2>::type mm_mul(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y) {
374 return _mm_mul_pd(x, y);
378 inline pack<double, 4>::type mm_mul(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y) {
379 return _mm256_mul_pd(x, y);
385 inline pack<float, 4>::type mm_div(pack<float, 4>::type
const &x,pack<float, 4>::type
const &y) {
386 return _mm_div_ps(x, y);
390 inline pack<float, 8>::type mm_div(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y) {
391 return _mm256_div_ps(x, y);
395 inline pack<double, 2>::type mm_div(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y) {
396 return _mm_div_pd(x, y);
400 inline pack<double, 4>::type mm_div(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y) {
401 return _mm256_div_pd(x, y);
407 inline pack<float, 4>::type mm_neg(pack<float, 4>::type
const &x) {
408 return _mm_xor_ps(x, (mm_set1<float, 4>(-0.f)));
412 inline pack<float, 8>::type mm_neg(pack<float, 8>::type
const &x) {
413 return _mm256_xor_ps(x, (mm_set1<float, 8>(-0.f)));
417 inline pack<double, 2>::type mm_neg(pack<double, 2>::type
const &x) {
418 return _mm_xor_pd(x, (mm_set1<double, 2>(-0.)));
422 inline pack<double, 4>::type mm_neg(pack<double, 4>::type
const &x) {
423 return _mm256_xor_pd(x, (mm_set1<double, 4>(-0.)));
433 inline pack<float,4>::type mm_complex_mul(pack<float, 4>::type
const &x, pack<float, 4>::type
const &y) {
434 typename pack<float,4>::type cc = _mm_permute_ps(y, 0b10100000);
435 typename pack<float,4>::type ba = _mm_permute_ps(x, 0b10110001);
436 typename pack<float,4>::type dd = _mm_permute_ps(y, 0b11110101);
437 typename pack<float,4>::type dba = _mm_mul_ps(ba, dd);
438 typename pack<float,4>::type mult = _mm_fmaddsub_ps(x, cc, dba);
443 inline pack<float, 8>::type mm_complex_mul(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y) {
444 typename pack<float,8>::type cc = _mm256_permute_ps(y, 0b10100000);
445 typename pack<float,8>::type ba = _mm256_permute_ps(x, 0b10110001);
446 typename pack<float,8>::type dd = _mm256_permute_ps(y, 0b11110101);
447 typename pack<float,8>::type dba = _mm256_mul_ps(ba, dd);
448 typename pack<float,8>::type mult = _mm256_fmaddsub_ps(x, cc, dba);
453 inline pack<double, 2>::type mm_complex_mul(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y) {
454 typename pack<double,2>::type cc = _mm_permute_pd(y, 0);
455 typename pack<double,2>::type ba = _mm_permute_pd(x, 0b01);
456 typename pack<double,2>::type dd = _mm_permute_pd(y, 0b11);
457 typename pack<double,2>::type dba = _mm_mul_pd(ba, dd);
458 typename pack<double,2>::type mult = _mm_fmaddsub_pd(x, cc, dba);
463 inline pack<double, 4>::type mm_complex_mul(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y) {
464 typename pack<double,4>::type cc = _mm256_permute_pd(y, 0b0000);
465 typename pack<double,4>::type ba = _mm256_permute_pd(x, 0b0101);
466 typename pack<double,4>::type dd = _mm256_permute_pd(y, 0b1111);
467 typename pack<double,4>::type dba = _mm256_mul_pd(ba, dd);
468 typename pack<double,4>::type mult = _mm256_fmaddsub_pd(x, cc, dba);
475 inline pack<float,4>::type mm_complex_fmadd(pack<float, 4>::type
const &x, pack<float, 4>::type
const &y, pack<float, 4>::type
const &z) {
476 typename pack<float,4>::type cc = _mm_permute_ps(y, 0b10100000);
477 typename pack<float,4>::type ba = _mm_permute_ps(x, 0b10110001);
478 typename pack<float,4>::type dd = _mm_permute_ps(y, 0b11110101);
479 typename pack<float,4>::type dba = _mm_fmaddsub_ps(ba, dd, z);
480 typename pack<float,4>::type mult = _mm_fmaddsub_ps(x, cc, dba);
485 inline pack<float, 8>::type mm_complex_fmadd(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y, pack<float, 8>::type
const &z) {
486 typename pack<float,8>::type cc = _mm256_permute_ps(y, 0b10100000);
487 typename pack<float,8>::type ba = _mm256_permute_ps(x, 0b10110001);
488 typename pack<float,8>::type dd = _mm256_permute_ps(y, 0b11110101);
489 typename pack<float,8>::type dba = _mm256_fmaddsub_ps(ba, dd, z);
490 typename pack<float,8>::type mult = _mm256_fmaddsub_ps(x, cc, dba);
495 inline pack<double, 2>::type mm_complex_fmadd(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y, pack<double, 2>::type
const &z) {
496 typename pack<double,2>::type cc = _mm_permute_pd(y, 0);
497 typename pack<double,2>::type ba = _mm_permute_pd(x, 0b01);
498 typename pack<double,2>::type dd = _mm_permute_pd(y, 0b11);
499 typename pack<double,2>::type dba = _mm_fmaddsub_pd(ba, dd, z);
500 typename pack<double,2>::type mult = _mm_fmaddsub_pd(x, cc, dba);
505 inline pack<double, 4>::type mm_complex_fmadd(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y, pack<double, 4>::type
const &z) {
506 typename pack<double,4>::type cc = _mm256_permute_pd(y, 0b0000);
507 typename pack<double,4>::type ba = _mm256_permute_pd(x, 0b0101);
508 typename pack<double,4>::type dd = _mm256_permute_pd(y, 0b1111);
509 typename pack<double,4>::type dba = _mm256_fmaddsub_pd(ba, dd, z);
510 typename pack<double,4>::type mult = _mm256_fmaddsub_pd(x, cc, dba);
515 inline pack<float,4>::type mm_complex_fmsub(pack<float, 4>::type
const &x, pack<float, 4>::type
const &y, pack<float, 4>::type
const &z) {
516 typename pack<float,4>::type cc = _mm_permute_ps(y, 0b10100000);
517 typename pack<float,4>::type ba = _mm_permute_ps(x, 0b10110001);
518 typename pack<float,4>::type dd = _mm_permute_ps(y, 0b11110101);
519 typename pack<float,4>::type dba = _mm_fmsubadd_ps(ba, dd, z);
520 typename pack<float,4>::type mult = _mm_fmaddsub_ps(x, cc, dba);
525 inline pack<float, 8>::type mm_complex_fmsub(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y, pack<float, 8>::type
const &z) {
526 typename pack<float,8>::type cc = _mm256_permute_ps(y, 0b10100000);
527 typename pack<float,8>::type ba = _mm256_permute_ps(x, 0b10110001);
528 typename pack<float,8>::type dd = _mm256_permute_ps(y, 0b11110101);
529 typename pack<float,8>::type dba = _mm256_fmsubadd_ps(ba, dd, z);
530 typename pack<float,8>::type mult = _mm256_fmaddsub_ps(x, cc, dba);
535 inline pack<double, 2>::type mm_complex_fmsub(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y, pack<double, 2>::type
const &z) {
536 typename pack<double,2>::type cc = _mm_permute_pd(y, 0);
537 typename pack<double,2>::type ba = _mm_permute_pd(x, 0b01);
538 typename pack<double,2>::type dd = _mm_permute_pd(y, 0b11);
539 typename pack<double,2>::type dba = _mm_fmsubadd_pd(ba, dd, z);
540 typename pack<double,2>::type mult = _mm_fmaddsub_pd(x, cc, dba);
545 inline pack<double, 4>::type mm_complex_fmsub(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y, pack<double, 4>::type
const &z) {
546 typename pack<double,4>::type cc = _mm256_permute_pd(y, 0b0000);
547 typename pack<double,4>::type ba = _mm256_permute_pd(x, 0b0101);
548 typename pack<double,4>::type dd = _mm256_permute_pd(y, 0b1111);
549 typename pack<double,4>::type dba = _mm256_fmsubadd_pd(ba, dd, z);
550 typename pack<double,4>::type mult = _mm256_fmaddsub_pd(x, cc, dba);
557 inline pack<float, 4>::type mm_complex_sq_mod(pack<float, 4>::type
const &x) {
558 return _mm_or_ps(_mm_dp_ps(x, x, 0b11001100), _mm_dp_ps(x, x, 0b00110011));
562 inline pack<float, 8>::type mm_complex_sq_mod(pack<float, 8>::type
const &x) {
563 return _mm256_or_ps(_mm256_dp_ps(x, x, 0b11001100), _mm256_dp_ps(x, x, 0b00110011));
567 inline pack<double, 2>::type mm_complex_sq_mod(pack<double, 2>::type
const &x) {
568 return _mm_dp_pd(x, x, 0b11111111);
572 inline pack<double, 4>::type mm_complex_sq_mod(pack<double, 4>::type
const &x) {
573 typename pack<double,4>::type a = _mm256_mul_pd(x, x);
574 return _mm256_hadd_pd(a, a);
580 inline pack<float, 4>::type mm_complex_mod(pack<float, 4>::type
const &x) {
581 return _mm_sqrt_ps(mm_complex_sq_mod(x));
585 inline pack<float, 8>::type mm_complex_mod(pack<float, 8>::type
const &x) {
586 return _mm256_sqrt_ps(mm_complex_sq_mod(x));
590 inline pack<double, 2>::type mm_complex_mod(pack<double, 2>::type
const &x) {
591 return _mm_sqrt_pd(mm_complex_sq_mod(x));
595 inline pack<double, 4>::type mm_complex_mod(pack<double, 4>::type
const &x) {
596 return _mm256_sqrt_pd(mm_complex_sq_mod(x));
600 inline pack<float, 4>::type mm_complex_conj(pack<float, 4>::type
const &x) {
601 return _mm_blend_ps(x, (mm_neg(x)), 0b1010);
605 inline pack<float, 8>::type mm_complex_conj(pack<float, 8>::type
const &x) {
606 return _mm256_blend_ps(x, (mm_neg(x)), 0b10101010);
610 inline pack<double, 2>::type mm_complex_conj(pack<double, 2>::type
const &x) {
611 return _mm_blend_pd(x, (mm_neg(x)), 0b10);
615 inline pack<double, 4>::type mm_complex_conj(pack<double, 4>::type
const &x) {
616 return _mm256_blend_pd(x, (mm_neg(x)), 0b1010);
621 inline pack<float, 4>::type mm_complex_mul_i(pack<float, 4>::type
const &x) {
622 return _mm_permute_ps( (mm_complex_conj(x)), 0b10110001);
626 inline pack<float, 8>::type mm_complex_mul_i(pack<float, 8>::type
const &x) {
627 return _mm256_permute_ps( (mm_complex_conj(x)), 0b10110001);
631 inline pack<double, 2>::type mm_complex_mul_i(pack<double, 2>::type
const &x) {
632 return _mm_permute_pd( (mm_complex_conj(x)), 0b00000001);
636 inline pack<double, 4>::type mm_complex_mul_i(pack<double, 4>::type
const &x) {
637 return _mm256_permute_pd( (mm_complex_conj(x)), 0b00000101);
641 inline pack<float, 4>::type mm_complex_mul_neg_i(pack<float, 4>::type
const &x) {
642 return mm_complex_conj(_mm_permute_ps(x, 0b10110001));
646 inline pack<float, 8>::type mm_complex_mul_neg_i(pack<float, 8>::type
const &x) {
647 return mm_complex_conj(_mm256_permute_ps(x, 0b10110001));
651 inline pack<double, 2>::type mm_complex_mul_neg_i(pack<double, 2>::type
const &x) {
652 return mm_complex_conj(_mm_permute_pd(x, 0b0000001));
656 inline pack<double, 4>::type mm_complex_mul_neg_i(pack<double, 4>::type
const &x) {
657 return mm_complex_conj(_mm256_permute_pd(x, 0b00000101));
663 inline pack<float, 4>::type mm_complex_div(pack<float, 4>::type
const &x, pack<float, 4>::type
const &y) {
664 return _mm_div_ps(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));
668 inline pack<float, 8>::type mm_complex_div(pack<float, 8>::type
const &x, pack<float, 8>::type
const &y) {
669 return _mm256_div_ps(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));
673 inline pack<double, 2>::type mm_complex_div(pack<double, 2>::type
const &x, pack<double, 2>::type
const &y) {
674 return _mm_div_pd(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));
678 inline pack<double, 4>::type mm_complex_div(pack<double, 4>::type
const &x, pack<double, 4>::type
const &y) {
679 return _mm256_div_pd(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));
683 #ifdef Heffte_ENABLE_AVX512
686 template<>
struct pack<double, 8> {
using type = __m512d; };
688 template<>
struct pack<float, 16> {
using type = __m512; };
696 inline typename pack<float, 16>::type mm_zero<float, 16>(){
return _mm512_setzero_ps(); }
700 inline typename pack<float, 16>::type mm_load<float, 16>(
float const *src) {
return _mm512_loadu_ps(src); }
704 inline void mm_store<float, 16>(
float *dest, pack<float, 16>::type
const &src) { _mm512_storeu_ps(dest, src); }
708 inline typename pack<float, 16>::type mm_pair_set<float, 16>(
float x,
float y) {
return _mm512_setr_ps(x, y, x, y, x, y, x, y, x, y, x, y, x, y, x, y); }
712 inline typename pack<float, 16>::type mm_set1<float, 16> (
float x) {
return _mm512_set1_ps(x); }
716 inline typename pack<float, 16>::type mm_complex_load<float, 16>(std::complex<float>
const *src,
int stride) {
717 return _mm512_setr_ps(src[0*stride].real(), src[0*stride].imag(), src[1*stride].real(), src[1*stride].imag(),
718 src[2*stride].real(), src[2*stride].imag(), src[3*stride].real(), src[3*stride].imag(),
719 src[4*stride].real(), src[4*stride].imag(), src[5*stride].real(), src[5*stride].imag(),
720 src[6*stride].real(), src[6*stride].imag(), src[7*stride].real(), src[7*stride].imag());
725 inline typename pack<float, 16>::type mm_complex_load<float, 16>(std::complex<float>
const *src) {
726 return mm_complex_load<float, 16>(src, 1);
735 inline typename pack<double, 8>::type mm_zero<double, 8>(){
return _mm512_setzero_pd(); }
739 inline typename pack<double, 8>::type mm_load<double, 8>(
double const *src) {
return _mm512_loadu_pd(src); }
743 inline void mm_store<double, 8>(
double *dest, pack<double, 8>::type
const &src) { _mm512_storeu_pd(dest, src); }
747 inline typename pack<double, 8>::type mm_pair_set<double, 8>(
double x,
double y) {
return _mm512_setr_pd(x, y, x, y, x, y, x, y); }
751 inline typename pack<double, 8>::type mm_set1<double, 8>(
double x) {
return _mm512_set1_pd(x); }
755 inline typename pack<double, 8>::type mm_complex_load<double, 8>(std::complex<double>
const *src,
int stride) {
756 return _mm512_setr_pd(src[0*stride].real(), src[0*stride].imag(), src[1*stride].real(), src[1*stride].imag(),
757 src[2*stride].real(), src[2*stride].imag(), src[3*stride].real(), src[3*stride].imag());
761 inline typename pack<double, 8>::type mm_complex_load<double, 8>(std::complex<double>
const *src) {
762 return mm_complex_load<double, 8>(src, 1);
772 inline pack<float, 16>::type mm_add(pack<float, 16>::type
const &x, pack<float, 16>::type
const &y) {
773 return _mm512_add_ps(x, y);
777 inline pack<double, 8>::type mm_add(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y) {
778 return _mm512_add_pd(x, y);
784 inline pack<float, 16>::type mm_sub(pack<float, 16>::type
const &x,pack<float, 16>::type
const &y) {
785 return _mm512_sub_ps(x, y);
789 inline pack<double, 8>::type mm_sub(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y) {
790 return _mm512_sub_pd(x, y);
796 inline pack<float, 16>::type mm_mul(pack<float, 16>::type
const &x, pack<float, 16>::type
const &y) {
797 return _mm512_mul_ps(x, y);
801 inline pack<double, 8>::type mm_mul(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y) {
802 return _mm512_mul_pd(x, y);
808 inline pack<float, 16>::type mm_div(pack<float, 16>::type
const &x,pack<float, 16>::type
const &y) {
809 return _mm512_div_ps(x, y);
813 inline pack<double, 8>::type mm_div(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y) {
814 return _mm512_div_pd(x, y);
819 inline pack<float, 16>::type mm_neg(pack<float, 16>::type
const &x) {
820 return _mm512_xor_ps(x, (mm_set1<float, 16>(-0.f)));
824 inline pack<double, 8>::type mm_neg(pack<double, 8>::type
const &x) {
825 return _mm512_xor_pd(x, (mm_set1<double, 8>(-0.f)));
835 inline pack<float,16>::type mm_complex_mul(pack<float, 16>::type
const &x, pack<float, 16>::type
const &y) {
836 typename pack<float, 16>::type cc = _mm512_permute_ps(y, 0b10100000);
837 typename pack<float, 16>::type ba = _mm512_permute_ps(x, 0b10110001);
838 typename pack<float, 16>::type dd = _mm512_permute_ps(y, 0b11110101);
839 typename pack<float, 16>::type dba = _mm512_mul_ps(ba, dd);
840 typename pack<float, 16>::type mult = _mm512_fmaddsub_ps(x, cc, dba);
845 inline pack<double, 8>::type mm_complex_mul(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y) {
846 typename pack<double, 8>::type cc = _mm512_permute_pd(y, 0b00000000);
847 typename pack<double, 8>::type ba = _mm512_permute_pd(x, 0b01010101);
848 typename pack<double, 8>::type dd = _mm512_permute_pd(y, 0b11111111);
849 typename pack<double, 8>::type dba = _mm512_mul_pd(ba, dd);
850 typename pack<double, 8>::type mult = _mm512_fmaddsub_pd(x, cc, dba);
857 inline pack<float,16>::type mm_complex_fmadd(pack<float, 16>::type
const &x, pack<float, 16>::type
const &y, pack<float, 16>::type
const &alpha) {
858 typename pack<float, 16>::type cc = _mm512_permute_ps(y, 0b10100000);
859 typename pack<float, 16>::type ba = _mm512_permute_ps(x, 0b10110001);
860 typename pack<float, 16>::type dd = _mm512_permute_ps(y, 0b11110101);
861 typename pack<float, 16>::type dba = _mm512_fmaddsub_ps(ba, dd, alpha);
862 typename pack<float, 16>::type mult = _mm512_fmaddsub_ps(x, cc, dba);
867 inline pack<double, 8>::type mm_complex_fmadd(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y, pack<double, 8>::type
const &alpha) {
868 typename pack<double, 8>::type cc = _mm512_permute_pd(y, 0b00000000);
869 typename pack<double, 8>::type ba = _mm512_permute_pd(x, 0b01010101);
870 typename pack<double, 8>::type dd = _mm512_permute_pd(y, 0b11111111);
871 typename pack<double, 8>::type dba = _mm512_fmaddsub_pd(ba, dd, alpha);
872 typename pack<double, 8>::type mult = _mm512_fmaddsub_pd(x, cc, dba);
877 inline pack<float,16>::type mm_complex_fmsub(pack<float, 16>::type
const &x, pack<float, 16>::type
const &y, pack<float, 16>::type
const &alpha) {
878 typename pack<float, 16>::type cc = _mm512_permute_ps(y, 0b10100000);
879 typename pack<float, 16>::type ba = _mm512_permute_ps(x, 0b10110001);
880 typename pack<float, 16>::type dd = _mm512_permute_ps(y, 0b11110101);
881 typename pack<float, 16>::type dba = _mm512_fmsubadd_ps(ba, dd, alpha);
882 typename pack<float, 16>::type mult = _mm512_fmaddsub_ps(x, cc, dba);
887 inline pack<double, 8>::type mm_complex_fmsub(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y, pack<double, 8>::type
const &alpha) {
888 typename pack<double, 8>::type cc = _mm512_permute_pd(y, 0b00000000);
889 typename pack<double, 8>::type ba = _mm512_permute_pd(x, 0b01010101);
890 typename pack<double, 8>::type dd = _mm512_permute_pd(y, 0b11111111);
891 typename pack<double, 8>::type dba = _mm512_fmsubadd_pd(ba, dd, alpha);
892 typename pack<double, 8>::type mult = _mm512_fmaddsub_pd(x, cc, dba);
899 inline pack<float, 16>::type mm_complex_sq_mod(pack<float, 16>::type
const &x) {
900 typename pack<float, 16>::type sq = mm_mul(x, x);
901 typename pack<float, 16>::type sq_perm = _mm512_permute_ps(sq, 0b10110001);
902 typename pack<float, 16>::type mod = mm_add(sq, sq_perm);
907 inline pack<double, 8>::type mm_complex_sq_mod(pack<double, 8>::type
const &x) {
908 typename pack<double, 8>::type sq = mm_mul(x, x);
909 typename pack<double, 8>::type sq_perm = _mm512_permute_pd(sq, 0b01010101);
910 typename pack<double, 8>::type mod = mm_add(sq, sq_perm);
917 inline pack<float, 16>::type mm_complex_mod(pack<float, 16>::type
const &x) {
918 return _mm512_sqrt_ps(mm_complex_sq_mod(x));
922 inline pack<double, 8>::type mm_complex_mod(pack<double, 8>::type
const &x) {
923 return _mm512_sqrt_pd(mm_complex_sq_mod(x));
929 inline pack<float, 16>::type mm_complex_conj(pack<float, 16>::type
const &x) {
930 return _mm512_mask_blend_ps(0b1010101010101010, x, mm_neg(x));
934 inline pack<double, 8>::type mm_complex_conj(pack<double, 8>::type
const &x) {
935 return _mm512_mask_blend_pd(0b10101010, x, mm_neg(x));
940 inline pack<float, 16>::type mm_complex_mul_i(pack<float, 16>::type
const &x) {
941 return _mm512_permute_ps( (mm_complex_conj(x)), 0b10110001);
945 inline pack<double, 8>::type mm_complex_mul_i(pack<double, 8>::type
const &x) {
946 return _mm512_permute_pd( (mm_complex_conj(x)), 0b01010101);
950 inline pack<float, 16>::type mm_complex_mul_neg_i(pack<float, 16>::type
const &x) {
951 return mm_complex_conj(_mm512_permute_ps(x, 0b10110001));
955 inline pack<double, 8>::type mm_complex_mul_neg_i(pack<double, 8>::type
const &x) {
956 return mm_complex_conj(_mm512_permute_pd(x, 0b01010101));
962 inline pack<float, 16>::type mm_complex_div(pack<float, 16>::type
const &x, pack<float, 16>::type
const &y) {
963 return _mm512_div_ps(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));
967 inline pack<double, 8>::type mm_complex_div(pack<double, 8>::type
const &x, pack<double, 8>::type
const &y) {
968 return _mm512_div_pd(mm_complex_mul(x, mm_complex_conj(y)), mm_complex_sq_mod(y));
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
Struct determining whether a type is a complex number.
Definition: heffte_stock_vec_types.h:38
Struct determining whether a type is a real number.
Definition: heffte_stock_vec_types.h:33
Struct to retrieve the vector type associated with the number of elements stored "per unit".
Definition: heffte_stock_vec_types.h:43