Highly Efficient FFT for Exascale: HeFFTe v2.3
heffte_backend_oneapi.h
1 /*
2  -- heFFTe --
3  Univ. of Tennessee, Knoxville
4  @date
5 */
6 
7 #ifndef HEFFTE_BACKEND_ONEAPI_H
8 #define HEFFTE_BACKEND_ONEAPI_H
9 
10 #include "heffte_r2r_executor.h"
11 
12 #ifdef Heffte_ENABLE_ONEAPI
13 
14 #include "heffte_backend_vector.h"
15 
16 #include <CL/sycl.hpp>
17 #include "oneapi/mkl.hpp"
18 #include "oneapi/mkl/dfti.hpp"
19 
20 #ifdef Heffte_ENABLE_MAGMA
21 // will enable once MAGMA has a DPC++/SYCL backend
22 //#include "heffte_magma_helpers.h"
23 #endif
24 
40 namespace heffte{
41 
48 namespace oapi {
50  inline sycl::queue make_sycl_queue(){
51  try{
52  return sycl::queue(sycl::gpu_selector());
53  }catch(sycl::exception const&){
54  return sycl::queue(sycl::cpu_selector());
55  }
56  }
57 
67  extern sycl::queue internal_sycl_queue;
68 
75  template<typename precision_type, typename index>
76  void convert(sycl::queue &stream, index num_entries, precision_type const source[], std::complex<precision_type> destination[]);
83  template<typename precision_type, typename index>
84  void convert(sycl::queue &stream, index num_entries, std::complex<precision_type> const source[], precision_type destination[]);
85 
90  template<typename scalar_type, typename index>
91  void scale_data(sycl::queue &stream, index num_entries, scalar_type *data, double scale_factor);
92 
99  template<typename scalar_type, typename index>
100  void direct_pack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[]);
107  template<typename scalar_type, typename index>
108  void direct_unpack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[]);
115  template<typename scalar_type, typename index>
116  void transpose_unpack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
117  index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2,
118  scalar_type const source[], scalar_type destination[]);
119 
126  template<typename precision>
127  static void pre_forward(sycl::queue&, int length, precision const input[], precision fft_signal[]);
129  template<typename precision>
130  static void post_forward(sycl::queue&, int length, std::complex<precision> const fft_result[], precision result[]);
132  template<typename precision>
133  static void pre_backward(sycl::queue&, int length, precision const input[], std::complex<precision> fft_signal[]);
135  template<typename precision>
136  static void post_backward(sycl::queue&, int length, precision const fft_result[], precision result[]);
137  };
144  template<typename precision>
145  static void pre_forward(sycl::queue&, int length, precision const input[], precision fft_signal[]);
147  template<typename precision>
148  static void post_forward(sycl::queue&, int length, std::complex<precision> const fft_result[], precision result[]);
150  template<typename precision>
151  static void pre_backward(sycl::queue&, int length, precision const input[], std::complex<precision> fft_signal[]);
153  template<typename precision>
154  static void post_backward(sycl::queue&, int length, precision const fft_result[], precision result[]);
155  };
156 
157 }
158 
159 namespace backend{
164  template<> struct is_enabled<onemkl> : std::true_type{};
169  template<> struct is_enabled<onemkl_cos> : std::true_type{};
174  template<> struct is_enabled<onemkl_sin> : std::true_type{};
175 
180  template<>
181  struct device_instance<tag::gpu>{
185  device_instance(sycl::queue &new_stream) : _stream(new_stream){}
187  device_instance(std::reference_wrapper<sycl::queue> &new_stream) : _stream(new_stream){}
189  sycl::queue& stream(){ return _stream; }
191  sycl::queue& stream() const{ return _stream; }
193  void synchronize_device() const{ _stream.get().wait(); }
195  std::reference_wrapper<sycl::queue> _stream;
197  using stream_type = std::reference_wrapper<sycl::queue>;
198  };
203  template<> struct default_backend<tag::gpu>{
205  using type = onemkl;
206  };
207 
212  template<> struct data_manipulator<tag::gpu> {
214  using stream_type = sycl::queue&;
218  template<typename scalar_type>
219  static scalar_type* allocate(sycl::queue &stream, size_t num_entries){
220  scalar_type* result = sycl::malloc_device<scalar_type>(num_entries, stream);
221  stream.wait();
222  return result;
223  }
225  template<typename scalar_type>
226  static void free(sycl::queue &stream, scalar_type *pntr){
227  if (pntr == nullptr) return;
228  sycl::free(pntr, stream);
229  }
231  template<typename scalar_type>
232  static void copy_n(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
233  stream.memcpy(destination, source, num_entries * sizeof(scalar_type)).wait();
234  }
236  template<typename scalar_type>
237  static void copy_n(sycl::queue &stream, std::complex<scalar_type> const source[], size_t num_entries, scalar_type destination[]){
238  oapi::convert(stream, static_cast<long long>(num_entries), source, destination);
239  }
241  template<typename scalar_type>
242  static void copy_n(sycl::queue &stream, scalar_type const source[], size_t num_entries, std::complex<scalar_type> destination[]){
243  oapi::convert(stream, static_cast<long long>(num_entries), source, destination);
244  }
246  template<typename scalar_type>
247  static void copy_device_to_host(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
248  stream.memcpy(destination, source, num_entries * sizeof(scalar_type)).wait();
249  }
251  template<typename scalar_type>
252  static void copy_device_to_device(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
253  stream.memcpy(destination, source, num_entries * sizeof(scalar_type)).wait();
254  }
256  template<typename scalar_type>
257  static void copy_host_to_device(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
258  stream.memcpy(destination, source, num_entries * sizeof(scalar_type)).wait();
259  }
260  };
261 
266  template<>
271  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
272  };
277  template<>
282  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
283  };
288  template<>
293  template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
294  };
295 }
296 
305 public:
313  template<typename index>
314  onemkl_executor(sycl::queue &inq, box3d<index> const box, int dimension) :
315  q(inq),
316  size(box.size[dimension]), size2(0),
317  howmanyffts(fft1d_get_howmany(box, dimension)),
318  stride(fft1d_get_stride(box, dimension)),
319  dist((dimension == box.order[0]) ? size : 1),
320  blocks((dimension == box.order[1]) ? box.osize(2) : 1),
321  block_stride(box.osize(0) * box.osize(1)),
322  total_size(box.count()),
323  embed({0, static_cast<MKL_LONG>(stride), 0}),
324  init_cplan(false), init_zplan(false),
325  cplan(size), zplan(size)
326  {}
328  template<typename index>
329  onemkl_executor(sycl::queue &inq, box3d<index> const box, int dir1, int dir2) :
330  q(inq),
331  size(box.size[std::min(dir1, dir2)]), size2(box.size[std::max(dir1, dir2)]),
332  blocks(1), block_stride(0), total_size(box.count()),
333  init_cplan(false), init_zplan(false),
334  cplan({size, size2}), zplan({size, size2})
335  {
336  int odir1 = box.find_order(dir1);
337  int odir2 = box.find_order(dir2);
338 
339  if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){
340  stride = 1;
341  dist = size * size2;
342  embed = {0, static_cast<MKL_LONG>(stride), static_cast<MKL_LONG>(size)};
343  howmanyffts = box.size[2];
344  }else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){
345  stride = box.size[0];
346  dist = 1;
347  embed = {0, static_cast<MKL_LONG>(stride), static_cast<MKL_LONG>(size) * static_cast<MKL_LONG>(stride)};
348  howmanyffts = box.size[0];
349  }else{ // case of directions (0, 2)
350  stride = 1;
351  dist = size;
352  embed = {0, static_cast<MKL_LONG>(stride), static_cast<MKL_LONG>(box.size[1]) * static_cast<MKL_LONG>(box.size[0])};
353  howmanyffts = box.size[1];
354  }
355  }
357  template<typename index>
358  onemkl_executor(sycl::queue &inq, box3d<index> const box) :
359  q(inq),
360  size(box.size[0]), size2(box.size[1]), howmanyffts(box.size[2]),
361  stride(0), dist(0),
362  blocks(1), block_stride(0), total_size(box.count()),
363  init_cplan(false), init_zplan(false),
364  cplan({howmanyffts, size2, size}), zplan({howmanyffts, size2, size})
365  {}
366 
368  void forward(std::complex<float> data[], std::complex<float>*) const override{
369  if (not init_cplan) make_plan(cplan);
370  for(int i=0; i<blocks; i++)
371  oneapi::mkl::dft::compute_forward(cplan, data + i * block_stride);
372  q.wait();
373  }
375  void backward(std::complex<float> data[], std::complex<float>*) const override{
376  if (not init_cplan) make_plan(cplan);
377  for(int i=0; i<blocks; i++)
378  oneapi::mkl::dft::compute_backward(cplan, data + i * block_stride);
379  q.wait();
380  }
382  void forward(std::complex<double> data[], std::complex<double>*) const override{
383  if (not init_zplan) make_plan(zplan);
384  for(int i=0; i<blocks; i++)
385  oneapi::mkl::dft::compute_forward(zplan, data + i * block_stride);
386  q.wait();
387  }
389  void backward(std::complex<double> data[], std::complex<double>*) const override{
390  if (not init_zplan) make_plan(zplan);
391  for(int i=0; i<blocks; i++)
392  oneapi::mkl::dft::compute_backward(zplan, data + i * block_stride);
393  q.wait();
394  }
395 
397  void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
398  for(int i=0; i<total_size; i++) outdata[i] = std::complex<float>(indata[i]);
399  forward(outdata, workspace);
400  }
402  void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{
403  backward(indata, workspace);
404  for(int i=0; i<total_size; i++) outdata[i] = std::real(indata[i]);
405  }
407  void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
408  for(int i=0; i<total_size; i++) outdata[i] = std::complex<double>(indata[i]);
409  forward(outdata, workspace);
410  }
412  void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
413  backward(indata, workspace);
414  for(int i=0; i<total_size; i++) outdata[i] = std::real(indata[i]);
415  }
416 
418  int box_size() const override{ return total_size; }
420  size_t workspace_size() const override{ return 0; }
421 
422 private:
424  template<typename onemkl_plan_type>
425  void make_plan(onemkl_plan_type &plan) const{
426  if (dist == 0){
427  plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, 1);
428  plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
429  }else if (size2 == 0){
430  plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, (MKL_LONG) howmanyffts);
431  plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
432  plan.set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, embed.data());
433  plan.set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, embed.data());
434  plan.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, (MKL_LONG) dist);
435  plan.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, (MKL_LONG) dist);
436  }else{
437  plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, (MKL_LONG) howmanyffts);
438  plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
439  plan.set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, embed.data());
440  plan.set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, embed.data());
441  plan.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, (MKL_LONG) dist);
442  plan.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, (MKL_LONG) dist);
443  }
444 
445  plan.commit(q);
446  q.wait();
447 
448  if (std::is_same<oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX>, onemkl_plan_type>::value)
449  init_cplan = true;
450  else
451  init_zplan = true;
452  }
453 
454  sycl::queue &q;
455  int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
456  std::array<MKL_LONG, 3> embed;
457 
458  mutable bool init_cplan, init_zplan;
459  mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX> cplan;
460  mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::COMPLEX> zplan;
461 };
462 
472 public:
482  template<typename index>
483  onemkl_executor_r2c(sycl::queue &inq, box3d<index> const box, int dimension) :
484  q(inq),
485  size(box.size[dimension]),
486  howmanyffts(fft1d_get_howmany(box, dimension)),
487  stride(fft1d_get_stride(box, dimension)),
488  blocks((dimension == box.order[1]) ? box.osize(2) : 1),
489  rdist((dimension == box.order[0]) ? size : 1),
490  cdist((dimension == box.order[0]) ? size/2 + 1 : 1),
491  rblock_stride(box.osize(0) * box.osize(1)),
492  cblock_stride(box.osize(0) * (box.osize(1)/2 + 1)),
493  rsize(box.count()),
494  csize(box.r2c(dimension).count()),
495  init_splan(false), init_dplan(false),
496  splan(size), dplan(size)
497  {}
498 
500  void forward(float const indata[], std::complex<float> outdata[], std::complex<float>*) const override{
501  if (not init_splan) make_plan(splan);
502  for(int i=0; i<blocks; i++)
503  oneapi::mkl::dft::compute_forward(splan, const_cast<float*>(indata + i * rblock_stride), reinterpret_cast<float*>(outdata + i * cblock_stride));
504  q.wait();
505  }
507  void backward(std::complex<float> indata[], float outdata[], std::complex<float>*) const override{
508  if (not init_splan) make_plan(splan);
509  for(int i=0; i<blocks; i++)
510  oneapi::mkl::dft::compute_backward(splan, reinterpret_cast<float*>(const_cast<std::complex<float>*>(indata + i * cblock_stride)), outdata + i * rblock_stride);
511  q.wait();
512  }
514  void forward(double const indata[], std::complex<double> outdata[], std::complex<double>*) const override{
515  if (not init_dplan) make_plan(dplan);
516  for(int i=0; i<blocks; i++)
517  oneapi::mkl::dft::compute_forward(dplan, const_cast<double*>(indata + i * rblock_stride), reinterpret_cast<double*>(outdata + i * cblock_stride));
518  q.wait();
519  }
521  void backward(std::complex<double> indata[], double outdata[], std::complex<double>*) const override{
522  if (not init_dplan) make_plan(dplan);
523  for(int i=0; i<blocks; i++)
524  oneapi::mkl::dft::compute_backward(dplan, reinterpret_cast<double*>(const_cast<std::complex<double>*>(indata + i * cblock_stride)), outdata + i * rblock_stride);
525  q.wait();
526  }
527 
529  int box_size() const override{ return rsize; }
531  int complex_size() const override{ return csize; }
533  size_t workspace_size() const override{ return 0; }
534 
535 private:
537  template<typename onemkl_plan_type>
538  void make_plan(onemkl_plan_type &plan) const{
539  plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, (MKL_LONG) howmanyffts);
540  plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
541  plan.set_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
542  MKL_LONG slstride[] = {0, static_cast<MKL_LONG>(stride)};
543  plan.set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, slstride);
544  plan.set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, slstride);
545  plan.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, (MKL_LONG) rdist);
546  plan.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, (MKL_LONG) cdist);
547  plan.commit(q);
548 
549  if (std::is_same<oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>, onemkl_plan_type>::value)
550  init_splan = true;
551  else
552  init_dplan = true;
553  q.wait();
554  }
555 
556  sycl::queue &q;
557 
558  int size, howmanyffts, stride, blocks;
559  int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
560  mutable bool init_splan, init_dplan;
561  mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL> splan;
562  mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL> dplan;
563 };
564 
573 template<> struct one_dim_backend<backend::onemkl>{
578 };
583 template<> struct one_dim_backend<backend::onemkl_cos>{
588 };
593 template<> struct one_dim_backend<backend::onemkl_sin>{
598 };
599 
604 template<> struct direct_packer<tag::gpu>{
606  template<typename scalar_type, typename index>
607  void pack(sycl::queue &stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
608  oapi::direct_pack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, data, buffer);
609  }
611  template<typename scalar_type, typename index>
612  void unpack(sycl::queue &stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
613  oapi::direct_unpack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, buffer, data);
614  }
615 };
616 
621 template<> struct transpose_packer<tag::gpu>{
623  template<typename scalar_type, typename index>
624  void pack(sycl::queue &stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
625  direct_packer<tag::gpu>().pack(stream, plan, data, buffer); // packing is done the same way as the direct_packer
626  }
628  template<typename scalar_type, typename index>
629  void unpack(sycl::queue &stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
630  oapi::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,
631  plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2], buffer, data);
632  }
633 };
634 
639 namespace data_scaling {
643  template<typename scalar_type, typename index>
644  static void apply(sycl::queue &stream, index num_entries, scalar_type *data, double scale_factor){
645  oapi::scale_data(stream, static_cast<long long>(num_entries), data, scale_factor);
646  }
650  template<typename precision_type, typename index>
651  static void apply(sycl::queue &stream, index num_entries, std::complex<precision_type> *data, double scale_factor){
652  apply<precision_type>(stream, 2*num_entries, reinterpret_cast<precision_type*>(data), scale_factor);
653  }
654 };
655 
660 template<> struct default_plan_options<backend::onemkl>{
662  static const bool use_reorder = false;
663 };
668 template<> struct default_plan_options<backend::onemkl_cos>{
670  static const bool use_reorder = true;
671 };
676 template<> struct default_plan_options<backend::onemkl_sin>{
678  static const bool use_reorder = true;
679 };
680 
681 }
682 
683 #endif
684 
685 #endif
Base class for all backend executors.
Definition: heffte_common.h:486
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition: heffte_common.h:519
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition: heffte_common.h:495
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition: heffte_common.h:491
Wrapper to oneMKL API for real-to-complex transform with shortening of the data.
Definition: heffte_backend_oneapi.h:471
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *) const override
Forward transform, single precision.
Definition: heffte_backend_oneapi.h:500
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *) const override
Forward transform, double precision.
Definition: heffte_backend_oneapi.h:514
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *) const override
Backward transform, single precision.
Definition: heffte_backend_oneapi.h:507
onemkl_executor_r2c(sycl::queue &inq, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition: heffte_backend_oneapi.h:483
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_oneapi.h:533
int box_size() const override
Returns the size of the box with real data.
Definition: heffte_backend_oneapi.h:529
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition: heffte_backend_oneapi.h:531
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *) const override
Backward transform, double precision.
Definition: heffte_backend_oneapi.h:521
Wrapper around the oneMKL API.
Definition: heffte_backend_oneapi.h:304
virtual void backward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:495
int box_size() const override
Returns the size of the box.
Definition: heffte_backend_oneapi.h:418
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition: heffte_backend_oneapi.h:402
void backward(std::complex< double > data[], std::complex< double > *) const override
Backward fft, double-complex case.
Definition: heffte_backend_oneapi.h:389
onemkl_executor(sycl::queue &inq, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition: heffte_backend_oneapi.h:314
void forward(std::complex< float > data[], std::complex< float > *) const override
Forward fft, float-complex case.
Definition: heffte_backend_oneapi.h:368
virtual void forward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:491
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition: heffte_backend_oneapi.h:412
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_oneapi.h:420
onemkl_executor(sycl::queue &inq, box3d< index > const box)
Merges two FFTs into one.
Definition: heffte_backend_oneapi.h:358
onemkl_executor(sycl::queue &inq, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition: heffte_backend_oneapi.h:329
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the deal data to complex and performs double-complex forward transform.
Definition: heffte_backend_oneapi.h:407
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the deal data to complex and performs float-complex forward transform.
Definition: heffte_backend_oneapi.h:397
void backward(std::complex< float > data[], std::complex< float > *) const override
Backward fft, float-complex case.
Definition: heffte_backend_oneapi.h:375
void forward(std::complex< double > data[], std::complex< double > *) const override
Forward fft, double-complex case.
Definition: heffte_backend_oneapi.h:382
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:169
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition: heffte_backend_cuda.h:796
void convert(sycl::queue &stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
void direct_unpack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
void scale_data(sycl::queue &stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
sycl::queue internal_sycl_queue
Default queue to use in case the user does not provide one.
void direct_pack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void transpose_unpack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
sycl::queue make_sycl_queue()
Creates a new SYCL queue, try to use the GPU but if an issue is encountered then default to the CPU.
Definition: heffte_backend_oneapi.h:50
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the oneAPI vector container.
Definition: heffte_backend_oneapi.h:271
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the oneAPI vector container.
Definition: heffte_backend_oneapi.h:282
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the oneAPI vector container.
Definition: heffte_backend_oneapi.h:293
Defines the container for the temporary buffers.
Definition: heffte_common.h:212
Type-tag for the cuFFT backend.
Definition: heffte_common.h:147
cudaStream_t stream_type
The stream type for the device.
Definition: heffte_backend_cuda.h:202
static void free(sycl::queue &stream, scalar_type *pntr)
Free memory.
Definition: heffte_backend_oneapi.h:226
static void copy_n(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition: heffte_backend_oneapi.h:232
static void copy_n(sycl::queue &stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition: heffte_backend_oneapi.h:242
static scalar_type * allocate(sycl::queue &stream, size_t num_entries)
Allocate memory.
Definition: heffte_backend_oneapi.h:219
static void copy_device_to_host(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition: heffte_backend_oneapi.h:247
static void copy_n(sycl::queue &stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition: heffte_backend_oneapi.h:237
static void copy_device_to_device(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition: heffte_backend_oneapi.h:252
static void copy_host_to_device(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition: heffte_backend_oneapi.h:257
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59
Defines inverse mapping from the location tag to a default backend tag.
Definition: heffte_common.h:380
The CUDA backend uses a CUDA stream.
Definition: heffte_backend_cuda.h:172
sycl::queue & stream() const
Returns the nullptr.
Definition: heffte_backend_oneapi.h:191
device_instance(sycl::queue &new_stream)
Constructor assigning the queue.
Definition: heffte_backend_oneapi.h:185
void synchronize_device() const
Syncs the execution with the queue.
Definition: heffte_backend_oneapi.h:193
std::reference_wrapper< sycl::queue > _stream
The sycl::queue, either user provided or created by heFFTe.
Definition: heffte_backend_oneapi.h:195
device_instance(std::reference_wrapper< sycl::queue > &new_stream)
Constructor assigning from an existing wrapper.
Definition: heffte_backend_oneapi.h:187
sycl::queue & stream()
Returns the nullptr.
Definition: heffte_backend_oneapi.h:189
cudaStream_t stream_type
The type for the internal stream.
Definition: heffte_backend_cuda.h:184
device_instance()
Empty constructor.
Definition: heffte_backend_oneapi.h:183
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358
Allows to define whether a specific backend interface has been enabled.
Definition: heffte_common.h:201
Type-tag for the Cosine Transform using the oneMKL backend.
Definition: heffte_common.h:185
Type-tag for the Sine Transform using the oneMKL backend.
Definition: heffte_common.h:190
Type-tag for the oneMKL backend.
Definition: heffte_common.h:180
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67
Defines a set of default plan options for a given backend.
Definition: heffte_common.h:642
Simple packer that copies sub-boxes without transposing the order of the indexes.
Definition: heffte_backend_cuda.h:759
void unpack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition: heffte_backend_oneapi.h:612
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_cuda.h:762
void pack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_oneapi.h:607
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_oneapi.h:124
static void pre_backward(sycl::queue &, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void post_forward(sycl::queue &, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_forward(sycl::queue &, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_backward(sycl::queue &, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_oneapi.h:142
static void pre_backward(sycl::queue &, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void post_backward(sycl::queue &, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_forward(sycl::queue &, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_forward(sycl::queue &, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
Indicates the structure that will be used by the fft backend.
Definition: heffte_common.h:546
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:42
index line_stride
Stride of the lines.
Definition: heffte_pack3d.h:36
index plane_stride
Stride of the planes.
Definition: heffte_pack3d.h:38
std::array< index, 3 > size
Number of elements in the three directions.
Definition: heffte_pack3d.h:34
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition: heffte_pack3d.h:44
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:40
Template algorithm for the Sine and Cosine transforms.
Definition: heffte_r2r_executor.h:135
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition: heffte_common.h:45
void pack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_oneapi.h:624
void unpack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition: heffte_backend_oneapi.h:629