heffte/heffte__backend__rocm_8h_source.html

 /*

     -- heFFTe --

        Univ. of Tennessee, Knoxville

        @date

 */


 #ifndef HEFFTE_BACKEND_ROCM_H

 #define HEFFTE_BACKEND_ROCM_H


 #include "heffte_r2r_executor.h"


 #ifdef Heffte_ENABLE_ROCM


 #ifndef __HIP_PLATFORM_HCC__

 #define __HIP_PLATFORM_HCC__

 #endif

 #include <hip/hip_runtime.h>

 #include <rocfft.h>

 #include "heffte_backend_vector.h"


 #ifdef Heffte_ENABLE_MAGMA

 #include "heffte_magma_helpers.h"

 #endif


 namespace heffte{


 namespace rocm {

     inline void check_error(hipError_t status, const char *function_name){

         if (status != hipSuccess)

             throw std::runtime_error(std::string(function_name) + " failed with message: " + std::string(hipGetErrorString(status)));

     }

     inline void check_error(rocfft_status status, const char *function_name){

         if (status != rocfft_status_success)

             throw std::runtime_error(std::string(function_name) + " failed with error code: " + std::to_string(status));

     }


     template<typename precision_type, typename index>

     void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex<precision_type> destination[]);

     template<typename precision_type, typename index>

     void convert(hipStream_t stream, index num_entries, std::complex<precision_type> const source[], precision_type destination[]);


     template<typename scalar_type, typename index>

     void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor);


     template<typename scalar_type, typename index>

     void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,

                     scalar_type const source[], scalar_type destination[]);

     template<typename scalar_type, typename index>

     void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,

                     scalar_type const source[], scalar_type destination[]);

     template<typename scalar_type, typename index>

     void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,

                         index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2,

                         scalar_type const source[], scalar_type destination[]);


     struct cos_pre_pos_processor{

         template<typename precision>

         static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[]);

         template<typename precision>

         static void post_forward(hipStream_t, int length, std::complex<precision> const fft_result[], precision result[]);

         template<typename precision>

         static void pre_backward(hipStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);

         template<typename precision>

         static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[]);

     };

     struct sin_pre_pos_processor{

         template<typename precision>

         static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[]);

         template<typename precision>

         static void post_forward(hipStream_t, int length, std::complex<precision> const fft_result[], precision result[]);

         template<typename precision>

         static void pre_backward(hipStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);

         template<typename precision>

         static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[]);

     };

 }


 namespace backend{

     template<> struct is_enabled<rocfft> : std::true_type{};

     template<> struct is_enabled<rocfft_cos> : std::true_type{};

     template<> struct is_enabled<rocfft_sin> : std::true_type{};


     template<>

     struct device_instance<tag::gpu>{

         device_instance(hipStream_t new_stream = nullptr) : _stream(new_stream){}

         hipStream_t stream(){ return _stream; }

         hipStream_t stream() const{ return _stream; }

         void synchronize_device() const{ rocm::check_error(hipStreamSynchronize(_stream), "device sync"); }

         mutable hipStream_t _stream;

         using stream_type = hipStream_t;

     };


     template<> struct default_backend<tag::gpu>{

         using type = rocfft;

     };


     template<> struct data_manipulator<tag::gpu> {

         using stream_type = hipStream_t;

         using backend_device = backend::device_instance<tag::gpu>;

         template<typename scalar_type>

         static scalar_type* allocate(hipStream_t stream, size_t num_entries){

             void *new_data;

             if (stream != nullptr) rocm::check_error( hipStreamSynchronize(stream), "hipStreamSynchronize()");

             rocm::check_error(hipMalloc(&new_data, num_entries * sizeof(scalar_type)), "hipMalloc()");

             return reinterpret_cast<scalar_type*>(new_data);

         }

         template<typename scalar_type>

         static void free(hipStream_t stream, scalar_type *pntr){

             if (pntr == nullptr) return;

             if (stream != nullptr) rocm::check_error( hipStreamSynchronize(stream), "hipStreamSynchronize()");

             rocm::check_error(hipFree(pntr), "hipFree()");

         }

         template<typename scalar_type>

         static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){

             if (stream == nullptr)

                 rocm::check_error(hipMemcpy(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice), "data_manipulator::copy_n()");

             else

                 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice, stream), "data_manipulator::copy_n()");

         }

         template<typename scalar_type>

         static void copy_n(hipStream_t stream, std::complex<scalar_type> const source[], size_t num_entries, scalar_type destination[]){

             rocm::convert(stream, static_cast<long long>(num_entries), source, destination);

         }

         template<typename scalar_type>

         static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex<scalar_type> destination[]){

             rocm::convert(stream, static_cast<long long>(num_entries), source, destination);

         }

         template<typename scalar_type>

         static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){

             rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToHost, stream),

                             "device_to_host (rocm)");

         }

         template<typename scalar_type>

         static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){

                 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice, stream),

                                 "device_to_device (rocm)");

         }

         template<typename scalar_type>

         static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){

             rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyHostToDevice, stream),

                             "host_to_device (rocm)");

         }

     };


     template<>

     struct buffer_traits<rocfft>{

         using location = tag::gpu;

         template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;

     };

     template<>

     struct buffer_traits<rocfft_cos>{

         using location = tag::gpu;

         template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;

     };

     template<>

     struct buffer_traits<rocfft_sin>{

         using location = tag::gpu;

         template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;

     };

 }


 template<typename precision_type, direction dir>

 struct plan_rocfft{

     plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist){


         rocfft_plan_description desc = nullptr;

         rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");


         rocm::check_error(

             rocfft_plan_description_set_data_layout(

                 desc,

                 (dir == direction::forward) ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,

                 (dir == direction::forward) ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,

                 nullptr, nullptr,

                 1, &stride, (dir == direction::forward) ? rdist : cdist,

                 1, &stride, (dir == direction::forward) ? cdist : rdist

             ),

             "plan layout"

         );


         rocm::check_error(

         rocfft_plan_create(&plan, rocfft_placement_notinplace,

                            (dir == direction::forward) ? rocfft_transform_type_real_forward : rocfft_transform_type_real_inverse,

                            (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,

                            1, &size, batch, desc),

         "plan create");


         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");


         rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");

     }

     ~plan_rocfft(){ rocfft_plan_destroy(plan); }

     operator rocfft_plan() const{ return plan; }

     size_t size_work() const{ return worksize; }


 private:

     rocfft_plan plan;

     size_t worksize;

 };


 template<typename precision_type, direction dir>

 struct plan_rocfft<std::complex<precision_type>, dir>{

     plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist) : plan(nullptr), worksize(0){

         rocfft_plan_description desc = nullptr;

         rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");


         rocm::check_error(

             rocfft_plan_description_set_data_layout(

                 desc,

                 rocfft_array_type_complex_interleaved,

                 rocfft_array_type_complex_interleaved,

                 nullptr, nullptr,

                 1, &stride, dist, 1, &stride, dist

             ),

             "plan layout"

         );


         rocm::check_error(

         rocfft_plan_create(&plan, rocfft_placement_inplace,

                            (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,

                            (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,

                            1, &size, batch, desc),

         "plan create");


         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");


         rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");

     }

     plan_rocfft(size_t size1, size_t size2, std::array<size_t, 2> const &embed, size_t batch, size_t dist) : plan(nullptr), worksize(0){

         size_t size[2] = {size1, size2};


         rocfft_plan_description desc = nullptr;

         rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");


         rocm::check_error(

             rocfft_plan_description_set_data_layout(

                 desc,

                 rocfft_array_type_complex_interleaved,

                 rocfft_array_type_complex_interleaved,

                 nullptr, nullptr,

                 2, embed.data(), dist, 2, embed.data(), dist

             ),

             "plan layout"

         );


         rocm::check_error(

         rocfft_plan_create(&plan, rocfft_placement_inplace,

                            (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,

                            (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,

                            2, size, batch, desc),

         "plan create");


         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");


         rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");

     }

     plan_rocfft(size_t size1, size_t size2, size_t size3){

         std::array<size_t, 3> size = {size1, size2, size3};

         rocfft_plan_description desc = nullptr;

         rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");


         rocm::check_error(

             rocfft_plan_description_set_data_layout(

                 desc,

                 rocfft_array_type_complex_interleaved,

                 rocfft_array_type_complex_interleaved,

                 nullptr, nullptr, 3, nullptr, 1, 3, nullptr, 1

             ),

             "plan layout"

         );


         rocm::check_error(

         rocfft_plan_create(&plan, rocfft_placement_inplace,

                            (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,

                            (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,

                            3, size.data(), 1, desc),

         "plan create 3d");


         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");


         rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");

     }

     ~plan_rocfft(){ rocm::check_error( rocfft_plan_destroy(plan), "plan destory"); }

     operator rocfft_plan() const{ return plan; }

     size_t size_work() const{ return worksize; }


 private:

     rocfft_plan plan;

     size_t worksize;

 };


 class rocfft_executor : public executor_base{

 public:

     using executor_base::forward;

     using executor_base::backward;

     using executor_base::complex_size;

     template<typename index>

     rocfft_executor(hipStream_t active_stream, box3d<index> const box, int dimension) :

         stream(active_stream),

         size(box.size[dimension]), size2(0),

         howmanyffts(fft1d_get_howmany(box, dimension)),

         stride(fft1d_get_stride(box, dimension)),

         dist((dimension == box.order[0]) ? size : 1),

         blocks((dimension == box.order[1]) ? box.osize(2) : 1),

         block_stride(box.osize(0) * box.osize(1)),

         total_size(box.count()),

         embed({0, 0}),

         worksize(compute_workspace_size())

     {}

     template<typename index>

     rocfft_executor(hipStream_t active_stream, box3d<index> const box, int dir1, int dir2) :

         stream(active_stream),

         size(box.size[std::min(dir1, dir2)]), size2(box.size[std::max(dir1, dir2)]),

         blocks(1), block_stride(0), total_size(box.count()),

         worksize(0)

     {

         int odir1 = box.find_order(dir1);

         int odir2 = box.find_order(dir2);


         if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){

             stride = 1;

             dist = size * size2;

             embed = {static_cast<size_t>(stride), static_cast<size_t>(size)};

             howmanyffts = box.size[2];

         }else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){

             stride = box.size[0];

             dist = 1;

             embed = {static_cast<size_t>(stride), static_cast<size_t>(size) * static_cast<size_t>(stride)};

             howmanyffts = box.size[0];

         }else{ // case of directions (0, 2)

             stride = 1;

             dist = size;

             embed = {static_cast<size_t>(stride), static_cast<size_t>(box.size[1]) * static_cast<size_t>(box.size[0])};

             howmanyffts = box.size[1];

         }

         worksize = compute_workspace_size();

     }

     template<typename index>

     rocfft_executor(hipStream_t active_stream, box3d<index> const box) :

         stream(active_stream),

         size(box.size[0]), size2(box.size[1]), howmanyffts(box.size[2]),

         stride(0), dist(0),

         blocks(1), block_stride(0),

         total_size(box.count()),

         embed({0, 0}),

         worksize(compute_workspace_size())

     {}


     template<typename precision_type, direction dir>

     void execute(std::complex<precision_type> data[], std::complex<precision_type> *workspace) const{

         if (std::is_same<precision_type, float>::value){

             if (dir == direction::forward)

                 make_plan(ccomplex_forward);

             else

                 make_plan(ccomplex_backward);

         }else{

             if (dir == direction::forward)

                 make_plan(zcomplex_forward);

             else

                 make_plan(zcomplex_backward);

         }

         rocfft_execution_info info;

         rocfft_execution_info_create(&info);

         rocfft_execution_info_set_stream(info, stream);


         size_t wsize = (std::is_same<precision_type, float>::value) ?

                             ((dir == direction::forward) ? ccomplex_forward->size_work() : ccomplex_backward->size_work()) :

                             ((dir == direction::forward) ? zcomplex_forward->size_work() : zcomplex_backward->size_work());


         if (wsize > 0)

             rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);


         for(int i=0; i<blocks; i++){

             void* block_data = reinterpret_cast<void*>(data + i * block_stride);

             rocm::check_error( rocfft_execute(

                 (std::is_same<precision_type, float>::value) ?

                     ((dir == direction::forward) ? *ccomplex_forward : *ccomplex_backward) :

                     ((dir == direction::forward) ? *zcomplex_forward : *zcomplex_backward),

                 &block_data, nullptr, info), "rocfft execute");

         }

         rocfft_execution_info_destroy(info);

     }


     void forward(std::complex<float> data[], std::complex<float> *workspace) const override{

         execute<float, direction::forward>(data, workspace);

     }

     void forward(std::complex<double> data[], std::complex<double> *workspace) const override{

         execute<double, direction::forward>(data, workspace);

     }

     void backward(std::complex<float> data[], std::complex<float> *workspace) const override{

         execute<float, direction::backward>(data, workspace);

     }

     void backward(std::complex<double> data[], std::complex<double> *workspace) const override{

         execute<double, direction::backward>(data, workspace);

     }


     void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{

         rocm::convert(stream, total_size, indata, outdata);

         forward(outdata, workspace);

     }

     void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{

         rocm::convert(stream, total_size, indata, outdata);

         forward(outdata, workspace);

     }

     void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{

         backward(indata, workspace);

         rocm::convert(stream, total_size, indata, outdata);

     }

     void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{

         backward(indata, workspace);

         rocm::convert(stream, total_size, indata, outdata);

     }


     int box_size() const override{ return total_size; }

     size_t workspace_size() const override{ return worksize; }

     size_t compute_workspace_size() const{

         make_plan(ccomplex_forward);

         make_plan(ccomplex_backward);

         make_plan(zcomplex_forward);

         make_plan(zcomplex_backward);

         return

         std::max( std::max(ccomplex_forward->size_work(), ccomplex_backward->size_work()) / sizeof(std::complex<float>),

                   std::max(zcomplex_forward->size_work(), zcomplex_backward->size_work()) / sizeof(std::complex<double>) ) + 1;

         return 0;

     }


 private:

     template<typename scalar_type, direction dir>

     void make_plan(std::unique_ptr<plan_rocfft<scalar_type, dir>> &plan) const{

         if (not plan){

             if (dist == 0)

                 plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, size2, howmanyffts));

             else if (size2 == 0)

                 plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, dist));

             else

                 plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, size2, embed, howmanyffts, dist));

         }

     }


     mutable hipStream_t stream;


     int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;

     std::array<size_t, 2> embed;

     mutable std::unique_ptr<plan_rocfft<std::complex<float>, direction::forward>> ccomplex_forward;

     mutable std::unique_ptr<plan_rocfft<std::complex<float>, direction::backward>> ccomplex_backward;

     mutable std::unique_ptr<plan_rocfft<std::complex<double>, direction::forward>> zcomplex_forward;

     mutable std::unique_ptr<plan_rocfft<std::complex<double>, direction::backward>> zcomplex_backward;


     size_t worksize;

 };


 class rocfft_executor_r2c : public executor_base{

 public:

     using executor_base::forward;

     using executor_base::backward;

     template<typename index>

     rocfft_executor_r2c(hipStream_t active_stream, box3d<index> const box, int dimension) :

         stream(active_stream),

         size(box.size[dimension]),

         howmanyffts(fft1d_get_howmany(box, dimension)),

         stride(fft1d_get_stride(box, dimension)),

         blocks((dimension == box.order[1]) ? box.osize(2) : 1),

         rdist((dimension == box.order[0]) ? size : 1),

         cdist((dimension == box.order[0]) ? size/2 + 1 : 1),

         rblock_stride(box.osize(0) * box.osize(1)),

         cblock_stride(box.osize(0) * (box.osize(1)/2 + 1)),

         rsize(box.count()),

         csize(box.r2c(dimension).count()),

         worksize(compute_workspace_size())

     {}


     template<typename precision_type>

     void forward(precision_type const indata[], std::complex<precision_type> outdata[], std::complex<precision_type> *workspace) const{

         if (std::is_same<precision_type, float>::value){

             make_plan(sforward);

         }else{

             make_plan(dforward);

         }


         rocfft_execution_info info;

         rocfft_execution_info_create(&info);

         rocfft_execution_info_set_stream(info, stream);


         size_t wsize = (std::is_same<precision_type, float>::value) ? sforward->size_work() : dforward->size_work();

         if (wsize > 0)

             rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);


         precision_type *copy_indata = reinterpret_cast<precision_type*>(

             reinterpret_cast<unsigned char *>(workspace) + wsize);

         backend::data_manipulator<tag::gpu>::copy_n(stream, indata, box_size(), copy_indata);


         for(int i=0; i<blocks; i++){

             void *rdata = const_cast<void*>(reinterpret_cast<void const*>(copy_indata + i * rblock_stride));

             void *cdata = reinterpret_cast<void*>(outdata + i * cblock_stride);

             rocm::check_error( rocfft_execute(

                 (std::is_same<precision_type, float>::value) ? *sforward : *dforward,

                 &rdata, &cdata, info), "rocfft execute");

         }

         rocfft_execution_info_destroy(info);

     }

     template<typename precision_type>

     void backward(std::complex<precision_type> indata[], precision_type outdata[], std::complex<precision_type> *workspace) const{

         if (std::is_same<precision_type, float>::value){

             make_plan(sbackward);

         }else{

             make_plan(dbackward);

         }


         rocfft_execution_info info;

         rocfft_execution_info_create(&info);

         rocfft_execution_info_set_stream(info, stream);


         size_t wsize = (std::is_same<precision_type, float>::value) ? sbackward->size_work() : dbackward->size_work();

         if (wsize > 0)

             rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);


         std::complex<precision_type> *copy_indata = reinterpret_cast<std::complex<precision_type>*>(

             reinterpret_cast<unsigned char *>(workspace) + wsize);

         backend::data_manipulator<tag::gpu>::copy_n(stream, indata, complex_size(), copy_indata);


         for(int i=0; i<blocks; i++){

             void *cdata = const_cast<void*>(reinterpret_cast<void const*>(copy_indata + i * cblock_stride));

             void *rdata = reinterpret_cast<void*>(outdata + i * rblock_stride);

             rocm::check_error( rocfft_execute(

                 (std::is_same<precision_type, float>::value) ? *sbackward : *dbackward,

                 &cdata, &rdata, info), "rocfft execute");

         }

         rocfft_execution_info_destroy(info);

     }

     void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{

         forward<float>(indata, outdata, workspace);

     }

     void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{

         backward<float>(indata, outdata, workspace);

     }

     void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{

         forward<double>(indata, outdata, workspace);

     }

     void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{

         backward<double>(indata, outdata, workspace);

     }


     int box_size() const override{ return rsize; }

     int complex_size() const override{ return csize; }

     size_t workspace_size() const override{ return worksize; }

     size_t compute_workspace_size() const{

         make_plan(sforward);

         make_plan(dforward);

         make_plan(sbackward);

         make_plan(dbackward);

         // Temporary copies have to be made, request that from user in addition, to what rocFFT requires.

         return

         std::max( std::max(sforward->size_work() + box_size() * sizeof(float),  sbackward->size_work() + complex_size() * sizeof(std::complex<float>))  / sizeof(std::complex<float>),

                   std::max(dforward->size_work() + box_size() * sizeof(double), dbackward->size_work() + complex_size() * sizeof(std::complex<double>)) / sizeof(std::complex<double>) ) + 1;

     }


 private:

     template<typename scalar_type, direction dir>

     void make_plan(std::unique_ptr<plan_rocfft<scalar_type, dir>> &plan) const{

         if (!plan) plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, rdist, cdist));

     }


     mutable hipStream_t stream;


     int size, howmanyffts, stride, blocks;

     int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;

     mutable std::unique_ptr<plan_rocfft<float, direction::forward>> sforward;

     mutable std::unique_ptr<plan_rocfft<double, direction::forward>> dforward;

     mutable std::unique_ptr<plan_rocfft<float, direction::backward>> sbackward;

     mutable std::unique_ptr<plan_rocfft<double, direction::backward>> dbackward;


     size_t worksize;

 };


 template<> struct one_dim_backend<backend::rocfft>{

     using executor = rocfft_executor;

     using executor_r2c = rocfft_executor_r2c;

 };


 template<> struct one_dim_backend<backend::rocfft_cos>{

     using executor = real2real_executor<backend::rocfft, rocm::cos_pre_pos_processor>;

     using executor_r2c = void;

 };

 template<> struct one_dim_backend<backend::rocfft_sin>{

     using executor = real2real_executor<backend::rocfft, rocm::sin_pre_pos_processor>;

     using executor_r2c = void;

 };


 template<> struct direct_packer<tag::gpu>{

     template<typename scalar_type, typename index>

     void pack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{

         rocm::direct_pack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, data, buffer);

     }

     template<typename scalar_type, typename index>

     void unpack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{

         rocm::direct_unpack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, buffer, data);

     }

 };


 template<> struct transpose_packer<tag::gpu>{

     template<typename scalar_type, typename index>

     void pack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{

         direct_packer<tag::gpu>().pack(stream, plan, data, buffer); // packing is done the same way as the direct_packer

     }

     template<typename scalar_type, typename index>

     void unpack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{

         rocm::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,

                                             plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2], buffer, data);

     }

 };


 namespace data_scaling {

     template<typename scalar_type, typename index>

     static void apply(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor){

         rocm::scale_data<scalar_type, long long>(stream, static_cast<long long>(num_entries), data, scale_factor);

     }

     template<typename precision_type, typename index>

     static void apply(hipStream_t stream, index num_entries, std::complex<precision_type> *data, double scale_factor){

         apply<precision_type>(stream, 2*num_entries, reinterpret_cast<precision_type*>(data), scale_factor);

     }

 };


 template<> struct default_plan_options<backend::rocfft>{

     static const bool use_reorder = true;

 };

 template<> struct default_plan_options<backend::rocfft_cos>{

     static const bool use_reorder = true;

 };

 template<> struct default_plan_options<backend::rocfft_sin>{

     static const bool use_reorder = true;

 };


 }


 #endif


 #endif   /* HEFFTE_BACKEND_FFTW_H */

heffte::executor_base
Base class for all backend executors.
Definition: heffte_common.h:486

heffte::executor_base::complex_size
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition: heffte_common.h:519

heffte::executor_base::backward
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition: heffte_common.h:495

heffte::executor_base::forward
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition: heffte_common.h:491

heffte::rocfft_executor_r2c
Wrapper to rocFFT API for real-to-complex transform with shortening of the data.
Definition: heffte_backend_rocm.h:674

heffte::rocfft_executor_r2c::complex_size
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition: heffte_backend_rocm.h:781

heffte::rocfft_executor_r2c::backward
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Backward transform, single precision.
Definition: heffte_backend_rocm.h:766

heffte::rocfft_executor_r2c::rocfft_executor_r2c
rocfft_executor_r2c(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition: heffte_backend_rocm.h:686

heffte::rocfft_executor_r2c::forward
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Forward transform, single precision.
Definition: heffte_backend_rocm.h:762

heffte::rocfft_executor_r2c::box_size
int box_size() const override
Returns the size of the box with real data.
Definition: heffte_backend_rocm.h:779

heffte::rocfft_executor_r2c::forward
void forward(precision_type const indata[], std::complex< precision_type > outdata[], std::complex< precision_type > *workspace) const
Forward transform, single precision.
Definition: heffte_backend_rocm.h:703

heffte::rocfft_executor_r2c::backward
void backward(std::complex< precision_type > indata[], precision_type outdata[], std::complex< precision_type > *workspace) const
Backward transform, single precision.
Definition: heffte_backend_rocm.h:733

heffte::rocfft_executor_r2c::compute_workspace_size
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition: heffte_backend_rocm.h:785

heffte::rocfft_executor_r2c::backward
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Backward transform, double precision.
Definition: heffte_backend_rocm.h:774

heffte::rocfft_executor_r2c::workspace_size
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_rocm.h:783

heffte::rocfft_executor_r2c::forward
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Forward transform, double precision.
Definition: heffte_backend_rocm.h:770

heffte::rocfft_executor
Wrapper around the rocFFT API.
Definition: heffte_backend_rocm.h:487

heffte::rocfft_executor::forward
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the deal data to complex and performs double-complex forward transform.
Definition: heffte_backend_rocm.h:609

heffte::rocfft_executor::backward
virtual void backward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:495

heffte::rocfft_executor::backward
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition: heffte_backend_rocm.h:614

heffte::rocfft_executor::forward
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the deal data to complex and performs float-complex forward transform.
Definition: heffte_backend_rocm.h:604

heffte::rocfft_executor::forward
virtual void forward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:491

heffte::rocfft_executor::backward
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition: heffte_backend_rocm.h:619

heffte::rocfft_executor::rocfft_executor
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition: heffte_backend_rocm.h:497

heffte::rocfft_executor::compute_workspace_size
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition: heffte_backend_rocm.h:629

heffte::rocfft_executor::rocfft_executor
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition: heffte_backend_rocm.h:511

heffte::rocfft_executor::box_size
int box_size() const override
Returns the size of the box.
Definition: heffte_backend_rocm.h:625

heffte::rocfft_executor::forward
void forward(std::complex< double > data[], std::complex< double > *workspace) const override
Forward fft, double-complex case.
Definition: heffte_backend_rocm.h:591

heffte::rocfft_executor::backward
void backward(std::complex< float > data[], std::complex< float > *workspace) const override
Backward fft, float-complex case.
Definition: heffte_backend_rocm.h:595

heffte::rocfft_executor::workspace_size
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_rocm.h:627

heffte::rocfft_executor::execute
void execute(std::complex< precision_type > data[], std::complex< precision_type > *workspace) const
Perform an in-place FFT on the data in the given direction.
Definition: heffte_backend_rocm.h:552

heffte::rocfft_executor::rocfft_executor
rocfft_executor(hipStream_t active_stream, box3d< index > const box)
Merges three FFTs into one.
Definition: heffte_backend_rocm.h:540

heffte::rocfft_executor::backward
void backward(std::complex< double > data[], std::complex< double > *workspace) const override
Backward fft, double-complex case.
Definition: heffte_backend_rocm.h:599

heffte::rocfft_executor::forward
void forward(std::complex< float > data[], std::complex< float > *workspace) const override
Forward fft, float-complex case.
Definition: heffte_backend_rocm.h:587

heffte::fft1d_get_howmany
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:159

heffte::fft1d_get_stride
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:169

heffte::direction::backward
@ backward
Inverse DFT transform.

heffte::direction::forward
@ forward
Forward DFT transform.

heffte::data_scaling::apply
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition: heffte_backend_cuda.h:796

heffte::rocm::check_error
void check_error(hipError_t status, const char *function_name)
Checks the status of a ROCm command and in case of a failure, converts it to a C++ exception.
Definition: heffte_backend_rocm.h:51

heffte::rocm::convert
void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.

heffte::rocm::transpose_unpack
void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.

heffte::rocm::scale_data
void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.

heffte::rocm::direct_pack
void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.

heffte::rocm::direct_unpack
void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.

heffte
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38

heffte::backend::buffer_traits< rocfft >::container
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:272

heffte::backend::buffer_traits< rocfft_cos >::container
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:283

heffte::backend::buffer_traits< rocfft_sin >::container
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:294

heffte::backend::buffer_traits
Defines the container for the temporary buffers.
Definition: heffte_common.h:212

heffte::backend::cufft
Type-tag for the cuFFT backend.
Definition: heffte_common.h:147

heffte::backend::data_manipulator< tag::gpu >::copy_device_to_device
static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition: heffte_backend_rocm.h:251

heffte::backend::data_manipulator< tag::gpu >::stream_type
cudaStream_t stream_type
The stream type for the device.
Definition: heffte_backend_cuda.h:202

heffte::backend::data_manipulator< tag::gpu >::copy_n
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition: heffte_backend_rocm.h:227

heffte::backend::data_manipulator< tag::gpu >::copy_n
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition: heffte_backend_rocm.h:240

heffte::backend::data_manipulator< tag::gpu >::copy_host_to_device
static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition: heffte_backend_rocm.h:257

heffte::backend::data_manipulator< tag::gpu >::copy_device_to_host
static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition: heffte_backend_rocm.h:245

heffte::backend::data_manipulator< tag::gpu >::allocate
static scalar_type * allocate(hipStream_t stream, size_t num_entries)
Allocate memory.
Definition: heffte_backend_rocm.h:212

heffte::backend::data_manipulator< tag::gpu >::free
static void free(hipStream_t stream, scalar_type *pntr)
Free memory.
Definition: heffte_backend_rocm.h:220

heffte::backend::data_manipulator< tag::gpu >::copy_n
static void copy_n(hipStream_t stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition: heffte_backend_rocm.h:235

heffte::backend::data_manipulator
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59

heffte::backend::default_backend
Defines inverse mapping from the location tag to a default backend tag.
Definition: heffte_common.h:380

heffte::backend::device_instance< tag::gpu >
The CUDA backend uses a CUDA stream.
Definition: heffte_backend_cuda.h:172

heffte::backend::device_instance< tag::gpu >::device_instance
device_instance(hipStream_t new_stream=nullptr)
Constructor, sets up the stream.
Definition: heffte_backend_rocm.h:179

heffte::backend::device_instance< tag::gpu >::synchronize_device
void synchronize_device() const
Syncs the execution with the queue.
Definition: heffte_backend_rocm.h:185

heffte::backend::device_instance< tag::gpu >::stream
hipStream_t stream() const
Returns the nullptr (const case).
Definition: heffte_backend_rocm.h:183

heffte::backend::device_instance< tag::gpu >::stream_type
cudaStream_t stream_type
The type for the internal stream.
Definition: heffte_backend_cuda.h:184

heffte::backend::device_instance< tag::gpu >::stream
hipStream_t stream()
Returns the nullptr.
Definition: heffte_backend_rocm.h:181

heffte::backend::device_instance< tag::gpu >::_stream
hipStream_t _stream
The CUDA stream to be used in all operations.
Definition: heffte_backend_rocm.h:187

heffte::backend::device_instance
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358

heffte::backend::is_enabled
Allows to define whether a specific backend interface has been enabled.
Definition: heffte_common.h:201

heffte::backend::rocfft_cos
Type-tag for the Cosine Transform using the rocFFT backend.
Definition: heffte_common.h:169

heffte::backend::rocfft_sin
Type-tag for the Sine Transform using the rocFFT backend.
Definition: heffte_common.h:174

heffte::backend::rocfft
Type-tag for the rocFFT backend.
Definition: heffte_common.h:164

heffte::box3d
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67

heffte::box3d::size
std::array< index, 3 > const size
The number of indexes in each direction.
Definition: heffte_geometry.h:129

heffte::box3d::find_order
int find_order(int dir) const
Returns the effective order of the direction (dir), 0 -> fast, 1 -> mid, 2 -> slow.
Definition: heffte_geometry.h:121

heffte::default_plan_options
Defines a set of default plan options for a given backend.
Definition: heffte_common.h:642

heffte::direct_packer< tag::gpu >
Simple packer that copies sub-boxes without transposing the order of the indexes.
Definition: heffte_backend_cuda.h:759

heffte::direct_packer< tag::gpu >::unpack
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition: heffte_backend_rocm.h:865

heffte::direct_packer< tag::gpu >::pack
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_cuda.h:762

heffte::direct_packer< tag::gpu >::pack
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_rocm.h:860

heffte::direct_packer
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83

heffte::one_dim_backend< backend::rocfft_cos >::executor_r2c
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_rocm.h:838

heffte::one_dim_backend< backend::rocfft_sin >::executor_r2c
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_rocm.h:850

heffte::one_dim_backend
Indicates the structure that will be used by the fft backend.
Definition: heffte_common.h:546

heffte::pack_plan_3d
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32

heffte::pack_plan_3d::buff_plane_stride
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:42

heffte::pack_plan_3d::line_stride
index line_stride
Stride of the lines.
Definition: heffte_pack3d.h:36

heffte::pack_plan_3d::plane_stride
index plane_stride
Stride of the planes.
Definition: heffte_pack3d.h:38

heffte::pack_plan_3d::size
std::array< index, 3 > size
Number of elements in the three directions.
Definition: heffte_pack3d.h:34

heffte::pack_plan_3d::map
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition: heffte_pack3d.h:44

heffte::pack_plan_3d::buff_line_stride
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:40

heffte::plan_rocfft< std::complex< precision_type >, dir >::~plan_rocfft
~plan_rocfft()
Destructor, deletes the plan.
Definition: heffte_backend_rocm.h:463

heffte::plan_rocfft< std::complex< precision_type >, dir >::plan_rocfft
plan_rocfft(size_t size1, size_t size2, std::array< size_t, 2 > const &embed, size_t batch, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_rocm.h:407

heffte::plan_rocfft< std::complex< precision_type >, dir >::plan_rocfft
plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_rocm.h:372

heffte::plan_rocfft< std::complex< precision_type >, dir >::plan_rocfft
plan_rocfft(size_t size1, size_t size2, size_t size3)
Constructor, takes inputs identical to cufftPlan3d()
Definition: heffte_backend_rocm.h:436

heffte::plan_rocfft< std::complex< precision_type >, dir >::size_work
size_t size_work() const
Return the worksize.
Definition: heffte_backend_rocm.h:467

heffte::plan_rocfft
Plan for the r2c single precision transform.
Definition: heffte_backend_rocm.h:306

heffte::plan_rocfft::~plan_rocfft
~plan_rocfft()
Destructor, deletes the plan.
Definition: heffte_backend_rocm.h:345

heffte::plan_rocfft::size_work
size_t size_work() const
Return the worksize.
Definition: heffte_backend_rocm.h:349

heffte::plan_rocfft::plan_rocfft
plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist)
Constructor and initializer of the plan.
Definition: heffte_backend_rocm.h:316

heffte::real2real_executor
Template algorithm for the Sine and Cosine transforms.
Definition: heffte_r2r_executor.h:135

heffte::rocm::cos_pre_pos_processor
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_rocm.h:121

heffte::rocm::cos_pre_pos_processor::post_forward
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.

heffte::rocm::cos_pre_pos_processor::pre_backward
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.

heffte::rocm::cos_pre_pos_processor::pre_forward
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.

heffte::rocm::cos_pre_pos_processor::post_backward
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.

heffte::rocm::sin_pre_pos_processor
Implementation of Sine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_rocm.h:139

heffte::rocm::sin_pre_pos_processor::post_backward
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.

heffte::rocm::sin_pre_pos_processor::pre_backward
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.

heffte::rocm::sin_pre_pos_processor::post_forward
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.

heffte::rocm::sin_pre_pos_processor::pre_forward
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.

heffte::tag::gpu
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition: heffte_common.h:45

heffte::transpose_packer< tag::gpu >::pack
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_rocm.h:877

heffte::transpose_packer< tag::gpu >::unpack
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition: heffte_backend_rocm.h:882