7 #ifndef HEFFTE_BACKEND_ROCM_H 
    8 #define HEFFTE_BACKEND_ROCM_H 
   10 #include "heffte_r2r_executor.h" 
   12 #ifdef Heffte_ENABLE_ROCM 
   14 #ifndef __HIP_PLATFORM_HCC__ 
   15 #define __HIP_PLATFORM_HCC__ 
   17 #include <hip/hip_runtime.h> 
   19 #include "heffte_backend_vector.h" 
   21 #ifdef Heffte_ENABLE_MAGMA 
   22 #include "heffte_magma_helpers.h" 
   51     inline void check_error(hipError_t status, 
const char *function_name){
 
   52         if (status != hipSuccess)
 
   53             throw std::runtime_error(std::string(function_name) + 
" failed with message: " + std::string(hipGetErrorString(status)));
 
   59     inline void check_error(rocfft_status status, 
const char *function_name){
 
   60         if (status != rocfft_status_success)
 
   61             throw std::runtime_error(std::string(function_name) + 
" failed with error code: " + std::to_string(status));
 
   70     template<
typename precision_type, 
typename index>
 
   71     void convert(hipStream_t stream, index num_entries, precision_type 
const source[], std::complex<precision_type> destination[]);
 
   78     template<
typename precision_type, 
typename index>
 
   79     void convert(hipStream_t stream, index num_entries, std::complex<precision_type> 
const source[], precision_type destination[]);
 
   85     template<
typename scalar_type, 
typename index>
 
   86     void scale_data(hipStream_t stream, index num_entries, scalar_type *data, 
double scale_factor);
 
   94     template<
typename scalar_type, 
typename index>
 
   95     void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
 
   96                     scalar_type 
const source[], scalar_type destination[]);
 
  103     template<
typename scalar_type, 
typename index>
 
  104     void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
 
  105                     scalar_type 
const source[], scalar_type destination[]);
 
  112     template<
typename scalar_type, 
typename index>
 
  113     void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
 
  114                         index buff_line_stride, index buff_plane_stride, 
int map0, 
int map1, 
int map2,
 
  115                         scalar_type 
const source[], scalar_type destination[]);
 
  123         template<
typename precision>
 
  124         static void pre_forward(hipStream_t, 
int length, precision 
const input[], precision fft_signal[]);
 
  126         template<
typename precision>
 
  127         static void post_forward(hipStream_t, 
int length, std::complex<precision> 
const fft_result[], precision result[]);
 
  129         template<
typename precision>
 
  130         static void pre_backward(hipStream_t, 
int length, precision 
const input[], std::complex<precision> fft_signal[]);
 
  132         template<
typename precision>
 
  133         static void post_backward(hipStream_t, 
int length, precision 
const fft_result[], precision result[]);
 
  141         template<
typename precision>
 
  142         static void pre_forward(hipStream_t, 
int length, precision 
const input[], precision fft_signal[]);
 
  144         template<
typename precision>
 
  145         static void post_forward(hipStream_t, 
int length, std::complex<precision> 
const fft_result[], precision result[]);
 
  147         template<
typename precision>
 
  148         static void pre_backward(hipStream_t, 
int length, precision 
const input[], std::complex<precision> fft_signal[]);
 
  150         template<
typename precision>
 
  151         static void post_backward(hipStream_t, 
int length, precision 
const fft_result[], precision result[]);
 
  183         hipStream_t 
stream()
 const{ 
return _stream; }
 
  211         template<
typename scalar_type>
 
  212         static scalar_type* 
allocate(hipStream_t stream, 
size_t num_entries){
 
  214             if (stream != 
nullptr) 
rocm::check_error( hipStreamSynchronize(stream), 
"hipStreamSynchronize()");
 
  215             rocm::check_error(hipMalloc(&new_data, num_entries * 
sizeof(scalar_type)), 
"hipMalloc()");
 
  216             return reinterpret_cast<scalar_type*
>(new_data);
 
  219         template<
typename scalar_type>
 
  220         static void free(hipStream_t stream, scalar_type *pntr){
 
  221             if (pntr == 
nullptr) 
return;
 
  222             if (stream != 
nullptr) 
rocm::check_error( hipStreamSynchronize(stream), 
"hipStreamSynchronize()");
 
  226         template<
typename scalar_type>
 
  227         static void copy_n(hipStream_t stream, scalar_type 
const source[], 
size_t num_entries, scalar_type destination[]){
 
  228             if (stream == 
nullptr)
 
  229                 rocm::check_error(hipMemcpy(destination, source, num_entries * 
sizeof(scalar_type), hipMemcpyDeviceToDevice), 
"data_manipulator::copy_n()");
 
  231                 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * 
sizeof(scalar_type), hipMemcpyDeviceToDevice, stream), 
"data_manipulator::copy_n()");
 
  234         template<
typename scalar_type>
 
  235         static void copy_n(hipStream_t stream, std::complex<scalar_type> 
const source[], 
size_t num_entries, scalar_type destination[]){
 
  236             rocm::convert(stream, 
static_cast<long long>(num_entries), source, destination);
 
  239         template<
typename scalar_type>
 
  240         static void copy_n(hipStream_t stream, scalar_type 
const source[], 
size_t num_entries, std::complex<scalar_type> destination[]){
 
  241             rocm::convert(stream, 
static_cast<long long>(num_entries), source, destination);
 
  244         template<
typename scalar_type>
 
  245         static void copy_device_to_host(hipStream_t stream, scalar_type 
const source[], 
size_t num_entries, scalar_type destination[]){
 
  246             rocm::check_error(hipMemcpyAsync(destination, source, num_entries * 
sizeof(scalar_type), hipMemcpyDeviceToHost, stream),
 
  247                             "device_to_host (rocm)");
 
  250         template<
typename scalar_type>
 
  251         static void copy_device_to_device(hipStream_t stream, scalar_type 
const source[], 
size_t num_entries, scalar_type destination[]){
 
  252                 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * 
sizeof(scalar_type), hipMemcpyDeviceToDevice, stream),
 
  253                                 "device_to_device (rocm)");
 
  256         template<
typename scalar_type>
 
  257         static void copy_host_to_device(hipStream_t stream, scalar_type 
const source[], 
size_t num_entries, scalar_type destination[]){
 
  258             rocm::check_error(hipMemcpyAsync(destination, source, num_entries * 
sizeof(scalar_type), hipMemcpyHostToDevice, stream),
 
  259                             "host_to_device (rocm)");
 
  272         template<
typename T> 
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
 
  283         template<
typename T> 
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
 
  294         template<
typename T> 
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
 
  305 template<
typename precision_type, direction dir>
 
  316     plan_rocfft(
size_t size, 
size_t batch, 
size_t stride, 
size_t rdist, 
size_t cdist){
 
  318         rocfft_plan_description desc = 
nullptr;
 
  322             rocfft_plan_description_set_data_layout(
 
  324                 (dir == 
direction::forward) ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,
 
  325                 (dir == 
direction::forward) ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,
 
  334         rocfft_plan_create(&plan, rocfft_placement_notinplace,
 
  335                            (dir == 
direction::forward) ? rocfft_transform_type_real_forward : rocfft_transform_type_real_inverse,
 
  336                            (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,
 
  337                            1, &size, batch, desc),
 
  340         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), 
"get_worksize");
 
  347     operator rocfft_plan()
 const{ 
return plan; }
 
  362 template<
typename precision_type, direction dir>
 
  372     plan_rocfft(
size_t size, 
size_t batch, 
size_t stride, 
size_t dist) : plan(nullptr), worksize(0){
 
  373         rocfft_plan_description desc = 
nullptr;
 
  377             rocfft_plan_description_set_data_layout(
 
  379                 rocfft_array_type_complex_interleaved,
 
  380                 rocfft_array_type_complex_interleaved,
 
  382                 1, &stride, dist, 1, &stride, dist
 
  388         rocfft_plan_create(&plan, rocfft_placement_inplace,
 
  389                            (dir == 
direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
 
  390                            (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,
 
  391                            1, &size, batch, desc),
 
  394         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), 
"get_worksize");
 
  407     plan_rocfft(
size_t size1, 
size_t size2, std::array<size_t, 2> 
const &embed, 
size_t batch, 
size_t dist) : plan(nullptr), worksize(0){
 
  408         size_t size[2] = {size1, size2};
 
  410         rocfft_plan_description desc = 
nullptr;
 
  414             rocfft_plan_description_set_data_layout(
 
  416                 rocfft_array_type_complex_interleaved,
 
  417                 rocfft_array_type_complex_interleaved,
 
  419                 2, embed.data(), dist, 2, embed.data(), dist
 
  425         rocfft_plan_create(&plan, rocfft_placement_inplace,
 
  426                            (dir == 
direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
 
  427                            (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,
 
  428                            2, size, batch, desc),
 
  431         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), 
"get_worksize");
 
  437         std::array<size_t, 3> size = {size1, size2, size3};
 
  438         rocfft_plan_description desc = 
nullptr;
 
  442             rocfft_plan_description_set_data_layout(
 
  444                 rocfft_array_type_complex_interleaved,
 
  445                 rocfft_array_type_complex_interleaved,
 
  446                 nullptr, 
nullptr, 3, 
nullptr, 1, 3, 
nullptr, 1
 
  452         rocfft_plan_create(&plan, rocfft_placement_inplace,
 
  453                            (dir == 
direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
 
  454                            (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,
 
  455                            3, size.data(), 1, desc),
 
  458         rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), 
"get_worksize");
 
  465     operator rocfft_plan()
 const{ 
return plan; }
 
  496     template<
typename index>
 
  498         stream(active_stream),
 
  499         size(box.size[dimension]), size2(0),
 
  502         dist((dimension == box.order[0]) ? size : 1),
 
  503         blocks((dimension == box.order[1]) ? box.osize(2) : 1),
 
  504         block_stride(box.osize(0) * box.osize(1)),
 
  505         total_size(box.count()),
 
  510     template<
typename index>
 
  512         stream(active_stream),
 
  513         size(box.size[std::min(dir1, dir2)]), size2(box.size[std::max(dir1, dir2)]),
 
  514         blocks(1), block_stride(0), total_size(box.count()),
 
  520         if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){
 
  523             embed = {
static_cast<size_t>(stride), 
static_cast<size_t>(size)};
 
  524             howmanyffts = box.
size[2];
 
  525         }
else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){
 
  526             stride = box.
size[0];
 
  528             embed = {
static_cast<size_t>(stride), 
static_cast<size_t>(size) * 
static_cast<size_t>(stride)};
 
  529             howmanyffts = box.
size[0];
 
  533             embed = {
static_cast<size_t>(stride), 
static_cast<size_t>(box.
size[1]) * 
static_cast<size_t>(box.
size[0])};
 
  534             howmanyffts = box.
size[1];
 
  539     template<
typename index>
 
  541         stream(active_stream),
 
  542         size(box.size[0]), size2(box.size[1]), howmanyffts(box.size[2]),
 
  544         blocks(1), block_stride(0),
 
  545         total_size(box.count()),
 
  551     template<
typename precision_type, direction dir>
 
  552     void execute(std::complex<precision_type> data[], std::complex<precision_type> *workspace)
 const{
 
  553         if (std::is_same<precision_type, float>::value){
 
  555                 make_plan(ccomplex_forward);
 
  557                 make_plan(ccomplex_backward);
 
  560                 make_plan(zcomplex_forward);
 
  562                 make_plan(zcomplex_backward);
 
  564         rocfft_execution_info info;
 
  565         rocfft_execution_info_create(&info);
 
  566         rocfft_execution_info_set_stream(info, stream);
 
  568         size_t wsize = (std::is_same<precision_type, float>::value) ?
 
  569                             ((dir == 
direction::forward) ? ccomplex_forward->size_work() : ccomplex_backward->size_work()) :
 
  570                             ((dir == 
direction::forward) ? zcomplex_forward->size_work() : zcomplex_backward->size_work());
 
  573             rocfft_execution_info_set_work_buffer(info, 
reinterpret_cast<void*
>(workspace), wsize);
 
  575         for(
int i=0; i<blocks; i++){
 
  576             void* block_data = 
reinterpret_cast<void*
>(data + i * block_stride);
 
  578                 (std::is_same<precision_type, float>::value) ?
 
  581                 &block_data, 
nullptr, info), 
"rocfft execute");
 
  583         rocfft_execution_info_destroy(info);
 
  587     void forward(std::complex<float> data[], std::complex<float> *workspace)
 const override{
 
  588         execute<float, direction::forward>(data, workspace);
 
  591     void forward(std::complex<double> data[], std::complex<double> *workspace)
 const override{
 
  592         execute<double, direction::forward>(data, workspace);
 
  595     void backward(std::complex<float> data[], std::complex<float> *workspace)
 const override{
 
  596         execute<float, direction::backward>(data, workspace);
 
  599     void backward(std::complex<double> data[], std::complex<double> *workspace)
 const override{
 
  600         execute<double, direction::backward>(data, workspace);
 
  604     void forward(
float const indata[], std::complex<float> outdata[], std::complex<float> *workspace)
 const override{
 
  609     void forward(
double const indata[], std::complex<double> outdata[], std::complex<double> *workspace)
 const override{
 
  614     void backward(std::complex<float> indata[], 
float outdata[], std::complex<float> *workspace)
 const override{
 
  619     void backward(std::complex<double> indata[], 
double outdata[], std::complex<double> *workspace)
 const override{
 
  625     int box_size()
 const override{ 
return total_size; }
 
  630         make_plan(ccomplex_forward);
 
  631         make_plan(ccomplex_backward);
 
  632         make_plan(zcomplex_forward);
 
  633         make_plan(zcomplex_backward);
 
  635         std::max( std::max(ccomplex_forward->size_work(), ccomplex_backward->size_work()) / 
sizeof(std::complex<float>),
 
  636                   std::max(zcomplex_forward->size_work(), zcomplex_backward->size_work()) / 
sizeof(std::complex<double>) ) + 1;
 
  642     template<
typename scalar_type, direction dir>
 
  654     mutable hipStream_t stream;
 
  656     int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
 
  657     std::array<size_t, 2> embed;
 
  658     mutable std::unique_ptr<plan_rocfft<std::complex<float>, 
direction::forward>> ccomplex_forward;
 
  659     mutable std::unique_ptr<plan_rocfft<std::complex<float>, 
direction::backward>> ccomplex_backward;
 
  660     mutable std::unique_ptr<plan_rocfft<std::complex<double>, 
direction::forward>> zcomplex_forward;
 
  661     mutable std::unique_ptr<plan_rocfft<std::complex<double>, 
direction::backward>> zcomplex_backward;
 
  685     template<
typename index>
 
  687         stream(active_stream),
 
  688         size(box.size[dimension]),
 
  691         blocks((dimension == box.order[1]) ? box.osize(2) : 1),
 
  692         rdist((dimension == box.order[0]) ? size : 1),
 
  693         cdist((dimension == box.order[0]) ? size/2 + 1 : 1),
 
  694         rblock_stride(box.osize(0) * box.osize(1)),
 
  695         cblock_stride(box.osize(0) * (box.osize(1)/2 + 1)),
 
  697         csize(box.r2c(dimension).count()),
 
  702     template<
typename precision_type>
 
  703     void forward(precision_type 
const indata[], std::complex<precision_type> outdata[], std::complex<precision_type> *workspace)
 const{
 
  704         if (std::is_same<precision_type, float>::value){
 
  710         rocfft_execution_info info;
 
  711         rocfft_execution_info_create(&info);
 
  712         rocfft_execution_info_set_stream(info, stream);
 
  714         size_t wsize = (std::is_same<precision_type, float>::value) ? sforward->size_work() : dforward->size_work();
 
  716             rocfft_execution_info_set_work_buffer(info, 
reinterpret_cast<void*
>(workspace), wsize);
 
  718         precision_type *copy_indata = 
reinterpret_cast<precision_type*
>(
 
  719             reinterpret_cast<unsigned char *
>(workspace) + wsize);
 
  722         for(
int i=0; i<blocks; i++){
 
  723             void *rdata = 
const_cast<void*
>(
reinterpret_cast<void const*
>(copy_indata + i * rblock_stride));
 
  724             void *cdata = 
reinterpret_cast<void*
>(outdata + i * cblock_stride);
 
  726                 (std::is_same<precision_type, float>::value) ? *sforward : *dforward,
 
  727                 &rdata, &cdata, info), 
"rocfft execute");
 
  729         rocfft_execution_info_destroy(info);
 
  732     template<
typename precision_type>
 
  733     void backward(std::complex<precision_type> indata[], precision_type outdata[], std::complex<precision_type> *workspace)
 const{
 
  734         if (std::is_same<precision_type, float>::value){
 
  735             make_plan(sbackward);
 
  737             make_plan(dbackward);
 
  740         rocfft_execution_info info;
 
  741         rocfft_execution_info_create(&info);
 
  742         rocfft_execution_info_set_stream(info, stream);
 
  744         size_t wsize = (std::is_same<precision_type, float>::value) ? sbackward->size_work() : dbackward->size_work();
 
  746             rocfft_execution_info_set_work_buffer(info, 
reinterpret_cast<void*
>(workspace), wsize);
 
  748         std::complex<precision_type> *copy_indata = 
reinterpret_cast<std::complex<precision_type>*
>(
 
  749             reinterpret_cast<unsigned char *
>(workspace) + wsize);
 
  752         for(
int i=0; i<blocks; i++){
 
  753             void *cdata = 
const_cast<void*
>(
reinterpret_cast<void const*
>(copy_indata + i * cblock_stride));
 
  754             void *rdata = 
reinterpret_cast<void*
>(outdata + i * rblock_stride);
 
  756                 (std::is_same<precision_type, float>::value) ? *sbackward : *dbackward,
 
  757                 &cdata, &rdata, info), 
"rocfft execute");
 
  759         rocfft_execution_info_destroy(info);
 
  762     void forward(
float const indata[], std::complex<float> outdata[], std::complex<float> *workspace)
 const override{
 
  763         forward<float>(indata, outdata, workspace);
 
  766     void backward(std::complex<float> indata[], 
float outdata[], std::complex<float> *workspace)
 const override{
 
  767         backward<float>(indata, outdata, workspace);
 
  770     void forward(
double const indata[], std::complex<double> outdata[], std::complex<double> *workspace)
 const override{
 
  771         forward<double>(indata, outdata, workspace);
 
  774     void backward(std::complex<double> indata[], 
double outdata[], std::complex<double> *workspace)
 const override{
 
  775         backward<double>(indata, outdata, workspace);
 
  788         make_plan(sbackward);
 
  789         make_plan(dbackward);
 
  792         std::max( std::max(sforward->size_work() + 
box_size() * 
sizeof(
float),  sbackward->size_work() + 
complex_size() * 
sizeof(std::complex<float>))  / 
sizeof(std::complex<float>),
 
  793                   std::max(dforward->size_work() + 
box_size() * 
sizeof(
double), dbackward->size_work() + 
complex_size() * 
sizeof(std::complex<double>)) / 
sizeof(std::complex<double>) ) + 1;
 
  798     template<
typename scalar_type, direction dir>
 
  800         if (!plan) plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(
new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, rdist, cdist));
 
  803     mutable hipStream_t stream;
 
  805     int size, howmanyffts, stride, blocks;
 
  806     int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
 
  807     mutable std::unique_ptr<plan_rocfft<float, direction::forward>> sforward;
 
  808     mutable std::unique_ptr<plan_rocfft<double, direction::forward>> dforward;
 
  809     mutable std::unique_ptr<plan_rocfft<float, direction::backward>> sbackward;
 
  810     mutable std::unique_ptr<plan_rocfft<double, direction::backward>> dbackward;
 
  859     template<
typename scalar_type, 
typename index>
 
  864     template<
typename scalar_type, 
typename index>
 
  874 template<> 
struct transpose_packer<tag::gpu>{
 
  876     template<
typename scalar_type, 
typename index>
 
  881     template<
typename scalar_type, 
typename index>
 
  888 namespace data_scaling {
 
  893     template<
typename scalar_type, 
typename index>
 
  894     static void apply(hipStream_t stream, index num_entries, scalar_type *data, 
double scale_factor){
 
  895         rocm::scale_data<scalar_type, long long>(stream, 
static_cast<long long>(num_entries), data, scale_factor);
 
  901     template<
typename precision_type, 
typename index>
 
  902     static void apply(hipStream_t stream, index num_entries, std::complex<precision_type> *data, 
double scale_factor){
 
  903         apply<precision_type>(stream, 2*num_entries, 
reinterpret_cast<precision_type*
>(data), scale_factor);
 
  913     static const bool use_reorder = 
true;
 
  921     static const bool use_reorder = 
true;
 
  929     static const bool use_reorder = 
true;
 
Base class for all backend executors.
Definition: heffte_common.h:486
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition: heffte_common.h:519
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition: heffte_common.h:495
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition: heffte_common.h:491
Wrapper to rocFFT API for real-to-complex transform with shortening of the data.
Definition: heffte_backend_rocm.h:674
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition: heffte_backend_rocm.h:781
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Backward transform, single precision.
Definition: heffte_backend_rocm.h:766
rocfft_executor_r2c(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition: heffte_backend_rocm.h:686
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Forward transform, single precision.
Definition: heffte_backend_rocm.h:762
int box_size() const override
Returns the size of the box with real data.
Definition: heffte_backend_rocm.h:779
void forward(precision_type const indata[], std::complex< precision_type > outdata[], std::complex< precision_type > *workspace) const
Forward transform, single precision.
Definition: heffte_backend_rocm.h:703
void backward(std::complex< precision_type > indata[], precision_type outdata[], std::complex< precision_type > *workspace) const
Backward transform, single precision.
Definition: heffte_backend_rocm.h:733
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition: heffte_backend_rocm.h:785
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Backward transform, double precision.
Definition: heffte_backend_rocm.h:774
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_rocm.h:783
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Forward transform, double precision.
Definition: heffte_backend_rocm.h:770
Wrapper around the rocFFT API.
Definition: heffte_backend_rocm.h:487
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the deal data to complex and performs double-complex forward transform.
Definition: heffte_backend_rocm.h:609
virtual void backward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:495
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition: heffte_backend_rocm.h:614
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the deal data to complex and performs float-complex forward transform.
Definition: heffte_backend_rocm.h:604
virtual void forward(float[], float *) const
Bring forth method that have not been overloaded.
Definition: heffte_common.h:491
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition: heffte_backend_rocm.h:619
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition: heffte_backend_rocm.h:497
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition: heffte_backend_rocm.h:629
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition: heffte_backend_rocm.h:511
int box_size() const override
Returns the size of the box.
Definition: heffte_backend_rocm.h:625
void forward(std::complex< double > data[], std::complex< double > *workspace) const override
Forward fft, double-complex case.
Definition: heffte_backend_rocm.h:591
void backward(std::complex< float > data[], std::complex< float > *workspace) const override
Backward fft, float-complex case.
Definition: heffte_backend_rocm.h:595
size_t workspace_size() const override
Return the size of the needed workspace.
Definition: heffte_backend_rocm.h:627
void execute(std::complex< precision_type > data[], std::complex< precision_type > *workspace) const
Perform an in-place FFT on the data in the given direction.
Definition: heffte_backend_rocm.h:552
rocfft_executor(hipStream_t active_stream, box3d< index > const box)
Merges three FFTs into one.
Definition: heffte_backend_rocm.h:540
void backward(std::complex< double > data[], std::complex< double > *workspace) const override
Backward fft, double-complex case.
Definition: heffte_backend_rocm.h:599
void forward(std::complex< float > data[], std::complex< float > *workspace) const override
Forward fft, float-complex case.
Definition: heffte_backend_rocm.h:587
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition: heffte_geometry.h:169
@ backward
Inverse DFT transform.
@ forward
Forward DFT transform.
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition: heffte_backend_cuda.h:796
void check_error(hipError_t status, const char *function_name)
Checks the status of a ROCm command and in case of a failure, converts it to a C++ exception.
Definition: heffte_backend_rocm.h:51
void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:272
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:283
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition: heffte_backend_rocm.h:294
Defines the container for the temporary buffers.
Definition: heffte_common.h:212
Type-tag for the cuFFT backend.
Definition: heffte_common.h:147
static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition: heffte_backend_rocm.h:251
cudaStream_t stream_type
The stream type for the device.
Definition: heffte_backend_cuda.h:202
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition: heffte_backend_rocm.h:227
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition: heffte_backend_rocm.h:240
static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition: heffte_backend_rocm.h:257
static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition: heffte_backend_rocm.h:245
static scalar_type * allocate(hipStream_t stream, size_t num_entries)
Allocate memory.
Definition: heffte_backend_rocm.h:212
static void free(hipStream_t stream, scalar_type *pntr)
Free memory.
Definition: heffte_backend_rocm.h:220
static void copy_n(hipStream_t stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition: heffte_backend_rocm.h:235
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition: heffte_common.h:59
Defines inverse mapping from the location tag to a default backend tag.
Definition: heffte_common.h:380
The CUDA backend uses a CUDA stream.
Definition: heffte_backend_cuda.h:172
device_instance(hipStream_t new_stream=nullptr)
Constructor, sets up the stream.
Definition: heffte_backend_rocm.h:179
void synchronize_device() const
Syncs the execution with the queue.
Definition: heffte_backend_rocm.h:185
hipStream_t stream() const
Returns the nullptr (const case).
Definition: heffte_backend_rocm.h:183
cudaStream_t stream_type
The type for the internal stream.
Definition: heffte_backend_cuda.h:184
hipStream_t stream()
Returns the nullptr.
Definition: heffte_backend_rocm.h:181
hipStream_t _stream
The CUDA stream to be used in all operations.
Definition: heffte_backend_rocm.h:187
Holds the auxiliary variables needed by each backend.
Definition: heffte_common.h:358
Allows to define whether a specific backend interface has been enabled.
Definition: heffte_common.h:201
Type-tag for the Cosine Transform using the rocFFT backend.
Definition: heffte_common.h:169
Type-tag for the Sine Transform using the rocFFT backend.
Definition: heffte_common.h:174
Type-tag for the rocFFT backend.
Definition: heffte_common.h:164
A generic container that describes a 3d box of indexes.
Definition: heffte_geometry.h:67
std::array< index, 3 > const size
The number of indexes in each direction.
Definition: heffte_geometry.h:129
int find_order(int dir) const
Returns the effective order of the direction (dir), 0 -> fast, 1 -> mid, 2 -> slow.
Definition: heffte_geometry.h:121
Defines a set of default plan options for a given backend.
Definition: heffte_common.h:642
Simple packer that copies sub-boxes without transposing the order of the indexes.
Definition: heffte_backend_cuda.h:759
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition: heffte_backend_rocm.h:865
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_cuda.h:762
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_rocm.h:860
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_rocm.h:838
void executor_r2c
Defines the real-to-complex executor.
Definition: heffte_backend_rocm.h:850
Indicates the structure that will be used by the fft backend.
Definition: heffte_common.h:546
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:42
index line_stride
Stride of the lines.
Definition: heffte_pack3d.h:36
index plane_stride
Stride of the planes.
Definition: heffte_pack3d.h:38
std::array< index, 3 > size
Number of elements in the three directions.
Definition: heffte_pack3d.h:34
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition: heffte_pack3d.h:44
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:40
~plan_rocfft()
Destructor, deletes the plan.
Definition: heffte_backend_rocm.h:463
plan_rocfft(size_t size1, size_t size2, std::array< size_t, 2 > const &embed, size_t batch, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_rocm.h:407
plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition: heffte_backend_rocm.h:372
plan_rocfft(size_t size1, size_t size2, size_t size3)
Constructor, takes inputs identical to cufftPlan3d()
Definition: heffte_backend_rocm.h:436
size_t size_work() const
Return the worksize.
Definition: heffte_backend_rocm.h:467
Plan for the r2c single precision transform.
Definition: heffte_backend_rocm.h:306
~plan_rocfft()
Destructor, deletes the plan.
Definition: heffte_backend_rocm.h:345
size_t size_work() const
Return the worksize.
Definition: heffte_backend_rocm.h:349
plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist)
Constructor and initializer of the plan.
Definition: heffte_backend_rocm.h:316
Template algorithm for the Sine and Cosine transforms.
Definition: heffte_r2r_executor.h:135
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_rocm.h:121
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
Implementation of Sine Transform pre-post processing methods using CUDA.
Definition: heffte_backend_rocm.h:139
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition: heffte_common.h:45
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_backend_rocm.h:877
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition: heffte_backend_rocm.h:882