Highly Efficient FFT for Exascale: HeFFTe v2.3
heffte_pack3d.h
1 /*
2  -- heFFTe --
3  Univ. of Tennessee, Knoxville
4  @date
5 */
6 
7 #ifndef HEFFTE_PACK3D_H
8 #define HEFFTE_PACK3D_H
9 
10 #include "heffte_common.h"
11 
25 namespace heffte {
26 
31 template<typename index>
32 struct pack_plan_3d{
34  std::array<index, 3> size;
36  index line_stride;
38  index plane_stride;
44  std::array<int, 3> map;
45 };
46 
51 template<typename index>
52 inline std::ostream & operator << (std::ostream &os, pack_plan_3d<index> const &plan){
53  os << "nfast = " << plan.size[0] << "\n";
54  os << "nmid = " << plan.size[1] << "\n";
55  os << "nslow = " << plan.size[2] << "\n";
56  os << "line_stride = " << plan.line_stride << "\n";
57  os << "plane_stride = " << plan.plane_stride << "\n";
58  if (plan.buff_line_stride > 0){
59  os << "buff_line_stride = " << plan.buff_line_stride << "\n";
60  os << "buff_plane_stride = " << plan.buff_plane_stride << "\n";
61  os << "map = (" << plan.map[0] << ", " << plan.map[1] << ", " << plan.map[2] << ")\n";
62  }
63  os << "\n";
64  return os;
65 }
66 
74 template<typename backend>
75 struct packer_backend{};
76 
77 // typename struct packer_backend<cuda>{ using mode = tag::gpu; } // specialization can differentiate between gpu and cpu backends
78 
83 template<typename mode> struct direct_packer{};
84 
89 template<> struct direct_packer<tag::cpu>{
91  template<typename scalar_type, typename index>
92  void pack(void*, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
93  scalar_type* buffer_iterator = buffer;
94  for(index slow = 0; slow < plan.size[2]; slow++){
95  for(index mid = 0; mid < plan.size[1]; mid++){
96  buffer_iterator = std::copy_n(&data[slow * plan.plane_stride + mid * plan.line_stride], plan.size[0], buffer_iterator);
97  }
98  }
99  }
101  template<typename scalar_type, typename index>
102  void unpack(void*, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
103  for(index slow = 0; slow < plan.size[2]; slow++){
104  for(index mid = 0; mid < plan.size[1]; mid++){
105  std::copy_n(&buffer[(slow * plan.size[1] + mid) * plan.size[0]],
106  plan.size[0], &data[slow * plan.plane_stride + mid * plan.line_stride]);
107  }
108  }
109  }
110 };
111 
116 template<typename mode> struct transpose_packer{};
117 
122 template<> struct transpose_packer<tag::cpu>{
124  template<typename scalar_type, typename index>
125  void pack(void *q, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
126  direct_packer<tag::cpu>().pack(q, plan, data, buffer); // packing is done the same way as the direct_packer
127  }
134  template<typename scalar_type, typename index>
135  void unpack(void*, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
136  constexpr index stride = 256 / sizeof(scalar_type);
137  if (plan.map[0] == 0 and plan.map[1] == 1){
138  for(index i=0; i<plan.size[2]; i++)
139  for(index j=0; j<plan.size[1]; j++)
140  for(index k=0; k<plan.size[0]; k++)
141  data[i * plan.plane_stride + j * plan.line_stride + k]
142  = buffer[ i * plan.buff_plane_stride + j * plan.buff_line_stride + k ];
143 
144  }else if (plan.map[0] == 0 and plan.map[1] == 2){
145  for(index bi=0; bi<plan.size[2]; bi+=stride)
146  for(index bj=0; bj<plan.size[1]; bj+=stride)
147  for(index bk=0; bk<plan.size[0]; bk+=stride)
148  for(index i=bi; i<std::min(bi + stride, plan.size[2]); i++)
149  for(index j=bj; j<std::min(bj + stride, plan.size[1]); j++)
150  for(index k=bk; k<std::min(bk + stride, plan.size[0]); k++)
151  data[i * plan.plane_stride + j * plan.line_stride + k]
152  = buffer[ j * plan.buff_plane_stride + i * plan.buff_line_stride + k ];
153 
154  }else if (plan.map[0] == 1 and plan.map[1] == 0){
155  for(index bi=0; bi<plan.size[2]; bi+=stride)
156  for(index bj=0; bj<plan.size[1]; bj+=stride)
157  for(index bk=0; bk<plan.size[0]; bk+=stride)
158  for(index i=bi; i<std::min(bi + stride, plan.size[2]); i++)
159  for(index j=bj; j<std::min(bj + stride, plan.size[1]); j++)
160  for(index k=bk; k<std::min(bk + stride, plan.size[0]); k++)
161  data[i * plan.plane_stride + j * plan.line_stride + k]
162  = buffer[ i * plan.buff_plane_stride + k * plan.buff_line_stride + j ];
163 
164  }else if (plan.map[0] == 1 and plan.map[1] == 2){
165  for(index bi=0; bi<plan.size[2]; bi+=stride)
166  for(index bj=0; bj<plan.size[1]; bj+=stride)
167  for(index bk=0; bk<plan.size[0]; bk+=stride)
168  for(index i=bi; i<std::min(bi + stride, plan.size[2]); i++)
169  for(index j=bj; j<std::min(bj + stride, plan.size[1]); j++)
170  for(index k=bk; k<std::min(bk + stride, plan.size[0]); k++)
171  data[i * plan.plane_stride + j * plan.line_stride + k]
172  = buffer[ k * plan.buff_plane_stride + i * plan.buff_line_stride + j ];
173 
174  }else if (plan.map[0] == 2 and plan.map[1] == 0){
175  for(index bi=0; bi<plan.size[2]; bi+=stride)
176  for(index bj=0; bj<plan.size[1]; bj+=stride)
177  for(index bk=0; bk<plan.size[0]; bk+=stride)
178  for(index i=bi; i<std::min(bi + stride, plan.size[2]); i++)
179  for(index j=bj; j<std::min(bj + stride, plan.size[1]); j++)
180  for(index k=bk; k<std::min(bk + stride, plan.size[0]); k++)
181  data[i * plan.plane_stride + j * plan.line_stride + k]
182  = buffer[ j * plan.buff_plane_stride + k * plan.buff_line_stride + i ];
183 
184  }else{ // if (plan.map[0] == 2 and plan.map[1] == 1){
185  for(index bi=0; bi<plan.size[2]; bi+=stride)
186  for(index bj=0; bj<plan.size[1]; bj+=stride)
187  for(index bk=0; bk<plan.size[0]; bk+=stride)
188  for(index i=bi; i<std::min(bi + stride, plan.size[2]); i++)
189  for(index j=bj; j<std::min(bj + stride, plan.size[1]); j++)
190  for(index k=bk; k<std::min(bk + stride, plan.size[0]); k++)
191  data[i * plan.plane_stride + j * plan.line_stride + k]
192  = buffer[ k * plan.buff_plane_stride + j * plan.buff_line_stride + i ];
193 
194  }
195 
196  }
197 };
198 
203 namespace data_scaling {
208  template<typename scalar_type, typename index>
209  void apply(void*, index num_entries, scalar_type *data, double scale_factor){;
210  for(index i=0; i<num_entries; i++) data[i] *= scale_factor;
211  }
221  template<typename precision_type, typename index>
222  void apply(void *stream, index num_entries, std::complex<precision_type> *data, double scale_factor){
223  apply<precision_type>(stream, 2*num_entries, reinterpret_cast<precision_type*>(data), scale_factor);
224  }
229  template<typename scalar_type, typename index>
230  void apply(index num_entries, scalar_type *data, double scale_factor){
231  apply(nullptr, num_entries, data, scale_factor);
232  }
233 };
234 
235 }
236 
237 #endif
std::ostream & operator<<(std::ostream &os, box3d< index > const box)
Debugging info, writes out the box to a stream.
Definition: heffte_geometry.h:146
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition: heffte_backend_cuda.h:796
Namespace containing all HeFFTe methods and classes.
Definition: heffte_backend_cuda.h:38
Simple packer that copies sub-boxes without transposing the order of the indexes.
Definition: heffte_pack3d.h:89
void pack(void *, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_pack3d.h:92
void unpack(void *, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition: heffte_pack3d.h:102
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition: heffte_pack3d.h:83
Holds the plan for a pack/unpack operation.
Definition: heffte_pack3d.h:32
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:42
index line_stride
Stride of the lines.
Definition: heffte_pack3d.h:36
index plane_stride
Stride of the planes.
Definition: heffte_pack3d.h:38
std::array< index, 3 > size
Number of elements in the three directions.
Definition: heffte_pack3d.h:34
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition: heffte_pack3d.h:44
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition: heffte_pack3d.h:40
The packer needs to know whether the data will be on the CPU or GPU devices.
Definition: heffte_pack3d.h:75
void pack(void *q, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition: heffte_pack3d.h:125
void unpack(void *, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition: heffte_pack3d.h:135
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition: heffte_pack3d.h:116