xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 5274ed1b5914009ea1e03c499b73268a814b584e)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29    Input parameter:
30      A - the matrix
31      x - the vector on which FDFT will be performed
32 
33    Output parameter:
34      y - vector that stores result of FDFT
35 */
36 #undef __FUNCT__
37 #define __FUNCT__ "MatMult_SeqFFTW"
38 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
39 {
40   PetscErrorCode ierr;
41   Mat_FFT        *fft  = (Mat_FFT*)A->data;
42   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
43   PetscScalar    *x_array,*y_array;
44   PetscInt       ndim=fft->ndim,*dim=fft->dim;
45 
46   PetscFunctionBegin;
47   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
48   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
49   if (!fftw->p_forward) { /* create a plan, then excute it */
50     switch (ndim) {
51     case 1:
52 #if defined(PETSC_USE_COMPLEX)
53       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
54 #else
55       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
56 #endif
57       break;
58     case 2:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
63 #endif
64       break;
65     case 3:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
70 #endif
71       break;
72     default:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
75       /* bug: dim[3] changes from 10 to 0 '--with-64-bit-indices=1' */
76 #else
77       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
78 #endif
79       break;
80     }
81     fftw->finarray  = x_array;
82     fftw->foutarray = y_array;
83     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
84                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
85     fftw_execute(fftw->p_forward);
86   } else { /* use existing plan */
87     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
88       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
89     } else {
90       fftw_execute(fftw->p_forward);
91     }
92   }
93   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
94   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
95   PetscFunctionReturn(0);
96 }
97 
98 /* MatMultTranspose_SeqFFTW performs serial backward DFT
99    Input parameter:
100      A - the matrix
101      x - the vector on which BDFT will be performed
102 
103    Output parameter:
104      y - vector that stores result of BDFT
105 */
106 
107 #undef __FUNCT__
108 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
109 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
110 {
111   PetscErrorCode ierr;
112   Mat_FFT        *fft  = (Mat_FFT*)A->data;
113   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
114   PetscScalar    *x_array,*y_array;
115   PetscInt       ndim=fft->ndim,*dim=fft->dim;
116 
117   PetscFunctionBegin;
118   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
119   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
120   if (!fftw->p_backward) { /* create a plan, then excute it */
121     switch (ndim) {
122     case 1:
123 #if defined(PETSC_USE_COMPLEX)
124       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
125 #else
126       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
127 #endif
128       break;
129     case 2:
130 #if defined(PETSC_USE_COMPLEX)
131       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
132 #else
133       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
134 #endif
135       break;
136     case 3:
137 #if defined(PETSC_USE_COMPLEX)
138       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
139 #else
140       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
141 #endif
142       break;
143     default:
144 #if defined(PETSC_USE_COMPLEX)
145       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
146 #else
147       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
148 #endif
149       break;
150     }
151     fftw->binarray  = x_array;
152     fftw->boutarray = y_array;
153     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
154   } else { /* use existing plan */
155     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
156       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
157     } else {
158       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
159     }
160   }
161   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
162   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
163   PetscFunctionReturn(0);
164 }
165 
166 /* MatMult_MPIFFTW performs forward DFT in parallel
167    Input parameter:
168      A - the matrix
169      x - the vector on which FDFT will be performed
170 
171    Output parameter:
172    y   - vector that stores result of FDFT
173 */
174 #undef __FUNCT__
175 #define __FUNCT__ "MatMult_MPIFFTW"
176 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
177 {
178   PetscErrorCode ierr;
179   Mat_FFT        *fft  = (Mat_FFT*)A->data;
180   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
181   PetscScalar    *x_array,*y_array;
182   PetscInt       ndim=fft->ndim,*dim=fft->dim;
183   MPI_Comm       comm;
184 
185   PetscFunctionBegin;
186   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
187   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
188   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
189   if (!fftw->p_forward) { /* create a plan, then excute it */
190     switch (ndim) {
191     case 1:
192 #if defined(PETSC_USE_COMPLEX)
193       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
194 #else
195       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
196 #endif
197       break;
198     case 2:
199 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
200       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
201 #else
202       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
203 #endif
204       break;
205     case 3:
206 #if defined(PETSC_USE_COMPLEX)
207       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
208 #else
209       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
210 #endif
211       break;
212     default:
213 #if defined(PETSC_USE_COMPLEX)
214       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
215 #else
216       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
217 #endif
218       break;
219     }
220     fftw->finarray  = x_array;
221     fftw->foutarray = y_array;
222     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
223                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
224     fftw_execute(fftw->p_forward);
225   } else { /* use existing plan */
226     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
227       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
228     } else {
229       fftw_execute(fftw->p_forward);
230     }
231   }
232   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
233   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
234   PetscFunctionReturn(0);
235 }
236 
237 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
238    Input parameter:
239      A - the matrix
240      x - the vector on which BDFT will be performed
241 
242    Output parameter:
243      y - vector that stores result of BDFT
244 */
245 #undef __FUNCT__
246 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
247 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
248 {
249   PetscErrorCode ierr;
250   Mat_FFT        *fft  = (Mat_FFT*)A->data;
251   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
252   PetscScalar    *x_array,*y_array;
253   PetscInt       ndim=fft->ndim,*dim=fft->dim;
254   MPI_Comm       comm;
255 
256   PetscFunctionBegin;
257   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
258   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
259   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
260   if (!fftw->p_backward) { /* create a plan, then excute it */
261     switch (ndim) {
262     case 1:
263 #if defined(PETSC_USE_COMPLEX)
264       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
265 #else
266       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
267 #endif
268       break;
269     case 2:
270 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
271       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
272 #else
273       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
274 #endif
275       break;
276     case 3:
277 #if defined(PETSC_USE_COMPLEX)
278       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
279 #else
280       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
281 #endif
282       break;
283     default:
284 #if defined(PETSC_USE_COMPLEX)
285       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
286 #else
287       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
288 #endif
289       break;
290     }
291     fftw->binarray  = x_array;
292     fftw->boutarray = y_array;
293     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
294   } else { /* use existing plan */
295     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
296       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
297     } else {
298       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
299     }
300   }
301   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
302   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
303   PetscFunctionReturn(0);
304 }
305 
306 #undef __FUNCT__
307 #define __FUNCT__ "MatDestroy_FFTW"
308 PetscErrorCode MatDestroy_FFTW(Mat A)
309 {
310   Mat_FFT        *fft  = (Mat_FFT*)A->data;
311   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
312   PetscErrorCode ierr;
313 
314   PetscFunctionBegin;
315   fftw_destroy_plan(fftw->p_forward);
316   fftw_destroy_plan(fftw->p_backward);
317   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
318   ierr = PetscFree(fft->data);CHKERRQ(ierr);
319   fftw_mpi_cleanup();
320   PetscFunctionReturn(0);
321 }
322 
323 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
324 #undef __FUNCT__
325 #define __FUNCT__ "VecDestroy_MPIFFTW"
326 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
327 {
328   PetscErrorCode ierr;
329   PetscScalar    *array;
330 
331   PetscFunctionBegin;
332   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
333   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
334   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
335   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
336   PetscFunctionReturn(0);
337 }
338 
339 #undef __FUNCT__
340 #define __FUNCT__ "MatGetVecsFFTW"
341 /*@
342    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
343      parallel layout determined by FFTW
344 
345    Collective on Mat
346 
347    Input Parameter:
348 .   A - the matrix
349 
350    Output Parameter:
351 +   x - (optional) input vector of forward FFTW
352 -   y - (optional) output vector of forward FFTW
353 -   z - (optional) output vector of backward FFTW
354 
355   Level: advanced
356 
357   Note: The parallel layout of output of forward FFTW is always same as the input
358         of the backward FFTW. But parallel layout of the input vector of forward
359         FFTW might not be same as the output of backward FFTW.
360         Also note that we need to provide enough space while doing parallel real transform.
361         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
362         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
363         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
364         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
365         zeros if it is odd only one zero is needed.
366         Lastly one needs some scratch space at the end of data set in each process. alloc_local
367         figures out how much space is needed, i.e. it figures out the data+scratch space for
368         each processor and returns that.
369 
370 .seealso: MatCreateFFTW()
371 @*/
372 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
373 {
374   PetscErrorCode ierr;
375 
376   PetscFunctionBegin;
377   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 #undef __FUNCT__
382 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
383 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
384 {
385   PetscErrorCode ierr;
386   PetscMPIInt    size,rank;
387   MPI_Comm       comm;
388   Mat_FFT        *fft  = (Mat_FFT*)A->data;
389   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
390   PetscInt       N     = fft->N;
391   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
392 
393   PetscFunctionBegin;
394   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
395   PetscValidType(A,1);
396   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
397 
398   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
399   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
400   if (size == 1) { /* sequential case */
401 #if defined(PETSC_USE_COMPLEX)
402     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
403     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
404     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
405 #else
406     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
407     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
408     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
409 #endif
410   } else { /* parallel cases */
411     ptrdiff_t    alloc_local,local_n0,local_0_start;
412     ptrdiff_t    local_n1;
413     fftw_complex *data_fout;
414     ptrdiff_t    local_1_start;
415 #if defined(PETSC_USE_COMPLEX)
416     fftw_complex *data_fin,*data_bout;
417 #else
418     double    *data_finr,*data_boutr;
419     PetscInt  n1,N1;
420     ptrdiff_t temp;
421 #endif
422 
423     switch (ndim) {
424     case 1:
425 #if !defined(PETSC_USE_COMPLEX)
426       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
427 #else
428       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
429       if (fin) {
430         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
431         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
432 
433         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
434       }
435       if (fout) {
436         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
437         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
438 
439         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
440       }
441       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
442       if (bout) {
443         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
444         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
445 
446         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
447       }
448       break;
449 #endif
450     case 2:
451 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
452       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
453       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
454       if (fin) {
455         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
456         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
457 
458         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
459       }
460       if (bout) {
461         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
462         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
463 
464         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
465       }
466       if (fout) {
467         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
468         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
469 
470         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
471       }
472 #else
473       /* Get local size */
474       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
475       if (fin) {
476         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
477         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
478 
479         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
480       }
481       if (fout) {
482         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
483         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
484 
485         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
486       }
487       if (bout) {
488         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
489         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
490 
491         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
492       }
493 #endif
494       break;
495     case 3:
496 #if !defined(PETSC_USE_COMPLEX)
497       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
498       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
499       if (fin) {
500         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
501         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
502 
503         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
504       }
505       if (bout) {
506         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
507         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
508 
509         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
510       }
511 
512       if (fout) {
513         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
514         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
515 
516         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
517       }
518 #else
519       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
520       if (fin) {
521         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
522         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
523 
524         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
525       }
526       if (fout) {
527         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
528         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
529 
530         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
531       }
532       if (bout) {
533         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
534         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
535 
536         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
537       }
538 #endif
539       break;
540     default:
541 #if !defined(PETSC_USE_COMPLEX)
542       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
543 
544       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
545 
546       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
547       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
548 
549       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
550 
551       if (fin) {
552         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
553         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
554 
555         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
556       }
557       if (bout) {
558         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
559         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
560 
561         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
562       }
563 
564       if (fout) {
565         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
566         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
567 
568         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
569       }
570 #else
571       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
572       if (fin) {
573         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
574         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
575 
576         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
577       }
578       if (fout) {
579         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
580         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
581 
582         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
583       }
584       if (bout) {
585         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
586         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
587 
588         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
589       }
590 #endif
591       break;
592     }
593   }
594   PetscFunctionReturn(0);
595 }
596 
597 #undef __FUNCT__
598 #define __FUNCT__ "VecScatterPetscToFFTW"
599 /*@
600    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
601 
602    Collective on Mat
603 
604    Input Parameters:
605 +  A - FFTW matrix
606 -  x - the PETSc vector
607 
608    Output Parameters:
609 .  y - the FFTW vector
610 
611   Options Database Keys:
612 . -mat_fftw_plannerflags - set FFTW planner flags
613 
614    Level: intermediate
615 
616    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
617          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
618          zeros. This routine does that job by scattering operation.
619 
620 .seealso: VecScatterFFTWToPetsc()
621 @*/
622 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
623 {
624   PetscErrorCode ierr;
625 
626   PetscFunctionBegin;
627   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
628   PetscFunctionReturn(0);
629 }
630 
631 #undef __FUNCT__
632 #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
633 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
634 {
635   PetscErrorCode ierr;
636   MPI_Comm       comm;
637   Mat_FFT        *fft  = (Mat_FFT*)A->data;
638   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
639   PetscInt       N     =fft->N;
640   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
641   PetscInt       low;
642   PetscInt       rank,size,vsize,vsize1;
643   ptrdiff_t      alloc_local,local_n0,local_0_start;
644   ptrdiff_t      local_n1,local_1_start;
645   VecScatter     vecscat;
646   IS             list1,list2;
647 #if !defined(PETSC_USE_COMPLEX)
648   PetscInt       i,j,k,partial_dim;
649   PetscInt       *indx1, *indx2, tempindx, tempindx1;
650   PetscInt       N1, n1,NM;
651   ptrdiff_t      temp;
652 #endif
653 
654   PetscFunctionBegin;
655   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
656   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
657   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
658   ierr = VecGetOwnershipRange(y,&low,NULL);
659 
660   if (size==1) {
661     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
662     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
663     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
664     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
665     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
666     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
667     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
668     ierr = ISDestroy(&list1);CHKERRQ(ierr);
669   } else {
670     switch (ndim) {
671     case 1:
672 #if defined(PETSC_USE_COMPLEX)
673       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
674 
675       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
676       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
677       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
678       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
679       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
680       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
681       ierr = ISDestroy(&list1);CHKERRQ(ierr);
682       ierr = ISDestroy(&list2);CHKERRQ(ierr);
683 #else
684       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
685 #endif
686       break;
687     case 2:
688 #if defined(PETSC_USE_COMPLEX)
689       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
690 
691       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
692       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
693       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
694       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
695       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
696       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
697       ierr = ISDestroy(&list1);CHKERRQ(ierr);
698       ierr = ISDestroy(&list2);CHKERRQ(ierr);
699 #else
700       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
701 
702       N1   = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
703       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
704       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
705 
706       if (dim[1]%2==0) {
707         NM = dim[1]+2;
708       } else {
709         NM = dim[1]+1;
710       }
711       for (i=0; i<local_n0; i++) {
712         for (j=0; j<dim[1]; j++) {
713           tempindx  = i*dim[1] + j;
714           tempindx1 = i*NM + j;
715 
716           indx1[tempindx]=local_0_start*dim[1]+tempindx;
717           indx2[tempindx]=low+tempindx1;
718         }
719       }
720 
721       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
722       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
723 
724       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
725       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
726       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
727       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
728       ierr = ISDestroy(&list1);CHKERRQ(ierr);
729       ierr = ISDestroy(&list2);CHKERRQ(ierr);
730       ierr = PetscFree(indx1);CHKERRQ(ierr);
731       ierr = PetscFree(indx2);CHKERRQ(ierr);
732 #endif
733       break;
734 
735     case 3:
736 #if defined(PETSC_USE_COMPLEX)
737       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
738 
739       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
740       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
741       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
742       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
743       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
744       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
745       ierr = ISDestroy(&list1);CHKERRQ(ierr);
746       ierr = ISDestroy(&list2);CHKERRQ(ierr);
747 #else
748       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
749 
750       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
751 
752       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
753       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
754 
755       if (dim[2]%2==0) NM = dim[2]+2;
756       else             NM = dim[2]+1;
757 
758       for (i=0; i<local_n0; i++) {
759         for (j=0; j<dim[1]; j++) {
760           for (k=0; k<dim[2]; k++) {
761             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
762             tempindx1 = i*dim[1]*NM + j*NM + k;
763 
764             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
765             indx2[tempindx]=low+tempindx1;
766           }
767         }
768       }
769 
770       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
771       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
772       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
773       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
774       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
775       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
776       ierr = ISDestroy(&list1);CHKERRQ(ierr);
777       ierr = ISDestroy(&list2);CHKERRQ(ierr);
778       ierr = PetscFree(indx1);CHKERRQ(ierr);
779       ierr = PetscFree(indx2);CHKERRQ(ierr);
780 #endif
781       break;
782 
783     default:
784 #if defined(PETSC_USE_COMPLEX)
785       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
786 
787       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
788       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
789       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
790       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
791       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
792       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
793       ierr = ISDestroy(&list1);CHKERRQ(ierr);
794       ierr = ISDestroy(&list2);CHKERRQ(ierr);
795 #else
796       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
797 
798       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
799 
800       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
801 
802       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
803 
804       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
805 
806       partial_dim = fftw->partial_dim;
807 
808       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
809       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
810 
811       if (dim[ndim-1]%2==0) NM = 2;
812       else                  NM = 1;
813 
814       j = low;
815       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
816         indx1[i] = local_0_start*partial_dim + i;
817         indx2[i] = j;
818         if (k%dim[ndim-1]==0) j+=NM;
819         j++;
820       }
821       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
822       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
823       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
824       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
825       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
826       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
827       ierr = ISDestroy(&list1);CHKERRQ(ierr);
828       ierr = ISDestroy(&list2);CHKERRQ(ierr);
829       ierr = PetscFree(indx1);CHKERRQ(ierr);
830       ierr = PetscFree(indx2);CHKERRQ(ierr);
831 #endif
832       break;
833     }
834   }
835   PetscFunctionReturn(0);
836 }
837 
838 #undef __FUNCT__
839 #define __FUNCT__ "VecScatterFFTWToPetsc"
840 /*@
841    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
842 
843    Collective on Mat
844 
845     Input Parameters:
846 +   A - FFTW matrix
847 -   x - FFTW vector
848 
849    Output Parameters:
850 .  y - PETSc vector
851 
852    Level: intermediate
853 
854    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
855          VecScatterFFTWToPetsc removes those extra zeros.
856 
857 .seealso: VecScatterPetscToFFTW()
858 @*/
859 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
860 {
861   PetscErrorCode ierr;
862 
863   PetscFunctionBegin;
864   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
865   PetscFunctionReturn(0);
866 }
867 
868 #undef __FUNCT__
869 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
870 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
871 {
872   PetscErrorCode ierr;
873   MPI_Comm       comm;
874   Mat_FFT        *fft  = (Mat_FFT*)A->data;
875   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
876   PetscInt       N     = fft->N;
877   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
878   PetscInt       low;
879   PetscInt       rank,size;
880   ptrdiff_t      alloc_local,local_n0,local_0_start;
881   ptrdiff_t      local_n1,local_1_start;
882   VecScatter     vecscat;
883   IS             list1,list2;
884 #if !defined(PETSC_USE_COMPLEX)
885   PetscInt  i,j,k,partial_dim;
886   PetscInt  *indx1, *indx2, tempindx, tempindx1;
887   PetscInt  N1, n1,NM;
888   ptrdiff_t temp;
889 #endif
890 
891   PetscFunctionBegin;
892   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
893   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
894   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
895   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
896 
897   if (size==1) {
898     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
899     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
900     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
901     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
902     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
903     ierr = ISDestroy(&list1);CHKERRQ(ierr);
904 
905   } else {
906     switch (ndim) {
907     case 1:
908 #if defined(PETSC_USE_COMPLEX)
909       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
910 
911       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
912       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
913       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
914       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
915       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
916       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
917       ierr = ISDestroy(&list1);CHKERRQ(ierr);
918       ierr = ISDestroy(&list2);CHKERRQ(ierr);
919 #else
920       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
921 #endif
922       break;
923     case 2:
924 #if defined(PETSC_USE_COMPLEX)
925       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
926 
927       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
928       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
929       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
930       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
931       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
932       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
933       ierr = ISDestroy(&list1);CHKERRQ(ierr);
934       ierr = ISDestroy(&list2);CHKERRQ(ierr);
935 #else
936       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
937 
938       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
939 
940       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
941       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
942 
943       if (dim[1]%2==0) NM = dim[1]+2;
944       else             NM = dim[1]+1;
945 
946       for (i=0; i<local_n0; i++) {
947         for (j=0; j<dim[1]; j++) {
948           tempindx = i*dim[1] + j;
949           tempindx1 = i*NM + j;
950 
951           indx1[tempindx]=local_0_start*dim[1]+tempindx;
952           indx2[tempindx]=low+tempindx1;
953         }
954       }
955 
956       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
957       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
958 
959       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
960       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
961       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
962       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
963       ierr = ISDestroy(&list1);CHKERRQ(ierr);
964       ierr = ISDestroy(&list2);CHKERRQ(ierr);
965       ierr = PetscFree(indx1);CHKERRQ(ierr);
966       ierr = PetscFree(indx2);CHKERRQ(ierr);
967 #endif
968       break;
969     case 3:
970 #if defined(PETSC_USE_COMPLEX)
971       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
972 
973       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
974       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
975       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
976       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
977       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
978       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
979       ierr = ISDestroy(&list1);CHKERRQ(ierr);
980       ierr = ISDestroy(&list2);CHKERRQ(ierr);
981 #else
982 
983       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
984 
985       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
986 
987       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
988       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
989 
990       if (dim[2]%2==0) NM = dim[2]+2;
991       else             NM = dim[2]+1;
992 
993       for (i=0; i<local_n0; i++) {
994         for (j=0; j<dim[1]; j++) {
995           for (k=0; k<dim[2]; k++) {
996             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
997             tempindx1 = i*dim[1]*NM + j*NM + k;
998 
999             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1000             indx2[tempindx]=low+tempindx1;
1001           }
1002         }
1003       }
1004 
1005       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1006       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1007 
1008       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1009       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1010       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1011       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1012       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1013       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1014       ierr = PetscFree(indx1);CHKERRQ(ierr);
1015       ierr = PetscFree(indx2);CHKERRQ(ierr);
1016 #endif
1017       break;
1018     default:
1019 #if defined(PETSC_USE_COMPLEX)
1020       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1021 
1022       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1023       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1024       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1025       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1026       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1027       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1028       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1029       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1030 #else
1031       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1032 
1033       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1034 
1035       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1036 
1037       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
1038 
1039       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1040 
1041       partial_dim = fftw->partial_dim;
1042 
1043       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1044       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1045 
1046       if (dim[ndim-1]%2==0) NM = 2;
1047       else                  NM = 1;
1048 
1049       j = low;
1050       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1051         indx1[i] = local_0_start*partial_dim + i;
1052         indx2[i] = j;
1053         if (k%dim[ndim-1]==0) j+=NM;
1054         j++;
1055       }
1056       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1057       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1058 
1059       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1060       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1061       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1062       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1063       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1064       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1065       ierr = PetscFree(indx1);CHKERRQ(ierr);
1066       ierr = PetscFree(indx2);CHKERRQ(ierr);
1067 #endif
1068       break;
1069     }
1070   }
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 #undef __FUNCT__
1075 #define __FUNCT__ "MatCreate_FFTW"
1076 /*
1077     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1078 
1079   Options Database Keys:
1080 + -mat_fftw_plannerflags - set FFTW planner flags
1081 
1082    Level: intermediate
1083 
1084 */
1085 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1086 {
1087   PetscErrorCode ierr;
1088   MPI_Comm       comm;
1089   Mat_FFT        *fft=(Mat_FFT*)A->data;
1090   Mat_FFTW       *fftw;
1091   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1092   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1093   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1094   PetscBool      flg;
1095   PetscInt       p_flag,partial_dim=1,ctr;
1096   PetscMPIInt    size,rank;
1097   ptrdiff_t      *pdim;
1098   ptrdiff_t      local_n1,local_1_start;
1099 #if !defined(PETSC_USE_COMPLEX)
1100   ptrdiff_t temp;
1101   PetscInt  N1,tot_dim;
1102 #else
1103   PetscInt n1;
1104 #endif
1105 
1106   PetscFunctionBegin;
1107   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1108   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1109   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1110 
1111   fftw_mpi_init();
1112   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1113   pdim[0] = dim[0];
1114 #if !defined(PETSC_USE_COMPLEX)
1115   tot_dim = 2*dim[0];
1116 #endif
1117   for (ctr=1; ctr<ndim; ctr++) {
1118     partial_dim *= dim[ctr];
1119     pdim[ctr]    = dim[ctr];
1120 #if !defined(PETSC_USE_COMPLEX)
1121     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1122     else             tot_dim *= dim[ctr];
1123 #endif
1124   }
1125 
1126   if (size == 1) {
1127 #if defined(PETSC_USE_COMPLEX)
1128     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1129     n    = N;
1130 #else
1131     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1132     n    = tot_dim;
1133 #endif
1134 
1135   } else {
1136     ptrdiff_t alloc_local,local_n0,local_0_start;
1137     switch (ndim) {
1138     case 1:
1139 #if !defined(PETSC_USE_COMPLEX)
1140       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1141 #else
1142       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1143 
1144       n    = (PetscInt)local_n0;
1145       n1   = (PetscInt)local_n1;
1146       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1147 #endif
1148       break;
1149     case 2:
1150 #if defined(PETSC_USE_COMPLEX)
1151       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1152       /*
1153        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1154        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1155        */
1156       n    = (PetscInt)local_n0*dim[1];
1157       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1158 #else
1159       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1160 
1161       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1162       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1163 #endif
1164       break;
1165     case 3:
1166 #if defined(PETSC_USE_COMPLEX)
1167       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1168 
1169       n    = (PetscInt)local_n0*dim[1]*dim[2];
1170       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1171 #else
1172       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1173 
1174       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1175       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1176 #endif
1177       break;
1178     default:
1179 #if defined(PETSC_USE_COMPLEX)
1180       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1181 
1182       n    = (PetscInt)local_n0*partial_dim;
1183       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1184 #else
1185       temp = pdim[ndim-1];
1186 
1187       pdim[ndim-1] = temp/2 + 1;
1188 
1189       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1190 
1191       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1192       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1193 
1194       pdim[ndim-1] = temp;
1195 
1196       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1197 #endif
1198       break;
1199     }
1200   }
1201   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1202   ierr      = PetscNewLog(A,&fftw);CHKERRQ(ierr);
1203   fft->data = (void*)fftw;
1204 
1205   fft->n            = n;
1206   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1207   fftw->partial_dim = partial_dim;
1208 
1209   ierr = PetscMalloc1(ndim, &(fftw->dim_fftw));CHKERRQ(ierr);
1210 
1211   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1212 
1213   fftw->p_forward  = 0;
1214   fftw->p_backward = 0;
1215   fftw->p_flag     = FFTW_ESTIMATE;
1216 
1217   if (size == 1) {
1218     A->ops->mult          = MatMult_SeqFFTW;
1219     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1220   } else {
1221     A->ops->mult          = MatMult_MPIFFTW;
1222     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1223   }
1224   fft->matdestroy = MatDestroy_FFTW;
1225   A->assembled    = PETSC_TRUE;
1226   A->preallocated = PETSC_TRUE;
1227 
1228   ierr = PetscObjectComposeFunction((PetscObject)A,"MatGetVecsFFTW_C",MatGetVecsFFTW_FFTW);CHKERRQ(ierr);
1229   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1230   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1231 
1232   /* get runtime options */
1233   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1234   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);CHKERRQ(ierr);
1235   if (flg) {
1236     fftw->p_flag = iplans[p_flag];
1237   }
1238   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1239   PetscFunctionReturn(0);
1240 }
1241 
1242 
1243 
1244 
1245