xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 97504071fdc107a23ca747f96fac89fc69934dc1)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29 
30    Input parameter:
31    mat - the matrix
32    x   - the vector on which FDFT will be performed
33 
34    Output parameter:
35    y   - vector that stores result of FDFT
36 
37 */
38 #undef __FUNCT__
39 #define __FUNCT__ "MatMult_SeqFFTW"
40 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
41 {
42   PetscErrorCode ierr;
43   Mat_FFT        *fft  = (Mat_FFT*)A->data;
44   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
45   PetscScalar    *x_array,*y_array;
46   PetscInt       ndim=fft->ndim,*dim=fft->dim;
47 
48   PetscFunctionBegin;
49   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
50   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
51   if (!fftw->p_forward){ /* create a plan, then excute it */
52     switch (ndim){
53     case 1:
54 #if defined(PETSC_USE_COMPLEX)
55       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
56 #else
57       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
58 #endif
59       break;
60     case 2:
61 #if defined(PETSC_USE_COMPLEX)
62       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
63 #else
64       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
65 #endif
66       break;
67     case 3:
68 #if defined(PETSC_USE_COMPLEX)
69       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
70 #else
71       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
72 #endif
73       break;
74     default:
75 #if defined(PETSC_USE_COMPLEX)
76       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
77 #else
78       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
79 #endif
80       break;
81     }
82     fftw->finarray  = x_array;
83     fftw->foutarray = y_array;
84     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
85                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
86     fftw_execute(fftw->p_forward);
87   } else { /* use existing plan */
88     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
89       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
90     } else {
91       fftw_execute(fftw->p_forward);
92     }
93   }
94   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
95   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
96   PetscFunctionReturn(0);
97 }
98 
99 /* MatMultTranspose_SeqFFTW performs serial backward DFT
100 
101    Input parameter:
102    mat - the matrix
103    x   - the vector on which BDFT will be performed
104 
105    Output parameter:
106    y   - vector that stores result of BDFT
107 
108 */
109 
110 #undef __FUNCT__
111 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
112 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
113 {
114   PetscErrorCode ierr;
115   Mat_FFT        *fft = (Mat_FFT*)A->data;
116   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
117   PetscScalar    *x_array,*y_array;
118   PetscInt       ndim=fft->ndim,*dim=fft->dim;
119 
120   PetscFunctionBegin;
121   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
122   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
123   if (!fftw->p_backward){ /* create a plan, then excute it */
124     switch (ndim){
125     case 1:
126 #if defined(PETSC_USE_COMPLEX)
127       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
128 #else
129       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
130 #endif
131       break;
132     case 2:
133 #if defined(PETSC_USE_COMPLEX)
134       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
135 #else
136       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
137 #endif
138       break;
139     case 3:
140 #if defined(PETSC_USE_COMPLEX)
141       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
142 #else
143       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
144 #endif
145       break;
146     default:
147 #if defined(PETSC_USE_COMPLEX)
148       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
149 #else
150       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
151 #endif
152       break;
153     }
154     fftw->binarray  = x_array;
155     fftw->boutarray = y_array;
156     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
157   } else { /* use existing plan */
158     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
159       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
160     } else {
161       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
162     }
163   }
164   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
165   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 /* MatMult_MPIFFTW performs forward DFT in parallel
170 
171    Input parameter:
172    mat - the matrix
173    x   - the vector on which FDFT will be performed
174 
175    Output parameter:
176    y   - vector that stores result of FDFT
177 
178 */
179 
180 #undef __FUNCT__
181 #define __FUNCT__ "MatMult_MPIFFTW"
182 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
183 {
184   PetscErrorCode ierr;
185   Mat_FFT        *fft  = (Mat_FFT*)A->data;
186   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
187   PetscScalar    *x_array,*y_array;
188   PetscInt       ndim=fft->ndim,*dim=fft->dim;
189   MPI_Comm       comm=((PetscObject)A)->comm;
190 
191   PetscFunctionBegin;
192   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
193   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
194   if (!fftw->p_forward){ /* create a plan, then excute it */
195     switch (ndim){
196     case 1:
197 #if defined(PETSC_USE_COMPLEX)
198       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
199 #else
200       SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"not support for real numbers yet");
201 #endif
202       break;
203     case 2:
204 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
205       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
206 #else
207       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
208 #endif
209       break;
210     case 3:
211 #if defined(PETSC_USE_COMPLEX)
212       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
213 #else
214       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
215 #endif
216       break;
217     default:
218 #if defined(PETSC_USE_COMPLEX)
219       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
220 #else
221       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
222 #endif
223       break;
224     }
225     fftw->finarray  = x_array;
226     fftw->foutarray = y_array;
227     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
228                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
229     fftw_execute(fftw->p_forward);
230   } else { /* use existing plan */
231     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
232       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
233     } else {
234       fftw_execute(fftw->p_forward);
235     }
236   }
237   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
238   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
243 
244    Input parameter:
245    mat - the matrix
246    x   - the vector on which BDFT will be performed
247 
248    Output parameter:
249    y   - vector that stores result of BDFT
250 
251 */
252 #undef __FUNCT__
253 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
254 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
255 {
256   PetscErrorCode ierr;
257   Mat_FFT        *fft  = (Mat_FFT*)A->data;
258   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
259   PetscScalar    *x_array,*y_array;
260   PetscInt       ndim=fft->ndim,*dim=fft->dim;
261   MPI_Comm       comm=((PetscObject)A)->comm;
262 
263   PetscFunctionBegin;
264   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
265   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
266   if (!fftw->p_backward){ /* create a plan, then excute it */
267     switch (ndim){
268     case 1:
269 #if defined(PETSC_USE_COMPLEX)
270       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
271 #else
272       SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"not support for real numbers yet");
273 #endif
274       break;
275     case 2:
276 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
277       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
278 #else
279       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
280 #endif
281       break;
282     case 3:
283 #if defined(PETSC_USE_COMPLEX)
284       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
285 #else
286       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
287 #endif
288       break;
289     default:
290 #if defined(PETSC_USE_COMPLEX)
291       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
292 #else
293       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
294 #endif
295       break;
296     }
297     fftw->binarray  = x_array;
298     fftw->boutarray = y_array;
299     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
300   } else { /* use existing plan */
301     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
302       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
303     } else {
304       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
305     }
306   }
307   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
308   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatDestroy_FFTW"
314 PetscErrorCode MatDestroy_FFTW(Mat A)
315 {
316   Mat_FFT        *fft = (Mat_FFT*)A->data;
317   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
318   PetscErrorCode ierr;
319 
320   PetscFunctionBegin;
321   fftw_destroy_plan(fftw->p_forward);
322   fftw_destroy_plan(fftw->p_backward);
323   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
324   ierr = PetscFree(fft->data);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
329 #undef __FUNCT__
330 #define __FUNCT__ "VecDestroy_MPIFFTW"
331 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
332 {
333   PetscErrorCode  ierr;
334   PetscScalar     *array;
335 
336   PetscFunctionBegin;
337   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
338   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
339   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
340   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
341   PetscFunctionReturn(0);
342 }
343 
344 
345 #undef __FUNCT__
346 #define __FUNCT__ "MatGetVecsFFTW"
347 /*@
348    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
349      parallel layout determined by FFTW
350 
351    Collective on Mat
352 
353    Input Parameter:
354 .  mat - the matrix
355 
356    Output Parameter:
357 +   fin - (optional) input vector of forward FFTW
358 -   fout - (optional) output vector of forward FFTW
359 -   bout - (optional) output vector of backward FFTW
360 
361   Level: advanced
362   Note: The parallel layout of output of forward FFTW is always same as the input
363         of the backward FFTW. But parallel layout of the input vector of forward
364         FFTW might not be same as the output of backward FFTW.
365         Also note that we need to provide enough space while doing parallel real transform.
366         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
367         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
368         last dimension from n to n/2+1 while invoking this routine.
369 
370 .seealso: MatCreateFFTW()
371 @*/
372 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
373 {
374   PetscErrorCode ierr;
375 
376   PetscFunctionBegin;
377   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 EXTERN_C_BEGIN
382 #undef __FUNCT__
383 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
384 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
385 {
386   PetscErrorCode ierr;
387   PetscMPIInt    size,rank;
388   MPI_Comm       comm=((PetscObject)A)->comm;
389   Mat_FFT        *fft = (Mat_FFT*)A->data;
390   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
391   PetscInt       N=fft->N;
392   PetscInt       ndim=fft->ndim,*dim=fft->dim,n=fft->n;
393 
394   PetscFunctionBegin;
395   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
396   PetscValidType(A,1);
397 
398   ierr = MPI_Comm_size(PETSC_COMM_WORLD, &size);CHKERRQ(ierr);
399   ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr);
400   if (size == 1){ /* sequential case */
401 #if defined(PETSC_USE_COMPLEX)
402     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
403     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
404     if (bout){ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
405 #else
406     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
407     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
408     if (bout){ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
409 #endif
410   } else { /* parallel cases */
411     ptrdiff_t      alloc_local,local_n0,local_0_start;
412     ptrdiff_t      local_n1;
413     fftw_complex   *data_fout;
414     ptrdiff_t      local_1_start;
415 #if defined(PETSC_USE_COMPLEX)
416     fftw_complex   *data_fin,*data_bout;
417 #else
418     double         *data_finr,*data_boutr;
419     PetscInt       n1,N1,vsize;
420     ptrdiff_t      temp;
421 #endif
422 
423     switch (ndim){
424           case 1:
425 #if !defined(PETSC_USE_COMPLEX)
426                  SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
427 #else
428                  alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
429                  if (fin) {
430                          data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
431                          ierr = VecCreateMPIWithArray(comm,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
432                          (*fin)->ops->destroy = VecDestroy_MPIFFTW;
433                          }
434                  if (fout) {
435                           data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
436                           ierr = VecCreateMPIWithArray(comm,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
437                           (*fout)->ops->destroy = VecDestroy_MPIFFTW;
438                          }
439                  alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
440                  if (bout) {
441                           data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
442                           ierr = VecCreateMPIWithArray(comm,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
443                           (*bout)->ops->destroy = VecDestroy_MPIFFTW;
444                          }
445           break;
446 #endif
447           case 2:
448 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
449                  alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
450                  N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
451                  if (fin) {
452                            data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
453                            ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
454                            (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
455                           }
456                  if (bout) {
457                            data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
458                            ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
459                            ierr = VecGetSize(*bout,&vsize);CHKERRQ(ierr);
460                            (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
461                           }
462                  //n1 = 2*local_n1*dim[0];
463                  if (fout) {
464                             data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
465                             ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
466                             (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
467                            }
468 #else
469       /* Get local size */
470                  alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
471                  if (fin) {
472                            data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
473                            ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
474                            (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
475                           }
476                  if (fout) {
477                             data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
478                             ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
479                             (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
480                            }
481                  if (bout) {
482                            data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
483                            ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
484                            (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
485                           }
486 #endif
487           break;
488           case 3:
489 #if !defined(PETSC_USE_COMPLEX)
490                  alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
491                  N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
492                  if (fin) {
493                          data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
494                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
495                          (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
496                          }
497                  if (bout) {
498                          data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
499                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
500                          (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
501                           }
502                  //n1 = 2*local_n1*dim[0]*(dim[2]/2+1);
503                  if (fout) {
504                           data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
505                           ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
506                           (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
507                           }
508 #else
509                  alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
510                  if (fin) {
511                          data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
512                          ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
513                          (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
514                          }
515                  if (fout) {
516                           data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
517                           ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
518                           (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
519                          }
520                  if (bout) {
521                          data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
522                          ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
523                          (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
524                          }
525 #endif
526           break;
527           default:
528 #if !defined(PETSC_USE_COMPLEX)
529                  temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
530                  (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
531                  alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
532                  N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
533                  (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
534 
535                  if (fin) {
536                          data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
537                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
538                          (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
539                         }
540                  if (bout) {
541                          data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
542                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
543                          (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
544                         }
545                  //temp = fftw->partial_dim;
546                  //fftw->partial_dim = fftw->partial_dim * ((fftw->dim_fftw)[fftw->ndim_fftw-1]/2+1)*(fftw->dim_fftw)[1]/((fftw->dim_fftw)[2]*(fftw->dim_fftw)[fftw->ndim_fftw-1]);
547                  //n1 = 2*local_n1*(fftw->partial_dim);  fftw->partial_dim = temp;
548                  if (fout) {
549                           data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
550                           ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
551                           (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
552                         }
553 #else
554                 alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
555                 if (fin) {
556                        data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
557                        ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
558                        (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
559                        }
560                 if (fout) {
561                          data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
562                          ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
563                          (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
564                        }
565                 if (bout) {
566                        data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
567                        ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
568                        (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
569                        }
570 #endif
571             break;
572           }
573   }
574   PetscFunctionReturn(0);
575 }
576 EXTERN_C_END
577 
578 #undef __FUNCT__
579 #define __FUNCT__ "MatGetVecs_FFTW"
580 /*
581    MatGetVecs_FFTW - Get vector(s) compatible with the matrix, i.e. with the
582      parallel layout determined by FFTW
583 
584    Collective on Mat
585 
586    Input Parameter:
587 .  mat - the matrix
588 
589    Output Parameter:
590 +   fin - (optional) input vector of forward FFTW
591 -   fout - (optional) output vector of forward FFTW
592 
593   Level: advanced
594 
595 .seealso: MatCreateFFTW()
596 */
597 PetscErrorCode  MatGetVecs_FFTW(Mat A,Vec *fin,Vec *fout)
598 {
599   PetscErrorCode ierr;
600   PetscMPIInt    size,rank;
601   MPI_Comm       comm=((PetscObject)A)->comm;
602   Mat_FFT        *fft = (Mat_FFT*)A->data;
603   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
604   PetscInt       N=fft->N;
605   PetscInt       ndim=fft->ndim,*dim=fft->dim,n=fft->n;
606 
607   PetscFunctionBegin;
608   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
609   PetscValidType(A,1);
610 
611   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
612   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
613   if (size == 1){ /* sequential case */
614 #if defined(PETSC_USE_COMPLEX)
615     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
616     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
617 #else
618     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N*2*(dim[ndim-1]/2+1)/dim[ndim-1],fin);CHKERRQ(ierr);}
619     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,2*N*(dim[ndim-1]/2+1)/dim[ndim-1],fout);CHKERRQ(ierr);}
620 #endif
621   } else {        /* mpi case */
622     ptrdiff_t      alloc_local,local_n0,local_0_start;
623     ptrdiff_t      local_n1,local_1_end;
624     fftw_complex   *data_fin,*data_fout;
625 #if !defined(PETSC_USE_COMPLEX)
626     double         *data_finr ;
627     ptrdiff_t      local_1_start,temp;
628     PetscInt       vsize,n1,N1;
629 #endif
630 
631 //    PetscInt ctr;
632 //    ptrdiff_t      ndim1,*pdim;
633 //    ndim1=(ptrdiff_t) ndim;
634 //    pdim = (ptrdiff_t *)calloc(ndim,sizeof(ptrdiff_t));
635 
636 //    for(ctr=0;ctr<ndim;ctr++)
637 //        {k
638 //           pdim[ctr] = dim[ctr];
639 //       }
640 
641 
642 
643     switch (ndim){
644     case 1:
645       /* Get local size */
646       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_end);
647       if (fin) {
648         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
649         ierr = VecCreateMPIWithArray(comm,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
650         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
651       }
652       if (fout) {
653         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
654         ierr = VecCreateMPIWithArray(comm,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
655         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
656       }
657       break;
658     case 2:
659 #if !defined(PETSC_USE_COMPLEX)
660       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
661       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
662       if (fin) {
663         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
664         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
665         ierr = VecGetSize(*fin,&vsize);CHKERRQ(ierr);
666         //printf("The code comes here with vector size %d\n",vsize);
667         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
668       }
669       n1 = 2*local_n1*(dim[0]);
670       //n1 = 2*local_n1*dim[1];
671       //printf("The values are %ld\n",local_n1);
672       if (fout) {
673         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
674         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
675         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
676       }
677 
678 #else
679       /* Get local size */
680      //printf("Hope this does not come here");
681       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
682       if (fin) {
683         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
684         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
685         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
686       }
687       if (fout) {
688         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
689         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
690         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
691       }
692      //printf("Hope this does not come here");
693 #endif
694       break;
695     case 3:
696       /* Get local size */
697 #if !defined(PETSC_USE_COMPLEX)
698       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
699       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
700       if (fin) {
701         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
702         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
703         ierr = VecGetSize(*fin,&vsize);CHKERRQ(ierr);
704         //printf("The code comes here with vector size %d\n",vsize);
705         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
706       }
707       n1 = 2*local_n1*dim[0]*(dim[2]/2+1);
708       if (fout) {
709         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
710         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
711         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
712       }
713 
714 
715 #else
716       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
717 //      printf("The quantity n is %d",n);
718       if (fin) {
719         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
720         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
721         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
722       }
723       if (fout) {
724         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
725         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
726         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
727       }
728 #endif
729       break;
730     default:
731       /* Get local size */
732 #if !defined(PETSC_USE_COMPLEX)
733       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
734       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
735       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
736       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
737       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
738       if (fin) {
739         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
740         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
741         ierr = VecGetSize(*fin,&vsize);CHKERRQ(ierr);
742         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
743       }
744       temp = fftw->partial_dim;
745       fftw->partial_dim = fftw->partial_dim * ((fftw->dim_fftw)[fftw->ndim_fftw-1]/2+1)*(fftw->dim_fftw)[1]/((fftw->dim_fftw)[2]*(fftw->dim_fftw)[fftw->ndim_fftw-1]);
746 
747       n1 = 2*local_n1*(fftw->partial_dim);  fftw->partial_dim = temp;
748       if (fout) {
749         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
750         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
751         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
752       }
753 
754 #else
755       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
756       if (fin) {
757         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
758         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
759         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
760       }
761       if (fout) {
762         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
763         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
764         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
765       }
766 #endif
767       break;
768     }
769   }
770 //  if (fin){
771 //    ierr = PetscLayoutReference(A->cmap,&(*fin)->map);CHKERRQ(ierr);
772 //  }
773 //  if (fout){
774 //    ierr = PetscLayoutReference(A->rmap,&(*fout)->map);CHKERRQ(ierr);
775 //  }
776   PetscFunctionReturn(0);
777 }
778 
779 #undef __FUNCT__
780 #define __FUNCT__ "VecScatterPetscToFFTW"
781 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
782 {
783   PetscErrorCode ierr;
784   PetscFunctionBegin;
785   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
786   PetscFunctionReturn(0);
787 }
788 
789 /*
790       VecScatterPetscToFFTW_FFTW - Copies the user data to the vector that goes into FFTW block.
791   Input A, x, y
792   A - FFTW matrix
793   x - user data
794   Options Database Keys:
795 + -mat_fftw_plannerflags - set FFTW planner flags
796 
797    Level: intermediate
798    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
799          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
800          zeros. This routine does that job by scattering operation.
801 
802 
803 */
804 
805 EXTERN_C_BEGIN
806 #undef __FUNCT__
807 #define __FUNCT__ "VecScatterPetscToFFTW_FTTW"
808 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
809 {
810   PetscErrorCode ierr;
811   MPI_Comm       comm=((PetscObject)A)->comm;
812   Mat_FFT        *fft = (Mat_FFT*)A->data;
813   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
814   PetscInt       N=fft->N;
815   PetscInt       ndim=fft->ndim,*dim=fft->dim;//n=fft->n;
816   PetscInt       low;
817   PetscInt       rank,size,vsize,vsize1;
818   ptrdiff_t      alloc_local,local_n0,local_0_start;
819   ptrdiff_t      local_n1,local_1_start;
820   VecScatter     vecscat;
821   IS             list1,list2;
822 #if !defined(PETSC_USE_COMPLEX)
823   PetscInt       i,j,k,partial_dim;
824   PetscInt       *indx1, *indx2, tempindx, tempindx1;
825   PetscInt       N1, n1 ,NM;
826   ptrdiff_t      temp;
827 #endif
828 
829   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
830   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
831   ierr = VecGetOwnershipRange(y,&low,PETSC_NULL);
832 
833   if (size==1)
834     {
835           ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
836           ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
837           //printf("The values of sizes are %d and %d",vsize,vsize1);
838           ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
839           ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
840           ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
841           ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
842           ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
843           ierr = ISDestroy(&list1);CHKERRQ(ierr);
844     }
845 
846  else{
847 
848  switch (ndim){
849  case 1:
850 #if defined(PETSC_USE_COMPLEX)
851       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
852       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0,local_0_start,1,&list1);
853       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0,low,1,&list2);
854       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
855       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
856       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
857       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
858       ierr = ISDestroy(&list1);CHKERRQ(ierr);
859       ierr = ISDestroy(&list2);CHKERRQ(ierr);
860 #else
861       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
862 #endif
863  break;
864  case 2:
865 #if defined(PETSC_USE_COMPLEX)
866       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
867       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],local_0_start*dim[1],1,&list1);
868       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],low,1,&list2);
869       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
870       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
871       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
872       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
873       ierr = ISDestroy(&list1);CHKERRQ(ierr);
874       ierr = ISDestroy(&list2);CHKERRQ(ierr);
875 #else
876       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
877       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
878       //ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx1);CHKERRQ(ierr);
879       //ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx2);CHKERRQ(ierr);
880       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
881       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
882 
883       if (dim[1]%2==0)
884         NM = dim[1]+2;
885       else
886         NM = dim[1]+1;
887 
888       for (i=0;i<local_n0;i++){
889          for (j=0;j<dim[1];j++){
890             tempindx = i*dim[1] + j;
891             tempindx1 = i*NM + j;
892             indx1[tempindx]=local_0_start*dim[1]+tempindx;
893             indx2[tempindx]=low+tempindx1;
894         }
895      }
896 
897       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
898       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
899 
900       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
901       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
902       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
903       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
904       ierr = ISDestroy(&list1);CHKERRQ(ierr);
905       ierr = ISDestroy(&list2);CHKERRQ(ierr);
906       ierr = PetscFree(indx1);CHKERRQ(ierr);
907       ierr = PetscFree(indx2);CHKERRQ(ierr);
908 #endif
909  break;
910 
911  case 3:
912 #if defined(PETSC_USE_COMPLEX)
913       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
914       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
915       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],low,1,&list2);
916       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
917       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
918       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
919       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
920       ierr = ISDestroy(&list1);CHKERRQ(ierr);
921       ierr = ISDestroy(&list2);CHKERRQ(ierr);
922 #else
923       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
924       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
925 
926       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
927       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
928 
929       if (dim[2]%2==0)
930         NM = dim[2]+2;
931       else
932         NM = dim[2]+1;
933 
934       for (i=0;i<local_n0;i++){
935          for (j=0;j<dim[1];j++){
936             for (k=0;k<dim[2];k++){
937                tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
938                tempindx1 = i*dim[1]*NM + j*NM + k;
939                indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
940                indx2[tempindx]=low+tempindx1;
941             }
942          }
943       }
944 
945       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
946       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
947       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
948       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
949       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
950       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
951       ierr = ISDestroy(&list1);CHKERRQ(ierr);
952       ierr = ISDestroy(&list2);CHKERRQ(ierr);
953       ierr = PetscFree(indx1);CHKERRQ(ierr);
954       ierr = PetscFree(indx2);CHKERRQ(ierr);
955 #endif
956  break;
957 
958  default:
959 #if defined(PETSC_USE_COMPLEX)
960       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
961       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
962       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),low,1,&list2);
963       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
964       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
965       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
966       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
967       ierr = ISDestroy(&list1);CHKERRQ(ierr);
968       ierr = ISDestroy(&list2);CHKERRQ(ierr);
969 #else
970       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
971       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
972       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
973       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
974       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
975 
976       partial_dim = fftw->partial_dim;
977 
978       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
979       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
980 
981       if (dim[ndim-1]%2==0)
982         NM = 2;
983       else
984         NM = 1;
985 
986       j = low;
987       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++)
988          {
989           indx1[i] = local_0_start*partial_dim + i;
990           indx2[i] = j;
991           if (k%dim[ndim-1]==0)
992             { j+=NM;}
993           j++;
994          }
995       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
996       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
997       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
998       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
999       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1000       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1001       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1002       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1003       ierr = PetscFree(indx1);CHKERRQ(ierr);
1004       ierr = PetscFree(indx2);CHKERRQ(ierr);
1005 #endif
1006  break;
1007   }
1008  }
1009  return(0);
1010 }
1011 EXTERN_C_END
1012 
1013 #undef __FUNCT__
1014 #define __FUNCT__ "VecScatterFFTWToPetsc"
1015 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
1016 {
1017   PetscErrorCode ierr;
1018   PetscFunctionBegin;
1019   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 /*
1024       VecScatterFFTWToPetsc_FFTW - Converts FFTW output to the PETSc vector that user can use.
1025   Input A, x, y
1026   A - FFTW matrix
1027   x - FFTW vector
1028   y - PETSc vector that user can read
1029 
1030    Level: intermediate
1031    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
1032          VecScatterFFTWToPetsc removes those extra zeros.
1033 
1034 */
1035 
1036 EXTERN_C_BEGIN
1037 #undef __FUNCT__
1038 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
1039 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
1040 {
1041   PetscErrorCode ierr;
1042   MPI_Comm       comm=((PetscObject)A)->comm;
1043   Mat_FFT        *fft = (Mat_FFT*)A->data;
1044   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
1045   PetscInt       N=fft->N;
1046   PetscInt       ndim=fft->ndim,*dim=fft->dim;
1047   PetscInt       low;
1048   PetscInt       rank,size;
1049   ptrdiff_t      alloc_local,local_n0,local_0_start;
1050   ptrdiff_t      local_n1,local_1_start;
1051   VecScatter     vecscat;
1052   IS             list1,list2;
1053 #if !defined(PETSC_USE_COMPLEX)
1054   PetscInt       i,j,k,partial_dim;
1055   PetscInt       *indx1, *indx2, tempindx, tempindx1;
1056   PetscInt       N1, n1 ,NM;
1057   ptrdiff_t      temp;
1058 #endif
1059 
1060 
1061   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1062   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1063   ierr = VecGetOwnershipRange(x,&low,PETSC_NULL);CHKERRQ(ierr);
1064 
1065   if (size==1){
1066     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
1067     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
1068     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1069     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1070     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1071     ierr = ISDestroy(&list1);CHKERRQ(ierr);
1072   }
1073   else{
1074 
1075   switch (ndim){
1076   case 1:
1077 #if defined(PETSC_USE_COMPLEX)
1078       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1079       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n1,local_1_start,1,&list1);
1080       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n1,low,1,&list2);
1081       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1082       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1083       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1084       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1085       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1086       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1087 #else
1088       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1089 #endif
1090   break;
1091   case 2:
1092 #if defined(PETSC_USE_COMPLEX)
1093       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1094       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],local_0_start*dim[1],1,&list1);
1095       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],low,1,&list2);
1096       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1097       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1098       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1099       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1100       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1101       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1102 #else
1103       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1104       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
1105 
1106       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
1107       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
1108 
1109       if (dim[1]%2==0)
1110         NM = dim[1]+2;
1111       else
1112         NM = dim[1]+1;
1113 
1114       for (i=0;i<local_n0;i++){
1115          for (j=0;j<dim[1];j++){
1116             tempindx = i*dim[1] + j;
1117             tempindx1 = i*NM + j;
1118             indx1[tempindx]=local_0_start*dim[1]+tempindx;
1119             indx2[tempindx]=low+tempindx1;
1120          }
1121        }
1122 
1123        ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1124        ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1125 
1126        ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1127        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1128        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1129        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1130        ierr = ISDestroy(&list1);CHKERRQ(ierr);
1131        ierr = ISDestroy(&list2);CHKERRQ(ierr);
1132        ierr = PetscFree(indx1);CHKERRQ(ierr);
1133        ierr = PetscFree(indx2);CHKERRQ(ierr);
1134 #endif
1135   break;
1136   case 3:
1137 #if defined(PETSC_USE_COMPLEX)
1138        alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1139        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1140        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],low,1,&list2);
1141        ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1142        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1143        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1144        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1145        ierr = ISDestroy(&list1);CHKERRQ(ierr);
1146        ierr = ISDestroy(&list2);CHKERRQ(ierr);
1147 #else
1148 
1149        alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1150        N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
1151 
1152 //     ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx1);CHKERRQ(ierr);
1153 //     ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx2);CHKERRQ(ierr);
1154        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
1155        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
1156 
1157        if (dim[2]%2==0)
1158         NM = dim[2]+2;
1159        else
1160         NM = dim[2]+1;
1161 
1162        for (i=0;i<local_n0;i++){
1163           for (j=0;j<dim[1];j++){
1164              for (k=0;k<dim[2];k++){
1165                 tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
1166                 tempindx1 = i*dim[1]*NM + j*NM + k;
1167                 indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1168                 indx2[tempindx]=low+tempindx1;
1169              }
1170           }
1171        }
1172 
1173        ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1174        ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1175 
1176        ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1177        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1178        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1179        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1180        ierr = ISDestroy(&list1);CHKERRQ(ierr);
1181        ierr = ISDestroy(&list2);CHKERRQ(ierr);
1182        ierr = PetscFree(indx1);CHKERRQ(ierr);
1183        ierr = PetscFree(indx2);CHKERRQ(ierr);
1184 #endif
1185   break;
1186   default:
1187 #if defined(PETSC_USE_COMPLEX)
1188        alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1189        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1190        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),low,1,&list2);
1191        ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1192        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1193        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1194        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1195        ierr = ISDestroy(&list1);CHKERRQ(ierr);
1196        ierr = ISDestroy(&list2);CHKERRQ(ierr);
1197 #else
1198        temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1199        (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1200        alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1201        N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
1202        (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1203 
1204        partial_dim = fftw->partial_dim;
1205 
1206        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1207        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1208 
1209        if (dim[ndim-1]%2==0)
1210          NM = 2;
1211        else
1212          NM = 1;
1213 
1214        j = low;
1215        for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++)
1216           {
1217            indx1[i] = local_0_start*partial_dim + i;
1218            indx2[i] = j;
1219            if (k%dim[ndim-1]==0)
1220              { j+=NM;}
1221            j++;
1222           }
1223        ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1224        ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1225 
1226        ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1227        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1228        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1229        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1230        ierr = ISDestroy(&list1);CHKERRQ(ierr);
1231        ierr = ISDestroy(&list2);CHKERRQ(ierr);
1232        ierr = PetscFree(indx1);CHKERRQ(ierr);
1233        ierr = PetscFree(indx2);CHKERRQ(ierr);
1234 #endif
1235   break;
1236   }
1237   }
1238   return 0;
1239 }
1240 EXTERN_C_END
1241 
1242 EXTERN_C_BEGIN
1243 #undef __FUNCT__
1244 #define __FUNCT__ "MatCreate_FFTW"
1245 /*
1246       MatCreate_FFTW - Creates a matrix object that provides FFT
1247   via the external package FFTW
1248   Options Database Keys:
1249 + -mat_fftw_plannerflags - set FFTW planner flags
1250 
1251    Level: intermediate
1252 
1253 */
1254 
1255 PetscErrorCode MatCreate_FFTW(Mat A)
1256 {
1257   PetscErrorCode ierr;
1258   MPI_Comm       comm=((PetscObject)A)->comm;
1259   Mat_FFT        *fft=(Mat_FFT*)A->data;
1260   Mat_FFTW       *fftw;
1261   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1262   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1263   PetscBool      flg;
1264   PetscInt       p_flag,partial_dim=1,ctr;
1265   PetscMPIInt    size,rank;
1266   ptrdiff_t      *pdim;
1267   ptrdiff_t      local_n1,local_1_start;
1268 #if !defined(PETSC_USE_COMPLEX)
1269    ptrdiff_t     temp;
1270    PetscInt      N1,tot_dim;
1271 #else
1272    PetscInt n1;
1273 #endif
1274 
1275   PetscFunctionBegin;
1276   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1277   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1278 
1279   pdim = (ptrdiff_t *)calloc(ndim,sizeof(ptrdiff_t));
1280   pdim[0] = dim[0];
1281 #if !defined(PETSC_USE_COMPLEX)
1282   tot_dim = 2*dim[0];
1283 #endif
1284   for (ctr=1;ctr<ndim;ctr++)
1285      {
1286           partial_dim *= dim[ctr];
1287           pdim[ctr] = dim[ctr];
1288 #if !defined(PETSC_USE_COMPLEX)
1289           if (ctr==ndim-1)
1290             tot_dim *= (dim[ctr]/2+1);
1291           else
1292             tot_dim *= dim[ctr];
1293 #endif
1294      }
1295 
1296   if (size == 1) {
1297 #if defined(PETSC_USE_COMPLEX)
1298     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1299     n = N;
1300 #else
1301     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1302     n = tot_dim;
1303 #endif
1304 
1305   } else {
1306     ptrdiff_t alloc_local,local_n0,local_0_start;
1307     switch (ndim){
1308     case 1:
1309 #if !defined(PETSC_USE_COMPLEX)
1310    SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1311 #else
1312       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1313       n = (PetscInt)local_n0;
1314       n1 = (PetscInt) local_n1;
1315       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1316 #endif
1317       break;
1318     case 2:
1319 #if defined(PETSC_USE_COMPLEX)
1320       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1321       /*
1322        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1323        PetscSynchronizedFlush(comm);
1324        */
1325       n = (PetscInt)local_n0*dim[1];
1326       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1327 #else
1328       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1329       n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1330       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1331 #endif
1332       break;
1333     case 3:
1334 #if defined(PETSC_USE_COMPLEX)
1335       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1336       n = (PetscInt)local_n0*dim[1]*dim[2];
1337       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1338 #else
1339       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1340       n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1341       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1342 #endif
1343       break;
1344     default:
1345 #if defined(PETSC_USE_COMPLEX)
1346       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1347       n = (PetscInt)local_n0*partial_dim;
1348       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1349 #else
1350       temp = pdim[ndim-1];
1351       pdim[ndim-1] = temp/2 + 1;
1352       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1353       n = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1354       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1355       pdim[ndim-1] = temp;
1356       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1357 #endif
1358       break;
1359     }
1360   }
1361   ierr = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1362   ierr = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1363   fft->data = (void*)fftw;
1364 
1365   fft->n           = n;
1366   fftw->ndim_fftw  = (ptrdiff_t)ndim; // This is dimension of fft
1367   fftw->partial_dim = partial_dim;
1368   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t *)&(fftw->dim_fftw));CHKERRQ(ierr);
1369   for(ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1370 
1371   fftw->p_forward  = 0;
1372   fftw->p_backward = 0;
1373   fftw->p_flag     = FFTW_ESTIMATE;
1374 
1375   if (size == 1){
1376     A->ops->mult          = MatMult_SeqFFTW;
1377     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1378   } else {
1379     A->ops->mult          = MatMult_MPIFFTW;
1380     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1381   }
1382   fft->matdestroy          = MatDestroy_FFTW;
1383   //A->ops->getvecs       = MatGetVecs_FFTW;
1384   A->assembled          = PETSC_TRUE;
1385   PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);
1386   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);
1387   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);
1388 
1389   /* get runtime options */
1390   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1391     ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1392     if (flg) {fftw->p_flag = (unsigned)p_flag;}
1393   PetscOptionsEnd();
1394   PetscFunctionReturn(0);
1395 }
1396 EXTERN_C_END
1397 
1398 
1399 
1400 
1401