xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision f4fca6d458ed9e13378932da2ba9c0a778827bbd)
1b2b77a04SHong Zhang 
2b2b77a04SHong Zhang /*
3b2b77a04SHong Zhang     Provides an interface to the FFTW package.
4b2b77a04SHong Zhang     Testing examples can be found in ~src/mat/examples/tests
5b2b77a04SHong Zhang */
6b2b77a04SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8b2b77a04SHong Zhang EXTERN_C_BEGIN
9c6db04a5SJed Brown #include <fftw3-mpi.h>
10b2b77a04SHong Zhang EXTERN_C_END
11b2b77a04SHong Zhang 
12b2b77a04SHong Zhang typedef struct {
13b9ae062cSHong Zhang   ptrdiff_t    ndim_fftw,*dim_fftw;
14a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
15a1b6d50cSHong Zhang   fftw_iodim64 *iodims;
16a1b6d50cSHong Zhang #else
178c1d5ab3SHong Zhang   fftw_iodim   *iodims;
18a1b6d50cSHong Zhang #endif
19e5d7f247SAmlan Barua   PetscInt     partial_dim;
20b2b77a04SHong Zhang   fftw_plan    p_forward,p_backward;
21b2b77a04SHong Zhang   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
22b2b77a04SHong Zhang   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
23b2b77a04SHong Zhang                                                             executed for the arrays with which the plan was created */
24b2b77a04SHong Zhang } Mat_FFTW;
25b2b77a04SHong Zhang 
26b2b77a04SHong Zhang extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
27b2b77a04SHong Zhang extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
28b2b77a04SHong Zhang extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
29b2b77a04SHong Zhang extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
30b2b77a04SHong Zhang extern PetscErrorCode MatDestroy_FFTW(Mat);
31b2b77a04SHong Zhang extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
32b2b77a04SHong Zhang 
3397504071SAmlan Barua /* MatMult_SeqFFTW performs forward DFT in parallel
3497504071SAmlan Barua    Input parameter:
353564f093SHong Zhang      A - the matrix
3697504071SAmlan Barua      x - the vector on which FDFT will be performed
3797504071SAmlan Barua 
3897504071SAmlan Barua    Output parameter:
3997504071SAmlan Barua      y - vector that stores result of FDFT
4097504071SAmlan Barua */
41b2b77a04SHong Zhang #undef __FUNCT__
42b2b77a04SHong Zhang #define __FUNCT__ "MatMult_SeqFFTW"
43b2b77a04SHong Zhang PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
44b2b77a04SHong Zhang {
45b2b77a04SHong Zhang   PetscErrorCode ierr;
46b2b77a04SHong Zhang   Mat_FFT        *fft  = (Mat_FFT*)A->data;
47b2b77a04SHong Zhang   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
48*f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
49*f4fca6d4SMatthew G. Knepley   PetscScalar    *y_array;
501acd80e4SHong Zhang #if defined(PETSC_USE_COMPLEX)
51a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
52a1b6d50cSHong Zhang   fftw_iodim64   *iodims;
53a1b6d50cSHong Zhang #else
548c1d5ab3SHong Zhang   fftw_iodim     *iodims;
55a1b6d50cSHong Zhang #endif
561acd80e4SHong Zhang   PetscInt       i;
571acd80e4SHong Zhang #endif
581acd80e4SHong Zhang   PetscInt       ndim = fft->ndim,*dim = fft->dim;
59b2b77a04SHong Zhang 
60b2b77a04SHong Zhang   PetscFunctionBegin;
61*f4fca6d4SMatthew G. Knepley   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
62b2b77a04SHong Zhang   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
63b2b77a04SHong Zhang   if (!fftw->p_forward) { /* create a plan, then excute it */
64b2b77a04SHong Zhang     switch (ndim) {
65b2b77a04SHong Zhang     case 1:
6658a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
67b2b77a04SHong Zhang       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
6858a912c5SAmlan Barua #else
6958a912c5SAmlan Barua       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
7058a912c5SAmlan Barua #endif
71b2b77a04SHong Zhang       break;
72b2b77a04SHong Zhang     case 2:
7358a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
74b2b77a04SHong Zhang       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
7558a912c5SAmlan Barua #else
7658a912c5SAmlan Barua       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
7758a912c5SAmlan Barua #endif
78b2b77a04SHong Zhang       break;
79b2b77a04SHong Zhang     case 3:
8058a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
81b2b77a04SHong Zhang       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
8258a912c5SAmlan Barua #else
8358a912c5SAmlan Barua       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
8458a912c5SAmlan Barua #endif
85b2b77a04SHong Zhang       break;
86b2b77a04SHong Zhang     default:
8758a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
88a1b6d50cSHong Zhang       iodims = fftw->iodims;
89a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
908c1d5ab3SHong Zhang       if (ndim) {
91a1b6d50cSHong Zhang         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
928c1d5ab3SHong Zhang         iodims[ndim-1].is = iodims[ndim-1].os = 1;
938c1d5ab3SHong Zhang         for (i=ndim-2; i>=0; --i) {
94a1b6d50cSHong Zhang           iodims[i].n = (ptrdiff_t)dim[i];
958c1d5ab3SHong Zhang           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
968c1d5ab3SHong Zhang         }
978c1d5ab3SHong Zhang       }
98a1b6d50cSHong Zhang       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
99a1b6d50cSHong Zhang #else
100a1b6d50cSHong Zhang       if (ndim) {
101a1b6d50cSHong Zhang         iodims[ndim-1].n = (int)dim[ndim-1];
102a1b6d50cSHong Zhang         iodims[ndim-1].is = iodims[ndim-1].os = 1;
103a1b6d50cSHong Zhang         for (i=ndim-2; i>=0; --i) {
104a1b6d50cSHong Zhang           iodims[i].n = (int)dim[i];
105a1b6d50cSHong Zhang           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
106a1b6d50cSHong Zhang         }
107a1b6d50cSHong Zhang       }
108a1b6d50cSHong Zhang       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
109a1b6d50cSHong Zhang #endif
110a1b6d50cSHong Zhang 
11158a912c5SAmlan Barua #else
112a31b9140SHong Zhang       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
11358a912c5SAmlan Barua #endif
114b2b77a04SHong Zhang       break;
115b2b77a04SHong Zhang     }
116*f4fca6d4SMatthew G. Knepley     fftw->finarray  = (PetscScalar *) x_array;
117b2b77a04SHong Zhang     fftw->foutarray = y_array;
118b2b77a04SHong Zhang     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
119b2b77a04SHong Zhang                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
120b2b77a04SHong Zhang     fftw_execute(fftw->p_forward);
121b2b77a04SHong Zhang   } else { /* use existing plan */
122b2b77a04SHong Zhang     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
123b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
124b2b77a04SHong Zhang     } else {
125b2b77a04SHong Zhang       fftw_execute(fftw->p_forward);
126b2b77a04SHong Zhang     }
127b2b77a04SHong Zhang   }
128b2b77a04SHong Zhang   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
129*f4fca6d4SMatthew G. Knepley   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
130b2b77a04SHong Zhang   PetscFunctionReturn(0);
131b2b77a04SHong Zhang }
132b2b77a04SHong Zhang 
13397504071SAmlan Barua /* MatMultTranspose_SeqFFTW performs serial backward DFT
13497504071SAmlan Barua    Input parameter:
1353564f093SHong Zhang      A - the matrix
13697504071SAmlan Barua      x - the vector on which BDFT will be performed
13797504071SAmlan Barua 
13897504071SAmlan Barua    Output parameter:
13997504071SAmlan Barua      y - vector that stores result of BDFT
14097504071SAmlan Barua */
14197504071SAmlan Barua 
142b2b77a04SHong Zhang #undef __FUNCT__
143b2b77a04SHong Zhang #define __FUNCT__ "MatMultTranspose_SeqFFTW"
144b2b77a04SHong Zhang PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
145b2b77a04SHong Zhang {
146b2b77a04SHong Zhang   PetscErrorCode ierr;
147b2b77a04SHong Zhang   Mat_FFT        *fft  = (Mat_FFT*)A->data;
148b2b77a04SHong Zhang   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
149*f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
150*f4fca6d4SMatthew G. Knepley   PetscScalar    *y_array;
151b2b77a04SHong Zhang   PetscInt       ndim=fft->ndim,*dim=fft->dim;
1521acd80e4SHong Zhang #if defined(PETSC_USE_COMPLEX)
153a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
154a1b6d50cSHong Zhang   fftw_iodim64   *iodims=fftw->iodims;
155a1b6d50cSHong Zhang #else
156a1b6d50cSHong Zhang   fftw_iodim     *iodims=fftw->iodims;
157a1b6d50cSHong Zhang #endif
1581acd80e4SHong Zhang #endif
159b2b77a04SHong Zhang 
160b2b77a04SHong Zhang   PetscFunctionBegin;
161*f4fca6d4SMatthew G. Knepley   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
162b2b77a04SHong Zhang   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
163b2b77a04SHong Zhang   if (!fftw->p_backward) { /* create a plan, then excute it */
164b2b77a04SHong Zhang     switch (ndim) {
165b2b77a04SHong Zhang     case 1:
16658a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
167b2b77a04SHong Zhang       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
16858a912c5SAmlan Barua #else
169e81bb599SAmlan Barua       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
17058a912c5SAmlan Barua #endif
171b2b77a04SHong Zhang       break;
172b2b77a04SHong Zhang     case 2:
17358a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
174b2b77a04SHong Zhang       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
17558a912c5SAmlan Barua #else
176e81bb599SAmlan Barua       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
17758a912c5SAmlan Barua #endif
178b2b77a04SHong Zhang       break;
179b2b77a04SHong Zhang     case 3:
18058a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
181b2b77a04SHong Zhang       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
18258a912c5SAmlan Barua #else
183e81bb599SAmlan Barua       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
18458a912c5SAmlan Barua #endif
185b2b77a04SHong Zhang       break;
186b2b77a04SHong Zhang     default:
18758a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
188a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
189a1b6d50cSHong Zhang       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
190a1b6d50cSHong Zhang #else
1918c1d5ab3SHong Zhang       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
192a1b6d50cSHong Zhang #endif
19358a912c5SAmlan Barua #else
194a31b9140SHong Zhang       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
19558a912c5SAmlan Barua #endif
196b2b77a04SHong Zhang       break;
197b2b77a04SHong Zhang     }
198*f4fca6d4SMatthew G. Knepley     fftw->binarray  = (PetscScalar *) x_array;
199b2b77a04SHong Zhang     fftw->boutarray = y_array;
200b2b77a04SHong Zhang     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
201b2b77a04SHong Zhang   } else { /* use existing plan */
202b2b77a04SHong Zhang     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
203b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
204b2b77a04SHong Zhang     } else {
205b2b77a04SHong Zhang       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
206b2b77a04SHong Zhang     }
207b2b77a04SHong Zhang   }
208b2b77a04SHong Zhang   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
209*f4fca6d4SMatthew G. Knepley   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
210b2b77a04SHong Zhang   PetscFunctionReturn(0);
211b2b77a04SHong Zhang }
212b2b77a04SHong Zhang 
21397504071SAmlan Barua /* MatMult_MPIFFTW performs forward DFT in parallel
21497504071SAmlan Barua    Input parameter:
2153564f093SHong Zhang      A - the matrix
21697504071SAmlan Barua      x - the vector on which FDFT will be performed
21797504071SAmlan Barua 
21897504071SAmlan Barua    Output parameter:
21997504071SAmlan Barua    y   - vector that stores result of FDFT
22097504071SAmlan Barua */
221b2b77a04SHong Zhang #undef __FUNCT__
222b2b77a04SHong Zhang #define __FUNCT__ "MatMult_MPIFFTW"
223b2b77a04SHong Zhang PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
224b2b77a04SHong Zhang {
225b2b77a04SHong Zhang   PetscErrorCode ierr;
226b2b77a04SHong Zhang   Mat_FFT        *fft  = (Mat_FFT*)A->data;
227b2b77a04SHong Zhang   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
228*f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
229*f4fca6d4SMatthew G. Knepley   PetscScalar    *y_array;
230c92418ccSAmlan Barua   PetscInt       ndim=fft->ndim,*dim=fft->dim;
231ce94432eSBarry Smith   MPI_Comm       comm;
232b2b77a04SHong Zhang 
233b2b77a04SHong Zhang   PetscFunctionBegin;
234ce94432eSBarry Smith   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
235*f4fca6d4SMatthew G. Knepley   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
236b2b77a04SHong Zhang   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
237b2b77a04SHong Zhang   if (!fftw->p_forward) { /* create a plan, then excute it */
238b2b77a04SHong Zhang     switch (ndim) {
239b2b77a04SHong Zhang     case 1:
2403c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
241b2b77a04SHong Zhang       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
242ae0a50aaSHong Zhang #else
2434f8276c3SHong Zhang       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
2443c3a9c75SAmlan Barua #endif
245b2b77a04SHong Zhang       break;
246b2b77a04SHong Zhang     case 2:
24797504071SAmlan Barua #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
248b2b77a04SHong Zhang       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
2493c3a9c75SAmlan Barua #else
2503c3a9c75SAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
2513c3a9c75SAmlan Barua #endif
252b2b77a04SHong Zhang       break;
253b2b77a04SHong Zhang     case 3:
2543c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
255b2b77a04SHong Zhang       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
2563c3a9c75SAmlan Barua #else
25751d1eed7SAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
2583c3a9c75SAmlan Barua #endif
259b2b77a04SHong Zhang       break;
260b2b77a04SHong Zhang     default:
2613c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
262c92418ccSAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
2633c3a9c75SAmlan Barua #else
2643c3a9c75SAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
2653c3a9c75SAmlan Barua #endif
266b2b77a04SHong Zhang       break;
267b2b77a04SHong Zhang     }
268*f4fca6d4SMatthew G. Knepley     fftw->finarray  = (PetscScalar *) x_array;
269b2b77a04SHong Zhang     fftw->foutarray = y_array;
270b2b77a04SHong Zhang     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
271b2b77a04SHong Zhang                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
272b2b77a04SHong Zhang     fftw_execute(fftw->p_forward);
273b2b77a04SHong Zhang   } else { /* use existing plan */
274b2b77a04SHong Zhang     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
275b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
276b2b77a04SHong Zhang     } else {
277b2b77a04SHong Zhang       fftw_execute(fftw->p_forward);
278b2b77a04SHong Zhang     }
279b2b77a04SHong Zhang   }
280b2b77a04SHong Zhang   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
281*f4fca6d4SMatthew G. Knepley   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
282b2b77a04SHong Zhang   PetscFunctionReturn(0);
283b2b77a04SHong Zhang }
284b2b77a04SHong Zhang 
28597504071SAmlan Barua /* MatMultTranspose_MPIFFTW performs parallel backward DFT
28697504071SAmlan Barua    Input parameter:
2873564f093SHong Zhang      A - the matrix
28897504071SAmlan Barua      x - the vector on which BDFT will be performed
28997504071SAmlan Barua 
29097504071SAmlan Barua    Output parameter:
29197504071SAmlan Barua      y - vector that stores result of BDFT
29297504071SAmlan Barua */
293b2b77a04SHong Zhang #undef __FUNCT__
294b2b77a04SHong Zhang #define __FUNCT__ "MatMultTranspose_MPIFFTW"
295b2b77a04SHong Zhang PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
296b2b77a04SHong Zhang {
297b2b77a04SHong Zhang   PetscErrorCode ierr;
298b2b77a04SHong Zhang   Mat_FFT        *fft  = (Mat_FFT*)A->data;
299b2b77a04SHong Zhang   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
300*f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
301*f4fca6d4SMatthew G. Knepley   PetscScalar    *y_array;
302c92418ccSAmlan Barua   PetscInt       ndim=fft->ndim,*dim=fft->dim;
303ce94432eSBarry Smith   MPI_Comm       comm;
304b2b77a04SHong Zhang 
305b2b77a04SHong Zhang   PetscFunctionBegin;
306ce94432eSBarry Smith   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
307*f4fca6d4SMatthew G. Knepley   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
308b2b77a04SHong Zhang   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
309b2b77a04SHong Zhang   if (!fftw->p_backward) { /* create a plan, then excute it */
310b2b77a04SHong Zhang     switch (ndim) {
311b2b77a04SHong Zhang     case 1:
3123c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
313b2b77a04SHong Zhang       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
314ae0a50aaSHong Zhang #else
3154f8276c3SHong Zhang       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
3163c3a9c75SAmlan Barua #endif
317b2b77a04SHong Zhang       break;
318b2b77a04SHong Zhang     case 2:
31997504071SAmlan Barua #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
320b2b77a04SHong Zhang       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
3213c3a9c75SAmlan Barua #else
3223c3a9c75SAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
3233c3a9c75SAmlan Barua #endif
324b2b77a04SHong Zhang       break;
325b2b77a04SHong Zhang     case 3:
3263c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
327b2b77a04SHong Zhang       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
3283c3a9c75SAmlan Barua #else
3293c3a9c75SAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
3303c3a9c75SAmlan Barua #endif
331b2b77a04SHong Zhang       break;
332b2b77a04SHong Zhang     default:
3333c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
334c92418ccSAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
3353c3a9c75SAmlan Barua #else
3363c3a9c75SAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
3373c3a9c75SAmlan Barua #endif
338b2b77a04SHong Zhang       break;
339b2b77a04SHong Zhang     }
340*f4fca6d4SMatthew G. Knepley     fftw->binarray  = (PetscScalar *) x_array;
341b2b77a04SHong Zhang     fftw->boutarray = y_array;
342b2b77a04SHong Zhang     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
343b2b77a04SHong Zhang   } else { /* use existing plan */
344b2b77a04SHong Zhang     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
345b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
346b2b77a04SHong Zhang     } else {
347b2b77a04SHong Zhang       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
348b2b77a04SHong Zhang     }
349b2b77a04SHong Zhang   }
350b2b77a04SHong Zhang   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
351*f4fca6d4SMatthew G. Knepley   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
352b2b77a04SHong Zhang   PetscFunctionReturn(0);
353b2b77a04SHong Zhang }
354b2b77a04SHong Zhang 
355b2b77a04SHong Zhang #undef __FUNCT__
356b2b77a04SHong Zhang #define __FUNCT__ "MatDestroy_FFTW"
357b2b77a04SHong Zhang PetscErrorCode MatDestroy_FFTW(Mat A)
358b2b77a04SHong Zhang {
359b2b77a04SHong Zhang   Mat_FFT        *fft  = (Mat_FFT*)A->data;
360b2b77a04SHong Zhang   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
361b2b77a04SHong Zhang   PetscErrorCode ierr;
362b2b77a04SHong Zhang 
363b2b77a04SHong Zhang   PetscFunctionBegin;
364b2b77a04SHong Zhang   fftw_destroy_plan(fftw->p_forward);
365b2b77a04SHong Zhang   fftw_destroy_plan(fftw->p_backward);
3668c1d5ab3SHong Zhang   if (fftw->iodims) {
3678c1d5ab3SHong Zhang     free(fftw->iodims);
3688c1d5ab3SHong Zhang   }
369bb5bf6f6SHong Zhang   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
370bf0cc555SLisandro Dalcin   ierr = PetscFree(fft->data);CHKERRQ(ierr);
3716ccf0b3eSHong Zhang   fftw_mpi_cleanup();
372b2b77a04SHong Zhang   PetscFunctionReturn(0);
373b2b77a04SHong Zhang }
374b2b77a04SHong Zhang 
375c6db04a5SJed Brown #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
376b2b77a04SHong Zhang #undef __FUNCT__
377b2b77a04SHong Zhang #define __FUNCT__ "VecDestroy_MPIFFTW"
378b2b77a04SHong Zhang PetscErrorCode VecDestroy_MPIFFTW(Vec v)
379b2b77a04SHong Zhang {
380b2b77a04SHong Zhang   PetscErrorCode ierr;
381b2b77a04SHong Zhang   PetscScalar    *array;
382b2b77a04SHong Zhang 
383b2b77a04SHong Zhang   PetscFunctionBegin;
384b2b77a04SHong Zhang   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
385b2b77a04SHong Zhang   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
386b2b77a04SHong Zhang   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
387b2b77a04SHong Zhang   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
388b2b77a04SHong Zhang   PetscFunctionReturn(0);
389b2b77a04SHong Zhang }
390b2b77a04SHong Zhang 
3914f7415efSAmlan Barua #undef __FUNCT__
3922a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecsFFTW"
3934be45526SHong Zhang /*@
3942a7a6963SBarry Smith    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
3954f7415efSAmlan Barua      parallel layout determined by FFTW
3964f7415efSAmlan Barua 
3974f7415efSAmlan Barua    Collective on Mat
3984f7415efSAmlan Barua 
3994f7415efSAmlan Barua    Input Parameter:
4003564f093SHong Zhang .   A - the matrix
4014f7415efSAmlan Barua 
4024f7415efSAmlan Barua    Output Parameter:
403cc55f3b1SHong Zhang +   x - (optional) input vector of forward FFTW
404cc55f3b1SHong Zhang -   y - (optional) output vector of forward FFTW
405cc55f3b1SHong Zhang -   z - (optional) output vector of backward FFTW
4064f7415efSAmlan Barua 
4074f7415efSAmlan Barua   Level: advanced
4083564f093SHong Zhang 
40997504071SAmlan Barua   Note: The parallel layout of output of forward FFTW is always same as the input
41097504071SAmlan Barua         of the backward FFTW. But parallel layout of the input vector of forward
41197504071SAmlan Barua         FFTW might not be same as the output of backward FFTW.
41297504071SAmlan Barua         Also note that we need to provide enough space while doing parallel real transform.
41397504071SAmlan Barua         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
41497504071SAmlan Barua         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
415e0ec536eSAmlan Barua         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
416e0ec536eSAmlan Barua         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
417e0ec536eSAmlan Barua         zeros if it is odd only one zero is needed.
418e0ec536eSAmlan Barua         Lastly one needs some scratch space at the end of data set in each process. alloc_local
419e0ec536eSAmlan Barua         figures out how much space is needed, i.e. it figures out the data+scratch space for
420e0ec536eSAmlan Barua         each processor and returns that.
4214f7415efSAmlan Barua 
4224f7415efSAmlan Barua .seealso: MatCreateFFTW()
4234be45526SHong Zhang @*/
4242a7a6963SBarry Smith PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
4254be45526SHong Zhang {
4264be45526SHong Zhang   PetscErrorCode ierr;
427b4c3921fSHong Zhang 
4284be45526SHong Zhang   PetscFunctionBegin;
4292a7a6963SBarry Smith   ierr = PetscTryMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
4304be45526SHong Zhang   PetscFunctionReturn(0);
4314be45526SHong Zhang }
4324be45526SHong Zhang 
4334be45526SHong Zhang #undef __FUNCT__
4342a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecsFFTW_FFTW"
4352a7a6963SBarry Smith PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
4364f7415efSAmlan Barua {
4374f7415efSAmlan Barua   PetscErrorCode ierr;
4384f7415efSAmlan Barua   PetscMPIInt    size,rank;
439ce94432eSBarry Smith   MPI_Comm       comm;
4404f7415efSAmlan Barua   Mat_FFT        *fft  = (Mat_FFT*)A->data;
4414f7415efSAmlan Barua   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
4429496c9aeSAmlan Barua   PetscInt       N     = fft->N;
4434f7415efSAmlan Barua   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
4444f7415efSAmlan Barua 
4454f7415efSAmlan Barua   PetscFunctionBegin;
4464f7415efSAmlan Barua   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4474f7415efSAmlan Barua   PetscValidType(A,1);
448ce94432eSBarry Smith   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
4494f7415efSAmlan Barua 
4504f8276c3SHong Zhang   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
4514f8276c3SHong Zhang   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
4524f7415efSAmlan Barua   if (size == 1) { /* sequential case */
4534f7415efSAmlan Barua #if defined(PETSC_USE_COMPLEX)
4544f7415efSAmlan Barua     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
4554f7415efSAmlan Barua     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
4564f7415efSAmlan Barua     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
4574f7415efSAmlan Barua #else
4588ca4c034SAmlan Barua     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
4598ca4c034SAmlan Barua     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
4608ca4c034SAmlan Barua     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
4614f7415efSAmlan Barua #endif
46297504071SAmlan Barua   } else { /* parallel cases */
4634f7415efSAmlan Barua     ptrdiff_t    alloc_local,local_n0,local_0_start;
4649496c9aeSAmlan Barua     ptrdiff_t    local_n1;
4659496c9aeSAmlan Barua     fftw_complex *data_fout;
4669496c9aeSAmlan Barua     ptrdiff_t    local_1_start;
4679496c9aeSAmlan Barua #if defined(PETSC_USE_COMPLEX)
4689496c9aeSAmlan Barua     fftw_complex *data_fin,*data_bout;
4699496c9aeSAmlan Barua #else
4704f7415efSAmlan Barua     double    *data_finr,*data_boutr;
4716ccf0b3eSHong Zhang     PetscInt  n1,N1;
4729496c9aeSAmlan Barua     ptrdiff_t temp;
4739496c9aeSAmlan Barua #endif
4749496c9aeSAmlan Barua 
4754f7415efSAmlan Barua     switch (ndim) {
4764f7415efSAmlan Barua     case 1:
47757625b84SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
4784f8276c3SHong Zhang       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
47957625b84SAmlan Barua #else
48057625b84SAmlan Barua       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
48157625b84SAmlan Barua       if (fin) {
48257625b84SAmlan Barua         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
483778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
48426fbe8dcSKarl Rupp 
48557625b84SAmlan Barua         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
48657625b84SAmlan Barua       }
48757625b84SAmlan Barua       if (fout) {
48857625b84SAmlan Barua         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
489778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
49026fbe8dcSKarl Rupp 
49157625b84SAmlan Barua         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
49257625b84SAmlan Barua       }
49357625b84SAmlan Barua       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
49457625b84SAmlan Barua       if (bout) {
49557625b84SAmlan Barua         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
496778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
49726fbe8dcSKarl Rupp 
49857625b84SAmlan Barua         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
49957625b84SAmlan Barua       }
50057625b84SAmlan Barua       break;
50157625b84SAmlan Barua #endif
5024f7415efSAmlan Barua     case 2:
50397504071SAmlan Barua #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
5044f7415efSAmlan Barua       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
5054f7415efSAmlan Barua       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
5064f7415efSAmlan Barua       if (fin) {
5074f7415efSAmlan Barua         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
508778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
50926fbe8dcSKarl Rupp 
5104f7415efSAmlan Barua         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
5114f7415efSAmlan Barua       }
5124f7415efSAmlan Barua       if (bout) {
5134f7415efSAmlan Barua         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
514778a2246SBarry Smith         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
51526fbe8dcSKarl Rupp 
5164f7415efSAmlan Barua         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
5174f7415efSAmlan Barua       }
51857625b84SAmlan Barua       if (fout) {
51957625b84SAmlan Barua         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
520778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
52126fbe8dcSKarl Rupp 
52257625b84SAmlan Barua         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
52357625b84SAmlan Barua       }
5244f7415efSAmlan Barua #else
5254f7415efSAmlan Barua       /* Get local size */
5264f7415efSAmlan Barua       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
5274f7415efSAmlan Barua       if (fin) {
5284f7415efSAmlan Barua         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
529778a2246SBarry Smith         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
53026fbe8dcSKarl Rupp 
5314f7415efSAmlan Barua         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
5324f7415efSAmlan Barua       }
5334f7415efSAmlan Barua       if (fout) {
5344f7415efSAmlan Barua         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
535778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
53626fbe8dcSKarl Rupp 
5374f7415efSAmlan Barua         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
5384f7415efSAmlan Barua       }
5394f7415efSAmlan Barua       if (bout) {
5404f7415efSAmlan Barua         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
541778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
54226fbe8dcSKarl Rupp 
5434f7415efSAmlan Barua         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
5444f7415efSAmlan Barua       }
5454f7415efSAmlan Barua #endif
5464f7415efSAmlan Barua       break;
5474f7415efSAmlan Barua     case 3:
5484f7415efSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
5494f7415efSAmlan Barua       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
5504f7415efSAmlan Barua       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
5514f7415efSAmlan Barua       if (fin) {
5524f7415efSAmlan Barua         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
553778a2246SBarry Smith         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
55426fbe8dcSKarl Rupp 
5554f7415efSAmlan Barua         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
5564f7415efSAmlan Barua       }
5574f7415efSAmlan Barua       if (bout) {
5584f7415efSAmlan Barua         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
559778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
56026fbe8dcSKarl Rupp 
5614f7415efSAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
5624f7415efSAmlan Barua       }
5633564f093SHong Zhang 
56457625b84SAmlan Barua       if (fout) {
56557625b84SAmlan Barua         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
566778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
56726fbe8dcSKarl Rupp 
56857625b84SAmlan Barua         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
56957625b84SAmlan Barua       }
5704f7415efSAmlan Barua #else
5710c9b18e4SAmlan Barua       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
5720c9b18e4SAmlan Barua       if (fin) {
5730c9b18e4SAmlan Barua         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
574778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
57526fbe8dcSKarl Rupp 
5760c9b18e4SAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
5770c9b18e4SAmlan Barua       }
5780c9b18e4SAmlan Barua       if (fout) {
5790c9b18e4SAmlan Barua         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
580778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
58126fbe8dcSKarl Rupp 
5820c9b18e4SAmlan Barua         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
5830c9b18e4SAmlan Barua       }
5840c9b18e4SAmlan Barua       if (bout) {
5850c9b18e4SAmlan Barua         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
586778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
58726fbe8dcSKarl Rupp 
5880c9b18e4SAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
5890c9b18e4SAmlan Barua       }
5904f7415efSAmlan Barua #endif
5914f7415efSAmlan Barua       break;
5924f7415efSAmlan Barua     default:
5934f7415efSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
5944f7415efSAmlan Barua       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
59526fbe8dcSKarl Rupp 
5964f7415efSAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
59726fbe8dcSKarl Rupp 
5984f7415efSAmlan Barua       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
5994f7415efSAmlan Barua       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
60026fbe8dcSKarl Rupp 
6014f7415efSAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
6024f7415efSAmlan Barua 
6034f7415efSAmlan Barua       if (fin) {
6044f7415efSAmlan Barua         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
605778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
60626fbe8dcSKarl Rupp 
6074f7415efSAmlan Barua         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
6084f7415efSAmlan Barua       }
6094f7415efSAmlan Barua       if (bout) {
6104f7415efSAmlan Barua         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
611778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
61226fbe8dcSKarl Rupp 
6139496c9aeSAmlan Barua         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
6144f7415efSAmlan Barua       }
6153564f093SHong Zhang 
61657625b84SAmlan Barua       if (fout) {
61757625b84SAmlan Barua         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
618778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
61926fbe8dcSKarl Rupp 
62057625b84SAmlan Barua         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
62157625b84SAmlan Barua       }
6224f7415efSAmlan Barua #else
6230c9b18e4SAmlan Barua       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
6240c9b18e4SAmlan Barua       if (fin) {
6250c9b18e4SAmlan Barua         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
626778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
62726fbe8dcSKarl Rupp 
6280c9b18e4SAmlan Barua         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
6290c9b18e4SAmlan Barua       }
6300c9b18e4SAmlan Barua       if (fout) {
6310c9b18e4SAmlan Barua         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
632778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
63326fbe8dcSKarl Rupp 
6340c9b18e4SAmlan Barua         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
6350c9b18e4SAmlan Barua       }
6360c9b18e4SAmlan Barua       if (bout) {
6370c9b18e4SAmlan Barua         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
638778a2246SBarry Smith         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
63926fbe8dcSKarl Rupp 
6400c9b18e4SAmlan Barua         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
6410c9b18e4SAmlan Barua       }
6424f7415efSAmlan Barua #endif
6434f7415efSAmlan Barua       break;
6444f7415efSAmlan Barua     }
6454f7415efSAmlan Barua   }
6464f7415efSAmlan Barua   PetscFunctionReturn(0);
6474f7415efSAmlan Barua }
6484f7415efSAmlan Barua 
649c92418ccSAmlan Barua #undef __FUNCT__
65074a26884SAmlan Barua #define __FUNCT__ "VecScatterPetscToFFTW"
6513564f093SHong Zhang /*@
6523564f093SHong Zhang    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
65354efbe56SHong Zhang 
6543564f093SHong Zhang    Collective on Mat
6553564f093SHong Zhang 
6563564f093SHong Zhang    Input Parameters:
6573564f093SHong Zhang +  A - FFTW matrix
6583564f093SHong Zhang -  x - the PETSc vector
6593564f093SHong Zhang 
6603564f093SHong Zhang    Output Parameters:
6613564f093SHong Zhang .  y - the FFTW vector
6623564f093SHong Zhang 
663b2b77a04SHong Zhang   Options Database Keys:
6643564f093SHong Zhang . -mat_fftw_plannerflags - set FFTW planner flags
665b2b77a04SHong Zhang 
666b2b77a04SHong Zhang    Level: intermediate
6673564f093SHong Zhang 
66897504071SAmlan Barua    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
66997504071SAmlan Barua          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
67097504071SAmlan Barua          zeros. This routine does that job by scattering operation.
67197504071SAmlan Barua 
6723564f093SHong Zhang .seealso: VecScatterFFTWToPetsc()
6733564f093SHong Zhang @*/
6743564f093SHong Zhang PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
6753564f093SHong Zhang {
6763564f093SHong Zhang   PetscErrorCode ierr;
677b2b77a04SHong Zhang 
6783564f093SHong Zhang   PetscFunctionBegin;
6793564f093SHong Zhang   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
6803564f093SHong Zhang   PetscFunctionReturn(0);
6813564f093SHong Zhang }
6823c3a9c75SAmlan Barua 
6833c3a9c75SAmlan Barua #undef __FUNCT__
6841986ecc6SHong Zhang #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
68574a26884SAmlan Barua PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
6863c3a9c75SAmlan Barua {
6873c3a9c75SAmlan Barua   PetscErrorCode ierr;
688ce94432eSBarry Smith   MPI_Comm       comm;
6893c3a9c75SAmlan Barua   Mat_FFT        *fft  = (Mat_FFT*)A->data;
6903c3a9c75SAmlan Barua   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
6919496c9aeSAmlan Barua   PetscInt       N     =fft->N;
692b5d11533SAmlan Barua   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
6939496c9aeSAmlan Barua   PetscInt       low;
6947a21806cSHong Zhang   PetscMPIInt    rank,size;
6957a21806cSHong Zhang   PetscInt       vsize,vsize1;
6967a21806cSHong Zhang   ptrdiff_t      local_n0,local_0_start;
6979496c9aeSAmlan Barua   ptrdiff_t      local_n1,local_1_start;
6983c3a9c75SAmlan Barua   VecScatter     vecscat;
6993c3a9c75SAmlan Barua   IS             list1,list2;
7009496c9aeSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
7019496c9aeSAmlan Barua   PetscInt       i,j,k,partial_dim;
7029496c9aeSAmlan Barua   PetscInt       *indx1, *indx2, tempindx, tempindx1;
703a31b9140SHong Zhang   PetscInt       NM;
7049496c9aeSAmlan Barua   ptrdiff_t      temp;
7059496c9aeSAmlan Barua #endif
7063c3a9c75SAmlan Barua 
7073564f093SHong Zhang   PetscFunctionBegin;
708ce94432eSBarry Smith   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
709f76f76beSAmlan Barua   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
710f76f76beSAmlan Barua   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
7110298fd71SBarry Smith   ierr = VecGetOwnershipRange(y,&low,NULL);
7123c3a9c75SAmlan Barua 
7133564f093SHong Zhang   if (size==1) {
7148ca4c034SAmlan Barua     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
7158ca4c034SAmlan Barua     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
7169496c9aeSAmlan Barua     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
7176971673cSAmlan Barua     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
7186971673cSAmlan Barua     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
7196971673cSAmlan Barua     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
7206971673cSAmlan Barua     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
721b223cc97SAmlan Barua     ierr = ISDestroy(&list1);CHKERRQ(ierr);
7223564f093SHong Zhang   } else {
7233c3a9c75SAmlan Barua     switch (ndim) {
7243c3a9c75SAmlan Barua     case 1:
72564657d84SAmlan Barua #if defined(PETSC_USE_COMPLEX)
7267a21806cSHong Zhang       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
72726fbe8dcSKarl Rupp 
7284f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
7294f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
73064657d84SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
73164657d84SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
73264657d84SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
73364657d84SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
73464657d84SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
73564657d84SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
73664657d84SAmlan Barua #else
737e5d7f247SAmlan Barua       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
73864657d84SAmlan Barua #endif
7393c3a9c75SAmlan Barua       break;
7403c3a9c75SAmlan Barua     case 2:
741bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
7427a21806cSHong Zhang       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
74326fbe8dcSKarl Rupp 
7444f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
7454f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
746bd59e6c6SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
747bd59e6c6SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
748bd59e6c6SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
749bd59e6c6SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
750bd59e6c6SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
751bd59e6c6SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
752bd59e6c6SAmlan Barua #else
753c3eba89aSHong Zhang       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
75426fbe8dcSKarl Rupp 
755854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
756854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
7573c3a9c75SAmlan Barua 
7583564f093SHong Zhang       if (dim[1]%2==0) {
7593c3a9c75SAmlan Barua         NM = dim[1]+2;
7603564f093SHong Zhang       } else {
7613c3a9c75SAmlan Barua         NM = dim[1]+1;
7623564f093SHong Zhang       }
7633c3a9c75SAmlan Barua       for (i=0; i<local_n0; i++) {
7643c3a9c75SAmlan Barua         for (j=0; j<dim[1]; j++) {
7653c3a9c75SAmlan Barua           tempindx  = i*dim[1] + j;
7663c3a9c75SAmlan Barua           tempindx1 = i*NM + j;
76726fbe8dcSKarl Rupp 
7685b3e41ffSAmlan Barua           indx1[tempindx]=local_0_start*dim[1]+tempindx;
7693c3a9c75SAmlan Barua           indx2[tempindx]=low+tempindx1;
7703c3a9c75SAmlan Barua         }
7713c3a9c75SAmlan Barua       }
7723c3a9c75SAmlan Barua 
7733c3a9c75SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
7743c3a9c75SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
7753c3a9c75SAmlan Barua 
776f76f76beSAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
777f76f76beSAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
778f76f76beSAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
779f76f76beSAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
780b223cc97SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
781b223cc97SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
782b223cc97SAmlan Barua       ierr = PetscFree(indx1);CHKERRQ(ierr);
783b223cc97SAmlan Barua       ierr = PetscFree(indx2);CHKERRQ(ierr);
784bd59e6c6SAmlan Barua #endif
7859496c9aeSAmlan Barua       break;
7863c3a9c75SAmlan Barua 
7873c3a9c75SAmlan Barua     case 3:
788bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
7897a21806cSHong Zhang       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
79026fbe8dcSKarl Rupp 
7914f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
7924f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
793bd59e6c6SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
794bd59e6c6SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
795bd59e6c6SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
796bd59e6c6SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
797bd59e6c6SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
798bd59e6c6SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
799bd59e6c6SAmlan Barua #else
800f1ade23cSHong Zhang       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
801f1ade23cSHong Zhang       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
8027a21806cSHong Zhang       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
80326fbe8dcSKarl Rupp 
804854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
805854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
80651d1eed7SAmlan Barua 
807db4deed7SKarl Rupp       if (dim[2]%2==0) NM = dim[2]+2;
808db4deed7SKarl Rupp       else             NM = dim[2]+1;
80951d1eed7SAmlan Barua 
81051d1eed7SAmlan Barua       for (i=0; i<local_n0; i++) {
81151d1eed7SAmlan Barua         for (j=0; j<dim[1]; j++) {
81251d1eed7SAmlan Barua           for (k=0; k<dim[2]; k++) {
81351d1eed7SAmlan Barua             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
81451d1eed7SAmlan Barua             tempindx1 = i*dim[1]*NM + j*NM + k;
81526fbe8dcSKarl Rupp 
81651d1eed7SAmlan Barua             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
81751d1eed7SAmlan Barua             indx2[tempindx]=low+tempindx1;
81851d1eed7SAmlan Barua           }
81951d1eed7SAmlan Barua         }
82051d1eed7SAmlan Barua       }
82151d1eed7SAmlan Barua 
82251d1eed7SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
82351d1eed7SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
82451d1eed7SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
82551d1eed7SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
82651d1eed7SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
82751d1eed7SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
828b223cc97SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
829b223cc97SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
830b223cc97SAmlan Barua       ierr = PetscFree(indx1);CHKERRQ(ierr);
831b223cc97SAmlan Barua       ierr = PetscFree(indx2);CHKERRQ(ierr);
832bd59e6c6SAmlan Barua #endif
8339496c9aeSAmlan Barua       break;
8343c3a9c75SAmlan Barua 
8353c3a9c75SAmlan Barua     default:
836bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
8377a21806cSHong Zhang       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
83826fbe8dcSKarl Rupp 
8394f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
8404f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
841bd59e6c6SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
842bd59e6c6SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
843bd59e6c6SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
844bd59e6c6SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
845bd59e6c6SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
846bd59e6c6SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
847bd59e6c6SAmlan Barua #else
848f1ade23cSHong Zhang       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
849f1ade23cSHong Zhang       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
850e5d7f247SAmlan Barua       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
85126fbe8dcSKarl Rupp 
852e5d7f247SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
85326fbe8dcSKarl Rupp 
8547a21806cSHong Zhang       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
85526fbe8dcSKarl Rupp 
856e5d7f247SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
857e5d7f247SAmlan Barua 
858e5d7f247SAmlan Barua       partial_dim = fftw->partial_dim;
859e5d7f247SAmlan Barua 
860854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
861854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
862e5d7f247SAmlan Barua 
863db4deed7SKarl Rupp       if (dim[ndim-1]%2==0) NM = 2;
864db4deed7SKarl Rupp       else                  NM = 1;
865e5d7f247SAmlan Barua 
8666971673cSAmlan Barua       j = low;
8673564f093SHong Zhang       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
8686971673cSAmlan Barua         indx1[i] = local_0_start*partial_dim + i;
8696971673cSAmlan Barua         indx2[i] = j;
87026fbe8dcSKarl Rupp         if (k%dim[ndim-1]==0) j+=NM;
8716971673cSAmlan Barua         j++;
8726971673cSAmlan Barua       }
8736971673cSAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
8746971673cSAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
8756971673cSAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
8766971673cSAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
8776971673cSAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
8786971673cSAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
879b223cc97SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
880b223cc97SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
881b223cc97SAmlan Barua       ierr = PetscFree(indx1);CHKERRQ(ierr);
882b223cc97SAmlan Barua       ierr = PetscFree(indx2);CHKERRQ(ierr);
883bd59e6c6SAmlan Barua #endif
8849496c9aeSAmlan Barua       break;
8853c3a9c75SAmlan Barua     }
886e81bb599SAmlan Barua   }
8873564f093SHong Zhang   PetscFunctionReturn(0);
8883c3a9c75SAmlan Barua }
8893c3a9c75SAmlan Barua 
8903c3a9c75SAmlan Barua #undef __FUNCT__
89174a26884SAmlan Barua #define __FUNCT__ "VecScatterFFTWToPetsc"
8923564f093SHong Zhang /*@
8933564f093SHong Zhang    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
8943564f093SHong Zhang 
8953564f093SHong Zhang    Collective on Mat
8963564f093SHong Zhang 
8973564f093SHong Zhang     Input Parameters:
8983564f093SHong Zhang +   A - FFTW matrix
8993564f093SHong Zhang -   x - FFTW vector
9003564f093SHong Zhang 
9013564f093SHong Zhang    Output Parameters:
9023564f093SHong Zhang .  y - PETSc vector
9033564f093SHong Zhang 
9043564f093SHong Zhang    Level: intermediate
9053564f093SHong Zhang 
9063564f093SHong Zhang    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
9073564f093SHong Zhang          VecScatterFFTWToPetsc removes those extra zeros.
9083564f093SHong Zhang 
9093564f093SHong Zhang .seealso: VecScatterPetscToFFTW()
9103564f093SHong Zhang @*/
91174a26884SAmlan Barua PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
9123c3a9c75SAmlan Barua {
9133c3a9c75SAmlan Barua   PetscErrorCode ierr;
9145fd66863SKarl Rupp 
9153c3a9c75SAmlan Barua   PetscFunctionBegin;
91674a26884SAmlan Barua   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
9173c3a9c75SAmlan Barua   PetscFunctionReturn(0);
9183c3a9c75SAmlan Barua }
91954efbe56SHong Zhang 
9203c3a9c75SAmlan Barua #undef __FUNCT__
92174a26884SAmlan Barua #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
92274a26884SAmlan Barua PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
9235b3e41ffSAmlan Barua {
9245b3e41ffSAmlan Barua   PetscErrorCode ierr;
925ce94432eSBarry Smith   MPI_Comm       comm;
9265b3e41ffSAmlan Barua   Mat_FFT        *fft  = (Mat_FFT*)A->data;
9275b3e41ffSAmlan Barua   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
9289496c9aeSAmlan Barua   PetscInt       N     = fft->N;
929b3655f67SAmlan Barua   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
9309496c9aeSAmlan Barua   PetscInt       low;
9317a21806cSHong Zhang   PetscMPIInt    rank,size;
9327a21806cSHong Zhang   ptrdiff_t      local_n0,local_0_start;
9339496c9aeSAmlan Barua   ptrdiff_t      local_n1,local_1_start;
9345b3e41ffSAmlan Barua   VecScatter     vecscat;
9355b3e41ffSAmlan Barua   IS             list1,list2;
9369496c9aeSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
9379496c9aeSAmlan Barua   PetscInt       i,j,k,partial_dim;
9389496c9aeSAmlan Barua   PetscInt       *indx1, *indx2, tempindx, tempindx1;
939a31b9140SHong Zhang   PetscInt       NM;
9409496c9aeSAmlan Barua   ptrdiff_t      temp;
9419496c9aeSAmlan Barua #endif
9429496c9aeSAmlan Barua 
9433564f093SHong Zhang   PetscFunctionBegin;
944ce94432eSBarry Smith   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
9455b3e41ffSAmlan Barua   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
9465b3e41ffSAmlan Barua   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
9470298fd71SBarry Smith   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
9485b3e41ffSAmlan Barua 
949e81bb599SAmlan Barua   if (size==1) {
950b3655f67SAmlan Barua     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
9516971673cSAmlan Barua     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
9526971673cSAmlan Barua     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
9536971673cSAmlan Barua     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
9546971673cSAmlan Barua     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
955b223cc97SAmlan Barua     ierr = ISDestroy(&list1);CHKERRQ(ierr);
956e81bb599SAmlan Barua 
9573564f093SHong Zhang   } else {
958e81bb599SAmlan Barua     switch (ndim) {
959e81bb599SAmlan Barua     case 1:
96064657d84SAmlan Barua #if defined(PETSC_USE_COMPLEX)
9617a21806cSHong Zhang       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
96226fbe8dcSKarl Rupp 
9634f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
9644f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
96564657d84SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
96664657d84SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
96764657d84SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
96864657d84SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
96964657d84SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
97064657d84SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
97164657d84SAmlan Barua #else
9726a506ed8SAmlan Barua       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
97364657d84SAmlan Barua #endif
9745b3e41ffSAmlan Barua       break;
9755b3e41ffSAmlan Barua     case 2:
976bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
9777a21806cSHong Zhang       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
97826fbe8dcSKarl Rupp 
9794f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
9804f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
981bd59e6c6SAmlan Barua       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
982bd59e6c6SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
983bd59e6c6SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
984bd59e6c6SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
985bd59e6c6SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
986bd59e6c6SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
987bd59e6c6SAmlan Barua #else
9887a21806cSHong Zhang       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
98926fbe8dcSKarl Rupp 
990854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
991854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
9925b3e41ffSAmlan Barua 
993db4deed7SKarl Rupp       if (dim[1]%2==0) NM = dim[1]+2;
994db4deed7SKarl Rupp       else             NM = dim[1]+1;
9955b3e41ffSAmlan Barua 
9965b3e41ffSAmlan Barua       for (i=0; i<local_n0; i++) {
9975b3e41ffSAmlan Barua         for (j=0; j<dim[1]; j++) {
9985b3e41ffSAmlan Barua           tempindx = i*dim[1] + j;
9995b3e41ffSAmlan Barua           tempindx1 = i*NM + j;
100026fbe8dcSKarl Rupp 
10015b3e41ffSAmlan Barua           indx1[tempindx]=local_0_start*dim[1]+tempindx;
10025b3e41ffSAmlan Barua           indx2[tempindx]=low+tempindx1;
10035b3e41ffSAmlan Barua         }
10045b3e41ffSAmlan Barua       }
10055b3e41ffSAmlan Barua 
10065b3e41ffSAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
10075b3e41ffSAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
10085b3e41ffSAmlan Barua 
10095b3e41ffSAmlan Barua       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
10105b3e41ffSAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
10115b3e41ffSAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
10125b3e41ffSAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1013b223cc97SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1014b223cc97SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1015b223cc97SAmlan Barua       ierr = PetscFree(indx1);CHKERRQ(ierr);
1016b223cc97SAmlan Barua       ierr = PetscFree(indx2);CHKERRQ(ierr);
1017bd59e6c6SAmlan Barua #endif
10189496c9aeSAmlan Barua       break;
10195b3e41ffSAmlan Barua     case 3:
1020bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
10217a21806cSHong Zhang       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
102226fbe8dcSKarl Rupp 
10234f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
10244f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1025bd59e6c6SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1026bd59e6c6SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1027bd59e6c6SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1028bd59e6c6SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1029bd59e6c6SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1030bd59e6c6SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1031bd59e6c6SAmlan Barua #else
10327a21806cSHong Zhang       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
103326fbe8dcSKarl Rupp 
1034854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
1035854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
103651d1eed7SAmlan Barua 
1037db4deed7SKarl Rupp       if (dim[2]%2==0) NM = dim[2]+2;
1038db4deed7SKarl Rupp       else             NM = dim[2]+1;
103951d1eed7SAmlan Barua 
104051d1eed7SAmlan Barua       for (i=0; i<local_n0; i++) {
104151d1eed7SAmlan Barua         for (j=0; j<dim[1]; j++) {
104251d1eed7SAmlan Barua           for (k=0; k<dim[2]; k++) {
104351d1eed7SAmlan Barua             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
104451d1eed7SAmlan Barua             tempindx1 = i*dim[1]*NM + j*NM + k;
104526fbe8dcSKarl Rupp 
104651d1eed7SAmlan Barua             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
104751d1eed7SAmlan Barua             indx2[tempindx]=low+tempindx1;
104851d1eed7SAmlan Barua           }
104951d1eed7SAmlan Barua         }
105051d1eed7SAmlan Barua       }
105151d1eed7SAmlan Barua 
105251d1eed7SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
105351d1eed7SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
105451d1eed7SAmlan Barua 
105551d1eed7SAmlan Barua       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
105651d1eed7SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105751d1eed7SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105851d1eed7SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1059b223cc97SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1060b223cc97SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1061b223cc97SAmlan Barua       ierr = PetscFree(indx1);CHKERRQ(ierr);
1062b223cc97SAmlan Barua       ierr = PetscFree(indx2);CHKERRQ(ierr);
1063bd59e6c6SAmlan Barua #endif
10649496c9aeSAmlan Barua       break;
10655b3e41ffSAmlan Barua     default:
1066bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
10677a21806cSHong Zhang       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
106826fbe8dcSKarl Rupp 
10694f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
10704f8276c3SHong Zhang       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1071bd59e6c6SAmlan Barua       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1072bd59e6c6SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1073bd59e6c6SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1074bd59e6c6SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1075bd59e6c6SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1076bd59e6c6SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1077bd59e6c6SAmlan Barua #else
1078ba6e06d0SAmlan Barua       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
107926fbe8dcSKarl Rupp 
1080ba6e06d0SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
108126fbe8dcSKarl Rupp 
1082c3eba89aSHong Zhang       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
108326fbe8dcSKarl Rupp 
1084ba6e06d0SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1085ba6e06d0SAmlan Barua 
1086ba6e06d0SAmlan Barua       partial_dim = fftw->partial_dim;
1087ba6e06d0SAmlan Barua 
1088854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1089854ce69bSBarry Smith       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1090ba6e06d0SAmlan Barua 
1091db4deed7SKarl Rupp       if (dim[ndim-1]%2==0) NM = 2;
1092db4deed7SKarl Rupp       else                  NM = 1;
1093ba6e06d0SAmlan Barua 
1094ba6e06d0SAmlan Barua       j = low;
10953564f093SHong Zhang       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1096ba6e06d0SAmlan Barua         indx1[i] = local_0_start*partial_dim + i;
1097ba6e06d0SAmlan Barua         indx2[i] = j;
10983564f093SHong Zhang         if (k%dim[ndim-1]==0) j+=NM;
1099ba6e06d0SAmlan Barua         j++;
1100ba6e06d0SAmlan Barua       }
1101ba6e06d0SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1102ba6e06d0SAmlan Barua       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1103ba6e06d0SAmlan Barua 
1104ba6e06d0SAmlan Barua       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1105ba6e06d0SAmlan Barua       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1106ba6e06d0SAmlan Barua       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1107ba6e06d0SAmlan Barua       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1108b223cc97SAmlan Barua       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1109b223cc97SAmlan Barua       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1110b223cc97SAmlan Barua       ierr = PetscFree(indx1);CHKERRQ(ierr);
1111b223cc97SAmlan Barua       ierr = PetscFree(indx2);CHKERRQ(ierr);
1112bd59e6c6SAmlan Barua #endif
11139496c9aeSAmlan Barua       break;
11145b3e41ffSAmlan Barua     }
1115e81bb599SAmlan Barua   }
11163564f093SHong Zhang   PetscFunctionReturn(0);
11175b3e41ffSAmlan Barua }
11185b3e41ffSAmlan Barua 
11195b3e41ffSAmlan Barua #undef __FUNCT__
11203c3a9c75SAmlan Barua #define __FUNCT__ "MatCreate_FFTW"
11213c3a9c75SAmlan Barua /*
11223564f093SHong Zhang     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
11233564f093SHong Zhang 
11243c3a9c75SAmlan Barua   Options Database Keys:
11253c3a9c75SAmlan Barua + -mat_fftw_plannerflags - set FFTW planner flags
11263c3a9c75SAmlan Barua 
11273c3a9c75SAmlan Barua    Level: intermediate
11283c3a9c75SAmlan Barua 
11293c3a9c75SAmlan Barua */
11308cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1131b2b77a04SHong Zhang {
1132b2b77a04SHong Zhang   PetscErrorCode ierr;
1133ce94432eSBarry Smith   MPI_Comm       comm;
1134b2b77a04SHong Zhang   Mat_FFT        *fft=(Mat_FFT*)A->data;
1135b2b77a04SHong Zhang   Mat_FFTW       *fftw;
1136b2b77a04SHong Zhang   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
11375274ed1bSHong Zhang   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
11385274ed1bSHong Zhang   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1139b2b77a04SHong Zhang   PetscBool      flg;
11404f48bc67SAmlan Barua   PetscInt       p_flag,partial_dim=1,ctr;
114111902ff2SHong Zhang   PetscMPIInt    size,rank;
11429496c9aeSAmlan Barua   ptrdiff_t      *pdim;
11439496c9aeSAmlan Barua   ptrdiff_t      local_n1,local_1_start;
11449496c9aeSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
11459496c9aeSAmlan Barua   ptrdiff_t      temp;
11468ca4c034SAmlan Barua   PetscInt       N1,tot_dim;
11474f48bc67SAmlan Barua #else
11484f48bc67SAmlan Barua   PetscInt       n1;
11499496c9aeSAmlan Barua #endif
11509496c9aeSAmlan Barua 
1151b2b77a04SHong Zhang   PetscFunctionBegin;
1152ce94432eSBarry Smith   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1153b2b77a04SHong Zhang   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
115411902ff2SHong Zhang   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1155c92418ccSAmlan Barua 
11561878d478SAmlan Barua   fftw_mpi_init();
115711902ff2SHong Zhang   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
115811902ff2SHong Zhang   pdim[0] = dim[0];
11598ca4c034SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
11608ca4c034SAmlan Barua   tot_dim = 2*dim[0];
11618ca4c034SAmlan Barua #endif
11623564f093SHong Zhang   for (ctr=1; ctr<ndim; ctr++) {
11636882372aSHong Zhang     partial_dim *= dim[ctr];
116411902ff2SHong Zhang     pdim[ctr]    = dim[ctr];
11658ca4c034SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
1166db4deed7SKarl Rupp     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1167db4deed7SKarl Rupp     else             tot_dim *= dim[ctr];
11688ca4c034SAmlan Barua #endif
11696882372aSHong Zhang   }
11703c3a9c75SAmlan Barua 
1171b2b77a04SHong Zhang   if (size == 1) {
1172e81bb599SAmlan Barua #if defined(PETSC_USE_COMPLEX)
1173b2b77a04SHong Zhang     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1174b2b77a04SHong Zhang     n    = N;
1175e81bb599SAmlan Barua #else
1176e81bb599SAmlan Barua     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1177e81bb599SAmlan Barua     n    = tot_dim;
1178e81bb599SAmlan Barua #endif
1179e81bb599SAmlan Barua 
1180b2b77a04SHong Zhang   } else {
11817a21806cSHong Zhang     ptrdiff_t local_n0,local_0_start;
1182b2b77a04SHong Zhang     switch (ndim) {
1183b2b77a04SHong Zhang     case 1:
11843c3a9c75SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
11853c3a9c75SAmlan Barua       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1186e5d7f247SAmlan Barua #else
11877a21806cSHong Zhang       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
118826fbe8dcSKarl Rupp 
11896882372aSHong Zhang       n    = (PetscInt)local_n0;
11909496c9aeSAmlan Barua       n1   = (PetscInt)local_n1;
11919496c9aeSAmlan Barua       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1192e5d7f247SAmlan Barua #endif
1193b2b77a04SHong Zhang       break;
1194b2b77a04SHong Zhang     case 2:
11955b3e41ffSAmlan Barua #if defined(PETSC_USE_COMPLEX)
11967a21806cSHong Zhang       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1197b2b77a04SHong Zhang       /*
1198b2b77a04SHong Zhang        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
11990ec8b6e3SBarry Smith        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1200b2b77a04SHong Zhang        */
1201b2b77a04SHong Zhang       n    = (PetscInt)local_n0*dim[1];
1202b2b77a04SHong Zhang       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
12035b3e41ffSAmlan Barua #else
1204c3eba89aSHong Zhang       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
120526fbe8dcSKarl Rupp 
12065b3e41ffSAmlan Barua       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1207c8a8a4f0SAmlan Barua       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
12085b3e41ffSAmlan Barua #endif
1209b2b77a04SHong Zhang       break;
1210b2b77a04SHong Zhang     case 3:
121151d1eed7SAmlan Barua #if defined(PETSC_USE_COMPLEX)
12127a21806cSHong Zhang       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
121326fbe8dcSKarl Rupp 
12146882372aSHong Zhang       n    = (PetscInt)local_n0*dim[1]*dim[2];
12156882372aSHong Zhang       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
121651d1eed7SAmlan Barua #else
1217c3eba89aSHong Zhang       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
121826fbe8dcSKarl Rupp 
121951d1eed7SAmlan Barua       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1220c8a8a4f0SAmlan Barua       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
122151d1eed7SAmlan Barua #endif
1222b2b77a04SHong Zhang       break;
1223b2b77a04SHong Zhang     default:
1224b3a17365SAmlan Barua #if defined(PETSC_USE_COMPLEX)
12257a21806cSHong Zhang       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
122626fbe8dcSKarl Rupp 
12276882372aSHong Zhang       n    = (PetscInt)local_n0*partial_dim;
12286882372aSHong Zhang       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1229b3a17365SAmlan Barua #else
1230b3a17365SAmlan Barua       temp = pdim[ndim-1];
123126fbe8dcSKarl Rupp 
1232b3a17365SAmlan Barua       pdim[ndim-1] = temp/2 + 1;
123326fbe8dcSKarl Rupp 
1234c3eba89aSHong Zhang       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
123526fbe8dcSKarl Rupp 
1236b3a17365SAmlan Barua       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1237b3a17365SAmlan Barua       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
123826fbe8dcSKarl Rupp 
1239b3a17365SAmlan Barua       pdim[ndim-1] = temp;
124026fbe8dcSKarl Rupp 
1241c8a8a4f0SAmlan Barua       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1242b3a17365SAmlan Barua #endif
1243b2b77a04SHong Zhang       break;
1244b2b77a04SHong Zhang     }
1245b2b77a04SHong Zhang   }
1246b2b77a04SHong Zhang   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1247b00a9115SJed Brown   ierr      = PetscNewLog(A,&fftw);CHKERRQ(ierr);
1248b2b77a04SHong Zhang   fft->data = (void*)fftw;
1249b2b77a04SHong Zhang 
1250b2b77a04SHong Zhang   fft->n            = n;
12510c74a584SJed Brown   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1252e5d7f247SAmlan Barua   fftw->partial_dim = partial_dim;
125326fbe8dcSKarl Rupp 
12545e806cb5SMatthew G. Knepley   ierr = PetscMalloc1(ndim, &(fftw->dim_fftw));CHKERRQ(ierr);
12558c1d5ab3SHong Zhang   if (size == 1) {
1256a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
1257a1b6d50cSHong Zhang     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1258a1b6d50cSHong Zhang #else
12598c1d5ab3SHong Zhang     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1260a1b6d50cSHong Zhang #endif
12618c1d5ab3SHong Zhang   }
126226fbe8dcSKarl Rupp 
1263b1301b2eSHong Zhang   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1264c92418ccSAmlan Barua 
1265b2b77a04SHong Zhang   fftw->p_forward  = 0;
1266b2b77a04SHong Zhang   fftw->p_backward = 0;
1267b2b77a04SHong Zhang   fftw->p_flag     = FFTW_ESTIMATE;
1268b2b77a04SHong Zhang 
1269b2b77a04SHong Zhang   if (size == 1) {
1270b2b77a04SHong Zhang     A->ops->mult          = MatMult_SeqFFTW;
1271b2b77a04SHong Zhang     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1272b2b77a04SHong Zhang   } else {
1273b2b77a04SHong Zhang     A->ops->mult          = MatMult_MPIFFTW;
1274b2b77a04SHong Zhang     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1275b2b77a04SHong Zhang   }
1276b2b77a04SHong Zhang   fft->matdestroy = MatDestroy_FFTW;
1277b2b77a04SHong Zhang   A->assembled    = PETSC_TRUE;
12784a52bad8SHong Zhang   A->preallocated = PETSC_TRUE;
127926fbe8dcSKarl Rupp 
12802a7a6963SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);CHKERRQ(ierr);
1281bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1282bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1283b2b77a04SHong Zhang 
1284b2b77a04SHong Zhang   /* get runtime options */
1285ce94432eSBarry Smith   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
12865274ed1bSHong Zhang   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);CHKERRQ(ierr);
12875274ed1bSHong Zhang   if (flg) {
12885274ed1bSHong Zhang     fftw->p_flag = iplans[p_flag];
12895274ed1bSHong Zhang   }
12904a52bad8SHong Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1291b2b77a04SHong Zhang   PetscFunctionReturn(0);
1292b2b77a04SHong Zhang }
12933c3a9c75SAmlan Barua 
12943c3a9c75SAmlan Barua 
12953c3a9c75SAmlan Barua 
12963c3a9c75SAmlan Barua 
1297