xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
1b2b77a04SHong Zhang 
2b2b77a04SHong Zhang /*
3b2b77a04SHong Zhang     Provides an interface to the FFTW package.
4c4762a1bSJed Brown     Testing examples can be found in ~src/mat/tests
5b2b77a04SHong Zhang */
6b2b77a04SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
8b2b77a04SHong Zhang EXTERN_C_BEGIN
90cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
10c6db04a5SJed Brown #include <fftw3-mpi.h>
110cf2e031SBarry Smith #else
120cf2e031SBarry Smith #include <fftw3.h>
130cf2e031SBarry Smith #endif
14b2b77a04SHong Zhang EXTERN_C_END
15b2b77a04SHong Zhang 
16b2b77a04SHong Zhang typedef struct {
17b9ae062cSHong Zhang   ptrdiff_t ndim_fftw, *dim_fftw;
18a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
19a1b6d50cSHong Zhang   fftw_iodim64 *iodims;
20a1b6d50cSHong Zhang #else
218c1d5ab3SHong Zhang   fftw_iodim *iodims;
22a1b6d50cSHong Zhang #endif
23e5d7f247SAmlan Barua   PetscInt     partial_dim;
24b2b77a04SHong Zhang   fftw_plan    p_forward, p_backward;
25b2b77a04SHong Zhang   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26b2b77a04SHong Zhang   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays becaue fftw plan should be
27b2b77a04SHong Zhang                                                             executed for the arrays with which the plan was created */
28b2b77a04SHong Zhang } Mat_FFTW;
29b2b77a04SHong Zhang 
30b2b77a04SHong Zhang extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
31b2b77a04SHong Zhang extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
320cf2e031SBarry Smith extern PetscErrorCode MatDestroy_FFTW(Mat);
330cf2e031SBarry Smith extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
340cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
35b2b77a04SHong Zhang extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
36b2b77a04SHong Zhang extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
37b2b77a04SHong Zhang extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
380cf2e031SBarry Smith #endif
39b2b77a04SHong Zhang 
400cf2e031SBarry Smith /*
410cf2e031SBarry Smith    MatMult_SeqFFTW performs forward DFT
4297504071SAmlan Barua    Input parameter:
433564f093SHong Zhang      A - the matrix
4497504071SAmlan Barua      x - the vector on which FDFT will be performed
4597504071SAmlan Barua 
4697504071SAmlan Barua    Output parameter:
4797504071SAmlan Barua      y - vector that stores result of FDFT
4897504071SAmlan Barua */
49*9371c9d4SSatish Balay PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y) {
50b2b77a04SHong Zhang   Mat_FFT           *fft  = (Mat_FFT *)A->data;
51b2b77a04SHong Zhang   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
52f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
53f4fca6d4SMatthew G. Knepley   PetscScalar       *y_array;
541acd80e4SHong Zhang #if defined(PETSC_USE_COMPLEX)
55a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
56a1b6d50cSHong Zhang   fftw_iodim64 *iodims;
57a1b6d50cSHong Zhang #else
588c1d5ab3SHong Zhang   fftw_iodim *iodims;
59a1b6d50cSHong Zhang #endif
601acd80e4SHong Zhang   PetscInt i;
611acd80e4SHong Zhang #endif
621acd80e4SHong Zhang   PetscInt ndim = fft->ndim, *dim = fft->dim;
63b2b77a04SHong Zhang 
64b2b77a04SHong Zhang   PetscFunctionBegin;
659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(x, &x_array));
669566063dSJacob Faibussowitsch   PetscCall(VecGetArray(y, &y_array));
676aad120cSJose E. Roman   if (!fftw->p_forward) { /* create a plan, then execute it */
68b2b77a04SHong Zhang     switch (ndim) {
69b2b77a04SHong Zhang     case 1:
7058a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
71b2b77a04SHong Zhang       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
7258a912c5SAmlan Barua #else
7358a912c5SAmlan Barua       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
7458a912c5SAmlan Barua #endif
75b2b77a04SHong Zhang       break;
76b2b77a04SHong Zhang     case 2:
7758a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
78b2b77a04SHong Zhang       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
7958a912c5SAmlan Barua #else
8058a912c5SAmlan Barua       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
8158a912c5SAmlan Barua #endif
82b2b77a04SHong Zhang       break;
83b2b77a04SHong Zhang     case 3:
8458a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
85b2b77a04SHong Zhang       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
8658a912c5SAmlan Barua #else
8758a912c5SAmlan Barua       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
8858a912c5SAmlan Barua #endif
89b2b77a04SHong Zhang       break;
90b2b77a04SHong Zhang     default:
9158a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
92a1b6d50cSHong Zhang       iodims = fftw->iodims;
93a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
948c1d5ab3SHong Zhang       if (ndim) {
95a1b6d50cSHong Zhang         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
968c1d5ab3SHong Zhang         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
978c1d5ab3SHong Zhang         for (i = ndim - 2; i >= 0; --i) {
98a1b6d50cSHong Zhang           iodims[i].n  = (ptrdiff_t)dim[i];
998c1d5ab3SHong Zhang           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
1008c1d5ab3SHong Zhang         }
1018c1d5ab3SHong Zhang       }
102a1b6d50cSHong Zhang       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
103a1b6d50cSHong Zhang #else
104a1b6d50cSHong Zhang       if (ndim) {
105a1b6d50cSHong Zhang         iodims[ndim - 1].n  = (int)dim[ndim - 1];
106a1b6d50cSHong Zhang         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
107a1b6d50cSHong Zhang         for (i = ndim - 2; i >= 0; --i) {
108a1b6d50cSHong Zhang           iodims[i].n  = (int)dim[i];
109a1b6d50cSHong Zhang           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
110a1b6d50cSHong Zhang         }
111a1b6d50cSHong Zhang       }
112a1b6d50cSHong Zhang       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
113a1b6d50cSHong Zhang #endif
114a1b6d50cSHong Zhang 
11558a912c5SAmlan Barua #else
116a31b9140SHong Zhang       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
11758a912c5SAmlan Barua #endif
118b2b77a04SHong Zhang       break;
119b2b77a04SHong Zhang     }
120f4fca6d4SMatthew G. Knepley     fftw->finarray  = (PetscScalar *)x_array;
121b2b77a04SHong Zhang     fftw->foutarray = y_array;
122b2b77a04SHong Zhang     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
123b2b77a04SHong Zhang                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
124b2b77a04SHong Zhang     fftw_execute(fftw->p_forward);
125b2b77a04SHong Zhang   } else {                                                         /* use existing plan */
126b2b77a04SHong Zhang     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
1277e4bc134SDominic Meiser #if defined(PETSC_USE_COMPLEX)
128b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
1297e4bc134SDominic Meiser #else
1307e4bc134SDominic Meiser       fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
1317e4bc134SDominic Meiser #endif
132b2b77a04SHong Zhang     } else {
133b2b77a04SHong Zhang       fftw_execute(fftw->p_forward);
134b2b77a04SHong Zhang     }
135b2b77a04SHong Zhang   }
1369566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(y, &y_array));
1379566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(x, &x_array));
138b2b77a04SHong Zhang   PetscFunctionReturn(0);
139b2b77a04SHong Zhang }
140b2b77a04SHong Zhang 
14197504071SAmlan Barua /* MatMultTranspose_SeqFFTW performs serial backward DFT
14297504071SAmlan Barua    Input parameter:
1433564f093SHong Zhang      A - the matrix
14497504071SAmlan Barua      x - the vector on which BDFT will be performed
14597504071SAmlan Barua 
14697504071SAmlan Barua    Output parameter:
14797504071SAmlan Barua      y - vector that stores result of BDFT
14897504071SAmlan Barua */
14997504071SAmlan Barua 
150*9371c9d4SSatish Balay PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y) {
151b2b77a04SHong Zhang   Mat_FFT           *fft  = (Mat_FFT *)A->data;
152b2b77a04SHong Zhang   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
153f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
154f4fca6d4SMatthew G. Knepley   PetscScalar       *y_array;
155b2b77a04SHong Zhang   PetscInt           ndim = fft->ndim, *dim = fft->dim;
1561acd80e4SHong Zhang #if defined(PETSC_USE_COMPLEX)
157a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
158a1b6d50cSHong Zhang   fftw_iodim64 *iodims = fftw->iodims;
159a1b6d50cSHong Zhang #else
160a1b6d50cSHong Zhang   fftw_iodim *iodims = fftw->iodims;
161a1b6d50cSHong Zhang #endif
1621acd80e4SHong Zhang #endif
163b2b77a04SHong Zhang 
164b2b77a04SHong Zhang   PetscFunctionBegin;
1659566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(x, &x_array));
1669566063dSJacob Faibussowitsch   PetscCall(VecGetArray(y, &y_array));
1676aad120cSJose E. Roman   if (!fftw->p_backward) { /* create a plan, then execute it */
168b2b77a04SHong Zhang     switch (ndim) {
169b2b77a04SHong Zhang     case 1:
17058a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
171b2b77a04SHong Zhang       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
17258a912c5SAmlan Barua #else
173e81bb599SAmlan Barua       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
17458a912c5SAmlan Barua #endif
175b2b77a04SHong Zhang       break;
176b2b77a04SHong Zhang     case 2:
17758a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
178b2b77a04SHong Zhang       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
17958a912c5SAmlan Barua #else
180e81bb599SAmlan Barua       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
18158a912c5SAmlan Barua #endif
182b2b77a04SHong Zhang       break;
183b2b77a04SHong Zhang     case 3:
18458a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
185b2b77a04SHong Zhang       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
18658a912c5SAmlan Barua #else
187e81bb599SAmlan Barua       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
18858a912c5SAmlan Barua #endif
189b2b77a04SHong Zhang       break;
190b2b77a04SHong Zhang     default:
19158a912c5SAmlan Barua #if defined(PETSC_USE_COMPLEX)
192a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
193a1b6d50cSHong Zhang       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
194a1b6d50cSHong Zhang #else
1958c1d5ab3SHong Zhang       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
196a1b6d50cSHong Zhang #endif
19758a912c5SAmlan Barua #else
198a31b9140SHong Zhang       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
19958a912c5SAmlan Barua #endif
200b2b77a04SHong Zhang       break;
201b2b77a04SHong Zhang     }
202f4fca6d4SMatthew G. Knepley     fftw->binarray  = (PetscScalar *)x_array;
203b2b77a04SHong Zhang     fftw->boutarray = y_array;
2042f613bf5SBarry Smith     fftw_execute(fftw->p_backward);
205b2b77a04SHong Zhang   } else {                                                         /* use existing plan */
206b2b77a04SHong Zhang     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
2077e4bc134SDominic Meiser #if defined(PETSC_USE_COMPLEX)
208b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
2097e4bc134SDominic Meiser #else
2107e4bc134SDominic Meiser       fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
2117e4bc134SDominic Meiser #endif
212b2b77a04SHong Zhang     } else {
2132f613bf5SBarry Smith       fftw_execute(fftw->p_backward);
214b2b77a04SHong Zhang     }
215b2b77a04SHong Zhang   }
2169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(y, &y_array));
2179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(x, &x_array));
218b2b77a04SHong Zhang   PetscFunctionReturn(0);
219b2b77a04SHong Zhang }
220b2b77a04SHong Zhang 
2210cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
22297504071SAmlan Barua /* MatMult_MPIFFTW performs forward DFT in parallel
22397504071SAmlan Barua    Input parameter:
2243564f093SHong Zhang      A - the matrix
22597504071SAmlan Barua      x - the vector on which FDFT will be performed
22697504071SAmlan Barua 
22797504071SAmlan Barua    Output parameter:
22897504071SAmlan Barua    y   - vector that stores result of FDFT
22997504071SAmlan Barua */
230*9371c9d4SSatish Balay PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y) {
231b2b77a04SHong Zhang   Mat_FFT           *fft  = (Mat_FFT *)A->data;
232b2b77a04SHong Zhang   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
233f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
234f4fca6d4SMatthew G. Knepley   PetscScalar       *y_array;
235c92418ccSAmlan Barua   PetscInt           ndim = fft->ndim, *dim = fft->dim;
236ce94432eSBarry Smith   MPI_Comm           comm;
237b2b77a04SHong Zhang 
238b2b77a04SHong Zhang   PetscFunctionBegin;
2399566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2409566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(x, &x_array));
2419566063dSJacob Faibussowitsch   PetscCall(VecGetArray(y, &y_array));
2426aad120cSJose E. Roman   if (!fftw->p_forward) { /* create a plan, then execute it */
243b2b77a04SHong Zhang     switch (ndim) {
244b2b77a04SHong Zhang     case 1:
2453c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
246b2b77a04SHong Zhang       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
247ae0a50aaSHong Zhang #else
2484f8276c3SHong Zhang       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
2493c3a9c75SAmlan Barua #endif
250b2b77a04SHong Zhang       break;
251b2b77a04SHong Zhang     case 2:
25297504071SAmlan Barua #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
253b2b77a04SHong Zhang       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
2543c3a9c75SAmlan Barua #else
2553c3a9c75SAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
2563c3a9c75SAmlan Barua #endif
257b2b77a04SHong Zhang       break;
258b2b77a04SHong Zhang     case 3:
2593c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
260b2b77a04SHong Zhang       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
2613c3a9c75SAmlan Barua #else
26251d1eed7SAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
2633c3a9c75SAmlan Barua #endif
264b2b77a04SHong Zhang       break;
265b2b77a04SHong Zhang     default:
2663c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
267c92418ccSAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
2683c3a9c75SAmlan Barua #else
2693c3a9c75SAmlan Barua       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
2703c3a9c75SAmlan Barua #endif
271b2b77a04SHong Zhang       break;
272b2b77a04SHong Zhang     }
273f4fca6d4SMatthew G. Knepley     fftw->finarray  = (PetscScalar *)x_array;
274b2b77a04SHong Zhang     fftw->foutarray = y_array;
275b2b77a04SHong Zhang     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
276b2b77a04SHong Zhang                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
277b2b77a04SHong Zhang     fftw_execute(fftw->p_forward);
278b2b77a04SHong Zhang   } else {                                                         /* use existing plan */
279b2b77a04SHong Zhang     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
280b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
281b2b77a04SHong Zhang     } else {
282b2b77a04SHong Zhang       fftw_execute(fftw->p_forward);
283b2b77a04SHong Zhang     }
284b2b77a04SHong Zhang   }
2859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(y, &y_array));
2869566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(x, &x_array));
287b2b77a04SHong Zhang   PetscFunctionReturn(0);
288b2b77a04SHong Zhang }
289b2b77a04SHong Zhang 
2900cf2e031SBarry Smith /*
2910cf2e031SBarry Smith    MatMultTranspose_MPIFFTW performs parallel backward DFT
29297504071SAmlan Barua    Input parameter:
2933564f093SHong Zhang      A - the matrix
29497504071SAmlan Barua      x - the vector on which BDFT will be performed
29597504071SAmlan Barua 
29697504071SAmlan Barua    Output parameter:
29797504071SAmlan Barua      y - vector that stores result of BDFT
29897504071SAmlan Barua */
299*9371c9d4SSatish Balay PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y) {
300b2b77a04SHong Zhang   Mat_FFT           *fft  = (Mat_FFT *)A->data;
301b2b77a04SHong Zhang   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
302f4fca6d4SMatthew G. Knepley   const PetscScalar *x_array;
303f4fca6d4SMatthew G. Knepley   PetscScalar       *y_array;
304c92418ccSAmlan Barua   PetscInt           ndim = fft->ndim, *dim = fft->dim;
305ce94432eSBarry Smith   MPI_Comm           comm;
306b2b77a04SHong Zhang 
307b2b77a04SHong Zhang   PetscFunctionBegin;
3089566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
3099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(x, &x_array));
3109566063dSJacob Faibussowitsch   PetscCall(VecGetArray(y, &y_array));
3116aad120cSJose E. Roman   if (!fftw->p_backward) { /* create a plan, then execute it */
312b2b77a04SHong Zhang     switch (ndim) {
313b2b77a04SHong Zhang     case 1:
3143c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
315b2b77a04SHong Zhang       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
316ae0a50aaSHong Zhang #else
3174f8276c3SHong Zhang       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
3183c3a9c75SAmlan Barua #endif
319b2b77a04SHong Zhang       break;
320b2b77a04SHong Zhang     case 2:
32197504071SAmlan Barua #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
322b2b77a04SHong Zhang       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
3233c3a9c75SAmlan Barua #else
3243c3a9c75SAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
3253c3a9c75SAmlan Barua #endif
326b2b77a04SHong Zhang       break;
327b2b77a04SHong Zhang     case 3:
3283c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
329b2b77a04SHong Zhang       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
3303c3a9c75SAmlan Barua #else
3313c3a9c75SAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
3323c3a9c75SAmlan Barua #endif
333b2b77a04SHong Zhang       break;
334b2b77a04SHong Zhang     default:
3353c3a9c75SAmlan Barua #if defined(PETSC_USE_COMPLEX)
336c92418ccSAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
3373c3a9c75SAmlan Barua #else
3383c3a9c75SAmlan Barua       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
3393c3a9c75SAmlan Barua #endif
340b2b77a04SHong Zhang       break;
341b2b77a04SHong Zhang     }
342f4fca6d4SMatthew G. Knepley     fftw->binarray  = (PetscScalar *)x_array;
343b2b77a04SHong Zhang     fftw->boutarray = y_array;
3442f613bf5SBarry Smith     fftw_execute(fftw->p_backward);
345b2b77a04SHong Zhang   } else {                                                         /* use existing plan */
346b2b77a04SHong Zhang     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
347b2b77a04SHong Zhang       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
348b2b77a04SHong Zhang     } else {
3492f613bf5SBarry Smith       fftw_execute(fftw->p_backward);
350b2b77a04SHong Zhang     }
351b2b77a04SHong Zhang   }
3529566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(y, &y_array));
3539566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(x, &x_array));
354b2b77a04SHong Zhang   PetscFunctionReturn(0);
355b2b77a04SHong Zhang }
3560cf2e031SBarry Smith #endif
357b2b77a04SHong Zhang 
358*9371c9d4SSatish Balay PetscErrorCode MatDestroy_FFTW(Mat A) {
359b2b77a04SHong Zhang   Mat_FFT  *fft  = (Mat_FFT *)A->data;
360b2b77a04SHong Zhang   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
361b2b77a04SHong Zhang 
362b2b77a04SHong Zhang   PetscFunctionBegin;
363b2b77a04SHong Zhang   fftw_destroy_plan(fftw->p_forward);
364b2b77a04SHong Zhang   fftw_destroy_plan(fftw->p_backward);
365*9371c9d4SSatish Balay   if (fftw->iodims) { free(fftw->iodims); }
3669566063dSJacob Faibussowitsch   PetscCall(PetscFree(fftw->dim_fftw));
3679566063dSJacob Faibussowitsch   PetscCall(PetscFree(fft->data));
3682e956fe4SStefano Zampini   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
3692e956fe4SStefano Zampini   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
3702e956fe4SStefano Zampini   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
3710cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
3726ccf0b3eSHong Zhang   fftw_mpi_cleanup();
3730cf2e031SBarry Smith #endif
374b2b77a04SHong Zhang   PetscFunctionReturn(0);
375b2b77a04SHong Zhang }
376b2b77a04SHong Zhang 
3770cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
378c6db04a5SJed Brown #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
379*9371c9d4SSatish Balay PetscErrorCode VecDestroy_MPIFFTW(Vec v) {
380b2b77a04SHong Zhang   PetscScalar *array;
381b2b77a04SHong Zhang 
382b2b77a04SHong Zhang   PetscFunctionBegin;
3839566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &array));
3842f613bf5SBarry Smith   fftw_free((fftw_complex *)array);
3859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &array));
3869566063dSJacob Faibussowitsch   PetscCall(VecDestroy_MPI(v));
387b2b77a04SHong Zhang   PetscFunctionReturn(0);
388b2b77a04SHong Zhang }
3890cf2e031SBarry Smith #endif
390b2b77a04SHong Zhang 
3910cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
392*9371c9d4SSatish Balay static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new) {
3935b113e21Ss-sajid-ali   Mat A;
3945b113e21Ss-sajid-ali 
3955b113e21Ss-sajid-ali   PetscFunctionBegin;
3969566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
3979566063dSJacob Faibussowitsch   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
3985b113e21Ss-sajid-ali   PetscFunctionReturn(0);
3995b113e21Ss-sajid-ali }
4005b113e21Ss-sajid-ali 
401*9371c9d4SSatish Balay static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new) {
4025b113e21Ss-sajid-ali   Mat A;
4035b113e21Ss-sajid-ali 
4045b113e21Ss-sajid-ali   PetscFunctionBegin;
4059566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
4069566063dSJacob Faibussowitsch   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
4075b113e21Ss-sajid-ali   PetscFunctionReturn(0);
4085b113e21Ss-sajid-ali }
4095b113e21Ss-sajid-ali 
410*9371c9d4SSatish Balay static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new) {
4115b113e21Ss-sajid-ali   Mat A;
4125b113e21Ss-sajid-ali 
4135b113e21Ss-sajid-ali   PetscFunctionBegin;
4149566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
4159566063dSJacob Faibussowitsch   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
4165b113e21Ss-sajid-ali   PetscFunctionReturn(0);
4175b113e21Ss-sajid-ali }
4180cf2e031SBarry Smith #endif
4195b113e21Ss-sajid-ali 
4204be45526SHong Zhang /*@
4212a7a6963SBarry Smith    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
4224f7415efSAmlan Barua      parallel layout determined by FFTW
4234f7415efSAmlan Barua 
4244f7415efSAmlan Barua    Collective on Mat
4254f7415efSAmlan Barua 
4264f7415efSAmlan Barua    Input Parameter:
4273564f093SHong Zhang .   A - the matrix
4284f7415efSAmlan Barua 
429d8d19677SJose E. Roman    Output Parameters:
430cc55f3b1SHong Zhang +   x - (optional) input vector of forward FFTW
4316b867d5aSJose E. Roman .   y - (optional) output vector of forward FFTW
432cc55f3b1SHong Zhang -   z - (optional) output vector of backward FFTW
4334f7415efSAmlan Barua 
4344f7415efSAmlan Barua   Level: advanced
4353564f093SHong Zhang 
43697504071SAmlan Barua   Note: The parallel layout of output of forward FFTW is always same as the input
43797504071SAmlan Barua         of the backward FFTW. But parallel layout of the input vector of forward
43897504071SAmlan Barua         FFTW might not be same as the output of backward FFTW.
43997504071SAmlan Barua         Also note that we need to provide enough space while doing parallel real transform.
44097504071SAmlan Barua         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
44197504071SAmlan Barua         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
442e0ec536eSAmlan Barua         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
443e0ec536eSAmlan Barua         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
444e0ec536eSAmlan Barua         zeros if it is odd only one zero is needed.
445e0ec536eSAmlan Barua         Lastly one needs some scratch space at the end of data set in each process. alloc_local
446e0ec536eSAmlan Barua         figures out how much space is needed, i.e. it figures out the data+scratch space for
447e0ec536eSAmlan Barua         each processor and returns that.
4484f7415efSAmlan Barua 
449db781477SPatrick Sanan .seealso: `MatCreateFFT()`
4504be45526SHong Zhang @*/
451*9371c9d4SSatish Balay PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z) {
4524be45526SHong Zhang   PetscFunctionBegin;
453cac4c232SBarry Smith   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
4544be45526SHong Zhang   PetscFunctionReturn(0);
4554be45526SHong Zhang }
4564be45526SHong Zhang 
457*9371c9d4SSatish Balay PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout) {
4584f7415efSAmlan Barua   PetscMPIInt size, rank;
459ce94432eSBarry Smith   MPI_Comm    comm;
4604f7415efSAmlan Barua   Mat_FFT    *fft = (Mat_FFT *)A->data;
4614f7415efSAmlan Barua 
4624f7415efSAmlan Barua   PetscFunctionBegin;
4634f7415efSAmlan Barua   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4644f7415efSAmlan Barua   PetscValidType(A, 1);
4659566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
4664f7415efSAmlan Barua 
4679566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
4689566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4694f7415efSAmlan Barua   if (size == 1) { /* sequential case */
4704f7415efSAmlan Barua #if defined(PETSC_USE_COMPLEX)
4719566063dSJacob Faibussowitsch     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
4729566063dSJacob Faibussowitsch     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
4739566063dSJacob Faibussowitsch     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
4744f7415efSAmlan Barua #else
4759566063dSJacob Faibussowitsch     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
4769566063dSJacob Faibussowitsch     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
4779566063dSJacob Faibussowitsch     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
4784f7415efSAmlan Barua #endif
4790cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
48097504071SAmlan Barua   } else { /* parallel cases */
4810cf2e031SBarry Smith     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
4820cf2e031SBarry Smith     PetscInt      ndim = fft->ndim, *dim = fft->dim;
4834f7415efSAmlan Barua     ptrdiff_t     alloc_local, local_n0, local_0_start;
4849496c9aeSAmlan Barua     ptrdiff_t     local_n1;
4859496c9aeSAmlan Barua     fftw_complex *data_fout;
4869496c9aeSAmlan Barua     ptrdiff_t     local_1_start;
4879496c9aeSAmlan Barua #if defined(PETSC_USE_COMPLEX)
4889496c9aeSAmlan Barua     fftw_complex *data_fin, *data_bout;
4899496c9aeSAmlan Barua #else
4904f7415efSAmlan Barua     double   *data_finr, *data_boutr;
4916ccf0b3eSHong Zhang     PetscInt  n1, N1;
4929496c9aeSAmlan Barua     ptrdiff_t temp;
4939496c9aeSAmlan Barua #endif
4949496c9aeSAmlan Barua 
4954f7415efSAmlan Barua     switch (ndim) {
4964f7415efSAmlan Barua     case 1:
49757625b84SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
4984f8276c3SHong Zhang       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
49957625b84SAmlan Barua #else
50057625b84SAmlan Barua       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
50157625b84SAmlan Barua       if (fin) {
50257625b84SAmlan Barua         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5039566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
5049566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
5055b113e21Ss-sajid-ali         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
50657625b84SAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
50757625b84SAmlan Barua       }
50857625b84SAmlan Barua       if (fout) {
50957625b84SAmlan Barua         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5109566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
5119566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
5125b113e21Ss-sajid-ali         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
51357625b84SAmlan Barua         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
51457625b84SAmlan Barua       }
51557625b84SAmlan Barua       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
51657625b84SAmlan Barua       if (bout) {
51757625b84SAmlan Barua         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5189566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
5199566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
5205b113e21Ss-sajid-ali         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
52157625b84SAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
52257625b84SAmlan Barua       }
52357625b84SAmlan Barua       break;
52457625b84SAmlan Barua #endif
5254f7415efSAmlan Barua     case 2:
52697504071SAmlan Barua #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
5274f7415efSAmlan Barua       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
528*9371c9d4SSatish Balay       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
529*9371c9d4SSatish Balay       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
5304f7415efSAmlan Barua       if (fin) {
5314f7415efSAmlan Barua         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
5329566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
5339566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
5345b113e21Ss-sajid-ali         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
5354f7415efSAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
5364f7415efSAmlan Barua       }
53757625b84SAmlan Barua       if (fout) {
53857625b84SAmlan Barua         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5399566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
5409566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
5415b113e21Ss-sajid-ali         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
54257625b84SAmlan Barua         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
54357625b84SAmlan Barua       }
5445b113e21Ss-sajid-ali       if (bout) {
5455b113e21Ss-sajid-ali         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
5469566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
5479566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
5485b113e21Ss-sajid-ali         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
5495b113e21Ss-sajid-ali         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
5505b113e21Ss-sajid-ali       }
5514f7415efSAmlan Barua #else
5524f7415efSAmlan Barua       /* Get local size */
5534f7415efSAmlan Barua       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
5544f7415efSAmlan Barua       if (fin) {
5554f7415efSAmlan Barua         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5569566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
5579566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
5585b113e21Ss-sajid-ali         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
5594f7415efSAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
5604f7415efSAmlan Barua       }
5614f7415efSAmlan Barua       if (fout) {
5624f7415efSAmlan Barua         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5639566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
5649566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
5655b113e21Ss-sajid-ali         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
5664f7415efSAmlan Barua         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
5674f7415efSAmlan Barua       }
5684f7415efSAmlan Barua       if (bout) {
5694f7415efSAmlan Barua         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5709566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
5719566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
5725b113e21Ss-sajid-ali         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
5734f7415efSAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
5744f7415efSAmlan Barua       }
5754f7415efSAmlan Barua #endif
5764f7415efSAmlan Barua       break;
5774f7415efSAmlan Barua     case 3:
5784f7415efSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
5794f7415efSAmlan Barua       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
580*9371c9d4SSatish Balay       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
581*9371c9d4SSatish Balay       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
5824f7415efSAmlan Barua       if (fin) {
5834f7415efSAmlan Barua         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
5849566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
5859566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
5865b113e21Ss-sajid-ali         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
5874f7415efSAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
5884f7415efSAmlan Barua       }
5895b113e21Ss-sajid-ali       if (fout) {
5905b113e21Ss-sajid-ali         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
5919566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
5929566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
5935b113e21Ss-sajid-ali         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
5945b113e21Ss-sajid-ali         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
5955b113e21Ss-sajid-ali       }
5964f7415efSAmlan Barua       if (bout) {
5974f7415efSAmlan Barua         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
5989566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
5999566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
6005b113e21Ss-sajid-ali         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
6014f7415efSAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
6024f7415efSAmlan Barua       }
6034f7415efSAmlan Barua #else
6040c9b18e4SAmlan Barua       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
6050c9b18e4SAmlan Barua       if (fin) {
6060c9b18e4SAmlan Barua         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
6079566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
6089566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
6095b113e21Ss-sajid-ali         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
6100c9b18e4SAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
6110c9b18e4SAmlan Barua       }
6120c9b18e4SAmlan Barua       if (fout) {
6130c9b18e4SAmlan Barua         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
6149566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
6159566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
6165b113e21Ss-sajid-ali         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
6170c9b18e4SAmlan Barua         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
6180c9b18e4SAmlan Barua       }
6190c9b18e4SAmlan Barua       if (bout) {
6200c9b18e4SAmlan Barua         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
6219566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
6229566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
6235b113e21Ss-sajid-ali         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
6240c9b18e4SAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
6250c9b18e4SAmlan Barua       }
6264f7415efSAmlan Barua #endif
6274f7415efSAmlan Barua       break;
6284f7415efSAmlan Barua     default:
6294f7415efSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
6304f7415efSAmlan Barua       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
6314f7415efSAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
6324f7415efSAmlan Barua       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
6330cf2e031SBarry Smith       N1                                    = 2 * fft->N * (PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
6344f7415efSAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
6354f7415efSAmlan Barua 
6364f7415efSAmlan Barua       if (fin) {
6374f7415efSAmlan Barua         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
6389566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
6399566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
6405b113e21Ss-sajid-ali         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
6414f7415efSAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
6424f7415efSAmlan Barua       }
6435b113e21Ss-sajid-ali       if (fout) {
6445b113e21Ss-sajid-ali         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
6459566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
6469566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
6475b113e21Ss-sajid-ali         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
6485b113e21Ss-sajid-ali         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
6495b113e21Ss-sajid-ali       }
6504f7415efSAmlan Barua       if (bout) {
6514f7415efSAmlan Barua         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
6529566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
6539566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
6545b113e21Ss-sajid-ali         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
6559496c9aeSAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
6564f7415efSAmlan Barua       }
6574f7415efSAmlan Barua #else
6580c9b18e4SAmlan Barua       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
6590c9b18e4SAmlan Barua       if (fin) {
6600c9b18e4SAmlan Barua         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
6619566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
6629566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
6635b113e21Ss-sajid-ali         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
6640c9b18e4SAmlan Barua         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
6650c9b18e4SAmlan Barua       }
6660c9b18e4SAmlan Barua       if (fout) {
6670c9b18e4SAmlan Barua         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
6689566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
6699566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
6705b113e21Ss-sajid-ali         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
6710c9b18e4SAmlan Barua         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
6720c9b18e4SAmlan Barua       }
6730c9b18e4SAmlan Barua       if (bout) {
6740c9b18e4SAmlan Barua         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
6759566063dSJacob Faibussowitsch         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
6769566063dSJacob Faibussowitsch         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
6775b113e21Ss-sajid-ali         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
6780c9b18e4SAmlan Barua         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
6790c9b18e4SAmlan Barua       }
6804f7415efSAmlan Barua #endif
6814f7415efSAmlan Barua       break;
6824f7415efSAmlan Barua     }
683f9d91177SJunchao Zhang     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
684f9d91177SJunchao Zhang        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
685f9d91177SJunchao Zhang        memory leaks. We void these pointers here to avoid acident uses.
686f9d91177SJunchao Zhang     */
687f9d91177SJunchao Zhang     if (fin) (*fin)->ops->replacearray = NULL;
688f9d91177SJunchao Zhang     if (fout) (*fout)->ops->replacearray = NULL;
689f9d91177SJunchao Zhang     if (bout) (*bout)->ops->replacearray = NULL;
6900cf2e031SBarry Smith #endif
6914f7415efSAmlan Barua   }
6924f7415efSAmlan Barua   PetscFunctionReturn(0);
6934f7415efSAmlan Barua }
6944f7415efSAmlan Barua 
6953564f093SHong Zhang /*@
6963564f093SHong Zhang    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
69754efbe56SHong Zhang 
6983564f093SHong Zhang    Collective on Mat
6993564f093SHong Zhang 
7003564f093SHong Zhang    Input Parameters:
7013564f093SHong Zhang +  A - FFTW matrix
7023564f093SHong Zhang -  x - the PETSc vector
7033564f093SHong Zhang 
7043564f093SHong Zhang    Output Parameters:
7053564f093SHong Zhang .  y - the FFTW vector
7063564f093SHong Zhang 
707b2b77a04SHong Zhang   Options Database Keys:
7083564f093SHong Zhang . -mat_fftw_plannerflags - set FFTW planner flags
709b2b77a04SHong Zhang 
710b2b77a04SHong Zhang    Level: intermediate
7113564f093SHong Zhang 
71297504071SAmlan Barua    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
71397504071SAmlan Barua          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
71497504071SAmlan Barua          zeros. This routine does that job by scattering operation.
71597504071SAmlan Barua 
716db781477SPatrick Sanan .seealso: `VecScatterFFTWToPetsc()`
7173564f093SHong Zhang @*/
718*9371c9d4SSatish Balay PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y) {
7193564f093SHong Zhang   PetscFunctionBegin;
720cac4c232SBarry Smith   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
7213564f093SHong Zhang   PetscFunctionReturn(0);
7223564f093SHong Zhang }
7233c3a9c75SAmlan Barua 
724*9371c9d4SSatish Balay PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y) {
725ce94432eSBarry Smith   MPI_Comm    comm;
7263c3a9c75SAmlan Barua   Mat_FFT    *fft = (Mat_FFT *)A->data;
7279496c9aeSAmlan Barua   PetscInt    low;
7287a21806cSHong Zhang   PetscMPIInt rank, size;
7297a21806cSHong Zhang   PetscInt    vsize, vsize1;
7303c3a9c75SAmlan Barua   VecScatter  vecscat;
7310cf2e031SBarry Smith   IS          list1;
7323c3a9c75SAmlan Barua 
7333564f093SHong Zhang   PetscFunctionBegin;
7349566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
7359566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
7369566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
7379566063dSJacob Faibussowitsch   PetscCall(VecGetOwnershipRange(y, &low, NULL));
7383c3a9c75SAmlan Barua 
7393564f093SHong Zhang   if (size == 1) {
7409566063dSJacob Faibussowitsch     PetscCall(VecGetSize(x, &vsize));
7419566063dSJacob Faibussowitsch     PetscCall(VecGetSize(y, &vsize1));
7429566063dSJacob Faibussowitsch     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
7439566063dSJacob Faibussowitsch     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
7449566063dSJacob Faibussowitsch     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
7459566063dSJacob Faibussowitsch     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
7469566063dSJacob Faibussowitsch     PetscCall(VecScatterDestroy(&vecscat));
7479566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&list1));
7480cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
7493564f093SHong Zhang   } else {
7500cf2e031SBarry Smith     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
7510cf2e031SBarry Smith     PetscInt  ndim = fft->ndim, *dim = fft->dim;
7520cf2e031SBarry Smith     ptrdiff_t local_n0, local_0_start;
7530cf2e031SBarry Smith     ptrdiff_t local_n1, local_1_start;
7540cf2e031SBarry Smith     IS        list2;
7550cf2e031SBarry Smith #if !defined(PETSC_USE_COMPLEX)
7560cf2e031SBarry Smith     PetscInt  i, j, k, partial_dim;
7570cf2e031SBarry Smith     PetscInt *indx1, *indx2, tempindx, tempindx1;
7580cf2e031SBarry Smith     PetscInt  NM;
7590cf2e031SBarry Smith     ptrdiff_t temp;
7600cf2e031SBarry Smith #endif
7610cf2e031SBarry Smith 
7623c3a9c75SAmlan Barua     switch (ndim) {
7633c3a9c75SAmlan Barua     case 1:
76464657d84SAmlan Barua #if defined(PETSC_USE_COMPLEX)
7657a21806cSHong Zhang       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
76626fbe8dcSKarl Rupp 
7679566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
7689566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
7699566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
7709566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
7719566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
7729566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
7739566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
7749566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
77564657d84SAmlan Barua #else
776e5d7f247SAmlan Barua       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
77764657d84SAmlan Barua #endif
7783c3a9c75SAmlan Barua       break;
7793c3a9c75SAmlan Barua     case 2:
780bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
7817a21806cSHong Zhang       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
78226fbe8dcSKarl Rupp 
7839566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
7849566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
7859566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
7869566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
7879566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
7889566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
7899566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
7909566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
791bd59e6c6SAmlan Barua #else
792c3eba89aSHong Zhang       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
79326fbe8dcSKarl Rupp 
7949566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
7959566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
7963c3a9c75SAmlan Barua 
7973564f093SHong Zhang       if (dim[1] % 2 == 0) {
7983c3a9c75SAmlan Barua         NM = dim[1] + 2;
7993564f093SHong Zhang       } else {
8003c3a9c75SAmlan Barua         NM = dim[1] + 1;
8013564f093SHong Zhang       }
8023c3a9c75SAmlan Barua       for (i = 0; i < local_n0; i++) {
8033c3a9c75SAmlan Barua         for (j = 0; j < dim[1]; j++) {
8043c3a9c75SAmlan Barua           tempindx  = i * dim[1] + j;
8053c3a9c75SAmlan Barua           tempindx1 = i * NM + j;
80626fbe8dcSKarl Rupp 
8075b3e41ffSAmlan Barua           indx1[tempindx] = local_0_start * dim[1] + tempindx;
8083c3a9c75SAmlan Barua           indx2[tempindx] = low + tempindx1;
8093c3a9c75SAmlan Barua         }
8103c3a9c75SAmlan Barua       }
8113c3a9c75SAmlan Barua 
8129566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
8139566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
8143c3a9c75SAmlan Barua 
8159566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
8169566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8179566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8189566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
8199566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
8209566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
8219566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx1));
8229566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx2));
823bd59e6c6SAmlan Barua #endif
8249496c9aeSAmlan Barua       break;
8253c3a9c75SAmlan Barua 
8263c3a9c75SAmlan Barua     case 3:
827bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
8287a21806cSHong Zhang       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
82926fbe8dcSKarl Rupp 
8309566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
8319566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
8329566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
8339566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8349566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8359566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
8369566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
8379566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
838bd59e6c6SAmlan Barua #else
839c4762a1bSJed Brown       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
840f1ade23cSHong Zhang       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
8417a21806cSHong Zhang       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
84226fbe8dcSKarl Rupp 
8439566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
8449566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
84551d1eed7SAmlan Barua 
846db4deed7SKarl Rupp       if (dim[2] % 2 == 0) NM = dim[2] + 2;
847db4deed7SKarl Rupp       else NM = dim[2] + 1;
84851d1eed7SAmlan Barua 
84951d1eed7SAmlan Barua       for (i = 0; i < local_n0; i++) {
85051d1eed7SAmlan Barua         for (j = 0; j < dim[1]; j++) {
85151d1eed7SAmlan Barua           for (k = 0; k < dim[2]; k++) {
85251d1eed7SAmlan Barua             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
85351d1eed7SAmlan Barua             tempindx1 = i * dim[1] * NM + j * NM + k;
85426fbe8dcSKarl Rupp 
85551d1eed7SAmlan Barua             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
85651d1eed7SAmlan Barua             indx2[tempindx] = low + tempindx1;
85751d1eed7SAmlan Barua           }
85851d1eed7SAmlan Barua         }
85951d1eed7SAmlan Barua       }
86051d1eed7SAmlan Barua 
8619566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
8629566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
8639566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
8649566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8659566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8669566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
8679566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
8689566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
8699566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx1));
8709566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx2));
871bd59e6c6SAmlan Barua #endif
8729496c9aeSAmlan Barua       break;
8733c3a9c75SAmlan Barua 
8743c3a9c75SAmlan Barua     default:
875bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
8767a21806cSHong Zhang       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
87726fbe8dcSKarl Rupp 
8789566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
8799566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
8809566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
8819566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8829566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
8839566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
8849566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
8859566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
886bd59e6c6SAmlan Barua #else
887c4762a1bSJed Brown       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
888f1ade23cSHong Zhang       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
889e5d7f247SAmlan Barua       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
89026fbe8dcSKarl Rupp 
891e5d7f247SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
89226fbe8dcSKarl Rupp 
8937a21806cSHong Zhang       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
89426fbe8dcSKarl Rupp 
895e5d7f247SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
896e5d7f247SAmlan Barua 
897e5d7f247SAmlan Barua       partial_dim = fftw->partial_dim;
898e5d7f247SAmlan Barua 
8999566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
9009566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
901e5d7f247SAmlan Barua 
902db4deed7SKarl Rupp       if (dim[ndim - 1] % 2 == 0) NM = 2;
903db4deed7SKarl Rupp       else NM = 1;
904e5d7f247SAmlan Barua 
9056971673cSAmlan Barua       j = low;
9063564f093SHong Zhang       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
9076971673cSAmlan Barua         indx1[i] = local_0_start * partial_dim + i;
9086971673cSAmlan Barua         indx2[i] = j;
90926fbe8dcSKarl Rupp         if (k % dim[ndim - 1] == 0) j += NM;
9106971673cSAmlan Barua         j++;
9116971673cSAmlan Barua       }
9129566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
9139566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
9149566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
9159566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
9169566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
9179566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
9189566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
9199566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
9209566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx1));
9219566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx2));
922bd59e6c6SAmlan Barua #endif
9239496c9aeSAmlan Barua       break;
9243c3a9c75SAmlan Barua     }
9250cf2e031SBarry Smith #endif
926e81bb599SAmlan Barua   }
9273564f093SHong Zhang   PetscFunctionReturn(0);
9283c3a9c75SAmlan Barua }
9293c3a9c75SAmlan Barua 
9303564f093SHong Zhang /*@
9313564f093SHong Zhang    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
9323564f093SHong Zhang 
9333564f093SHong Zhang    Collective on Mat
9343564f093SHong Zhang 
9353564f093SHong Zhang     Input Parameters:
9363564f093SHong Zhang +   A - FFTW matrix
9373564f093SHong Zhang -   x - FFTW vector
9383564f093SHong Zhang 
9393564f093SHong Zhang    Output Parameters:
9403564f093SHong Zhang .  y - PETSc vector
9413564f093SHong Zhang 
9423564f093SHong Zhang    Level: intermediate
9433564f093SHong Zhang 
9443564f093SHong Zhang    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
9453564f093SHong Zhang          VecScatterFFTWToPetsc removes those extra zeros.
9463564f093SHong Zhang 
947db781477SPatrick Sanan .seealso: `VecScatterPetscToFFTW()`
9483564f093SHong Zhang @*/
949*9371c9d4SSatish Balay PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y) {
9503c3a9c75SAmlan Barua   PetscFunctionBegin;
951cac4c232SBarry Smith   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
9523c3a9c75SAmlan Barua   PetscFunctionReturn(0);
9533c3a9c75SAmlan Barua }
95454efbe56SHong Zhang 
955*9371c9d4SSatish Balay PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y) {
956ce94432eSBarry Smith   MPI_Comm    comm;
9575b3e41ffSAmlan Barua   Mat_FFT    *fft = (Mat_FFT *)A->data;
9589496c9aeSAmlan Barua   PetscInt    low;
9597a21806cSHong Zhang   PetscMPIInt rank, size;
9605b3e41ffSAmlan Barua   VecScatter  vecscat;
9610cf2e031SBarry Smith   IS          list1;
9629496c9aeSAmlan Barua 
9633564f093SHong Zhang   PetscFunctionBegin;
9649566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
9659566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
9669566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
9679566063dSJacob Faibussowitsch   PetscCall(VecGetOwnershipRange(x, &low, NULL));
9685b3e41ffSAmlan Barua 
969e81bb599SAmlan Barua   if (size == 1) {
9709566063dSJacob Faibussowitsch     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
9719566063dSJacob Faibussowitsch     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
9729566063dSJacob Faibussowitsch     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
9739566063dSJacob Faibussowitsch     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
9749566063dSJacob Faibussowitsch     PetscCall(VecScatterDestroy(&vecscat));
9759566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&list1));
976e81bb599SAmlan Barua 
9770cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
9783564f093SHong Zhang   } else {
9790cf2e031SBarry Smith     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
9800cf2e031SBarry Smith     PetscInt  ndim = fft->ndim, *dim = fft->dim;
9810cf2e031SBarry Smith     ptrdiff_t local_n0, local_0_start;
9820cf2e031SBarry Smith     ptrdiff_t local_n1, local_1_start;
9830cf2e031SBarry Smith     IS        list2;
9840cf2e031SBarry Smith #if !defined(PETSC_USE_COMPLEX)
9850cf2e031SBarry Smith     PetscInt  i, j, k, partial_dim;
9860cf2e031SBarry Smith     PetscInt *indx1, *indx2, tempindx, tempindx1;
9870cf2e031SBarry Smith     PetscInt  NM;
9880cf2e031SBarry Smith     ptrdiff_t temp;
9890cf2e031SBarry Smith #endif
990e81bb599SAmlan Barua     switch (ndim) {
991e81bb599SAmlan Barua     case 1:
99264657d84SAmlan Barua #if defined(PETSC_USE_COMPLEX)
9937a21806cSHong Zhang       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
99426fbe8dcSKarl Rupp 
9959566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
9969566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
9979566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
9989566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
9999566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10009566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
10019566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
10029566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
100364657d84SAmlan Barua #else
10046a506ed8SAmlan Barua       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
100564657d84SAmlan Barua #endif
10065b3e41ffSAmlan Barua       break;
10075b3e41ffSAmlan Barua     case 2:
1008bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
10097a21806cSHong Zhang       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
101026fbe8dcSKarl Rupp 
10119566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
10129566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
10139566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
10149566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10159566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10169566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
10179566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
10189566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
1019bd59e6c6SAmlan Barua #else
10207a21806cSHong Zhang       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
102126fbe8dcSKarl Rupp 
10229566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
10239566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
10245b3e41ffSAmlan Barua 
1025db4deed7SKarl Rupp       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1026db4deed7SKarl Rupp       else NM = dim[1] + 1;
10275b3e41ffSAmlan Barua 
10285b3e41ffSAmlan Barua       for (i = 0; i < local_n0; i++) {
10295b3e41ffSAmlan Barua         for (j = 0; j < dim[1]; j++) {
10305b3e41ffSAmlan Barua           tempindx  = i * dim[1] + j;
10315b3e41ffSAmlan Barua           tempindx1 = i * NM + j;
103226fbe8dcSKarl Rupp 
10335b3e41ffSAmlan Barua           indx1[tempindx] = local_0_start * dim[1] + tempindx;
10345b3e41ffSAmlan Barua           indx2[tempindx] = low + tempindx1;
10355b3e41ffSAmlan Barua         }
10365b3e41ffSAmlan Barua       }
10375b3e41ffSAmlan Barua 
10389566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
10399566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
10405b3e41ffSAmlan Barua 
10419566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
10429566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10439566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10449566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
10459566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
10469566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
10479566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx1));
10489566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx2));
1049bd59e6c6SAmlan Barua #endif
10509496c9aeSAmlan Barua       break;
10515b3e41ffSAmlan Barua     case 3:
1052bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
10537a21806cSHong Zhang       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
105426fbe8dcSKarl Rupp 
10559566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
10569566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
10579566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
10589566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10599566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10609566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
10619566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
10629566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
1063bd59e6c6SAmlan Barua #else
10647a21806cSHong Zhang       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
106526fbe8dcSKarl Rupp 
10669566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
10679566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
106851d1eed7SAmlan Barua 
1069db4deed7SKarl Rupp       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1070db4deed7SKarl Rupp       else NM = dim[2] + 1;
107151d1eed7SAmlan Barua 
107251d1eed7SAmlan Barua       for (i = 0; i < local_n0; i++) {
107351d1eed7SAmlan Barua         for (j = 0; j < dim[1]; j++) {
107451d1eed7SAmlan Barua           for (k = 0; k < dim[2]; k++) {
107551d1eed7SAmlan Barua             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
107651d1eed7SAmlan Barua             tempindx1 = i * dim[1] * NM + j * NM + k;
107726fbe8dcSKarl Rupp 
107851d1eed7SAmlan Barua             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
107951d1eed7SAmlan Barua             indx2[tempindx] = low + tempindx1;
108051d1eed7SAmlan Barua           }
108151d1eed7SAmlan Barua         }
108251d1eed7SAmlan Barua       }
108351d1eed7SAmlan Barua 
10849566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
10859566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
108651d1eed7SAmlan Barua 
10879566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
10889566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10899566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
10909566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
10919566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
10929566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
10939566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx1));
10949566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx2));
1095bd59e6c6SAmlan Barua #endif
10969496c9aeSAmlan Barua       break;
10975b3e41ffSAmlan Barua     default:
1098bd59e6c6SAmlan Barua #if defined(PETSC_USE_COMPLEX)
10997a21806cSHong Zhang       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
110026fbe8dcSKarl Rupp 
11019566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
11029566063dSJacob Faibussowitsch       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
11039566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
11049566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
11059566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
11069566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
11079566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
11089566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
1109bd59e6c6SAmlan Barua #else
1110ba6e06d0SAmlan Barua       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
111126fbe8dcSKarl Rupp 
1112ba6e06d0SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
111326fbe8dcSKarl Rupp 
1114c3eba89aSHong Zhang       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
111526fbe8dcSKarl Rupp 
1116ba6e06d0SAmlan Barua       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1117ba6e06d0SAmlan Barua 
1118ba6e06d0SAmlan Barua       partial_dim = fftw->partial_dim;
1119ba6e06d0SAmlan Barua 
11209566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
11219566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
1122ba6e06d0SAmlan Barua 
1123db4deed7SKarl Rupp       if (dim[ndim - 1] % 2 == 0) NM = 2;
1124db4deed7SKarl Rupp       else NM = 1;
1125ba6e06d0SAmlan Barua 
1126ba6e06d0SAmlan Barua       j = low;
11273564f093SHong Zhang       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
1128ba6e06d0SAmlan Barua         indx1[i] = local_0_start * partial_dim + i;
1129ba6e06d0SAmlan Barua         indx2[i] = j;
11303564f093SHong Zhang         if (k % dim[ndim - 1] == 0) j += NM;
1131ba6e06d0SAmlan Barua         j++;
1132ba6e06d0SAmlan Barua       }
11339566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
11349566063dSJacob Faibussowitsch       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1135ba6e06d0SAmlan Barua 
11369566063dSJacob Faibussowitsch       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
11379566063dSJacob Faibussowitsch       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
11389566063dSJacob Faibussowitsch       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
11399566063dSJacob Faibussowitsch       PetscCall(VecScatterDestroy(&vecscat));
11409566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list1));
11419566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&list2));
11429566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx1));
11439566063dSJacob Faibussowitsch       PetscCall(PetscFree(indx2));
1144bd59e6c6SAmlan Barua #endif
11459496c9aeSAmlan Barua       break;
11465b3e41ffSAmlan Barua     }
11470cf2e031SBarry Smith #endif
1148e81bb599SAmlan Barua   }
11493564f093SHong Zhang   PetscFunctionReturn(0);
11505b3e41ffSAmlan Barua }
11515b3e41ffSAmlan Barua 
11523c3a9c75SAmlan Barua /*
11533564f093SHong Zhang     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
11543564f093SHong Zhang 
11553c3a9c75SAmlan Barua   Options Database Keys:
11563c3a9c75SAmlan Barua + -mat_fftw_plannerflags - set FFTW planner flags
11573c3a9c75SAmlan Barua 
11583c3a9c75SAmlan Barua    Level: intermediate
11593c3a9c75SAmlan Barua 
11603c3a9c75SAmlan Barua */
1161*9371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A) {
1162ce94432eSBarry Smith   MPI_Comm    comm;
1163b2b77a04SHong Zhang   Mat_FFT    *fft = (Mat_FFT *)A->data;
1164b2b77a04SHong Zhang   Mat_FFTW   *fftw;
11650cf2e031SBarry Smith   PetscInt    ndim = fft->ndim, *dim = fft->dim;
11665274ed1bSHong Zhang   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
11675274ed1bSHong Zhang   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1168b2b77a04SHong Zhang   PetscBool   flg;
11694f48bc67SAmlan Barua   PetscInt    p_flag, partial_dim = 1, ctr;
117011902ff2SHong Zhang   PetscMPIInt size, rank;
11719496c9aeSAmlan Barua   ptrdiff_t  *pdim;
11729496c9aeSAmlan Barua #if !defined(PETSC_USE_COMPLEX)
11730cf2e031SBarry Smith   PetscInt tot_dim;
11749496c9aeSAmlan Barua #endif
11759496c9aeSAmlan Barua 
1176b2b77a04SHong Zhang   PetscFunctionBegin;
11779566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
11789566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
11799566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1180c92418ccSAmlan Barua 
11810cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
11821878d478SAmlan Barua   fftw_mpi_init();
11830cf2e031SBarry Smith #endif
118411902ff2SHong Zhang   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
118511902ff2SHong Zhang   pdim[0] = dim[0];
11868ca4c034SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
11878ca4c034SAmlan Barua   tot_dim = 2 * dim[0];
11888ca4c034SAmlan Barua #endif
11893564f093SHong Zhang   for (ctr = 1; ctr < ndim; ctr++) {
11906882372aSHong Zhang     partial_dim *= dim[ctr];
119111902ff2SHong Zhang     pdim[ctr] = dim[ctr];
11928ca4c034SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
1193db4deed7SKarl Rupp     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1194db4deed7SKarl Rupp     else tot_dim *= dim[ctr];
11958ca4c034SAmlan Barua #endif
11966882372aSHong Zhang   }
11973c3a9c75SAmlan Barua 
1198b2b77a04SHong Zhang   if (size == 1) {
1199e81bb599SAmlan Barua #if defined(PETSC_USE_COMPLEX)
12009566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
12010cf2e031SBarry Smith     fft->n = fft->N;
1202e81bb599SAmlan Barua #else
12039566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
12040cf2e031SBarry Smith     fft->n       = tot_dim;
12050cf2e031SBarry Smith #endif
12060cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
12070cf2e031SBarry Smith   } else {
12080cf2e031SBarry Smith     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
12090cf2e031SBarry Smith #if !defined(PETSC_USE_COMPLEX)
12100cf2e031SBarry Smith     ptrdiff_t temp;
12110cf2e031SBarry Smith     PetscInt  N1;
1212e81bb599SAmlan Barua #endif
1213e81bb599SAmlan Barua 
1214b2b77a04SHong Zhang     switch (ndim) {
1215b2b77a04SHong Zhang     case 1:
12163c3a9c75SAmlan Barua #if !defined(PETSC_USE_COMPLEX)
12173c3a9c75SAmlan Barua       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1218e5d7f247SAmlan Barua #else
12197a21806cSHong Zhang       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
12200cf2e031SBarry Smith       fft->n = (PetscInt)local_n0;
12219566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1222e5d7f247SAmlan Barua #endif
1223b2b77a04SHong Zhang       break;
1224b2b77a04SHong Zhang     case 2:
12255b3e41ffSAmlan Barua #if defined(PETSC_USE_COMPLEX)
12267a21806cSHong Zhang       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
12270cf2e031SBarry Smith       fft->n = (PetscInt)local_n0 * dim[1];
12289566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
12295b3e41ffSAmlan Barua #else
1230c3eba89aSHong Zhang       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
123126fbe8dcSKarl Rupp 
12320cf2e031SBarry Smith       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
12339566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
12345b3e41ffSAmlan Barua #endif
1235b2b77a04SHong Zhang       break;
1236b2b77a04SHong Zhang     case 3:
123751d1eed7SAmlan Barua #if defined(PETSC_USE_COMPLEX)
12387a21806cSHong Zhang       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
123926fbe8dcSKarl Rupp 
12400cf2e031SBarry Smith       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
12419566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
124251d1eed7SAmlan Barua #else
1243c3eba89aSHong Zhang       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
124426fbe8dcSKarl Rupp 
12450cf2e031SBarry Smith       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
12469566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
124751d1eed7SAmlan Barua #endif
1248b2b77a04SHong Zhang       break;
1249b2b77a04SHong Zhang     default:
1250b3a17365SAmlan Barua #if defined(PETSC_USE_COMPLEX)
12517a21806cSHong Zhang       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
125226fbe8dcSKarl Rupp 
12530cf2e031SBarry Smith       fft->n = (PetscInt)local_n0 * partial_dim;
12549566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1255b3a17365SAmlan Barua #else
1256b3a17365SAmlan Barua       temp = pdim[ndim - 1];
125726fbe8dcSKarl Rupp 
1258b3a17365SAmlan Barua       pdim[ndim - 1] = temp / 2 + 1;
125926fbe8dcSKarl Rupp 
1260c3eba89aSHong Zhang       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
126126fbe8dcSKarl Rupp 
12620cf2e031SBarry Smith       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
12630cf2e031SBarry Smith       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
126426fbe8dcSKarl Rupp 
1265b3a17365SAmlan Barua       pdim[ndim - 1] = temp;
126626fbe8dcSKarl Rupp 
12679566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1268b3a17365SAmlan Barua #endif
1269b2b77a04SHong Zhang       break;
1270b2b77a04SHong Zhang     }
12710cf2e031SBarry Smith #endif
1272b2b77a04SHong Zhang   }
12732277172eSMark Adams   free(pdim);
12749566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
12759566063dSJacob Faibussowitsch   PetscCall(PetscNewLog(A, &fftw));
1276b2b77a04SHong Zhang   fft->data = (void *)fftw;
1277b2b77a04SHong Zhang 
12780c74a584SJed Brown   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1279e5d7f247SAmlan Barua   fftw->partial_dim = partial_dim;
128026fbe8dcSKarl Rupp 
12819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ndim, &(fftw->dim_fftw)));
12828c1d5ab3SHong Zhang   if (size == 1) {
1283a1b6d50cSHong Zhang #if defined(PETSC_USE_64BIT_INDICES)
1284a1b6d50cSHong Zhang     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1285a1b6d50cSHong Zhang #else
12868c1d5ab3SHong Zhang     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1287a1b6d50cSHong Zhang #endif
12888c1d5ab3SHong Zhang   }
128926fbe8dcSKarl Rupp 
1290b1301b2eSHong Zhang   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1291c92418ccSAmlan Barua 
1292f4259b30SLisandro Dalcin   fftw->p_forward  = NULL;
1293f4259b30SLisandro Dalcin   fftw->p_backward = NULL;
1294b2b77a04SHong Zhang   fftw->p_flag     = FFTW_ESTIMATE;
1295b2b77a04SHong Zhang 
1296b2b77a04SHong Zhang   if (size == 1) {
1297b2b77a04SHong Zhang     A->ops->mult          = MatMult_SeqFFTW;
1298b2b77a04SHong Zhang     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
12990cf2e031SBarry Smith #if !PetscDefined(HAVE_MPIUNI)
1300b2b77a04SHong Zhang   } else {
1301b2b77a04SHong Zhang     A->ops->mult          = MatMult_MPIFFTW;
1302b2b77a04SHong Zhang     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
13030cf2e031SBarry Smith #endif
1304b2b77a04SHong Zhang   }
1305b2b77a04SHong Zhang   fft->matdestroy = MatDestroy_FFTW;
1306b2b77a04SHong Zhang   A->assembled    = PETSC_TRUE;
13074a52bad8SHong Zhang   A->preallocated = PETSC_TRUE;
130826fbe8dcSKarl Rupp 
13099566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
13109566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
13119566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1312b2b77a04SHong Zhang 
1313b2b77a04SHong Zhang   /* get runtime options */
1314d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
13159566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
13165f80ce2aSJacob Faibussowitsch   if (flg) fftw->p_flag = iplans[p_flag];
1317d0609cedSBarry Smith   PetscOptionsEnd();
1318b2b77a04SHong Zhang   PetscFunctionReturn(0);
1319b2b77a04SHong Zhang }
1320