xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision 7d5fd1e4d9337468ad3f05b65b7facdcd2dfd2a4)
1 #include <petscvec_kokkos.hpp>
2 #include <../src/mat/impls/aij/mpi/mpiaij.h>
3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
4 
5 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
6 {
7   PetscErrorCode   ierr;
8   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
9   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
10 
11   PetscFunctionBegin;
12   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
13   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
14     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
15   }
16   if (aijkok && aijkok->device_mat_d.data()) {
17     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
18   }
19 
20   PetscFunctionReturn(0);
21 }
22 
23 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
24 {
25   PetscErrorCode ierr;
26   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
27 
28   PetscFunctionBegin;
29   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
30   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
31 #if defined(PETSC_USE_DEBUG)
32   if (d_nnz) {
33     PetscInt i;
34     for (i=0; i<mat->rmap->n; i++) {
35       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
36     }
37   }
38   if (o_nnz) {
39     PetscInt i;
40     for (i=0; i<mat->rmap->n; i++) {
41       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
42     }
43   }
44 #endif
45 #if defined(PETSC_USE_CTABLE)
46   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
47 #else
48   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
49 #endif
50   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
51   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
52   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
53   /* Because the B will have been resized we simply destroy it and create a new one each time */
54   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
55 
56   if (!mpiaij->A) {
57     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
58     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
59     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
60   }
61   if (!mpiaij->B) {
62     PetscMPIInt size;
63     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
64     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
65     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
66     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
67   }
68   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
69   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
70   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
71   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
72   mat->preallocated = PETSC_TRUE;
73   PetscFunctionReturn(0);
74 }
75 
76 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
77 {
78   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
79   PetscErrorCode ierr;
80   PetscInt       nt;
81 
82   PetscFunctionBegin;
83   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
84   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
85   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
86   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
87   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
88   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
89   PetscFunctionReturn(0);
90 }
91 
92 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
93 {
94   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
95   PetscErrorCode ierr;
96   PetscInt       nt;
97 
98   PetscFunctionBegin;
99   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
100   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
101   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
102   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
103   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
104   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
109 {
110   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
111   PetscErrorCode ierr;
112   PetscInt       nt;
113 
114   PetscFunctionBegin;
115   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
116   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
117   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
118   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
119   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
120   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
125 {
126   PetscErrorCode     ierr;
127   Mat                B;
128 
129   PetscFunctionBegin;
130   if (reuse == MAT_INITIAL_MATRIX) {
131     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
132   } else if (reuse == MAT_REUSE_MATRIX) {
133     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
134   }
135   B = *newmat;
136 
137   B->boundtocpu = PETSC_FALSE;
138   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
139   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
140   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
141 
142   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
143   B->ops->mult                  = MatMult_MPIAIJKokkos;
144   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
145   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
146   // Needs an efficient implementation of the COO preallocation routines
147   //B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
148 
149   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
154 {
155   PetscErrorCode ierr;
156 
157   PetscFunctionBegin;
158   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
159   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
160   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
161   PetscFunctionReturn(0);
162 }
163 
164 /*@C
165    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
166    (the default parallel PETSc format).  This matrix will ultimately pushed down
167    to Kokkos for calculations. For good matrix
168    assembly performance the user should preallocate the matrix storage by setting
169    the parameter nz (or the array nnz).  By setting these parameters accurately,
170    performance during matrix assembly can be increased by more than a factor of 50.
171 
172    Collective
173 
174    Input Parameters:
175 +  comm - MPI communicator, set to PETSC_COMM_SELF
176 .  m - number of rows
177 .  n - number of columns
178 .  nz - number of nonzeros per row (same for all rows)
179 -  nnz - array containing the number of nonzeros in the various rows
180          (possibly different for each row) or NULL
181 
182    Output Parameter:
183 .  A - the matrix
184 
185    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
186    MatXXXXSetPreallocation() paradigm instead of this routine directly.
187    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
188 
189    Notes:
190    If nnz is given then nz is ignored
191 
192    The AIJ format (also called the Yale sparse matrix format or
193    compressed row storage), is fully compatible with standard Fortran 77
194    storage.  That is, the stored row and column indices can begin at
195    either one (as in Fortran) or zero.  See the users' manual for details.
196 
197    Specify the preallocated storage with either nz or nnz (not both).
198    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
199    allocation.  For large problems you MUST preallocate memory or you
200    will get TERRIBLE performance, see the users' manual chapter on matrices.
201 
202    By default, this format uses inodes (identical nodes) when possible, to
203    improve numerical efficiency of matrix-vector products and solves. We
204    search for consecutive rows with the same nonzero structure, thereby
205    reusing matrix information to achieve increased efficiency.
206 
207    Level: intermediate
208 
209 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
210 @*/
211 PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
212 {
213   PetscErrorCode ierr;
214   PetscMPIInt    size;
215 
216   PetscFunctionBegin;
217   ierr = MatCreate(comm,A);CHKERRQ(ierr);
218   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
219   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
220   if (size > 1) {
221     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
222     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
223   } else {
224     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
226   }
227   PetscFunctionReturn(0);
228 }
229 
230 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
231 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
232 {
233   PetscMPIInt                size,rank;
234   MPI_Comm                   comm;
235   PetscErrorCode             ierr;
236   PetscSplitCSRDataStructure d_mat=NULL;
237 
238   PetscFunctionBegin;
239   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
240   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
241   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
242   if (size == 1) {
243     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
244   } else {
245     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
246     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
247   }
248   // act like MatSetValues because not called on host
249   if (A->assembled) {
250     if (A->was_assembled) {
251       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
252     }
253     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
254   } else {
255     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
256     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
257   }
258   if (!d_mat) {
259     struct _n_SplitCSRMat h_mat; /* host container */
260     Mat_SeqAIJKokkos      *aijkokA;
261     Mat_SeqAIJ            *jaca;
262     PetscInt              n = A->rmap->n, nnz;
263     Mat                   Amat;
264     PetscInt              *colmap;
265 
266     /* create and copy h_mat */
267     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
268     if (size == 1) {
269       Amat = A;
270       jaca = (Mat_SeqAIJ*)A->data;
271       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
272       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
273       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
274       h_mat.offdiag.a = NULL;
275       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
276       aijkokA->i_uncompressed_d = NULL;
277       aijkokA->colmap_d = NULL;
278     } else {
279       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
280       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
281       PetscInt         ii;
282       Mat_SeqAIJKokkos *aijkokB;
283 
284       Amat = aij->A;
285       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
286       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
287       aijkokA->i_uncompressed_d = NULL;
288       aijkokA->colmap_d = NULL;
289       jaca = (Mat_SeqAIJ*)aij->A->data;
290       if (aij->B->cmap->n && !aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
291       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
292       aij->donotstash = PETSC_TRUE;
293       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
294       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
295       ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr);
296       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr);
297       for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
298       // allocate B copy data
299       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
300       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
301       nnz = jacb->i[n];
302       if (jacb->compressedrow.use) {
303         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
304         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
305         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
306         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
307       } else {
308          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
309       }
310       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
311       h_mat.offdiag.a = aijkokB->a_d.data();
312       {
313         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
314         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
315         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
316         h_mat.colmap = aijkokB->colmap_d->data();
317         ierr = PetscFree(colmap);CHKERRQ(ierr);
318       }
319       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
320       h_mat.offdiag.n = n;
321     }
322     // allocate A copy data
323     nnz = jaca->i[n];
324     h_mat.diag.n = n;
325     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
326     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
327     if (jaca->compressedrow.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
328     else {
329       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
330     }
331     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
332     h_mat.diag.a = aijkokA->a_d.data();
333     // copy pointers and metdata to device
334     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
335     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
336     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
337   }
338   *B = d_mat; // return it, set it in Mat, and set it up
339   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
340   PetscFunctionReturn(0);
341 }
342