xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision cbd2c53bc8d4e32f1aab714f173fd0b4c821fa72)
1 #include <petscvec_kokkos.hpp>
2 #include <../src/mat/impls/aij/mpi/mpiaij.h>
3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
4 
5 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
6 {
7   PetscErrorCode   ierr;
8   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
9   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
10 
11   PetscFunctionBegin;
12   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
13   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
14     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
15   }
16   if (aijkok && aijkok->device_mat_d.data()) {
17     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
18   }
19 
20   PetscFunctionReturn(0);
21 }
22 
23 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
24 {
25   PetscErrorCode ierr;
26   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
27 
28   PetscFunctionBegin;
29   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
30   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
31 #if defined(PETSC_USE_DEBUG)
32   if (d_nnz) {
33     PetscInt i;
34     for (i=0; i<mat->rmap->n; i++) {
35       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
36     }
37   }
38   if (o_nnz) {
39     PetscInt i;
40     for (i=0; i<mat->rmap->n; i++) {
41       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
42     }
43   }
44 #endif
45 #if defined(PETSC_USE_CTABLE)
46   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
47 #else
48   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
49 #endif
50   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
51   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
52   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
53   /* Because the B will have been resized we simply destroy it and create a new one each time */
54   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
55 
56   if (!mpiaij->A) {
57     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
58     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
59     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
60   }
61   if (!mpiaij->B) {
62     PetscMPIInt size;
63     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
64     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
65     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
66     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
67   }
68   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
69   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
70   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
71   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
72   mat->preallocated = PETSC_TRUE;
73   PetscFunctionReturn(0);
74 }
75 
76 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
77 {
78   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
79   PetscErrorCode ierr;
80   PetscInt       nt;
81 
82   PetscFunctionBegin;
83   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
84   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
85   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
86   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
87   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
88   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
89   PetscFunctionReturn(0);
90 }
91 
92 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
93 {
94   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
95   PetscErrorCode ierr;
96   PetscInt       nt;
97 
98   PetscFunctionBegin;
99   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
100   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
101   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
102   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
103   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
104   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
109 {
110   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
111   PetscErrorCode ierr;
112   PetscInt       nt;
113 
114   PetscFunctionBegin;
115   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
116   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
117   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
118   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
119   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
120   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
125 {
126   PetscErrorCode     ierr;
127   Mat                B;
128 
129   PetscFunctionBegin;
130   if (reuse == MAT_INITIAL_MATRIX) {
131     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
132   } else if (reuse == MAT_REUSE_MATRIX) {
133     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
134   }
135   B = *newmat;
136 
137   B->boundtocpu = PETSC_FALSE;
138   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
139   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
140   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
141 
142   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
143   B->ops->mult                  = MatMult_MPIAIJKokkos;
144   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
145   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
146   // Needs an efficient implementation of the COO preallocation routines
147   //B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
148 
149   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
154 {
155   PetscErrorCode ierr;
156 
157   PetscFunctionBegin;
158   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
159   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
160   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
161   PetscFunctionReturn(0);
162 }
163 
164 /*@C
165    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
166    (the default parallel PETSc format).  This matrix will ultimately pushed down
167    to Kokkos for calculations. For good matrix
168    assembly performance the user should preallocate the matrix storage by setting
169    the parameter nz (or the array nnz).  By setting these parameters accurately,
170    performance during matrix assembly can be increased by more than a factor of 50.
171 
172    Collective
173 
174    Input Parameters:
175 +  comm - MPI communicator, set to PETSC_COMM_SELF
176 .  m - number of rows
177 .  n - number of columns
178 .  nz - number of nonzeros per row (same for all rows)
179 -  nnz - array containing the number of nonzeros in the various rows
180          (possibly different for each row) or NULL
181 
182    Output Parameter:
183 .  A - the matrix
184 
185    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
186    MatXXXXSetPreallocation() paradigm instead of this routine directly.
187    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
188 
189    Notes:
190    If nnz is given then nz is ignored
191 
192    The AIJ format (also called the Yale sparse matrix format or
193    compressed row storage), is fully compatible with standard Fortran 77
194    storage.  That is, the stored row and column indices can begin at
195    either one (as in Fortran) or zero.  See the users' manual for details.
196 
197    Specify the preallocated storage with either nz or nnz (not both).
198    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
199    allocation.  For large problems you MUST preallocate memory or you
200    will get TERRIBLE performance, see the users' manual chapter on matrices.
201 
202    By default, this format uses inodes (identical nodes) when possible, to
203    improve numerical efficiency of matrix-vector products and solves. We
204    search for consecutive rows with the same nonzero structure, thereby
205    reusing matrix information to achieve increased efficiency.
206 
207    Level: intermediate
208 
209 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
210 @*/
211 PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
212 {
213   PetscErrorCode ierr;
214   PetscMPIInt    size;
215 
216   PetscFunctionBegin;
217   ierr = MatCreate(comm,A);CHKERRQ(ierr);
218   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
219   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
220   if (size > 1) {
221     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
222     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
223   } else {
224     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
226   }
227   PetscFunctionReturn(0);
228 }
229 
230 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
231 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
232 {
233   #if defined(PETSC_USE_CTABLE)
234   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
235   #else
236   PetscMPIInt                size,rank;
237   MPI_Comm                   comm;
238   PetscErrorCode             ierr;
239   PetscSplitCSRDataStructure *d_mat=NULL;
240 
241   PetscFunctionBegin;
242   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
243   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
244   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
245   if (size == 1) {
246     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
247   } else {
248     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
249     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
250   }
251   // act like MatSetValues because not called on host
252   if (A->assembled) {
253     if (A->was_assembled) {
254       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
255     }
256     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
257   } else {
258     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
259     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
260   }
261   if (!d_mat) {
262     PetscSplitCSRDataStructure  h_mat; /* host container */
263     Mat_SeqAIJKokkos            *aijkokA;
264     Mat_SeqAIJ                  *jaca;
265     PetscInt                    n = A->rmap->n, nnz;
266     Mat                         Amat;
267     // create and copy
268     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
269     if (size == 1) {
270       Amat = A;
271       jaca = (Mat_SeqAIJ*)A->data;
272       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
273       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
274       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
275       h_mat.offdiag.a = NULL;
276       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
277       aijkokA->i_uncompressed_d = NULL;
278       aijkokA->colmap_d = NULL;
279     } else {
280       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
281       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
282       PetscInt         ii;
283       Mat_SeqAIJKokkos *aijkokB;
284       Amat = aij->A;
285       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
286       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
287       aijkokA->i_uncompressed_d = NULL;
288       aijkokA->colmap_d = NULL;
289       jaca = (Mat_SeqAIJ*)aij->A->data;
290       if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
291       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
292       // create colmap - this is ussually done (lazy) in MatSetValues
293       aij->donotstash = PETSC_TRUE;
294       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
295       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
296       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
297       aij->colmap[A->cmap->N] = -9;
298       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
299       for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
300       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
301       // allocate B copy data
302       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
303       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
304       nnz = jacb->i[n];
305       if (jacb->compressedrow.use) {
306         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
307         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
308         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
309         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
310         //err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
311         //err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
312       } else {
313          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
314       }
315       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
316       h_mat.offdiag.a = aijkokB->a_d.data();
317       {
318         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N);
319         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
320         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
321         h_mat.colmap = aijkokB->colmap_d->data();
322       }
323       //err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
324       //err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
325       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
326       h_mat.offdiag.n = n;
327     }
328     // allocate A copy data
329     nnz = jaca->i[n];
330     h_mat.diag.n = n;
331     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
332     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
333     if (jaca->compressedrow.use) {
334       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
335     } else {
336       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
337     }
338     //h_mat.diag.j = aj;
339     //h_mat.diag.a = aa;
340     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
341     h_mat.diag.a = aijkokA->a_d.data();
342     // copy pointers and metdata to device
343     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
344     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
345     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
346   }
347   *B = d_mat; // return it, set it in Mat, and set it up
348   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
349   PetscFunctionReturn(0);
350   #endif
351 }
352