xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision 6a29ce69e29aaf78b7325cac1ae663c7f0b163da)
1 #include <petscconf.h>
2 #include <../src/mat/impls/aij/mpi/mpiaij.h>
3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
4 #include <petscveckokkos.hpp>
5 
6 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
7 {
8   PetscErrorCode   ierr;
9   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
10   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
11 
12   PetscFunctionBegin;
13   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
14   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
15     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
16   }
17   if (aijkok && aijkok->device_mat_d.data()) {
18     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
19   }
20 
21   PetscFunctionReturn(0);
22 }
23 
24 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
25 {
26   PetscErrorCode ierr;
27   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
28 
29   PetscFunctionBegin;
30   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
31   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
32 #if defined(PETSC_USE_DEBUG)
33   if (d_nnz) {
34     PetscInt i;
35     for (i=0; i<mat->rmap->n; i++) {
36       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
37     }
38   }
39   if (o_nnz) {
40     PetscInt i;
41     for (i=0; i<mat->rmap->n; i++) {
42       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
43     }
44   }
45 #endif
46 #if defined(PETSC_USE_CTABLE)
47   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
48 #else
49   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
50 #endif
51   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
52   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
53   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
54   /* Because the B will have been resized we simply destroy it and create a new one each time */
55   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
56 
57   if (!mpiaij->A) {
58     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
59     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
60     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
61   }
62   if (!mpiaij->B) {
63     PetscMPIInt size;
64     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRQ(ierr);
65     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
66     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
67     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
68   }
69   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
70   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
71   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
72   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
73   mat->preallocated = PETSC_TRUE;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
78 {
79   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
80   PetscErrorCode ierr;
81   PetscInt       nt;
82 
83   PetscFunctionBegin;
84   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
85   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
86   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
87   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
88   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
89   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
94 {
95   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
102   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
104   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
110 {
111   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
112   PetscErrorCode ierr;
113   PetscInt       nt;
114 
115   PetscFunctionBegin;
116   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
117   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
118   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
119   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
120   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
121   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
122   PetscFunctionReturn(0);
123 }
124 
125 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
126 {
127   PetscErrorCode     ierr;
128   Mat                B;
129 
130   PetscFunctionBegin;
131   if (reuse == MAT_INITIAL_MATRIX) {
132     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
133   } else if (reuse == MAT_REUSE_MATRIX) {
134     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
135   }
136   B = *newmat;
137 
138   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
139   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
140   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
141 
142   B->ops->assemblyend    = MatAssemblyEnd_MPIAIJKokkos;
143   B->ops->mult           = MatMult_MPIAIJKokkos;
144   B->ops->multadd        = MatMultAdd_MPIAIJKokkos;
145   B->ops->multtranspose  = MatMultTranspose_MPIAIJKokkos;
146 
147   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
148   PetscFunctionReturn(0);
149 }
150 
151 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
152 {
153   PetscErrorCode ierr;
154 
155   PetscFunctionBegin;
156   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
157   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
158   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
159   PetscFunctionReturn(0);
160 }
161 
162 /*@C
163    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
164    (the default parallel PETSc format).  This matrix will ultimately pushed down
165    to Kokkos for calculations. For good matrix
166    assembly performance the user should preallocate the matrix storage by setting
167    the parameter nz (or the array nnz).  By setting these parameters accurately,
168    performance during matrix assembly can be increased by more than a factor of 50.
169 
170    Collective
171 
172    Input Parameters:
173 +  comm - MPI communicator, set to PETSC_COMM_SELF
174 .  m - number of rows
175 .  n - number of columns
176 .  nz - number of nonzeros per row (same for all rows)
177 -  nnz - array containing the number of nonzeros in the various rows
178          (possibly different for each row) or NULL
179 
180    Output Parameter:
181 .  A - the matrix
182 
183    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
184    MatXXXXSetPreallocation() paradigm instead of this routine directly.
185    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
186 
187    Notes:
188    If nnz is given then nz is ignored
189 
190    The AIJ format (also called the Yale sparse matrix format or
191    compressed row storage), is fully compatible with standard Fortran 77
192    storage.  That is, the stored row and column indices can begin at
193    either one (as in Fortran) or zero.  See the users' manual for details.
194 
195    Specify the preallocated storage with either nz or nnz (not both).
196    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
197    allocation.  For large problems you MUST preallocate memory or you
198    will get TERRIBLE performance, see the users' manual chapter on matrices.
199 
200    By default, this format uses inodes (identical nodes) when possible, to
201    improve numerical efficiency of matrix-vector products and solves. We
202    search for consecutive rows with the same nonzero structure, thereby
203    reusing matrix information to achieve increased efficiency.
204 
205    Level: intermediate
206 
207 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
208 @*/
209 PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
210 {
211   PetscErrorCode ierr;
212   PetscMPIInt    size;
213 
214   PetscFunctionBegin;
215   ierr = MatCreate(comm,A);CHKERRQ(ierr);
216   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
217   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
218   if (size > 1) {
219     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
220     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
221   } else {
222     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
223     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
224   }
225   PetscFunctionReturn(0);
226 }
227 
228 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
229 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
230 {
231   #if defined(PETSC_USE_CTABLE)
232   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
233   #else
234   PetscMPIInt                size,rank;
235   MPI_Comm                   comm;
236   PetscErrorCode             ierr;
237   PetscSplitCSRDataStructure *d_mat=NULL;
238 
239   PetscFunctionBegin;
240   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
241   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
242   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
243   if (size == 1) {
244     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
245   } else {
246     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
247     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
248   }
249   // act like MatSetValues because not called on host
250   if (A->assembled) {
251     if (A->was_assembled) {
252       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
253     }
254     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
255   } else {
256     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
257     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
258   }
259   if (!d_mat) {
260     PetscSplitCSRDataStructure  h_mat; /* host container */
261     Mat_SeqAIJKokkos            *aijkokA;
262     Mat_SeqAIJ                  *jaca;
263     PetscInt                    n = A->rmap->n, nnz;
264     Mat                         Amat;
265     // create and copy
266     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
267     if (size == 1) {
268       Amat = A;
269       jaca = (Mat_SeqAIJ*)A->data;
270       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
271       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
272       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
273       h_mat.offdiag.a = NULL;
274       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
275       aijkokA->i_uncompressed_d = NULL;
276       aijkokA->colmap_d = NULL;
277     } else {
278       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
279       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
280       PetscInt         ii;
281       Mat_SeqAIJKokkos *aijkokB;
282       Amat = aij->A;
283       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
284       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
285       aijkokA->i_uncompressed_d = NULL;
286       aijkokA->colmap_d = NULL;
287       jaca = (Mat_SeqAIJ*)aij->A->data;
288       if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
289       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
290       // create colmap - this is ussually done (lazy) in MatSetValues
291       aij->donotstash = PETSC_TRUE;
292       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
293       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
294       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
295       aij->colmap[A->cmap->N] = -9;
296       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
297       for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
298       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
299       // allocate B copy data
300       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
301       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
302       nnz = jacb->i[n];
303       if (jacb->compressedrow.use) {
304         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
305         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k));
306         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
307         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
308         //err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
309         //err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
310       } else {
311          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
312       }
313       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
314       h_mat.offdiag.a = aijkokB->a_d.data();
315       {
316         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N);
317         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k));
318         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
319         h_mat.colmap = aijkokB->colmap_d->data();
320       }
321       //err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
322       //err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
323       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
324       h_mat.offdiag.n = n;
325     }
326     // allocate A copy data
327     nnz = jaca->i[n];
328     h_mat.diag.n = n;
329     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
330     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
331     if (jaca->compressedrow.use) {
332       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
333     } else {
334       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
335     }
336     //h_mat.diag.j = aj;
337     //h_mat.diag.a = aa;
338     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
339     h_mat.diag.a = aijkokA->a_d.data();
340     // copy pointers and metdata to device
341     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
342     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
343     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
344   }
345   *B = d_mat; // return it, set it in Mat, and set it up
346   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
347   PetscFunctionReturn(0);
348   #endif
349 }
350