xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision b2b7ea0180aec965afc953541bac0ca6b1e2155a)
1 #include <petscconf.h>
2 #include <../src/mat/impls/aij/mpi/mpiaij.h>
3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
4 #include <petscveckokkos.hpp>
5 
6 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
7 {
8   PetscErrorCode   ierr;
9   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
10   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
11 
12   PetscFunctionBegin;
13   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
14   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
15     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
16   }
17   if (aijkok && aijkok->device_mat_d.data()) {
18     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
19   }
20 
21   PetscFunctionReturn(0);
22 }
23 
24 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
25 {
26   PetscErrorCode ierr;
27   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
28 
29   PetscFunctionBegin;
30   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
31   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
32 #if defined(PETSC_USE_DEBUG)
33   if (d_nnz) {
34     PetscInt i;
35     for (i=0; i<mat->rmap->n; i++) {
36       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
37     }
38   }
39   if (o_nnz) {
40     PetscInt i;
41     for (i=0; i<mat->rmap->n; i++) {
42       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
43     }
44   }
45 #endif
46 #if defined(PETSC_USE_CTABLE)
47   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
48 #else
49   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
50 #endif
51   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
52   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
53   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
54   /* Because the B will have been resized we simply destroy it and create a new one each time */
55   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
56 
57   if (!mpiaij->A) {
58     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
59     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
60     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
61   }
62   if (!mpiaij->B) {
63     PetscMPIInt size;
64     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRQ(ierr);
65     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
66     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
67     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
68   }
69   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
70   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
71   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
72   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
73   mat->preallocated = PETSC_TRUE;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
78 {
79   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
80   PetscErrorCode ierr;
81   PetscInt       nt;
82 
83   PetscFunctionBegin;
84   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
85   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
86   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
87   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
88   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
89   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
94 {
95   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
102   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
104   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
110 {
111   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
112   PetscErrorCode ierr;
113   PetscInt       nt;
114 
115   PetscFunctionBegin;
116   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
117   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
118   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
119   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
120   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
121   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
122   PetscFunctionReturn(0);
123 }
124 
125 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
126 {
127   PetscErrorCode     ierr;
128   Mat                B;
129 
130   PetscFunctionBegin;
131   if (reuse == MAT_INITIAL_MATRIX) {
132     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
133   } else if (reuse == MAT_REUSE_MATRIX) {
134     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
135   }
136   B = *newmat;
137 
138   B->boundtocpu = PETSC_FALSE;
139   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
140   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
141   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
142 
143   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
144   B->ops->mult                  = MatMult_MPIAIJKokkos;
145   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
146   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
147   // Needs an efficient implementation of the COO preallocation routines
148   //B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
149 
150   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 
154 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
155 {
156   PetscErrorCode ierr;
157 
158   PetscFunctionBegin;
159   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
160   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
161   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 /*@C
166    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
167    (the default parallel PETSc format).  This matrix will ultimately pushed down
168    to Kokkos for calculations. For good matrix
169    assembly performance the user should preallocate the matrix storage by setting
170    the parameter nz (or the array nnz).  By setting these parameters accurately,
171    performance during matrix assembly can be increased by more than a factor of 50.
172 
173    Collective
174 
175    Input Parameters:
176 +  comm - MPI communicator, set to PETSC_COMM_SELF
177 .  m - number of rows
178 .  n - number of columns
179 .  nz - number of nonzeros per row (same for all rows)
180 -  nnz - array containing the number of nonzeros in the various rows
181          (possibly different for each row) or NULL
182 
183    Output Parameter:
184 .  A - the matrix
185 
186    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
187    MatXXXXSetPreallocation() paradigm instead of this routine directly.
188    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
189 
190    Notes:
191    If nnz is given then nz is ignored
192 
193    The AIJ format (also called the Yale sparse matrix format or
194    compressed row storage), is fully compatible with standard Fortran 77
195    storage.  That is, the stored row and column indices can begin at
196    either one (as in Fortran) or zero.  See the users' manual for details.
197 
198    Specify the preallocated storage with either nz or nnz (not both).
199    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
200    allocation.  For large problems you MUST preallocate memory or you
201    will get TERRIBLE performance, see the users' manual chapter on matrices.
202 
203    By default, this format uses inodes (identical nodes) when possible, to
204    improve numerical efficiency of matrix-vector products and solves. We
205    search for consecutive rows with the same nonzero structure, thereby
206    reusing matrix information to achieve increased efficiency.
207 
208    Level: intermediate
209 
210 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
211 @*/
212 PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
213 {
214   PetscErrorCode ierr;
215   PetscMPIInt    size;
216 
217   PetscFunctionBegin;
218   ierr = MatCreate(comm,A);CHKERRQ(ierr);
219   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
220   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
221   if (size > 1) {
222     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
223     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
224   } else {
225     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
226     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
227   }
228   PetscFunctionReturn(0);
229 }
230 
231 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
232 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
233 {
234   #if defined(PETSC_USE_CTABLE)
235   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
236   #else
237   PetscMPIInt                size,rank;
238   MPI_Comm                   comm;
239   PetscErrorCode             ierr;
240   PetscSplitCSRDataStructure *d_mat=NULL;
241 
242   PetscFunctionBegin;
243   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
244   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
245   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
246   if (size == 1) {
247     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
248   } else {
249     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
250     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
251   }
252   // act like MatSetValues because not called on host
253   if (A->assembled) {
254     if (A->was_assembled) {
255       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
256     }
257     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
258   } else {
259     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
260     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
261   }
262   if (!d_mat) {
263     PetscSplitCSRDataStructure  h_mat; /* host container */
264     Mat_SeqAIJKokkos            *aijkokA;
265     Mat_SeqAIJ                  *jaca;
266     PetscInt                    n = A->rmap->n, nnz;
267     Mat                         Amat;
268     // create and copy
269     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
270     if (size == 1) {
271       Amat = A;
272       jaca = (Mat_SeqAIJ*)A->data;
273       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
274       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
275       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
276       h_mat.offdiag.a = NULL;
277       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
278       aijkokA->i_uncompressed_d = NULL;
279       aijkokA->colmap_d = NULL;
280     } else {
281       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
282       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
283       PetscInt         ii;
284       Mat_SeqAIJKokkos *aijkokB;
285       Amat = aij->A;
286       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
287       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
288       aijkokA->i_uncompressed_d = NULL;
289       aijkokA->colmap_d = NULL;
290       jaca = (Mat_SeqAIJ*)aij->A->data;
291       if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
292       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
293       // create colmap - this is ussually done (lazy) in MatSetValues
294       aij->donotstash = PETSC_TRUE;
295       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
296       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
297       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
298       aij->colmap[A->cmap->N] = -9;
299       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
300       for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
301       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
302       // allocate B copy data
303       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
304       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
305       nnz = jacb->i[n];
306       if (jacb->compressedrow.use) {
307         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
308         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k));
309         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
310         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
311         //err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
312         //err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
313       } else {
314          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
315       }
316       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
317       h_mat.offdiag.a = aijkokB->a_d.data();
318       {
319         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N);
320         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k));
321         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
322         h_mat.colmap = aijkokB->colmap_d->data();
323       }
324       //err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
325       //err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
326       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
327       h_mat.offdiag.n = n;
328     }
329     // allocate A copy data
330     nnz = jaca->i[n];
331     h_mat.diag.n = n;
332     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
333     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
334     if (jaca->compressedrow.use) {
335       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
336     } else {
337       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
338     }
339     //h_mat.diag.j = aj;
340     //h_mat.diag.a = aa;
341     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
342     h_mat.diag.a = aijkokA->a_d.data();
343     // copy pointers and metdata to device
344     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
345     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
346     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
347   }
348   *B = d_mat; // return it, set it in Mat, and set it up
349   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
350   PetscFunctionReturn(0);
351   #endif
352 }
353