xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision 0db4d2e0165a0ea245cec0549c2de0bb7b39e2c0)
1 #include <petscconf.h>
2 #include <../src/mat/impls/aij/mpi/mpiaij.h>
3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
4 #include <petscveckokkos.hpp>
5 
6 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
7 {
8   PetscErrorCode   ierr;
9   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
10   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
11 
12   PetscFunctionBegin;
13   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
14   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
15     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
16   }
17   if (aijkok && aijkok->device_mat_d.data()) {
18     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
19   }
20 
21   PetscFunctionReturn(0);
22 }
23 
24 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
25 {
26   PetscErrorCode     ierr;
27   PetscInt           i;
28   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)mat->data;
29 
30   PetscFunctionBegin;
31   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
32   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
33   if (d_nnz) {
34     for (i=0; i<mat->rmap->n; i++) {
35       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
36     }
37   }
38   if (o_nnz) {
39     for (i=0; i<mat->rmap->n; i++) {
40       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
41     }
42   }
43   if (!mat->preallocated) {
44     /* Explicitly create 2 MATSEQAIJKOKKOS matrices. */
45     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
46     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
47     ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
48     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
49     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
50     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,mat->cmap->N,mat->rmap->n,mat->cmap->N);CHKERRQ(ierr);
51     ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
52     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
53   }
54   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
55   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
56 
57   mat->preallocated = PETSC_TRUE;
58   PetscFunctionReturn(0);
59 }
60 
61 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
62 {
63   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
64   PetscErrorCode ierr;
65   PetscInt       nt;
66 
67   PetscFunctionBegin;
68   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
69   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
70   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
71   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
72   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
73   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
78 {
79   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
80   PetscErrorCode ierr;
81   PetscInt       nt;
82 
83   PetscFunctionBegin;
84   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
85   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
86   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
87   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
88   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
89   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
94 {
95   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
102   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
103   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
104   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
105   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
110 {
111   PetscErrorCode     ierr;
112   Mat                B;
113 
114   PetscFunctionBegin;
115   if (reuse == MAT_INITIAL_MATRIX) {
116     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
117   } else if (reuse == MAT_REUSE_MATRIX) {
118     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
119   }
120   B = *newmat;
121 
122   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
123   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
124   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
125 
126   B->ops->assemblyend    = MatAssemblyEnd_MPIAIJKokkos;
127   B->ops->mult           = MatMult_MPIAIJKokkos;
128   B->ops->multadd        = MatMultAdd_MPIAIJKokkos;
129   B->ops->multtranspose  = MatMultTranspose_MPIAIJKokkos;
130 
131   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
132   PetscFunctionReturn(0);
133 }
134 
135 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
136 {
137   PetscErrorCode ierr;
138 
139   PetscFunctionBegin;
140   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
141   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
142   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*@C
147    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
148    (the default parallel PETSc format).  This matrix will ultimately pushed down
149    to Kokkos for calculations. For good matrix
150    assembly performance the user should preallocate the matrix storage by setting
151    the parameter nz (or the array nnz).  By setting these parameters accurately,
152    performance during matrix assembly can be increased by more than a factor of 50.
153 
154    Collective
155 
156    Input Parameters:
157 +  comm - MPI communicator, set to PETSC_COMM_SELF
158 .  m - number of rows
159 .  n - number of columns
160 .  nz - number of nonzeros per row (same for all rows)
161 -  nnz - array containing the number of nonzeros in the various rows
162          (possibly different for each row) or NULL
163 
164    Output Parameter:
165 .  A - the matrix
166 
167    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
168    MatXXXXSetPreallocation() paradigm instead of this routine directly.
169    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
170 
171    Notes:
172    If nnz is given then nz is ignored
173 
174    The AIJ format (also called the Yale sparse matrix format or
175    compressed row storage), is fully compatible with standard Fortran 77
176    storage.  That is, the stored row and column indices can begin at
177    either one (as in Fortran) or zero.  See the users' manual for details.
178 
179    Specify the preallocated storage with either nz or nnz (not both).
180    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
181    allocation.  For large problems you MUST preallocate memory or you
182    will get TERRIBLE performance, see the users' manual chapter on matrices.
183 
184    By default, this format uses inodes (identical nodes) when possible, to
185    improve numerical efficiency of matrix-vector products and solves. We
186    search for consecutive rows with the same nonzero structure, thereby
187    reusing matrix information to achieve increased efficiency.
188 
189    Level: intermediate
190 
191 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
192 @*/
193 PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
194 {
195   PetscErrorCode ierr;
196   PetscMPIInt    size;
197 
198   PetscFunctionBegin;
199   ierr = MatCreate(comm,A);CHKERRQ(ierr);
200   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
201   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
202   if (size > 1) {
203     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
204     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
205   } else {
206     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
207     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
208   }
209   PetscFunctionReturn(0);
210 }
211 
212 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
213 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
214 {
215   #if defined(PETSC_USE_CTABLE)
216   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
217   #else
218   PetscMPIInt                size,rank;
219   MPI_Comm                   comm;
220   PetscErrorCode             ierr;
221   PetscSplitCSRDataStructure *d_mat=NULL;
222 
223   PetscFunctionBegin;
224   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
225   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
226   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
227   if (size == 1) {
228     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
229   } else {
230     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
231     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
232   }
233   // act like MatSetValues because not called on host
234   if (A->assembled) {
235     if (A->was_assembled) {
236       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
237     }
238     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
239   } else {
240     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
241     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
242   }
243   if (!d_mat) {
244     PetscSplitCSRDataStructure  h_mat; /* host container */
245     Mat_SeqAIJKokkos            *aijkokA;
246     Mat_SeqAIJ                  *jaca;
247     PetscInt                    n = A->rmap->n, nnz;
248     Mat                         Amat;
249     // create and copy
250     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
251     if (size == 1) {
252       Amat = A;
253       jaca = (Mat_SeqAIJ*)A->data;
254       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
255       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
256       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
257       h_mat.offdiag.a = NULL;
258       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
259       aijkokA->i_uncompressed_d = NULL;
260       aijkokA->colmap_d = NULL;
261     } else {
262       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
263       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
264       PetscInt         ii;
265       Mat_SeqAIJKokkos *aijkokB;
266       Amat = aij->A;
267       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
268       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
269       aijkokA->i_uncompressed_d = NULL;
270       aijkokA->colmap_d = NULL;
271       jaca = (Mat_SeqAIJ*)aij->A->data;
272       if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
273       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
274       // create colmap - this is ussually done (lazy) in MatSetValues
275       aij->donotstash = PETSC_TRUE;
276       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
277       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
278       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
279       aij->colmap[A->cmap->N] = -9;
280       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
281       for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
282       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
283       // allocate B copy data
284       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
285       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
286       nnz = jacb->i[n];
287       if (jacb->compressedrow.use) {
288         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
289         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k));
290         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
291         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
292         //err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
293         //err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
294       } else {
295          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
296       }
297       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
298       h_mat.offdiag.a = aijkokB->a_d.data();
299       {
300         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N);
301         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k));
302         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
303         h_mat.colmap = aijkokB->colmap_d->data();
304       }
305       //err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
306       //err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
307       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
308       h_mat.offdiag.n = n;
309     }
310     // allocate A copy data
311     nnz = jaca->i[n];
312     h_mat.diag.n = n;
313     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
314     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
315     if (jaca->compressedrow.use) {
316       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
317     } else {
318       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
319     }
320     //h_mat.diag.j = aj;
321     //h_mat.diag.a = aa;
322     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
323     h_mat.diag.a = aijkokA->a_d.data();
324     // copy pointers and metdata to device
325     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
326     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
327     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
328   }
329   *B = d_mat; // return it, set it in Mat, and set it up
330   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
331   PetscFunctionReturn(0);
332   #endif
333 }
334