xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 569e28a7502c934530efa6903dfdf087ebd3ebcd)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscveckokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosSparse_CrsMatrix.hpp>
8 #include <KokkosSparse_spmv.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <../src/mat/impls/aij/seq/aij.h>
11 
12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13 
14 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
15 
16 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
17 {
18   PetscErrorCode ierr;
19 
20   PetscFunctionBegin;
21   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
22   A->offloadmask = PETSC_OFFLOAD_CPU;
23   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
24   PetscFunctionReturn(0);
25 }
26 
27 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
28 {
29   Mat_SeqAIJ                *aijseq = (Mat_SeqAIJ*)A->data;
30   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
31   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
32 
33   PetscFunctionBegin;
34   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
35   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
36     delete aijkok;
37     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
38     aijkok->nonzerostate = A->nonzerostate;
39     A->spptr             = aijkok;
40   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
41     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
42   }
43   A->offloadmask = PETSC_OFFLOAD_BOTH;
44   PetscFunctionReturn(0);
45 }
46 
47 /* y = A x */
48 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
49 {
50   PetscErrorCode                   ierr;
51   Mat_SeqAIJKokkos                 *aijkok;
52   ConstPetscScalarViewDevice_t     xv;
53   PetscScalarViewDevice_t          yv;
54 
55   PetscFunctionBegin;
56   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
57   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
58   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
59   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
61   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
62   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
63   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
64   ierr   = WaitForKokkos();CHKERRQ(ierr);
65   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
66   PetscFunctionReturn(0);
67 }
68 
69 /* y = A^T x */
70 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
71 {
72   PetscErrorCode                   ierr;
73   Mat_SeqAIJKokkos                 *aijkok;
74   ConstPetscScalarViewDevice_t     xv;
75   PetscScalarViewDevice_t          yv;
76 
77   PetscFunctionBegin;
78   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
79   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
80   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
81   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
82   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
83   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
84   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
85   ierr = WaitForKokkos();CHKERRQ(ierr);
86   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
87   PetscFunctionReturn(0);
88 }
89 
90 /* y = A^H x */
91 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
92 {
93   PetscErrorCode                   ierr;
94   Mat_SeqAIJKokkos                 *aijkok;
95   ConstPetscScalarViewDevice_t     xv;
96   PetscScalarViewDevice_t          yv;
97 
98   PetscFunctionBegin;
99   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
100   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
101   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
102   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
103   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
104   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
105   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
106   ierr = WaitForKokkos();CHKERRQ(ierr);
107   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
108   PetscFunctionReturn(0);
109 }
110 
111 /* z = A x + y */
112 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
113 {
114   PetscErrorCode                   ierr;
115   Mat_SeqAIJKokkos                 *aijkok;
116   ConstPetscScalarViewDevice_t     xv,yv;
117   PetscScalarViewDevice_t          zv;
118 
119   PetscFunctionBegin;
120   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
121   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
122   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
123   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
124   if (zz != yy) Kokkos::deep_copy(zv,yv);
125   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
126   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
127   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
128   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
129   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
130   ierr = WaitForKokkos();CHKERRQ(ierr);
131   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
132   PetscFunctionReturn(0);
133 }
134 
135 /* z = A^T x + y */
136 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
137 {
138   PetscErrorCode                   ierr;
139   Mat_SeqAIJKokkos                 *aijkok;
140   ConstPetscScalarViewDevice_t     xv,yv;
141   PetscScalarViewDevice_t          zv;
142 
143   PetscFunctionBegin;
144   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
145   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
146   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
147   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
148   if (zz != yy) Kokkos::deep_copy(zv,yv);
149   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
151   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
152   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
153   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
154   ierr = WaitForKokkos();CHKERRQ(ierr);
155   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
156   PetscFunctionReturn(0);
157 }
158 
159 /* z = A^H x + y */
160 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
161 {
162   PetscErrorCode                   ierr;
163   Mat_SeqAIJKokkos                 *aijkok;
164   ConstPetscScalarViewDevice_t     xv,yv;
165   PetscScalarViewDevice_t          zv;
166 
167   PetscFunctionBegin;
168   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
169   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
170   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
171   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
172   if (zz != yy) Kokkos::deep_copy(zv,yv);
173   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
174   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
175   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
176   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
177   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
178   ierr = WaitForKokkos();CHKERRQ(ierr);
179   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
180   PetscFunctionReturn(0);
181 }
182 
183 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
184 {
185   PetscErrorCode   ierr;
186   Mat              B;
187   Mat_SeqAIJ       *aij;
188 
189   PetscFunctionBegin;
190   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
191     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
192   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
193     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
194   }
195 
196   B    = *newmat;
197   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
198   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
199   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
200   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
201   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
202   aij  = (Mat_SeqAIJ*)B->data;
203   aij->inode.use = PETSC_FALSE;
204 
205   B->offloadmask = PETSC_OFFLOAD_CPU;
206   PetscFunctionReturn(0);
207 }
208 
209 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
210 {
211   PetscErrorCode ierr;
212 
213   PetscFunctionBegin;
214   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
215   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
219 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
220 {
221   PetscErrorCode        ierr;
222   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   delete aijkok;
226   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
236   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
237   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
242 {
243   PetscFunctionBegin;
244   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
245   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
246   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
247 
248   A->ops->mult                      = MatMult_SeqAIJKokkos;
249   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
250   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
251   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
252   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
253   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
254   PetscFunctionReturn(0);
255 }
256 
257 /* --------------------------------------------------------------------------------*/
258 /*C
259    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
260    (the default parallel PETSc format). This matrix will ultimately be handled by
261    Kokkos for calculations. For good matrix
262    assembly performance the user should preallocate the matrix storage by setting
263    the parameter nz (or the array nnz).  By setting these parameters accurately,
264    performance during matrix assembly can be increased by more than a factor of 50.
265 
266    Collective
267 
268    Input Parameters:
269 +  comm - MPI communicator, set to PETSC_COMM_SELF
270 .  m - number of rows
271 .  n - number of columns
272 .  nz - number of nonzeros per row (same for all rows)
273 -  nnz - array containing the number of nonzeros in the various rows
274          (possibly different for each row) or NULL
275 
276    Output Parameter:
277 .  A - the matrix
278 
279    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
280    MatXXXXSetPreallocation() paradgm instead of this routine directly.
281    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
282 
283    Notes:
284    If nnz is given then nz is ignored
285 
286    The AIJ format (also called the Yale sparse matrix format or
287    compressed row storage), is fully compatible with standard Fortran 77
288    storage.  That is, the stored row and column indices can begin at
289    either one (as in Fortran) or zero.  See the users' manual for details.
290 
291    Specify the preallocated storage with either nz or nnz (not both).
292    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
293    allocation.  For large problems you MUST preallocate memory or you
294    will get TERRIBLE performance, see the users' manual chapter on matrices.
295 
296    By default, this format uses inodes (identical nodes) when possible, to
297    improve numerical efficiency of matrix-vector products and solves. We
298    search for consecutive rows with the same nonzero structure, thereby
299    reusing matrix information to achieve increased efficiency.
300 
301    Level: intermediate
302 
303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
304 @*/
305 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
306 {
307   PetscErrorCode ierr;
308 
309   PetscFunctionBegin;
310   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
311   ierr = MatCreate(comm,A);CHKERRQ(ierr);
312   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
313   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
314   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317