xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision a3d0cf85f51e7fdec4f84d2d39b29bdf0224b99b)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscveckokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosSparse_CrsMatrix.hpp>
8 #include <KokkosSparse_spmv.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <../src/mat/impls/aij/seq/aij.h>
11 
12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13 
14 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
15 
16 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
17 {
18   PetscErrorCode ierr;
19 
20   PetscFunctionBegin;
21   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
22   A->offloadmask = PETSC_OFFLOAD_CPU;
23   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
24   PetscFunctionReturn(0);
25 }
26 
27 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
28 {
29   Mat_SeqAIJ                *aijseq = (Mat_SeqAIJ*)A->data;
30   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
31   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
32 
33   PetscFunctionBegin;
34   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
35   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
36     delete aijkok;
37     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
38     aijkok->nonzerostate = A->nonzerostate;
39     A->spptr             = aijkok;
40   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
41     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
42   }
43   A->offloadmask = PETSC_OFFLOAD_BOTH;
44   PetscFunctionReturn(0);
45 }
46 
47 /* y = A x */
48 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
49 {
50   PetscErrorCode                   ierr;
51   Mat_SeqAIJKokkos                 *aijkok;
52   ConstPetscScalarViewDevice_t     xv;
53   PetscScalarViewDevice_t          yv;
54 
55   PetscFunctionBegin;
56   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
57   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
58   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
59   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
61   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
62   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
63   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
64   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
65   PetscFunctionReturn(0);
66 }
67 
68 /* y = A^T x */
69 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
70 {
71   PetscErrorCode                   ierr;
72   Mat_SeqAIJKokkos                 *aijkok;
73   ConstPetscScalarViewDevice_t     xv;
74   PetscScalarViewDevice_t          yv;
75 
76   PetscFunctionBegin;
77   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
78   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
79   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
80   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
81   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
82   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
83   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
84   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
85   PetscFunctionReturn(0);
86 }
87 
88 /* y = A^H x */
89 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
90 {
91   PetscErrorCode                   ierr;
92   Mat_SeqAIJKokkos                 *aijkok;
93   ConstPetscScalarViewDevice_t     xv;
94   PetscScalarViewDevice_t          yv;
95 
96   PetscFunctionBegin;
97   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
98   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
99   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
100   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
102   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
103   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
104   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 /* z = A x + y */
109 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
110 {
111   PetscErrorCode                   ierr;
112   Mat_SeqAIJKokkos                 *aijkok;
113   ConstPetscScalarViewDevice_t     xv,yv;
114   PetscScalarViewDevice_t          zv;
115 
116   PetscFunctionBegin;
117   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
118   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
119   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
120   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
121   if (zz != yy) Kokkos::deep_copy(zv,yv);
122   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
123   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
124   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
125   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
126   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
127   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
128   PetscFunctionReturn(0);
129 }
130 
131 /* z = A^T x + y */
132 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
133 {
134   PetscErrorCode                   ierr;
135   Mat_SeqAIJKokkos                 *aijkok;
136   ConstPetscScalarViewDevice_t     xv,yv;
137   PetscScalarViewDevice_t          zv;
138 
139   PetscFunctionBegin;
140   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
141   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
142   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
143   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
144   if (zz != yy) Kokkos::deep_copy(zv,yv);
145   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
147   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
148   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
149   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
150   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 
154 /* z = A^H x + y */
155 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
156 {
157   PetscErrorCode                   ierr;
158   Mat_SeqAIJKokkos                 *aijkok;
159   ConstPetscScalarViewDevice_t     xv,yv;
160   PetscScalarViewDevice_t          zv;
161 
162   PetscFunctionBegin;
163   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
164   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
165   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
166   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
167   if (zz != yy) Kokkos::deep_copy(zv,yv);
168   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
169   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
170   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
171   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
172   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
173   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
178 {
179   PetscErrorCode   ierr;
180   Mat              B;
181   Mat_SeqAIJ       *aij;
182 
183   PetscFunctionBegin;
184   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
185     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
186   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
187     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
188   }
189 
190   B    = *newmat;
191   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
192   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
193   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
194   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
195   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
196   aij  = (Mat_SeqAIJ*)B->data;
197   aij->inode.use = PETSC_FALSE;
198 
199   B->offloadmask = PETSC_OFFLOAD_CPU;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
204 {
205   PetscErrorCode ierr;
206 
207   PetscFunctionBegin;
208   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
209   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
214 {
215   PetscErrorCode        ierr;
216   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
217 
218   PetscFunctionBegin;
219   delete aijkok;
220   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
221   PetscFunctionReturn(0);
222 }
223 
224 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
225 {
226   PetscErrorCode ierr;
227 
228   PetscFunctionBegin;
229   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
230   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
231   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
236 {
237   PetscFunctionBegin;
238   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
239   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
240   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
241 
242   A->ops->mult                      = MatMult_SeqAIJKokkos;
243   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
244   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
245   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
246   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
247   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
248   PetscFunctionReturn(0);
249 }
250 
251 /* --------------------------------------------------------------------------------*/
252 /*C
253    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
254    (the default parallel PETSc format). This matrix will ultimately be handled by
255    Kokkos for calculations. For good matrix
256    assembly performance the user should preallocate the matrix storage by setting
257    the parameter nz (or the array nnz).  By setting these parameters accurately,
258    performance during matrix assembly can be increased by more than a factor of 50.
259 
260    Collective
261 
262    Input Parameters:
263 +  comm - MPI communicator, set to PETSC_COMM_SELF
264 .  m - number of rows
265 .  n - number of columns
266 .  nz - number of nonzeros per row (same for all rows)
267 -  nnz - array containing the number of nonzeros in the various rows
268          (possibly different for each row) or NULL
269 
270    Output Parameter:
271 .  A - the matrix
272 
273    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
274    MatXXXXSetPreallocation() paradgm instead of this routine directly.
275    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
276 
277    Notes:
278    If nnz is given then nz is ignored
279 
280    The AIJ format (also called the Yale sparse matrix format or
281    compressed row storage), is fully compatible with standard Fortran 77
282    storage.  That is, the stored row and column indices can begin at
283    either one (as in Fortran) or zero.  See the users' manual for details.
284 
285    Specify the preallocated storage with either nz or nnz (not both).
286    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
287    allocation.  For large problems you MUST preallocate memory or you
288    will get TERRIBLE performance, see the users' manual chapter on matrices.
289 
290    By default, this format uses inodes (identical nodes) when possible, to
291    improve numerical efficiency of matrix-vector products and solves. We
292    search for consecutive rows with the same nonzero structure, thereby
293    reusing matrix information to achieve increased efficiency.
294 
295    Level: intermediate
296 
297 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
298 @*/
299 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
300 {
301   PetscErrorCode ierr;
302 
303   PetscFunctionBegin;
304   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
305   ierr = MatCreate(comm,A);CHKERRQ(ierr);
306   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
307   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
308   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311