xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 56ea09ce3fa184c22e0affd92b2d4e46ff4a1b47)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscvec.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosSparse_CrsMatrix.hpp>
8 #include <KokkosSparse_spmv.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <../src/mat/impls/aij/seq/aij.h>
11 
12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13 
14 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
15 
16 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
17 {
18   PetscErrorCode ierr;
19 
20   PetscFunctionBegin;
21   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
22   A->offloadmask = PETSC_OFFLOAD_CPU;
23   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
24   PetscFunctionReturn(0);
25 }
26 
27 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
28 {
29   Mat_SeqAIJ                *aijseq = (Mat_SeqAIJ*)A->data;
30   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
31   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
32 
33   PetscFunctionBegin;
34   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
35   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
36     delete aijkok;
37     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
38     aijkok->nonzerostate = A->nonzerostate;
39     A->spptr             = aijkok;
40   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
41     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
42   }
43   A->offloadmask = PETSC_OFFLOAD_BOTH;
44   PetscFunctionReturn(0);
45 }
46 
47 /* y = A x */
48 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
49 {
50   PetscErrorCode                   ierr;
51   Mat_SeqAIJKokkos                 *aijkok;
52   ConstPetscScalarViewDevice_t     xv;
53   PetscScalarViewDevice_t          yv;
54 
55   PetscFunctionBegin;
56   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
57   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
58   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
59   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
61   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
62   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
63   PetscFunctionReturn(0);
64 }
65 
66 /* y = A^T x */
67 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
68 {
69   PetscErrorCode                   ierr;
70   Mat_SeqAIJKokkos                 *aijkok;
71   ConstPetscScalarViewDevice_t     xv;
72   PetscScalarViewDevice_t          yv;
73 
74   PetscFunctionBegin;
75   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
76   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
77   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
78   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
79   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
80   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
81   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
82   PetscFunctionReturn(0);
83 }
84 
85 /* y = A^H x */
86 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
87 {
88   PetscErrorCode                   ierr;
89   Mat_SeqAIJKokkos                 *aijkok;
90   ConstPetscScalarViewDevice_t     xv;
91   PetscScalarViewDevice_t          yv;
92 
93   PetscFunctionBegin;
94   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
95   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
96   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
97   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
98   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
99   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
100   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 /* z = A x + y */
105 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
106 {
107   PetscErrorCode                   ierr;
108   Mat_SeqAIJKokkos                 *aijkok;
109   ConstPetscScalarViewDevice_t     xv,yv;
110   PetscScalarViewDevice_t          zv;
111 
112   PetscFunctionBegin;
113   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
114   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
115   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
116   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
117   if (zz != yy) Kokkos::deep_copy(zv,yv);
118   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
120   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
121   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
122   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
123   PetscFunctionReturn(0);
124 }
125 
126 /* z = A^T x + y */
127 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
128 {
129   PetscErrorCode                   ierr;
130   Mat_SeqAIJKokkos                 *aijkok;
131   ConstPetscScalarViewDevice_t     xv,yv;
132   PetscScalarViewDevice_t          zv;
133 
134   PetscFunctionBegin;
135   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
136   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
137   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
138   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
139   if (zz != yy) Kokkos::deep_copy(zv,yv);
140   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
141   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
142   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
143   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
144   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 /* z = A^H x + y */
149 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
150 {
151   PetscErrorCode                   ierr;
152   Mat_SeqAIJKokkos                 *aijkok;
153   ConstPetscScalarViewDevice_t     xv,yv;
154   PetscScalarViewDevice_t          zv;
155 
156   PetscFunctionBegin;
157   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
158   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
159   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
160   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
161   if (zz != yy) Kokkos::deep_copy(zv,yv);
162   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
164   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
165   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
166   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
167   PetscFunctionReturn(0);
168 }
169 
170 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
171 {
172   PetscErrorCode   ierr;
173   Mat              B;
174   Mat_SeqAIJ       *aij;
175 
176   PetscFunctionBegin;
177   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
178     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
179   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
180     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
181   }
182 
183   B    = *newmat;
184   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
185   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
186   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
187   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
188   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
189   aij  = (Mat_SeqAIJ*)B->data;
190   aij->inode.use = PETSC_FALSE;
191 
192   B->offloadmask = PETSC_OFFLOAD_CPU;
193   PetscFunctionReturn(0);
194 }
195 
196 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
197 {
198   PetscErrorCode ierr;
199 
200   PetscFunctionBegin;
201   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
202   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
207 {
208   PetscErrorCode        ierr;
209   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
210 
211   PetscFunctionBegin;
212   delete aijkok;
213   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
214   PetscFunctionReturn(0);
215 }
216 
217 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
223   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
224   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
229 {
230   PetscFunctionBegin;
231   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
232   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
233   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
234 
235   A->ops->mult                      = MatMult_SeqAIJKokkos;
236   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
237   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
238   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
239   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
240   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
241   PetscFunctionReturn(0);
242 }
243 
244 /* --------------------------------------------------------------------------------*/
245 /*C
246    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
247    (the default parallel PETSc format). This matrix will ultimately be handled by
248    Kokkos for calculations. For good matrix
249    assembly performance the user should preallocate the matrix storage by setting
250    the parameter nz (or the array nnz).  By setting these parameters accurately,
251    performance during matrix assembly can be increased by more than a factor of 50.
252 
253    Collective
254 
255    Input Parameters:
256 +  comm - MPI communicator, set to PETSC_COMM_SELF
257 .  m - number of rows
258 .  n - number of columns
259 .  nz - number of nonzeros per row (same for all rows)
260 -  nnz - array containing the number of nonzeros in the various rows
261          (possibly different for each row) or NULL
262 
263    Output Parameter:
264 .  A - the matrix
265 
266    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
267    MatXXXXSetPreallocation() paradgm instead of this routine directly.
268    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
269 
270    Notes:
271    If nnz is given then nz is ignored
272 
273    The AIJ format (also called the Yale sparse matrix format or
274    compressed row storage), is fully compatible with standard Fortran 77
275    storage.  That is, the stored row and column indices can begin at
276    either one (as in Fortran) or zero.  See the users' manual for details.
277 
278    Specify the preallocated storage with either nz or nnz (not both).
279    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
280    allocation.  For large problems you MUST preallocate memory or you
281    will get TERRIBLE performance, see the users' manual chapter on matrices.
282 
283    By default, this format uses inodes (identical nodes) when possible, to
284    improve numerical efficiency of matrix-vector products and solves. We
285    search for consecutive rows with the same nonzero structure, thereby
286    reusing matrix information to achieve increased efficiency.
287 
288    Level: intermediate
289 
290 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
291 @*/
292 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
293 {
294   PetscErrorCode ierr;
295 
296   PetscFunctionBegin;
297   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
298   ierr = MatCreate(comm,A);CHKERRQ(ierr);
299   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
300   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
301   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
302   PetscFunctionReturn(0);
303 }
304