1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscveckokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosSparse_CrsMatrix.hpp> 8 #include <KokkosSparse_spmv.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <../src/mat/impls/aij/seq/aij.h> 11 12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 13 14 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 15 16 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 17 { 18 PetscErrorCode ierr; 19 20 PetscFunctionBegin; 21 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 22 A->offloadmask = PETSC_OFFLOAD_CPU; 23 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 24 PetscFunctionReturn(0); 25 } 26 27 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 28 { 29 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 30 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 31 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 32 33 PetscFunctionBegin; 34 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 35 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 36 delete aijkok; 37 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 38 aijkok->nonzerostate = A->nonzerostate; 39 A->spptr = aijkok; 40 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 41 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 42 } 43 A->offloadmask = PETSC_OFFLOAD_BOTH; 44 PetscFunctionReturn(0); 45 } 46 47 /* y = A x */ 48 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 49 { 50 PetscErrorCode ierr; 51 Mat_SeqAIJKokkos *aijkok; 52 ConstPetscScalarViewDevice_t xv; 53 PetscScalarViewDevice_t yv; 54 55 PetscFunctionBegin; 56 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 57 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 58 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 59 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 61 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 62 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 63 /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 64 ierr = WaitForKokkos();CHKERRQ(ierr); 65 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 66 PetscFunctionReturn(0); 67 } 68 69 /* y = A^T x */ 70 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 71 { 72 PetscErrorCode ierr; 73 Mat_SeqAIJKokkos *aijkok; 74 ConstPetscScalarViewDevice_t xv; 75 PetscScalarViewDevice_t yv; 76 77 PetscFunctionBegin; 78 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 79 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 80 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 81 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 82 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 83 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 84 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 85 ierr = WaitForKokkos();CHKERRQ(ierr); 86 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 87 PetscFunctionReturn(0); 88 } 89 90 /* y = A^H x */ 91 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 92 { 93 PetscErrorCode ierr; 94 Mat_SeqAIJKokkos *aijkok; 95 ConstPetscScalarViewDevice_t xv; 96 PetscScalarViewDevice_t yv; 97 98 PetscFunctionBegin; 99 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 100 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 101 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 102 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 103 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 104 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 105 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 106 ierr = WaitForKokkos();CHKERRQ(ierr); 107 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 /* z = A x + y */ 112 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 113 { 114 PetscErrorCode ierr; 115 Mat_SeqAIJKokkos *aijkok; 116 ConstPetscScalarViewDevice_t xv,yv; 117 PetscScalarViewDevice_t zv; 118 119 PetscFunctionBegin; 120 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 121 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 122 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 123 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 124 if (zz != yy) Kokkos::deep_copy(zv,yv); 125 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 126 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 127 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 128 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 129 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 130 ierr = WaitForKokkos();CHKERRQ(ierr); 131 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 132 PetscFunctionReturn(0); 133 } 134 135 /* z = A^T x + y */ 136 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 137 { 138 PetscErrorCode ierr; 139 Mat_SeqAIJKokkos *aijkok; 140 ConstPetscScalarViewDevice_t xv,yv; 141 PetscScalarViewDevice_t zv; 142 143 PetscFunctionBegin; 144 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 145 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 146 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 147 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 148 if (zz != yy) Kokkos::deep_copy(zv,yv); 149 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 150 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 151 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 152 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 153 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 154 ierr = WaitForKokkos();CHKERRQ(ierr); 155 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 156 PetscFunctionReturn(0); 157 } 158 159 /* z = A^H x + y */ 160 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 161 { 162 PetscErrorCode ierr; 163 Mat_SeqAIJKokkos *aijkok; 164 ConstPetscScalarViewDevice_t xv,yv; 165 PetscScalarViewDevice_t zv; 166 167 PetscFunctionBegin; 168 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 169 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 170 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 171 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 172 if (zz != yy) Kokkos::deep_copy(zv,yv); 173 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 174 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 175 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 176 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 177 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 178 ierr = WaitForKokkos();CHKERRQ(ierr); 179 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 180 PetscFunctionReturn(0); 181 } 182 183 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 184 { 185 PetscErrorCode ierr; 186 Mat B; 187 Mat_SeqAIJ *aij; 188 189 PetscFunctionBegin; 190 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 191 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 192 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 193 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 194 } 195 196 B = *newmat; 197 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 198 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 199 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 200 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 201 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 202 aij = (Mat_SeqAIJ*)B->data; 203 aij->inode.use = PETSC_FALSE; 204 205 B->offloadmask = PETSC_OFFLOAD_CPU; 206 PetscFunctionReturn(0); 207 } 208 209 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 210 { 211 PetscErrorCode ierr; 212 213 PetscFunctionBegin; 214 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 215 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 delete aijkok; 226 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 236 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 237 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 242 { 243 PetscFunctionBegin; 244 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 245 A->ops->destroy = MatDestroy_SeqAIJKokkos; 246 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 247 248 A->ops->mult = MatMult_SeqAIJKokkos; 249 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 250 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 251 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 252 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 253 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 254 PetscFunctionReturn(0); 255 } 256 257 /* --------------------------------------------------------------------------------*/ 258 /*C 259 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 260 (the default parallel PETSc format). This matrix will ultimately be handled by 261 Kokkos for calculations. For good matrix 262 assembly performance the user should preallocate the matrix storage by setting 263 the parameter nz (or the array nnz). By setting these parameters accurately, 264 performance during matrix assembly can be increased by more than a factor of 50. 265 266 Collective 267 268 Input Parameters: 269 + comm - MPI communicator, set to PETSC_COMM_SELF 270 . m - number of rows 271 . n - number of columns 272 . nz - number of nonzeros per row (same for all rows) 273 - nnz - array containing the number of nonzeros in the various rows 274 (possibly different for each row) or NULL 275 276 Output Parameter: 277 . A - the matrix 278 279 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 280 MatXXXXSetPreallocation() paradgm instead of this routine directly. 281 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 282 283 Notes: 284 If nnz is given then nz is ignored 285 286 The AIJ format (also called the Yale sparse matrix format or 287 compressed row storage), is fully compatible with standard Fortran 77 288 storage. That is, the stored row and column indices can begin at 289 either one (as in Fortran) or zero. See the users' manual for details. 290 291 Specify the preallocated storage with either nz or nnz (not both). 292 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 293 allocation. For large problems you MUST preallocate memory or you 294 will get TERRIBLE performance, see the users' manual chapter on matrices. 295 296 By default, this format uses inodes (identical nodes) when possible, to 297 improve numerical efficiency of matrix-vector products and solves. We 298 search for consecutive rows with the same nonzero structure, thereby 299 reusing matrix information to achieve increased efficiency. 300 301 Level: intermediate 302 303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 304 @*/ 305 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 306 { 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 311 ierr = MatCreate(comm,A);CHKERRQ(ierr); 312 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 313 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 314 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317