1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscveckokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosSparse_CrsMatrix.hpp> 8 #include <KokkosSparse_spmv.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <../src/mat/impls/aij/seq/aij.h> 11 12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 13 14 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 15 16 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 17 { 18 PetscErrorCode ierr; 19 20 PetscFunctionBegin; 21 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 22 A->offloadmask = PETSC_OFFLOAD_CPU; 23 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 24 PetscFunctionReturn(0); 25 } 26 27 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 28 { 29 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 30 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 31 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 32 33 PetscFunctionBegin; 34 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 35 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 36 delete aijkok; 37 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 38 aijkok->nonzerostate = A->nonzerostate; 39 A->spptr = aijkok; 40 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 41 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 42 } 43 A->offloadmask = PETSC_OFFLOAD_BOTH; 44 PetscFunctionReturn(0); 45 } 46 47 /* y = A x */ 48 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 49 { 50 PetscErrorCode ierr; 51 Mat_SeqAIJKokkos *aijkok; 52 ConstPetscScalarViewDevice_t xv; 53 PetscScalarViewDevice_t yv; 54 55 PetscFunctionBegin; 56 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 57 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 58 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 59 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 61 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 62 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 63 /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 64 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 65 PetscFunctionReturn(0); 66 } 67 68 /* y = A^T x */ 69 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 70 { 71 PetscErrorCode ierr; 72 Mat_SeqAIJKokkos *aijkok; 73 ConstPetscScalarViewDevice_t xv; 74 PetscScalarViewDevice_t yv; 75 76 PetscFunctionBegin; 77 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 78 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 79 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 80 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 81 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 82 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 83 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 84 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 85 PetscFunctionReturn(0); 86 } 87 88 /* y = A^H x */ 89 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 90 { 91 PetscErrorCode ierr; 92 Mat_SeqAIJKokkos *aijkok; 93 ConstPetscScalarViewDevice_t xv; 94 PetscScalarViewDevice_t yv; 95 96 PetscFunctionBegin; 97 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 98 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 99 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 100 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 101 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 102 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 103 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 104 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 105 PetscFunctionReturn(0); 106 } 107 108 /* z = A x + y */ 109 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 110 { 111 PetscErrorCode ierr; 112 Mat_SeqAIJKokkos *aijkok; 113 ConstPetscScalarViewDevice_t xv,yv; 114 PetscScalarViewDevice_t zv; 115 116 PetscFunctionBegin; 117 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 118 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 119 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 120 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 121 if (zz != yy) Kokkos::deep_copy(zv,yv); 122 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 123 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 124 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 125 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 126 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 127 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 128 PetscFunctionReturn(0); 129 } 130 131 /* z = A^T x + y */ 132 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 133 { 134 PetscErrorCode ierr; 135 Mat_SeqAIJKokkos *aijkok; 136 ConstPetscScalarViewDevice_t xv,yv; 137 PetscScalarViewDevice_t zv; 138 139 PetscFunctionBegin; 140 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 141 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 142 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 143 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 144 if (zz != yy) Kokkos::deep_copy(zv,yv); 145 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 147 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 148 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 149 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 150 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 151 PetscFunctionReturn(0); 152 } 153 154 /* z = A^H x + y */ 155 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 156 { 157 PetscErrorCode ierr; 158 Mat_SeqAIJKokkos *aijkok; 159 ConstPetscScalarViewDevice_t xv,yv; 160 PetscScalarViewDevice_t zv; 161 162 PetscFunctionBegin; 163 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 164 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 166 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 167 if (zz != yy) Kokkos::deep_copy(zv,yv); 168 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 169 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 170 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 171 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 172 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 173 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 178 { 179 PetscErrorCode ierr; 180 Mat B; 181 Mat_SeqAIJ *aij; 182 183 PetscFunctionBegin; 184 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 185 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 186 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 187 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 188 } 189 190 B = *newmat; 191 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 192 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 193 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 194 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 195 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 196 aij = (Mat_SeqAIJ*)B->data; 197 aij->inode.use = PETSC_FALSE; 198 199 B->offloadmask = PETSC_OFFLOAD_CPU; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 204 { 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 209 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 210 PetscFunctionReturn(0); 211 } 212 213 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 214 { 215 PetscErrorCode ierr; 216 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 217 218 PetscFunctionBegin; 219 delete aijkok; 220 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 221 PetscFunctionReturn(0); 222 } 223 224 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 225 { 226 PetscErrorCode ierr; 227 228 PetscFunctionBegin; 229 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 230 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 231 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 232 PetscFunctionReturn(0); 233 } 234 235 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 236 { 237 PetscFunctionBegin; 238 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 239 A->ops->destroy = MatDestroy_SeqAIJKokkos; 240 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 241 242 A->ops->mult = MatMult_SeqAIJKokkos; 243 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 244 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 245 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 246 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 247 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 248 PetscFunctionReturn(0); 249 } 250 251 /* --------------------------------------------------------------------------------*/ 252 /*C 253 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 254 (the default parallel PETSc format). This matrix will ultimately be handled by 255 Kokkos for calculations. For good matrix 256 assembly performance the user should preallocate the matrix storage by setting 257 the parameter nz (or the array nnz). By setting these parameters accurately, 258 performance during matrix assembly can be increased by more than a factor of 50. 259 260 Collective 261 262 Input Parameters: 263 + comm - MPI communicator, set to PETSC_COMM_SELF 264 . m - number of rows 265 . n - number of columns 266 . nz - number of nonzeros per row (same for all rows) 267 - nnz - array containing the number of nonzeros in the various rows 268 (possibly different for each row) or NULL 269 270 Output Parameter: 271 . A - the matrix 272 273 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 274 MatXXXXSetPreallocation() paradgm instead of this routine directly. 275 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 276 277 Notes: 278 If nnz is given then nz is ignored 279 280 The AIJ format (also called the Yale sparse matrix format or 281 compressed row storage), is fully compatible with standard Fortran 77 282 storage. That is, the stored row and column indices can begin at 283 either one (as in Fortran) or zero. See the users' manual for details. 284 285 Specify the preallocated storage with either nz or nnz (not both). 286 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 287 allocation. For large problems you MUST preallocate memory or you 288 will get TERRIBLE performance, see the users' manual chapter on matrices. 289 290 By default, this format uses inodes (identical nodes) when possible, to 291 improve numerical efficiency of matrix-vector products and solves. We 292 search for consecutive rows with the same nonzero structure, thereby 293 reusing matrix information to achieve increased efficiency. 294 295 Level: intermediate 296 297 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 298 @*/ 299 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 300 { 301 PetscErrorCode ierr; 302 303 PetscFunctionBegin; 304 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 305 ierr = MatCreate(comm,A);CHKERRQ(ierr); 306 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 307 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 308 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311