1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscvec.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosSparse_CrsMatrix.hpp> 8 #include <KokkosSparse_spmv.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <../src/mat/impls/aij/seq/aij.h> 11 12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 13 14 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 15 16 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 17 { 18 PetscErrorCode ierr; 19 20 PetscFunctionBegin; 21 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 22 A->offloadmask = PETSC_OFFLOAD_CPU; 23 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 24 PetscFunctionReturn(0); 25 } 26 27 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 28 { 29 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 30 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 31 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 32 33 PetscFunctionBegin; 34 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 35 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 36 delete aijkok; 37 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 38 aijkok->nonzerostate = A->nonzerostate; 39 A->spptr = aijkok; 40 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 41 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 42 } 43 A->offloadmask = PETSC_OFFLOAD_BOTH; 44 PetscFunctionReturn(0); 45 } 46 47 /* y = A x */ 48 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 49 { 50 PetscErrorCode ierr; 51 Mat_SeqAIJKokkos *aijkok; 52 ConstPetscScalarViewDevice_t xv; 53 PetscScalarViewDevice_t yv; 54 55 PetscFunctionBegin; 56 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 57 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 58 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 59 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 61 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 62 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 63 PetscFunctionReturn(0); 64 } 65 66 /* y = A^T x */ 67 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 68 { 69 PetscErrorCode ierr; 70 Mat_SeqAIJKokkos *aijkok; 71 ConstPetscScalarViewDevice_t xv; 72 PetscScalarViewDevice_t yv; 73 74 PetscFunctionBegin; 75 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 76 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 77 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 78 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 79 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 80 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 81 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 82 PetscFunctionReturn(0); 83 } 84 85 /* y = A^H x */ 86 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 87 { 88 PetscErrorCode ierr; 89 Mat_SeqAIJKokkos *aijkok; 90 ConstPetscScalarViewDevice_t xv; 91 PetscScalarViewDevice_t yv; 92 93 PetscFunctionBegin; 94 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 95 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 96 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 97 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 98 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 99 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 100 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 101 PetscFunctionReturn(0); 102 } 103 104 /* z = A x + y */ 105 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 106 { 107 PetscErrorCode ierr; 108 Mat_SeqAIJKokkos *aijkok; 109 ConstPetscScalarViewDevice_t xv,yv; 110 PetscScalarViewDevice_t zv; 111 112 PetscFunctionBegin; 113 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 114 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 115 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 116 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 117 if (zz != yy) Kokkos::deep_copy(zv,yv); 118 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 120 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 121 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 122 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 123 PetscFunctionReturn(0); 124 } 125 126 /* z = A^T x + y */ 127 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 128 { 129 PetscErrorCode ierr; 130 Mat_SeqAIJKokkos *aijkok; 131 ConstPetscScalarViewDevice_t xv,yv; 132 PetscScalarViewDevice_t zv; 133 134 PetscFunctionBegin; 135 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 136 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 137 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 138 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 139 if (zz != yy) Kokkos::deep_copy(zv,yv); 140 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 141 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 142 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 143 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 144 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 /* z = A^H x + y */ 149 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 150 { 151 PetscErrorCode ierr; 152 Mat_SeqAIJKokkos *aijkok; 153 ConstPetscScalarViewDevice_t xv,yv; 154 PetscScalarViewDevice_t zv; 155 156 PetscFunctionBegin; 157 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 158 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 159 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 160 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 161 if (zz != yy) Kokkos::deep_copy(zv,yv); 162 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 164 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 166 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 167 PetscFunctionReturn(0); 168 } 169 170 PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 171 { 172 PetscErrorCode ierr; 173 Mat B; 174 175 PetscFunctionBegin; 176 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 177 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 178 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 179 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 180 } 181 182 B = *newmat; 183 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 184 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 185 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 186 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 187 188 B->offloadmask = PETSC_OFFLOAD_CPU; 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 193 { 194 PetscErrorCode ierr; 195 196 PetscFunctionBegin; 197 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 198 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 199 PetscFunctionReturn(0); 200 } 201 202 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 203 { 204 PetscErrorCode ierr; 205 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 206 207 PetscFunctionBegin; 208 delete aijkok; 209 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 210 PetscFunctionReturn(0); 211 } 212 213 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 219 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 220 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 221 PetscFunctionReturn(0); 222 } 223 224 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 225 { 226 PetscFunctionBegin; 227 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 228 A->ops->destroy = MatDestroy_SeqAIJKokkos; 229 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 230 231 A->ops->mult = MatMult_SeqAIJKokkos; 232 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 233 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 234 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 235 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 236 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 237 PetscFunctionReturn(0); 238 } 239 240 /* --------------------------------------------------------------------------------*/ 241 /*C 242 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 243 (the default parallel PETSc format). This matrix will ultimately be handled by 244 Kokkos for calculations. For good matrix 245 assembly performance the user should preallocate the matrix storage by setting 246 the parameter nz (or the array nnz). By setting these parameters accurately, 247 performance during matrix assembly can be increased by more than a factor of 50. 248 249 Collective 250 251 Input Parameters: 252 + comm - MPI communicator, set to PETSC_COMM_SELF 253 . m - number of rows 254 . n - number of columns 255 . nz - number of nonzeros per row (same for all rows) 256 - nnz - array containing the number of nonzeros in the various rows 257 (possibly different for each row) or NULL 258 259 Output Parameter: 260 . A - the matrix 261 262 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 263 MatXXXXSetPreallocation() paradgm instead of this routine directly. 264 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 265 266 Notes: 267 If nnz is given then nz is ignored 268 269 The AIJ format (also called the Yale sparse matrix format or 270 compressed row storage), is fully compatible with standard Fortran 77 271 storage. That is, the stored row and column indices can begin at 272 either one (as in Fortran) or zero. See the users' manual for details. 273 274 Specify the preallocated storage with either nz or nnz (not both). 275 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 276 allocation. For large problems you MUST preallocate memory or you 277 will get TERRIBLE performance, see the users' manual chapter on matrices. 278 279 By default, this format uses inodes (identical nodes) when possible, to 280 improve numerical efficiency of matrix-vector products and solves. We 281 search for consecutive rows with the same nonzero structure, thereby 282 reusing matrix information to achieve increased efficiency. 283 284 Level: intermediate 285 286 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 287 @*/ 288 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 289 { 290 PetscErrorCode ierr; 291 292 PetscFunctionBegin; 293 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 294 ierr = MatCreate(comm,A);CHKERRQ(ierr); 295 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 296 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 297 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 298 PetscFunctionReturn(0); 299 } 300