1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscvec.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosSparse_CrsMatrix.hpp> 8 #include <KokkosSparse_spmv.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <../src/mat/impls/aij/seq/aij.h> 11 12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 13 14 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 15 16 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 17 { 18 PetscErrorCode ierr; 19 20 PetscFunctionBegin; 21 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 22 A->offloadmask = PETSC_OFFLOAD_CPU; 23 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 24 PetscFunctionReturn(0); 25 } 26 27 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 28 { 29 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 30 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 31 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 32 33 PetscFunctionBegin; 34 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 35 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 36 delete aijkok; 37 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 38 aijkok->nonzerostate = A->nonzerostate; 39 A->spptr = aijkok; 40 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 41 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 42 } 43 A->offloadmask = PETSC_OFFLOAD_BOTH; 44 PetscFunctionReturn(0); 45 } 46 47 /* y = A x */ 48 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 49 { 50 PetscErrorCode ierr; 51 Mat_SeqAIJKokkos *aijkok; 52 ConstPetscScalarViewDevice_t xv; 53 PetscScalarViewDevice_t yv; 54 55 PetscFunctionBegin; 56 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 57 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 58 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 59 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 61 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 62 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 63 PetscFunctionReturn(0); 64 } 65 66 /* y = A^T x */ 67 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 68 { 69 PetscErrorCode ierr; 70 Mat_SeqAIJKokkos *aijkok; 71 ConstPetscScalarViewDevice_t xv; 72 PetscScalarViewDevice_t yv; 73 74 PetscFunctionBegin; 75 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 76 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 77 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 78 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 79 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 80 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 81 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 82 PetscFunctionReturn(0); 83 } 84 85 /* y = A^H x */ 86 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 87 { 88 PetscErrorCode ierr; 89 Mat_SeqAIJKokkos *aijkok; 90 ConstPetscScalarViewDevice_t xv; 91 PetscScalarViewDevice_t yv; 92 93 PetscFunctionBegin; 94 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 95 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 96 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 97 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 98 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 99 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 100 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 101 PetscFunctionReturn(0); 102 } 103 104 /* z = A x + y */ 105 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 106 { 107 PetscErrorCode ierr; 108 Mat_SeqAIJKokkos *aijkok; 109 ConstPetscScalarViewDevice_t xv,yv; 110 PetscScalarViewDevice_t zv; 111 112 PetscFunctionBegin; 113 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 114 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 115 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 116 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 117 if (zz != yy) Kokkos::deep_copy(zv,yv); 118 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 120 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 121 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 122 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 123 PetscFunctionReturn(0); 124 } 125 126 /* z = A^T x + y */ 127 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 128 { 129 PetscErrorCode ierr; 130 Mat_SeqAIJKokkos *aijkok; 131 ConstPetscScalarViewDevice_t xv,yv; 132 PetscScalarViewDevice_t zv; 133 134 PetscFunctionBegin; 135 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 136 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 137 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 138 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 139 if (zz != yy) Kokkos::deep_copy(zv,yv); 140 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 141 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 142 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 143 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 144 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 /* z = A^H x + y */ 149 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 150 { 151 PetscErrorCode ierr; 152 Mat_SeqAIJKokkos *aijkok; 153 ConstPetscScalarViewDevice_t xv,yv; 154 PetscScalarViewDevice_t zv; 155 156 PetscFunctionBegin; 157 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 158 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 159 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 160 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 161 if (zz != yy) Kokkos::deep_copy(zv,yv); 162 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 164 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 166 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 167 PetscFunctionReturn(0); 168 } 169 170 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 171 { 172 PetscErrorCode ierr; 173 Mat B; 174 Mat_SeqAIJ *aij; 175 176 PetscFunctionBegin; 177 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 178 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 179 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 180 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 181 } 182 183 B = *newmat; 184 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 185 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 186 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 187 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 188 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 189 aij = (Mat_SeqAIJ*)B->data; 190 aij->inode.use = PETSC_FALSE; 191 192 B->offloadmask = PETSC_OFFLOAD_CPU; 193 PetscFunctionReturn(0); 194 } 195 196 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 197 { 198 PetscErrorCode ierr; 199 200 PetscFunctionBegin; 201 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 202 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 203 PetscFunctionReturn(0); 204 } 205 206 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 207 { 208 PetscErrorCode ierr; 209 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 210 211 PetscFunctionBegin; 212 delete aijkok; 213 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 214 PetscFunctionReturn(0); 215 } 216 217 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 223 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 224 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 229 { 230 PetscFunctionBegin; 231 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 232 A->ops->destroy = MatDestroy_SeqAIJKokkos; 233 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 234 235 A->ops->mult = MatMult_SeqAIJKokkos; 236 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 237 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 238 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 239 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 240 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 241 PetscFunctionReturn(0); 242 } 243 244 /* --------------------------------------------------------------------------------*/ 245 /*C 246 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 247 (the default parallel PETSc format). This matrix will ultimately be handled by 248 Kokkos for calculations. For good matrix 249 assembly performance the user should preallocate the matrix storage by setting 250 the parameter nz (or the array nnz). By setting these parameters accurately, 251 performance during matrix assembly can be increased by more than a factor of 50. 252 253 Collective 254 255 Input Parameters: 256 + comm - MPI communicator, set to PETSC_COMM_SELF 257 . m - number of rows 258 . n - number of columns 259 . nz - number of nonzeros per row (same for all rows) 260 - nnz - array containing the number of nonzeros in the various rows 261 (possibly different for each row) or NULL 262 263 Output Parameter: 264 . A - the matrix 265 266 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 267 MatXXXXSetPreallocation() paradgm instead of this routine directly. 268 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 269 270 Notes: 271 If nnz is given then nz is ignored 272 273 The AIJ format (also called the Yale sparse matrix format or 274 compressed row storage), is fully compatible with standard Fortran 77 275 storage. That is, the stored row and column indices can begin at 276 either one (as in Fortran) or zero. See the users' manual for details. 277 278 Specify the preallocated storage with either nz or nnz (not both). 279 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 280 allocation. For large problems you MUST preallocate memory or you 281 will get TERRIBLE performance, see the users' manual chapter on matrices. 282 283 By default, this format uses inodes (identical nodes) when possible, to 284 improve numerical efficiency of matrix-vector products and solves. We 285 search for consecutive rows with the same nonzero structure, thereby 286 reusing matrix information to achieve increased efficiency. 287 288 Level: intermediate 289 290 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 291 @*/ 292 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 293 { 294 PetscErrorCode ierr; 295 296 PetscFunctionBegin; 297 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 298 ierr = MatCreate(comm,A);CHKERRQ(ierr); 299 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 300 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 301 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 302 PetscFunctionReturn(0); 303 } 304