1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 226 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228 CHKERRCXX(aijkok->a_dual.sync_device()); 229 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231 #if defined(PETSC_USE_COMPLEX) 232 const auto& a = aijkok->csrmatH.values; 233 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234 #endif 235 aijkok->hermitian_updated = PETSC_TRUE; 236 } 237 *csrmatH = &aijkok->csrmatH; 238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 /* y = A x */ 243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 244 { 245 PetscErrorCode ierr; 246 Mat_SeqAIJKokkos *aijkok; 247 ConstPetscScalarKokkosView xv; 248 PetscScalarKokkosView yv; 249 250 PetscFunctionBegin; 251 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 252 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 255 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 259 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 261 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 /* y = A^T x */ 266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 267 { 268 PetscErrorCode ierr; 269 Mat_SeqAIJKokkos *aijkok; 270 const char *mode; 271 ConstPetscScalarKokkosView xv; 272 PetscScalarKokkosView yv; 273 KokkosCsrMatrix *csrmat; 274 275 PetscFunctionBegin; 276 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 277 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 280 if (A->form_explicit_transpose) { 281 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282 mode = "N"; 283 } else { 284 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285 csrmat = &aijkok->csrmat; 286 mode = "T"; 287 } 288 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 291 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 /* y = A^H x */ 297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 298 { 299 PetscErrorCode ierr; 300 Mat_SeqAIJKokkos *aijkok; 301 const char *mode; 302 ConstPetscScalarKokkosView xv; 303 PetscScalarKokkosView yv; 304 KokkosCsrMatrix *csrmat; 305 306 PetscFunctionBegin; 307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 308 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 311 if (A->form_explicit_transpose) { 312 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313 mode = "N"; 314 } else { 315 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316 csrmat = &aijkok->csrmat; 317 mode = "C"; 318 } 319 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 322 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 /* z = A x + y */ 328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 329 { 330 PetscErrorCode ierr; 331 Mat_SeqAIJKokkos *aijkok; 332 ConstPetscScalarKokkosView xv,yv; 333 PetscScalarKokkosView zv; 334 335 PetscFunctionBegin; 336 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 337 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 341 if (zz != yy) Kokkos::deep_copy(zv,yv); 342 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 /* z = A^T x + y */ 353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJKokkos *aijkok; 357 const char *mode; 358 ConstPetscScalarKokkosView xv,yv; 359 PetscScalarKokkosView zv; 360 KokkosCsrMatrix *csrmat; 361 362 PetscFunctionBegin; 363 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 364 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 368 if (zz != yy) Kokkos::deep_copy(zv,yv); 369 if (A->form_explicit_transpose) { 370 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374 csrmat = &aijkok->csrmat; 375 mode = "T"; 376 } 377 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 381 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 /* z = A^H x + y */ 387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 388 { 389 PetscErrorCode ierr; 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv,yv; 393 PetscScalarKokkosView zv; 394 KokkosCsrMatrix *csrmat; 395 396 PetscFunctionBegin; 397 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 398 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 402 if (zz != yy) Kokkos::deep_copy(zv,yv); 403 if (A->form_explicit_transpose) { 404 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408 csrmat = &aijkok->csrmat; 409 mode = "C"; 410 } 411 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 415 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424 425 PetscFunctionBegin; 426 switch (op) { 427 case MAT_FORM_EXPLICIT_TRANSPOSE: 428 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430 A->form_explicit_transpose = flg; 431 break; 432 default: 433 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434 break; 435 } 436 PetscFunctionReturn(0); 437 } 438 439 /* Depending on reuse, either build a new mat, or use the existing mat */ 440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 441 { 442 PetscErrorCode ierr; 443 Mat_SeqAIJ *aseq; 444 445 PetscFunctionBegin; 446 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 449 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 452 PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457 aseq = static_cast<Mat_SeqAIJ*>(A->data); 458 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 459 PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 461 } 462 } 463 PetscFunctionReturn(0); 464 } 465 466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468 */ 469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 470 { 471 PetscErrorCode ierr; 472 Mat_SeqAIJ *bseq; 473 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474 Mat mat; 475 476 PetscFunctionBegin; 477 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479 mat = *B; 480 if (A->assembled) { 481 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484 /* Now copy values to B if needed */ 485 if (dupOption == MAT_COPY_VALUES) { 486 if (akok->a_dual.need_sync_device()) { 487 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488 bkok->a_dual.modify_host(); 489 } else { /* If device has the latest data, we only copy data on device */ 490 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491 bkok->a_dual.modify_device(); 492 } 493 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495 bkok->a_dual.modify_host(); 496 } 497 mat->spptr = bkok; 498 } 499 500 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 508 { 509 PetscErrorCode ierr; 510 Mat At; 511 KokkosCsrMatrix *internT; 512 Mat_SeqAIJKokkos *atkok,*bkok; 513 514 PetscFunctionBegin; 515 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 516 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517 /* Deep copy internT, as we want to isolate the internal transpose */ 518 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 519 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 520 if (reuse == MAT_INITIAL_MATRIX) *B = At; 521 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 522 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 523 if ((*B)->assembled) { 524 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 525 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 526 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 527 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 528 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 529 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 530 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 531 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 532 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 533 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 534 } 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 if (A->factortype == MAT_FACTOR_NONE) { 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 delete aijkok; 547 } else { 548 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 549 } 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 553 A->spptr = NULL; 554 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 555 PetscFunctionReturn(0); 556 } 557 558 /*MC 559 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 560 561 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 562 563 Options Database Keys: 564 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 565 566 Level: beginner 567 568 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 569 M*/ 570 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 571 { 572 PetscErrorCode ierr; 573 574 PetscFunctionBegin; 575 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 576 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 577 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 578 PetscFunctionReturn(0); 579 } 580 581 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 582 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 583 { 584 PetscErrorCode ierr; 585 Mat_SeqAIJ *a,*b; 586 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 587 MatScalarKokkosView aa,ba,ca; 588 MatRowMapKokkosView ai,bi,ci; 589 MatColIdxKokkosView aj,bj,cj; 590 PetscInt m,n,nnz,aN; 591 592 PetscFunctionBegin; 593 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 594 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 595 PetscValidPointer(C,4); 596 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 597 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 598 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 599 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 600 601 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 602 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 603 a = static_cast<Mat_SeqAIJ*>(A->data); 604 b = static_cast<Mat_SeqAIJ*>(B->data); 605 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 606 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 607 aa = akok->a_dual.view_device(); 608 ai = akok->i_dual.view_device(); 609 ba = bkok->a_dual.view_device(); 610 bi = bkok->i_dual.view_device(); 611 m = A->rmap->n; /* M, N and nnz of C */ 612 n = A->cmap->n + B->cmap->n; 613 nnz = a->nz + b->nz; 614 aN = A->cmap->n; /* N of A */ 615 if (reuse == MAT_INITIAL_MATRIX) { 616 aj = akok->j_dual.view_device(); 617 bj = bkok->j_dual.view_device(); 618 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 619 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 620 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 621 ca = ca_dual.view_device(); 622 ci = ci_dual.view_device(); 623 cj = cj_dual.view_device(); 624 625 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 626 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 627 PetscInt i = t.league_rank(); /* row i */ 628 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 629 630 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 631 ci(i) = coffset; 632 if (i == m-1) ci(m) = ai(m) + bi(m); 633 }); 634 635 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 636 if (k < alen) { 637 ca(coffset+k) = aa(ai(i)+k); 638 cj(coffset+k) = aj(ai(i)+k); 639 } else { 640 ca(coffset+k) = ba(bi(i)+k-alen); 641 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 642 } 643 }); 644 }); 645 ca_dual.modify_device(); 646 ci_dual.modify_device(); 647 cj_dual.modify_device(); 648 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 649 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 650 } else if (reuse == MAT_REUSE_MATRIX) { 651 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 652 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 653 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 654 ca = ckok->a_dual.view_device(); 655 ci = ckok->i_dual.view_device(); 656 657 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 658 PetscInt i = t.league_rank(); /* row i */ 659 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 660 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 661 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 662 else ca(ci(i)+k) = ba(bi(i)+k-alen); 663 }); 664 }); 665 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 666 } 667 PetscFunctionReturn(0); 668 } 669 670 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 671 { 672 PetscFunctionBegin; 673 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 674 PetscFunctionReturn(0); 675 } 676 677 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 678 { 679 PetscErrorCode ierr; 680 Mat_Product *product = C->product; 681 Mat A,B; 682 bool transA,transB; /* use bool, since KK needs this type */ 683 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 684 Mat_SeqAIJ *c; 685 MatProductData_SeqAIJKokkos *pdata; 686 KokkosCsrMatrix *csrmatA,*csrmatB; 687 688 PetscFunctionBegin; 689 MatCheckProduct(C,1); 690 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 691 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 692 693 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 694 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 695 PetscFunctionReturn(0); 696 } 697 698 switch (product->type) { 699 case MATPRODUCT_AB: transA = false; transB = false; break; 700 case MATPRODUCT_AtB: transA = true; transB = false; break; 701 case MATPRODUCT_ABt: transA = false; transB = true; break; 702 default: 703 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 704 } 705 706 A = product->A; 707 B = product->B; 708 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 709 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 710 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 711 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 712 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 713 714 PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 715 716 csrmatA = &akok->csrmat; 717 csrmatB = &bkok->csrmat; 718 719 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 720 if (transA) { 721 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 722 transA = false; 723 } 724 725 if (transB) { 726 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 727 transB = false; 728 } 729 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 730 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 731 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 732 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 733 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 734 /* shorter version of MatAssemblyEnd_SeqAIJ */ 735 c = (Mat_SeqAIJ*)C->data; 736 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 737 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 738 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 739 c->reallocs = 0; 740 C->info.mallocs = 0; 741 C->info.nz_unneeded = 0; 742 C->assembled = C->was_assembled = PETSC_TRUE; 743 C->num_ass++; 744 PetscFunctionReturn(0); 745 } 746 747 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 748 { 749 PetscErrorCode ierr; 750 Mat_Product *product = C->product; 751 MatProductType ptype; 752 Mat A,B; 753 bool transA,transB; 754 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 755 MatProductData_SeqAIJKokkos *pdata; 756 MPI_Comm comm; 757 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 758 759 PetscFunctionBegin; 760 MatCheckProduct(C,1); 761 ierr = PetscObjectGetComm((PetscObject)C,&comm); 762 PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 763 A = product->A; 764 B = product->B; 765 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 766 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 767 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 768 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 769 csrmatA = &akok->csrmat; 770 csrmatB = &bkok->csrmat; 771 772 ptype = product->type; 773 switch (ptype) { 774 case MATPRODUCT_AB: transA = false; transB = false; break; 775 case MATPRODUCT_AtB: transA = true; transB = false; break; 776 case MATPRODUCT_ABt: transA = false; transB = true; break; 777 default: 778 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 779 } 780 781 product->data = pdata = new MatProductData_SeqAIJKokkos(); 782 pdata->kh.set_team_work_size(16); 783 pdata->kh.set_dynamic_scheduling(true); 784 pdata->reusesym = product->api_user; 785 786 /* TODO: add command line options to select spgemm algorithms */ 787 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 788 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 789 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 790 */ 791 pdata->kh.create_spgemm_handle(spgemm_alg); 792 793 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 794 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 795 if (transA) { 796 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 797 transA = false; 798 } 799 800 if (transB) { 801 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 802 transB = false; 803 } 804 805 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 806 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 807 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 808 calling new Mat_SeqAIJKokkos(). 809 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 810 */ 811 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 812 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 813 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 814 815 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 816 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 817 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 818 PetscFunctionReturn(0); 819 } 820 821 /* handles sparse matrix matrix ops */ 822 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 823 { 824 PetscErrorCode ierr; 825 Mat_Product *product = mat->product; 826 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 827 828 PetscFunctionBegin; 829 MatCheckProduct(mat,1); 830 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 831 if (product->type == MATPRODUCT_ABC) { 832 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 833 } 834 if (Biskok && Ciskok) { 835 switch (product->type) { 836 case MATPRODUCT_AB: 837 case MATPRODUCT_AtB: 838 case MATPRODUCT_ABt: 839 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 840 break; 841 case MATPRODUCT_PtAP: 842 case MATPRODUCT_RARt: 843 case MATPRODUCT_ABC: 844 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 845 break; 846 default: 847 break; 848 } 849 } else { /* fallback for AIJ */ 850 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 851 } 852 PetscFunctionReturn(0); 853 } 854 855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 856 { 857 PetscErrorCode ierr; 858 Mat_SeqAIJKokkos *aijkok; 859 860 PetscFunctionBegin; 861 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 862 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 863 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 864 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 865 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 866 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 867 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 868 PetscFunctionReturn(0); 869 } 870 871 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 872 { 873 PetscErrorCode ierr; 874 Mat_SeqAIJKokkos *aijkok; 875 876 PetscFunctionBegin; 877 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 878 if (aijkok) { /* Only zero the device if data is already there */ 879 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 880 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 881 } else { /* Might be preallocated but not assembled */ 882 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 883 } 884 PetscFunctionReturn(0); 885 } 886 887 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 888 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 889 { 890 PetscErrorCode ierr; 891 Mat_SeqAIJKokkos *aijkok; 892 893 PetscFunctionBegin; 894 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 895 PetscValidPointer(kv,2); 896 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 897 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 898 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 899 *kv = aijkok->a_dual.view_device(); 900 PetscFunctionReturn(0); 901 } 902 903 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 904 { 905 PetscFunctionBegin; 906 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 907 PetscValidPointer(kv,2); 908 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 909 PetscFunctionReturn(0); 910 } 911 912 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 913 { 914 PetscErrorCode ierr; 915 Mat_SeqAIJKokkos *aijkok; 916 917 PetscFunctionBegin; 918 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 919 PetscValidPointer(kv,2); 920 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 921 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 922 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 923 *kv = aijkok->a_dual.view_device(); 924 PetscFunctionReturn(0); 925 } 926 927 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 928 { 929 PetscErrorCode ierr; 930 931 PetscFunctionBegin; 932 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 933 PetscValidPointer(kv,2); 934 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 935 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 936 PetscFunctionReturn(0); 937 } 938 939 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 940 { 941 Mat_SeqAIJKokkos *aijkok; 942 943 PetscFunctionBegin; 944 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945 PetscValidPointer(kv,2); 946 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 947 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 948 *kv = aijkok->a_dual.view_device(); 949 PetscFunctionReturn(0); 950 } 951 952 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 953 { 954 PetscErrorCode ierr; 955 956 PetscFunctionBegin; 957 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 958 PetscValidPointer(kv,2); 959 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 960 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 961 PetscFunctionReturn(0); 962 } 963 964 /* Computes Y += alpha X */ 965 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 966 { 967 PetscErrorCode ierr; 968 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 969 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 970 ConstMatScalarKokkosView Xa; 971 MatScalarKokkosView Ya; 972 973 PetscFunctionBegin; 974 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 975 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 976 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 977 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 978 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 979 980 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 981 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 982 PetscBool e; 983 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 984 if (e) { 985 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 986 if (e) pattern = SAME_NONZERO_PATTERN; 987 } 988 } 989 990 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 991 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 992 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 993 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 994 */ 995 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 996 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 997 Xa = xkok->a_dual.view_device(); 998 Ya = ykok->a_dual.view_device(); 999 1000 if (pattern == SAME_NONZERO_PATTERN) { 1001 KokkosBlas::axpy(alpha,Xa,Ya); 1002 ierr = MatSeqAIJKokkosModifyDevice(Y); 1003 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1004 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1005 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1006 1007 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1008 PetscInt i = t.league_rank(); /* row i */ 1009 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1010 PetscInt p,q = Yi(i); 1011 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1012 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1013 if (Xj(p) == Yj(q)) { /* Found it */ 1014 Ya(q) += alpha * Xa(p); 1015 q++; 1016 } else { 1017 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1018 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1019 */ 1020 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1021 } 1022 } 1023 }); 1024 }); 1025 ierr = MatSeqAIJKokkosModifyDevice(Y); 1026 } else { /* different nonzero patterns */ 1027 Mat Z; 1028 KokkosCsrMatrix zcsr; 1029 KernelHandle kh; 1030 kh.create_spadd_handle(false); 1031 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1032 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1033 zkok = new Mat_SeqAIJKokkos(zcsr); 1034 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1035 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1036 kh.destroy_spadd_handle(); 1037 } 1038 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1039 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1040 PetscFunctionReturn(0); 1041 } 1042 1043 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1044 { 1045 PetscErrorCode ierr; 1046 Mat newmat; 1047 Mat_SeqAIJKokkos *akok; 1048 Mat_SeqAIJ *aseq; 1049 1050 PetscFunctionBegin; 1051 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 1052 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 1053 ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr); 1054 ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 1055 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1056 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1057 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1058 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1059 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1060 akok->SetUpCOO(aseq); 1061 PetscFunctionReturn(0); 1062 } 1063 1064 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1065 { 1066 PetscErrorCode ierr; 1067 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1068 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1069 PetscCount Annz = aseq->nz; 1070 const PetscCountKokkosView& jmap = akok->jmap_d; 1071 const PetscCountKokkosView& perm = akok->perm_d; 1072 MatScalarKokkosView Aa; 1073 ConstMatScalarKokkosView kv; 1074 PetscMemType memtype; 1075 1076 PetscFunctionBegin; 1077 PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()"); 1078 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1079 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1080 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1081 } else { 1082 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1083 } 1084 1085 if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */ 1086 else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 1087 1088 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1089 PetscScalar sum = 0.0; 1090 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1091 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1092 }); 1093 1094 if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1095 else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 1096 PetscFunctionReturn(0); 1097 } 1098 1099 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1100 { 1101 PetscErrorCode ierr; 1102 1103 PetscFunctionBegin; 1104 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1105 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1106 B->offloadmask = PETSC_OFFLOAD_CPU; 1107 PetscFunctionReturn(0); 1108 } 1109 1110 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1111 { 1112 PetscErrorCode ierr; 1113 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1114 1115 PetscFunctionBegin; 1116 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1117 A->boundtocpu = PETSC_FALSE; 1118 1119 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1120 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1121 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1122 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1123 A->ops->scale = MatScale_SeqAIJKokkos; 1124 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1125 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1126 A->ops->mult = MatMult_SeqAIJKokkos; 1127 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1128 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1129 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1130 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1131 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1132 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1133 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1134 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1135 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1136 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1137 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1138 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1139 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1140 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1141 1142 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1143 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1144 PetscFunctionReturn(0); 1145 } 1146 1147 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1148 { 1149 PetscErrorCode ierr; 1150 Mat_SeqAIJ *aseq; 1151 PetscInt i,m,n; 1152 1153 PetscFunctionBegin; 1154 PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1155 1156 m = akok->nrows(); 1157 n = akok->ncols(); 1158 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1159 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1160 1161 /* Set up data structures of A as a MATSEQAIJ */ 1162 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1163 aseq = (Mat_SeqAIJ*)(A)->data; 1164 1165 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1166 akok->j_dual.sync_host(); 1167 1168 aseq->i = akok->i_host_data(); 1169 aseq->j = akok->j_host_data(); 1170 aseq->a = akok->a_host_data(); 1171 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1172 aseq->singlemalloc = PETSC_FALSE; 1173 aseq->free_a = PETSC_FALSE; 1174 aseq->free_ij = PETSC_FALSE; 1175 aseq->nz = akok->nnz(); 1176 aseq->maxnz = aseq->nz; 1177 1178 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1179 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1180 for (i=0; i<m; i++) { 1181 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1182 } 1183 1184 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1185 akok->nonzerostate = A->nonzerostate; 1186 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1187 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1188 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1189 PetscFunctionReturn(0); 1190 } 1191 1192 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1193 1194 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1195 */ 1196 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1197 { 1198 PetscErrorCode ierr; 1199 1200 PetscFunctionBegin; 1201 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1202 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1203 PetscFunctionReturn(0); 1204 } 1205 1206 /* --------------------------------------------------------------------------------*/ 1207 /*@C 1208 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1209 (the default parallel PETSc format). This matrix will ultimately be handled by 1210 Kokkos for calculations. For good matrix 1211 assembly performance the user should preallocate the matrix storage by setting 1212 the parameter nz (or the array nnz). By setting these parameters accurately, 1213 performance during matrix assembly can be increased by more than a factor of 50. 1214 1215 Collective 1216 1217 Input Parameters: 1218 + comm - MPI communicator, set to PETSC_COMM_SELF 1219 . m - number of rows 1220 . n - number of columns 1221 . nz - number of nonzeros per row (same for all rows) 1222 - nnz - array containing the number of nonzeros in the various rows 1223 (possibly different for each row) or NULL 1224 1225 Output Parameter: 1226 . A - the matrix 1227 1228 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1229 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1230 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1231 1232 Notes: 1233 If nnz is given then nz is ignored 1234 1235 The AIJ format (also called the Yale sparse matrix format or 1236 compressed row storage), is fully compatible with standard Fortran 77 1237 storage. That is, the stored row and column indices can begin at 1238 either one (as in Fortran) or zero. See the users' manual for details. 1239 1240 Specify the preallocated storage with either nz or nnz (not both). 1241 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1242 allocation. For large problems you MUST preallocate memory or you 1243 will get TERRIBLE performance, see the users' manual chapter on matrices. 1244 1245 By default, this format uses inodes (identical nodes) when possible, to 1246 improve numerical efficiency of matrix-vector products and solves. We 1247 search for consecutive rows with the same nonzero structure, thereby 1248 reusing matrix information to achieve increased efficiency. 1249 1250 Level: intermediate 1251 1252 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1253 @*/ 1254 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1255 { 1256 PetscErrorCode ierr; 1257 1258 PetscFunctionBegin; 1259 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1260 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1261 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1262 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1263 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1264 PetscFunctionReturn(0); 1265 } 1266 1267 typedef Kokkos::TeamPolicy<>::member_type team_member; 1268 // 1269 // This factorization exploits block diagonal matrices with "Nf" (not used). 1270 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1271 // 1272 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1273 { 1274 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1275 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1276 IS isrow = b->row,isicol = b->icol; 1277 PetscErrorCode ierr; 1278 const PetscInt *r_h,*ic_h; 1279 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1280 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1281 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1282 PetscBool row_identity,col_identity; 1283 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1284 1285 PetscFunctionBegin; 1286 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1287 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1288 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1289 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1290 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1291 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1292 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1293 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1294 { 1295 #define KOKKOS_SHARED_LEVEL 1 1296 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1297 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1298 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1299 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1300 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1301 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1302 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1303 size_t flops_h = 0.0; 1304 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1305 Kokkos::View<size_t> d_flops_k ("flops"); 1306 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1307 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1308 Kokkos::deep_copy (d_flops_k, h_flops_k); 1309 Kokkos::deep_copy (d_r_k, h_r_k); 1310 Kokkos::deep_copy (d_ic_k, h_ic_k); 1311 // Fill A --> fact 1312 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1313 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1314 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1315 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1316 // zero rows of B 1317 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1318 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1319 PetscScalar *baL = ba_d + bi_d[rowb]; 1320 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1321 /* zero (unfactored row) */ 1322 for (int j=0;j<nzbL;j++) baL[j] = 0; 1323 for (int j=0;j<nzbU;j++) baU[j] = 0; 1324 }); 1325 // copy A into B 1326 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1327 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1328 const PetscScalar *av = aa_d + ai_d[rowa]; 1329 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1330 /* load in initial (unfactored row) */ 1331 for (int j=0;j<nza;j++) { 1332 PetscInt colb = ic[ajtmp[j]]; 1333 PetscScalar vala = av[j]; 1334 if (colb == rowb) { 1335 *(ba_d + bdiag_d[rowb]) = vala; 1336 } else { 1337 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1338 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1339 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1340 for (int j=0; j<nz ; j++) { 1341 if (pbj[j] == colb) { 1342 pba[j] = vala; 1343 set++; 1344 break; 1345 } 1346 } 1347 #if !defined(PETSC_HAVE_SYCL) 1348 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1349 #endif 1350 } 1351 } 1352 }); 1353 }); 1354 Kokkos::fence(); 1355 1356 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1357 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1358 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1359 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1360 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1361 const PetscInt start = field*nloc, end = start + nloc; 1362 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1363 // A22 panel update for each row A(1,:) and col A(:,1) 1364 for (int ii=start; ii<end-1; ii++) { 1365 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1366 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1367 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1368 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1369 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1370 PetscInt kIdx = j*Ni + field_block_idx; 1371 if (kIdx >= nzUi) /* void */ ; 1372 else { 1373 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1374 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1375 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1376 size_t st_idx; 1377 // find and do L(k,i) = A(:k,i) / A(i,i) 1378 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1379 // get column, there has got to be a better way 1380 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1381 if (pjL[j] == ii) { 1382 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1383 idx = j; // output 1384 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1385 } 1386 }, st_idx); 1387 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1388 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1389 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1390 #endif 1391 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1392 // U(i+1,:end) 1393 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1394 PetscScalar Uij = baUi[uiIdx]; 1395 PetscInt col = bjUi[uiIdx]; 1396 if (col==myk) { 1397 // A_kk = A_kk - L_ki * U_ij(k) 1398 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1399 *Akkv = *Akkv - L_ki() * Uij; // UiK 1400 } else { 1401 PetscScalar *start, *end, *pAkjv=NULL; 1402 PetscInt high, low; 1403 const PetscInt *startj; 1404 if (col<myk) { // L 1405 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1406 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1407 start = pLki+1; // start at pLki+1, A22(myk,1) 1408 startj= bj_d + bi_d[myk] + idx; 1409 end = ba_d + bi_d[myk+1]; 1410 } else { 1411 PetscInt idx = bdiag_d[myk+1]+1; 1412 start = ba_d + idx; 1413 startj= bj_d + idx; 1414 end = ba_d + bdiag_d[myk]; 1415 } 1416 // search for 'col', use bisection search - TODO 1417 low = 0; 1418 high = (PetscInt)(end-start); 1419 while (high-low > 5) { 1420 int t = (low+high)/2; 1421 if (startj[t] > col) high = t; 1422 else low = t; 1423 } 1424 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1425 if (startj[pAkjv-start] == col) break; 1426 } 1427 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1428 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1429 #endif 1430 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1431 } 1432 }); 1433 } 1434 }); 1435 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1436 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1437 } /* endof for (i=0; i<n; i++) { */ 1438 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1439 }); 1440 Kokkos::fence(); 1441 Kokkos::deep_copy (h_flops_k, d_flops_k); 1442 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1443 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1444 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1445 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1446 /* Invert diagonal for simpler triangular solves */ 1447 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1448 int i = start + outer_index*Ni + lg_rank%Ni; 1449 if (i < end) { 1450 PetscScalar *pv = ba_d + bdiag_d[i]; 1451 *pv = 1.0/(*pv); 1452 } 1453 }); 1454 }); 1455 } 1456 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1457 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1458 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1459 1460 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1461 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1462 if (b->inode.size) { 1463 B->ops->solve = MatSolve_SeqAIJ_Inode; 1464 } else if (row_identity && col_identity) { 1465 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1466 } else { 1467 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1468 } 1469 B->offloadmask = PETSC_OFFLOAD_GPU; 1470 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1471 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1472 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1473 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1474 B->ops->matsolve = MatMatSolve_SeqAIJ; 1475 B->assembled = PETSC_TRUE; 1476 B->preallocated = PETSC_TRUE; 1477 1478 PetscFunctionReturn(0); 1479 } 1480 1481 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1482 { 1483 PetscErrorCode ierr; 1484 1485 PetscFunctionBegin; 1486 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1487 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1488 PetscFunctionReturn(0); 1489 } 1490 1491 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1492 { 1493 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1494 1495 PetscFunctionBegin; 1496 if (!factors->sptrsv_symbolic_completed) { 1497 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1498 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1499 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1500 } 1501 PetscFunctionReturn(0); 1502 } 1503 1504 /* Check if we need to update factors etc for transpose solve */ 1505 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1506 { 1507 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1508 MatColIdxType n = A->rmap->n; 1509 1510 PetscFunctionBegin; 1511 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1512 /* Update L^T and do sptrsv symbolic */ 1513 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1514 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1515 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1516 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1517 1518 KokkosKernels::Impl::transpose_matrix< 1519 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1520 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1521 MatRowMapKokkosView,DefaultExecutionSpace>( 1522 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1523 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1524 1525 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1526 We have to sort the indices, until KK provides finer control options. 1527 */ 1528 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1529 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1530 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1531 1532 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1533 1534 /* Update U^T and do sptrsv symbolic */ 1535 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1536 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1537 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1538 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1539 1540 KokkosKernels::Impl::transpose_matrix< 1541 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1542 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1543 MatRowMapKokkosView,DefaultExecutionSpace>( 1544 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1545 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1546 1547 /* Sort indices. See comments above */ 1548 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1549 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1550 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1551 1552 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1553 factors->transpose_updated = PETSC_TRUE; 1554 } 1555 PetscFunctionReturn(0); 1556 } 1557 1558 /* Solve Ax = b, with A = LU */ 1559 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1560 { 1561 PetscErrorCode ierr; 1562 ConstPetscScalarKokkosView bv; 1563 PetscScalarKokkosView xv; 1564 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1565 1566 PetscFunctionBegin; 1567 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1568 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1569 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1570 ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1571 /* Solve L tmpv = b */ 1572 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1573 /* Solve Ux = tmpv */ 1574 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1575 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1576 ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1577 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 /* Solve A^T x = b, where A^T = U^T L^T */ 1582 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1583 { 1584 PetscErrorCode ierr; 1585 ConstPetscScalarKokkosView bv; 1586 PetscScalarKokkosView xv; 1587 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1588 1589 PetscFunctionBegin; 1590 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1591 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1592 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1593 ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1594 /* Solve U^T tmpv = b */ 1595 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1596 1597 /* Solve L^T x = tmpv */ 1598 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1599 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1600 ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1601 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1602 PetscFunctionReturn(0); 1603 } 1604 1605 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1606 { 1607 PetscErrorCode ierr; 1608 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1609 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1610 PetscInt fill_lev = info->levels; 1611 1612 PetscFunctionBegin; 1613 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1614 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1615 1616 auto a_d = aijkok->a_dual.view_device(); 1617 auto i_d = aijkok->i_dual.view_device(); 1618 auto j_d = aijkok->j_dual.view_device(); 1619 1620 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1621 1622 B->assembled = PETSC_TRUE; 1623 B->preallocated = PETSC_TRUE; 1624 B->ops->solve = MatSolve_SeqAIJKokkos; 1625 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1626 B->ops->matsolve = NULL; 1627 B->ops->matsolvetranspose = NULL; 1628 B->offloadmask = PETSC_OFFLOAD_GPU; 1629 1630 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1631 factors->transpose_updated = PETSC_FALSE; 1632 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1633 /* TODO: log flops, but how to know that? */ 1634 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1635 PetscFunctionReturn(0); 1636 } 1637 1638 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1639 { 1640 PetscErrorCode ierr; 1641 Mat_SeqAIJKokkos *aijkok; 1642 Mat_SeqAIJ *b; 1643 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1644 PetscInt fill_lev = info->levels; 1645 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1646 PetscInt n = A->rmap->n; 1647 1648 PetscFunctionBegin; 1649 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1650 /* Rebuild factors */ 1651 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1652 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1653 1654 /* Create a spiluk handle and then do symbolic factorization */ 1655 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1656 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1657 1658 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1659 1660 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1661 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1662 Kokkos::realloc(factors->iU_d,n+1); 1663 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1664 1665 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1666 auto i_d = aijkok->i_dual.view_device(); 1667 auto j_d = aijkok->j_dual.view_device(); 1668 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1669 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1670 1671 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1672 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1673 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1674 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1675 1676 /* TODO: add options to select sptrsv algorithms */ 1677 /* Create sptrsv handles for L, U and their transpose */ 1678 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1679 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1680 #else 1681 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1682 #endif 1683 1684 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1685 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1686 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1687 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1688 1689 /* Fill fields of the factor matrix B */ 1690 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1691 b = (Mat_SeqAIJ*)B->data; 1692 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1693 B->info.fill_ratio_given = info->fill; 1694 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1695 1696 B->offloadmask = PETSC_OFFLOAD_GPU; 1697 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1698 PetscFunctionReturn(0); 1699 } 1700 1701 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1702 { 1703 PetscErrorCode ierr; 1704 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1705 const PetscInt nrows = A->rmap->n; 1706 1707 PetscFunctionBegin; 1708 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1709 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1710 // move B data into Kokkos 1711 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1712 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1713 { 1714 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1715 if (!baijkok->diag_d.extent(0)) { 1716 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1717 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1718 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1719 } 1720 } 1721 PetscFunctionReturn(0); 1722 } 1723 1724 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1725 { 1726 PetscFunctionBegin; 1727 *type = MATSOLVERKOKKOS; 1728 PetscFunctionReturn(0); 1729 } 1730 1731 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1732 { 1733 PetscFunctionBegin; 1734 *type = MATSOLVERKOKKOSDEVICE; 1735 PetscFunctionReturn(0); 1736 } 1737 1738 /*MC 1739 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1740 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1741 1742 Level: beginner 1743 1744 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1745 M*/ 1746 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1747 { 1748 PetscErrorCode ierr; 1749 PetscInt n = A->rmap->n; 1750 1751 PetscFunctionBegin; 1752 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1753 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1754 (*B)->factortype = ftype; 1755 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1756 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1757 1758 if (ftype == MAT_FACTOR_LU) { 1759 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1760 (*B)->canuseordering = PETSC_TRUE; 1761 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1762 } else if (ftype == MAT_FACTOR_ILU) { 1763 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1764 (*B)->canuseordering = PETSC_FALSE; 1765 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1766 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1767 1768 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1769 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1770 PetscFunctionReturn(0); 1771 } 1772 1773 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1774 { 1775 PetscErrorCode ierr; 1776 PetscInt n = A->rmap->n; 1777 1778 PetscFunctionBegin; 1779 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1780 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1781 (*B)->factortype = ftype; 1782 (*B)->canuseordering = PETSC_TRUE; 1783 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1784 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1785 1786 if (ftype == MAT_FACTOR_LU) { 1787 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1788 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1789 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1790 1791 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1792 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1793 PetscFunctionReturn(0); 1794 } 1795 1796 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1797 { 1798 PetscErrorCode ierr; 1799 1800 PetscFunctionBegin; 1801 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1802 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1803 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1804 PetscFunctionReturn(0); 1805 } 1806 1807 /* Utility to print out a KokkosCsrMatrix for debugging */ 1808 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1809 { 1810 PetscErrorCode ierr; 1811 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1812 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1813 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1814 const PetscInt *i = iv.data(); 1815 const PetscInt *j = jv.data(); 1816 const PetscScalar *a = av.data(); 1817 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1818 1819 PetscFunctionBegin; 1820 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1821 for (PetscInt k=0; k<m; k++) { 1822 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1823 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1824 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1825 } 1826 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1827 } 1828 PetscFunctionReturn(0); 1829 } 1830