1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 226 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228 CHKERRCXX(aijkok->a_dual.sync_device()); 229 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231 #if defined(PETSC_USE_COMPLEX) 232 const auto& a = aijkok->csrmatH.values; 233 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234 #endif 235 aijkok->hermitian_updated = PETSC_TRUE; 236 } 237 *csrmatH = &aijkok->csrmatH; 238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 /* y = A x */ 243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 244 { 245 PetscErrorCode ierr; 246 Mat_SeqAIJKokkos *aijkok; 247 ConstPetscScalarKokkosView xv; 248 PetscScalarKokkosView yv; 249 250 PetscFunctionBegin; 251 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 252 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 254 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 255 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 256 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 259 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /* y = A^T x */ 265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 266 { 267 PetscErrorCode ierr; 268 Mat_SeqAIJKokkos *aijkok; 269 const char *mode; 270 ConstPetscScalarKokkosView xv; 271 PetscScalarKokkosView yv; 272 KokkosCsrMatrix *csrmat; 273 274 PetscFunctionBegin; 275 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 276 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 277 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 279 if (A->form_explicit_transpose) { 280 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 281 mode = "N"; 282 } else { 283 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 284 csrmat = &aijkok->csrmat; 285 mode = "T"; 286 } 287 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 288 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 289 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 290 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 291 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 /* y = A^H x */ 296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 297 { 298 PetscErrorCode ierr; 299 Mat_SeqAIJKokkos *aijkok; 300 const char *mode; 301 ConstPetscScalarKokkosView xv; 302 PetscScalarKokkosView yv; 303 KokkosCsrMatrix *csrmat; 304 305 PetscFunctionBegin; 306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 307 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 308 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 310 if (A->form_explicit_transpose) { 311 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 312 mode = "N"; 313 } else { 314 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 315 csrmat = &aijkok->csrmat; 316 mode = "C"; 317 } 318 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 319 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 320 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 321 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 322 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 /* z = A x + y */ 327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 328 { 329 PetscErrorCode ierr; 330 Mat_SeqAIJKokkos *aijkok; 331 ConstPetscScalarKokkosView xv,yv; 332 PetscScalarKokkosView zv; 333 334 PetscFunctionBegin; 335 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 336 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 337 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 340 if (zz != yy) Kokkos::deep_copy(zv,yv); 341 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 342 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 343 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 /* z = A^T x + y */ 352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 353 { 354 PetscErrorCode ierr; 355 Mat_SeqAIJKokkos *aijkok; 356 const char *mode; 357 ConstPetscScalarKokkosView xv,yv; 358 PetscScalarKokkosView zv; 359 KokkosCsrMatrix *csrmat; 360 361 PetscFunctionBegin; 362 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 363 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 364 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 367 if (zz != yy) Kokkos::deep_copy(zv,yv); 368 if (A->form_explicit_transpose) { 369 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 370 mode = "N"; 371 } else { 372 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 373 csrmat = &aijkok->csrmat; 374 mode = "T"; 375 } 376 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 377 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 378 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 380 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 381 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 /* z = A^H x + y */ 386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 387 { 388 PetscErrorCode ierr; 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv,yv; 392 PetscScalarKokkosView zv; 393 KokkosCsrMatrix *csrmat; 394 395 PetscFunctionBegin; 396 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 397 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 398 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 401 if (zz != yy) Kokkos::deep_copy(zv,yv); 402 if (A->form_explicit_transpose) { 403 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 404 mode = "N"; 405 } else { 406 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 407 csrmat = &aijkok->csrmat; 408 mode = "C"; 409 } 410 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 411 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 412 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 414 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 415 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 416 PetscFunctionReturn(0); 417 } 418 419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 420 { 421 PetscErrorCode ierr; 422 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 423 424 PetscFunctionBegin; 425 switch (op) { 426 case MAT_FORM_EXPLICIT_TRANSPOSE: 427 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 428 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 429 A->form_explicit_transpose = flg; 430 break; 431 default: 432 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 433 break; 434 } 435 PetscFunctionReturn(0); 436 } 437 438 /* Depending on reuse, either build a new mat, or use the existing mat */ 439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *aseq; 443 444 PetscFunctionBegin; 445 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 446 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 447 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 448 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 449 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 450 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 451 PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 452 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 454 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 455 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 456 aseq = static_cast<Mat_SeqAIJ*>(A->data); 457 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 458 PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 459 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 460 } 461 } 462 PetscFunctionReturn(0); 463 } 464 465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 466 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 467 */ 468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 469 { 470 PetscErrorCode ierr; 471 Mat_SeqAIJ *bseq; 472 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 473 Mat mat; 474 475 PetscFunctionBegin; 476 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 477 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 478 mat = *B; 479 if (A->assembled) { 480 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 481 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 482 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 483 /* Now copy values to B if needed */ 484 if (dupOption == MAT_COPY_VALUES) { 485 if (akok->a_dual.need_sync_device()) { 486 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 487 bkok->a_dual.modify_host(); 488 } else { /* If device has the latest data, we only copy data on device */ 489 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 490 bkok->a_dual.modify_device(); 491 } 492 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 493 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 494 bkok->a_dual.modify_host(); 495 } 496 mat->spptr = bkok; 497 } 498 499 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 500 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 501 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 502 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 503 PetscFunctionReturn(0); 504 } 505 506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 507 { 508 PetscErrorCode ierr; 509 Mat At; 510 KokkosCsrMatrix *internT; 511 Mat_SeqAIJKokkos *atkok,*bkok; 512 513 PetscFunctionBegin; 514 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 515 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 516 /* Deep copy internT, as we want to isolate the internal transpose */ 517 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 518 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 519 if (reuse == MAT_INITIAL_MATRIX) *B = At; 520 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 521 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 522 if ((*B)->assembled) { 523 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 524 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 525 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 526 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 527 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 528 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 529 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 530 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 531 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 532 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 533 } 534 PetscFunctionReturn(0); 535 } 536 537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 538 { 539 PetscErrorCode ierr; 540 Mat_SeqAIJKokkos *aijkok; 541 542 PetscFunctionBegin; 543 if (A->factortype == MAT_FACTOR_NONE) { 544 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 545 delete aijkok; 546 } else { 547 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 548 } 549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 552 A->spptr = NULL; 553 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 554 PetscFunctionReturn(0); 555 } 556 557 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 558 { 559 PetscErrorCode ierr; 560 561 PetscFunctionBegin; 562 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 563 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 564 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 565 PetscFunctionReturn(0); 566 } 567 568 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 569 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 570 { 571 PetscErrorCode ierr; 572 Mat_SeqAIJ *a,*b; 573 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 574 MatScalarKokkosView aa,ba,ca; 575 MatRowMapKokkosView ai,bi,ci; 576 MatColIdxKokkosView aj,bj,cj; 577 PetscInt m,n,nnz,aN; 578 579 PetscFunctionBegin; 580 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 581 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 582 PetscValidPointer(C,4); 583 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 584 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 585 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 586 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 587 588 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 589 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 590 a = static_cast<Mat_SeqAIJ*>(A->data); 591 b = static_cast<Mat_SeqAIJ*>(B->data); 592 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 593 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 594 aa = akok->a_dual.view_device(); 595 ai = akok->i_dual.view_device(); 596 ba = bkok->a_dual.view_device(); 597 bi = bkok->i_dual.view_device(); 598 m = A->rmap->n; /* M, N and nnz of C */ 599 n = A->cmap->n + B->cmap->n; 600 nnz = a->nz + b->nz; 601 aN = A->cmap->n; /* N of A */ 602 if (reuse == MAT_INITIAL_MATRIX) { 603 aj = akok->j_dual.view_device(); 604 bj = bkok->j_dual.view_device(); 605 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 606 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 607 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 608 ca = ca_dual.view_device(); 609 ci = ci_dual.view_device(); 610 cj = cj_dual.view_device(); 611 612 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 613 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 614 PetscInt i = t.league_rank(); /* row i */ 615 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 616 617 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 618 ci(i) = coffset; 619 if (i == m-1) ci(m) = ai(m) + bi(m); 620 }); 621 622 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 623 if (k < alen) { 624 ca(coffset+k) = aa(ai(i)+k); 625 cj(coffset+k) = aj(ai(i)+k); 626 } else { 627 ca(coffset+k) = ba(bi(i)+k-alen); 628 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 629 } 630 }); 631 }); 632 ca_dual.modify_device(); 633 ci_dual.modify_device(); 634 cj_dual.modify_device(); 635 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 636 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 637 } else if (reuse == MAT_REUSE_MATRIX) { 638 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 639 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 640 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 641 ca = ckok->a_dual.view_device(); 642 ci = ckok->i_dual.view_device(); 643 644 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 645 PetscInt i = t.league_rank(); /* row i */ 646 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 647 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 648 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 649 else ca(ci(i)+k) = ba(bi(i)+k-alen); 650 }); 651 }); 652 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 653 } 654 PetscFunctionReturn(0); 655 } 656 657 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 658 { 659 PetscFunctionBegin; 660 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 661 PetscFunctionReturn(0); 662 } 663 664 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 665 { 666 PetscErrorCode ierr; 667 Mat_Product *product = C->product; 668 Mat A,B; 669 bool transA,transB; /* use bool, since KK needs this type */ 670 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 671 Mat_SeqAIJ *c; 672 MatProductData_SeqAIJKokkos *pdata; 673 KokkosCsrMatrix *csrmatA,*csrmatB; 674 675 PetscFunctionBegin; 676 MatCheckProduct(C,1); 677 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 678 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 679 680 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 681 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 682 PetscFunctionReturn(0); 683 } 684 685 switch (product->type) { 686 case MATPRODUCT_AB: transA = false; transB = false; break; 687 case MATPRODUCT_AtB: transA = true; transB = false; break; 688 case MATPRODUCT_ABt: transA = false; transB = true; break; 689 default: 690 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 691 } 692 693 A = product->A; 694 B = product->B; 695 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 696 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 697 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 698 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 699 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 700 701 PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 702 703 csrmatA = &akok->csrmat; 704 csrmatB = &bkok->csrmat; 705 706 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 707 if (transA) { 708 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 709 transA = false; 710 } 711 712 if (transB) { 713 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 714 transB = false; 715 } 716 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 717 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 718 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 719 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 720 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 721 /* shorter version of MatAssemblyEnd_SeqAIJ */ 722 c = (Mat_SeqAIJ*)C->data; 723 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 724 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 725 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 726 c->reallocs = 0; 727 C->info.mallocs = 0; 728 C->info.nz_unneeded = 0; 729 C->assembled = C->was_assembled = PETSC_TRUE; 730 C->num_ass++; 731 PetscFunctionReturn(0); 732 } 733 734 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 735 { 736 PetscErrorCode ierr; 737 Mat_Product *product = C->product; 738 MatProductType ptype; 739 Mat A,B; 740 bool transA,transB; 741 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 742 MatProductData_SeqAIJKokkos *pdata; 743 MPI_Comm comm; 744 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 745 746 PetscFunctionBegin; 747 MatCheckProduct(C,1); 748 ierr = PetscObjectGetComm((PetscObject)C,&comm); 749 PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 750 A = product->A; 751 B = product->B; 752 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 753 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 754 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 755 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 756 csrmatA = &akok->csrmat; 757 csrmatB = &bkok->csrmat; 758 759 ptype = product->type; 760 switch (ptype) { 761 case MATPRODUCT_AB: transA = false; transB = false; break; 762 case MATPRODUCT_AtB: transA = true; transB = false; break; 763 case MATPRODUCT_ABt: transA = false; transB = true; break; 764 default: 765 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 766 } 767 768 product->data = pdata = new MatProductData_SeqAIJKokkos(); 769 pdata->kh.set_team_work_size(16); 770 pdata->kh.set_dynamic_scheduling(true); 771 pdata->reusesym = product->api_user; 772 773 /* TODO: add command line options to select spgemm algorithms */ 774 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 775 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 776 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 777 */ 778 pdata->kh.create_spgemm_handle(spgemm_alg); 779 780 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 781 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 782 if (transA) { 783 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 784 transA = false; 785 } 786 787 if (transB) { 788 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 789 transB = false; 790 } 791 792 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 793 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 794 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 795 calling new Mat_SeqAIJKokkos(). 796 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 797 */ 798 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 799 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 800 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 801 802 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 803 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 804 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 805 PetscFunctionReturn(0); 806 } 807 808 /* handles sparse matrix matrix ops */ 809 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 810 { 811 PetscErrorCode ierr; 812 Mat_Product *product = mat->product; 813 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 814 815 PetscFunctionBegin; 816 MatCheckProduct(mat,1); 817 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 818 if (product->type == MATPRODUCT_ABC) { 819 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 820 } 821 if (Biskok && Ciskok) { 822 switch (product->type) { 823 case MATPRODUCT_AB: 824 case MATPRODUCT_AtB: 825 case MATPRODUCT_ABt: 826 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 827 break; 828 case MATPRODUCT_PtAP: 829 case MATPRODUCT_RARt: 830 case MATPRODUCT_ABC: 831 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 832 break; 833 default: 834 break; 835 } 836 } else { /* fallback for AIJ */ 837 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 838 } 839 PetscFunctionReturn(0); 840 } 841 842 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 843 { 844 PetscErrorCode ierr; 845 Mat_SeqAIJKokkos *aijkok; 846 847 PetscFunctionBegin; 848 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 849 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 850 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 851 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 852 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 853 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 854 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 855 PetscFunctionReturn(0); 856 } 857 858 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 859 { 860 PetscErrorCode ierr; 861 Mat_SeqAIJKokkos *aijkok; 862 863 PetscFunctionBegin; 864 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 865 if (aijkok) { /* Only zero the device if data is already there */ 866 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 867 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 868 } else { /* Might be preallocated but not assembled */ 869 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 870 } 871 PetscFunctionReturn(0); 872 } 873 874 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 875 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 876 { 877 PetscErrorCode ierr; 878 Mat_SeqAIJKokkos *aijkok; 879 880 PetscFunctionBegin; 881 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 882 PetscValidPointer(kv,2); 883 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 884 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 885 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 886 *kv = aijkok->a_dual.view_device(); 887 PetscFunctionReturn(0); 888 } 889 890 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 891 { 892 PetscFunctionBegin; 893 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 894 PetscValidPointer(kv,2); 895 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 896 PetscFunctionReturn(0); 897 } 898 899 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 900 { 901 PetscErrorCode ierr; 902 Mat_SeqAIJKokkos *aijkok; 903 904 PetscFunctionBegin; 905 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 906 PetscValidPointer(kv,2); 907 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 908 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 909 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 910 *kv = aijkok->a_dual.view_device(); 911 PetscFunctionReturn(0); 912 } 913 914 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 915 { 916 PetscErrorCode ierr; 917 918 PetscFunctionBegin; 919 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 920 PetscValidPointer(kv,2); 921 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 922 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 923 PetscFunctionReturn(0); 924 } 925 926 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 927 { 928 Mat_SeqAIJKokkos *aijkok; 929 930 PetscFunctionBegin; 931 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 932 PetscValidPointer(kv,2); 933 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 934 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 935 *kv = aijkok->a_dual.view_device(); 936 PetscFunctionReturn(0); 937 } 938 939 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 940 { 941 PetscErrorCode ierr; 942 943 PetscFunctionBegin; 944 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945 PetscValidPointer(kv,2); 946 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 947 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 948 PetscFunctionReturn(0); 949 } 950 951 /* Computes Y += alpha X */ 952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 953 { 954 PetscErrorCode ierr; 955 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 956 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 957 ConstMatScalarKokkosView Xa; 958 MatScalarKokkosView Ya; 959 960 PetscFunctionBegin; 961 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 962 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 963 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 964 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 965 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 966 967 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 968 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 969 PetscBool e; 970 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 971 if (e) { 972 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 973 if (e) pattern = SAME_NONZERO_PATTERN; 974 } 975 } 976 977 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 978 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 979 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 980 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 981 */ 982 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 983 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 984 Xa = xkok->a_dual.view_device(); 985 Ya = ykok->a_dual.view_device(); 986 987 if (pattern == SAME_NONZERO_PATTERN) { 988 KokkosBlas::axpy(alpha,Xa,Ya); 989 ierr = MatSeqAIJKokkosModifyDevice(Y); 990 } else if (pattern == SUBSET_NONZERO_PATTERN) { 991 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 992 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 993 994 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 995 PetscInt i = t.league_rank(); /* row i */ 996 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 997 PetscInt p,q = Yi(i); 998 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 999 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1000 if (Xj(p) == Yj(q)) { /* Found it */ 1001 Ya(q) += alpha * Xa(p); 1002 q++; 1003 } else { 1004 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1005 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1006 */ 1007 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1008 } 1009 } 1010 }); 1011 }); 1012 ierr = MatSeqAIJKokkosModifyDevice(Y); 1013 } else { /* different nonzero patterns */ 1014 Mat Z; 1015 KokkosCsrMatrix zcsr; 1016 KernelHandle kh; 1017 kh.create_spadd_handle(false); 1018 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1019 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1020 zkok = new Mat_SeqAIJKokkos(zcsr); 1021 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1022 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1023 kh.destroy_spadd_handle(); 1024 } 1025 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1026 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1027 PetscFunctionReturn(0); 1028 } 1029 1030 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1031 { 1032 PetscErrorCode ierr; 1033 Mat newmat; 1034 Mat_SeqAIJKokkos *akok; 1035 Mat_SeqAIJ *aseq; 1036 1037 PetscFunctionBegin; 1038 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 1039 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 1040 ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr); 1041 ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 1042 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1043 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1044 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1045 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1046 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1047 akok->SetUpCOO(aseq); 1048 PetscFunctionReturn(0); 1049 } 1050 1051 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1052 { 1053 PetscErrorCode ierr; 1054 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1055 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1056 PetscCount Annz = aseq->nz; 1057 const PetscCountKokkosView& jmap = akok->jmap_d; 1058 const PetscCountKokkosView& perm = akok->perm_d; 1059 MatScalarKokkosView Aa; 1060 ConstMatScalarKokkosView kv; 1061 PetscMemType memtype; 1062 1063 PetscFunctionBegin; 1064 PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()"); 1065 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1066 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1067 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1068 } else { 1069 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1070 } 1071 1072 if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */ 1073 else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 1074 1075 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1076 for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k)); 1077 }); 1078 1079 if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1080 else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 1081 PetscFunctionReturn(0); 1082 } 1083 1084 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1085 { 1086 PetscErrorCode ierr; 1087 1088 PetscFunctionBegin; 1089 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1090 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1091 B->offloadmask = PETSC_OFFLOAD_CPU; 1092 PetscFunctionReturn(0); 1093 } 1094 1095 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1096 { 1097 PetscErrorCode ierr; 1098 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1099 1100 PetscFunctionBegin; 1101 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1102 A->boundtocpu = PETSC_FALSE; 1103 1104 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1105 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1106 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1107 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1108 A->ops->scale = MatScale_SeqAIJKokkos; 1109 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1110 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1111 A->ops->mult = MatMult_SeqAIJKokkos; 1112 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1113 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1114 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1115 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1116 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1117 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1118 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1119 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1120 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1121 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1122 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1123 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1124 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1125 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1126 1127 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1128 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1129 PetscFunctionReturn(0); 1130 } 1131 1132 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1133 { 1134 PetscErrorCode ierr; 1135 Mat_SeqAIJ *aseq; 1136 PetscInt i,m,n; 1137 1138 PetscFunctionBegin; 1139 PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1140 1141 m = akok->nrows(); 1142 n = akok->ncols(); 1143 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1144 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1145 1146 /* Set up data structures of A as a MATSEQAIJ */ 1147 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1148 aseq = (Mat_SeqAIJ*)(A)->data; 1149 1150 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1151 akok->j_dual.sync_host(); 1152 1153 aseq->i = akok->i_host_data(); 1154 aseq->j = akok->j_host_data(); 1155 aseq->a = akok->a_host_data(); 1156 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1157 aseq->singlemalloc = PETSC_FALSE; 1158 aseq->free_a = PETSC_FALSE; 1159 aseq->free_ij = PETSC_FALSE; 1160 aseq->nz = akok->nnz(); 1161 aseq->maxnz = aseq->nz; 1162 1163 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1164 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1165 for (i=0; i<m; i++) { 1166 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1167 } 1168 1169 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1170 akok->nonzerostate = A->nonzerostate; 1171 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1172 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1173 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1178 1179 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1180 */ 1181 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1182 { 1183 PetscErrorCode ierr; 1184 1185 PetscFunctionBegin; 1186 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1187 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1188 PetscFunctionReturn(0); 1189 } 1190 1191 /* --------------------------------------------------------------------------------*/ 1192 /*@C 1193 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1194 (the default parallel PETSc format). This matrix will ultimately be handled by 1195 Kokkos for calculations. For good matrix 1196 assembly performance the user should preallocate the matrix storage by setting 1197 the parameter nz (or the array nnz). By setting these parameters accurately, 1198 performance during matrix assembly can be increased by more than a factor of 50. 1199 1200 Collective 1201 1202 Input Parameters: 1203 + comm - MPI communicator, set to PETSC_COMM_SELF 1204 . m - number of rows 1205 . n - number of columns 1206 . nz - number of nonzeros per row (same for all rows) 1207 - nnz - array containing the number of nonzeros in the various rows 1208 (possibly different for each row) or NULL 1209 1210 Output Parameter: 1211 . A - the matrix 1212 1213 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1214 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1215 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1216 1217 Notes: 1218 If nnz is given then nz is ignored 1219 1220 The AIJ format (also called the Yale sparse matrix format or 1221 compressed row storage), is fully compatible with standard Fortran 77 1222 storage. That is, the stored row and column indices can begin at 1223 either one (as in Fortran) or zero. See the users' manual for details. 1224 1225 Specify the preallocated storage with either nz or nnz (not both). 1226 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1227 allocation. For large problems you MUST preallocate memory or you 1228 will get TERRIBLE performance, see the users' manual chapter on matrices. 1229 1230 By default, this format uses inodes (identical nodes) when possible, to 1231 improve numerical efficiency of matrix-vector products and solves. We 1232 search for consecutive rows with the same nonzero structure, thereby 1233 reusing matrix information to achieve increased efficiency. 1234 1235 Level: intermediate 1236 1237 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1238 @*/ 1239 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1240 { 1241 PetscErrorCode ierr; 1242 1243 PetscFunctionBegin; 1244 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1245 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1246 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1247 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1248 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1249 PetscFunctionReturn(0); 1250 } 1251 1252 typedef Kokkos::TeamPolicy<>::member_type team_member; 1253 // 1254 // This factorization exploits block diagonal matrices with "Nf" (not used). 1255 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1256 // 1257 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1258 { 1259 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1260 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1261 IS isrow = b->row,isicol = b->icol; 1262 PetscErrorCode ierr; 1263 const PetscInt *r_h,*ic_h; 1264 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1265 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1266 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1267 PetscBool row_identity,col_identity; 1268 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1269 1270 PetscFunctionBegin; 1271 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1272 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1273 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1274 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1275 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1276 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1277 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1278 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1279 { 1280 #define KOKKOS_SHARED_LEVEL 1 1281 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1282 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1283 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1284 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1285 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1286 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1287 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1288 size_t flops_h = 0.0; 1289 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1290 Kokkos::View<size_t> d_flops_k ("flops"); 1291 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1292 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1293 Kokkos::deep_copy (d_flops_k, h_flops_k); 1294 Kokkos::deep_copy (d_r_k, h_r_k); 1295 Kokkos::deep_copy (d_ic_k, h_ic_k); 1296 // Fill A --> fact 1297 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1298 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1299 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1300 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1301 // zero rows of B 1302 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1303 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1304 PetscScalar *baL = ba_d + bi_d[rowb]; 1305 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1306 /* zero (unfactored row) */ 1307 for (int j=0;j<nzbL;j++) baL[j] = 0; 1308 for (int j=0;j<nzbU;j++) baU[j] = 0; 1309 }); 1310 // copy A into B 1311 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1312 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1313 const PetscScalar *av = aa_d + ai_d[rowa]; 1314 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1315 /* load in initial (unfactored row) */ 1316 for (int j=0;j<nza;j++) { 1317 PetscInt colb = ic[ajtmp[j]]; 1318 PetscScalar vala = av[j]; 1319 if (colb == rowb) { 1320 *(ba_d + bdiag_d[rowb]) = vala; 1321 } else { 1322 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1323 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1324 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1325 for (int j=0; j<nz ; j++) { 1326 if (pbj[j] == colb) { 1327 pba[j] = vala; 1328 set++; 1329 break; 1330 } 1331 } 1332 #if !defined(PETSC_HAVE_SYCL) 1333 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1334 #endif 1335 } 1336 } 1337 }); 1338 }); 1339 Kokkos::fence(); 1340 1341 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1342 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1343 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1344 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1345 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1346 const PetscInt start = field*nloc, end = start + nloc; 1347 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1348 // A22 panel update for each row A(1,:) and col A(:,1) 1349 for (int ii=start; ii<end-1; ii++) { 1350 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1351 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1352 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1353 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1354 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1355 PetscInt kIdx = j*Ni + field_block_idx; 1356 if (kIdx >= nzUi) /* void */ ; 1357 else { 1358 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1359 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1360 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1361 size_t st_idx; 1362 // find and do L(k,i) = A(:k,i) / A(i,i) 1363 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1364 // get column, there has got to be a better way 1365 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1366 if (pjL[j] == ii) { 1367 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1368 idx = j; // output 1369 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1370 } 1371 }, st_idx); 1372 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1373 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1374 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1375 #endif 1376 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1377 // U(i+1,:end) 1378 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1379 PetscScalar Uij = baUi[uiIdx]; 1380 PetscInt col = bjUi[uiIdx]; 1381 if (col==myk) { 1382 // A_kk = A_kk - L_ki * U_ij(k) 1383 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1384 *Akkv = *Akkv - L_ki() * Uij; // UiK 1385 } else { 1386 PetscScalar *start, *end, *pAkjv=NULL; 1387 PetscInt high, low; 1388 const PetscInt *startj; 1389 if (col<myk) { // L 1390 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1391 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1392 start = pLki+1; // start at pLki+1, A22(myk,1) 1393 startj= bj_d + bi_d[myk] + idx; 1394 end = ba_d + bi_d[myk+1]; 1395 } else { 1396 PetscInt idx = bdiag_d[myk+1]+1; 1397 start = ba_d + idx; 1398 startj= bj_d + idx; 1399 end = ba_d + bdiag_d[myk]; 1400 } 1401 // search for 'col', use bisection search - TODO 1402 low = 0; 1403 high = (PetscInt)(end-start); 1404 while (high-low > 5) { 1405 int t = (low+high)/2; 1406 if (startj[t] > col) high = t; 1407 else low = t; 1408 } 1409 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1410 if (startj[pAkjv-start] == col) break; 1411 } 1412 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1413 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1414 #endif 1415 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1416 } 1417 }); 1418 } 1419 }); 1420 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1421 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1422 } /* endof for (i=0; i<n; i++) { */ 1423 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1424 }); 1425 Kokkos::fence(); 1426 Kokkos::deep_copy (h_flops_k, d_flops_k); 1427 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1428 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1429 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1430 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1431 /* Invert diagonal for simpler triangular solves */ 1432 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1433 int i = start + outer_index*Ni + lg_rank%Ni; 1434 if (i < end) { 1435 PetscScalar *pv = ba_d + bdiag_d[i]; 1436 *pv = 1.0/(*pv); 1437 } 1438 }); 1439 }); 1440 } 1441 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1442 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1443 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1444 1445 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1446 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1447 if (b->inode.size) { 1448 B->ops->solve = MatSolve_SeqAIJ_Inode; 1449 } else if (row_identity && col_identity) { 1450 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1451 } else { 1452 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1453 } 1454 B->offloadmask = PETSC_OFFLOAD_GPU; 1455 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1456 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1457 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1458 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1459 B->ops->matsolve = MatMatSolve_SeqAIJ; 1460 B->assembled = PETSC_TRUE; 1461 B->preallocated = PETSC_TRUE; 1462 1463 PetscFunctionReturn(0); 1464 } 1465 1466 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1467 { 1468 PetscErrorCode ierr; 1469 1470 PetscFunctionBegin; 1471 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1472 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1473 PetscFunctionReturn(0); 1474 } 1475 1476 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1477 { 1478 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1479 1480 PetscFunctionBegin; 1481 if (!factors->sptrsv_symbolic_completed) { 1482 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1483 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1484 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1485 } 1486 PetscFunctionReturn(0); 1487 } 1488 1489 /* Check if we need to update factors etc for transpose solve */ 1490 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1491 { 1492 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1493 MatColIdxType n = A->rmap->n; 1494 1495 PetscFunctionBegin; 1496 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1497 /* Update L^T and do sptrsv symbolic */ 1498 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1499 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1500 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1501 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1502 1503 KokkosKernels::Impl::transpose_matrix< 1504 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1505 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1506 MatRowMapKokkosView,DefaultExecutionSpace>( 1507 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1508 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1509 1510 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1511 We have to sort the indices, until KK provides finer control options. 1512 */ 1513 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1514 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1515 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1516 1517 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1518 1519 /* Update U^T and do sptrsv symbolic */ 1520 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1521 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1522 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1523 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1524 1525 KokkosKernels::Impl::transpose_matrix< 1526 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1527 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1528 MatRowMapKokkosView,DefaultExecutionSpace>( 1529 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1530 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1531 1532 /* Sort indices. See comments above */ 1533 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1534 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1535 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1536 1537 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1538 factors->transpose_updated = PETSC_TRUE; 1539 } 1540 PetscFunctionReturn(0); 1541 } 1542 1543 /* Solve Ax = b, with A = LU */ 1544 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1545 { 1546 PetscErrorCode ierr; 1547 ConstPetscScalarKokkosView bv; 1548 PetscScalarKokkosView xv; 1549 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1550 1551 PetscFunctionBegin; 1552 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1553 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1554 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1555 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1556 /* Solve L tmpv = b */ 1557 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1558 /* Solve Ux = tmpv */ 1559 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1560 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1561 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1562 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1563 PetscFunctionReturn(0); 1564 } 1565 1566 /* Solve A^T x = b, where A^T = U^T L^T */ 1567 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1568 { 1569 PetscErrorCode ierr; 1570 ConstPetscScalarKokkosView bv; 1571 PetscScalarKokkosView xv; 1572 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1573 1574 PetscFunctionBegin; 1575 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1576 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1577 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1578 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1579 /* Solve U^T tmpv = b */ 1580 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1581 1582 /* Solve L^T x = tmpv */ 1583 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1584 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1585 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1586 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1587 PetscFunctionReturn(0); 1588 } 1589 1590 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1591 { 1592 PetscErrorCode ierr; 1593 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1594 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1595 PetscInt fill_lev = info->levels; 1596 1597 PetscFunctionBegin; 1598 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1599 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1600 1601 auto a_d = aijkok->a_dual.view_device(); 1602 auto i_d = aijkok->i_dual.view_device(); 1603 auto j_d = aijkok->j_dual.view_device(); 1604 1605 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1606 1607 B->assembled = PETSC_TRUE; 1608 B->preallocated = PETSC_TRUE; 1609 B->ops->solve = MatSolve_SeqAIJKokkos; 1610 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1611 B->ops->matsolve = NULL; 1612 B->ops->matsolvetranspose = NULL; 1613 B->offloadmask = PETSC_OFFLOAD_GPU; 1614 1615 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1616 factors->transpose_updated = PETSC_FALSE; 1617 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1618 /* TODO: log flops, but how to know that? */ 1619 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1620 PetscFunctionReturn(0); 1621 } 1622 1623 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1624 { 1625 PetscErrorCode ierr; 1626 Mat_SeqAIJKokkos *aijkok; 1627 Mat_SeqAIJ *b; 1628 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1629 PetscInt fill_lev = info->levels; 1630 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1631 PetscInt n = A->rmap->n; 1632 1633 PetscFunctionBegin; 1634 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1635 /* Rebuild factors */ 1636 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1637 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1638 1639 /* Create a spiluk handle and then do symbolic factorization */ 1640 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1641 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1642 1643 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1644 1645 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1646 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1647 Kokkos::realloc(factors->iU_d,n+1); 1648 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1649 1650 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1651 auto i_d = aijkok->i_dual.view_device(); 1652 auto j_d = aijkok->j_dual.view_device(); 1653 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1654 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1655 1656 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1657 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1658 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1659 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1660 1661 /* TODO: add options to select sptrsv algorithms */ 1662 /* Create sptrsv handles for L, U and their transpose */ 1663 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1664 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1665 #else 1666 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1667 #endif 1668 1669 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1670 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1671 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1672 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1673 1674 /* Fill fields of the factor matrix B */ 1675 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1676 b = (Mat_SeqAIJ*)B->data; 1677 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1678 B->info.fill_ratio_given = info->fill; 1679 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1680 1681 B->offloadmask = PETSC_OFFLOAD_GPU; 1682 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1683 PetscFunctionReturn(0); 1684 } 1685 1686 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1687 { 1688 PetscErrorCode ierr; 1689 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1690 const PetscInt nrows = A->rmap->n; 1691 1692 PetscFunctionBegin; 1693 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1694 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1695 // move B data into Kokkos 1696 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1697 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1698 { 1699 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1700 if (!baijkok->diag_d.extent(0)) { 1701 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1702 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1703 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1704 } 1705 } 1706 PetscFunctionReturn(0); 1707 } 1708 1709 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1710 { 1711 PetscFunctionBegin; 1712 *type = MATSOLVERKOKKOS; 1713 PetscFunctionReturn(0); 1714 } 1715 1716 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1717 { 1718 PetscFunctionBegin; 1719 *type = MATSOLVERKOKKOSDEVICE; 1720 PetscFunctionReturn(0); 1721 } 1722 1723 /*MC 1724 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1725 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1726 1727 Level: beginner 1728 1729 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1730 M*/ 1731 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1732 { 1733 PetscErrorCode ierr; 1734 PetscInt n = A->rmap->n; 1735 1736 PetscFunctionBegin; 1737 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1738 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1739 (*B)->factortype = ftype; 1740 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1741 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1742 1743 if (ftype == MAT_FACTOR_LU) { 1744 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1745 (*B)->canuseordering = PETSC_TRUE; 1746 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1747 } else if (ftype == MAT_FACTOR_ILU) { 1748 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1749 (*B)->canuseordering = PETSC_FALSE; 1750 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1751 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1752 1753 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1754 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1755 PetscFunctionReturn(0); 1756 } 1757 1758 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1759 { 1760 PetscErrorCode ierr; 1761 PetscInt n = A->rmap->n; 1762 1763 PetscFunctionBegin; 1764 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1765 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1766 (*B)->factortype = ftype; 1767 (*B)->canuseordering = PETSC_TRUE; 1768 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1769 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1770 1771 if (ftype == MAT_FACTOR_LU) { 1772 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1773 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1774 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1775 1776 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1777 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1778 PetscFunctionReturn(0); 1779 } 1780 1781 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1782 { 1783 PetscErrorCode ierr; 1784 1785 PetscFunctionBegin; 1786 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1787 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1788 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1789 PetscFunctionReturn(0); 1790 } 1791 1792 /* Utility to print out a KokkosCsrMatrix for debugging */ 1793 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1794 { 1795 PetscErrorCode ierr; 1796 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1797 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1798 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1799 const PetscInt *i = iv.data(); 1800 const PetscInt *j = jv.data(); 1801 const PetscScalar *a = av.data(); 1802 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1803 1804 PetscFunctionBegin; 1805 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1806 for (PetscInt k=0; k<m; k++) { 1807 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1808 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1809 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1810 } 1811 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1812 } 1813 PetscFunctionReturn(0); 1814 } 1815