1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 226 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228 CHKERRCXX(aijkok->a_dual.sync_device()); 229 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231 #if defined(PETSC_USE_COMPLEX) 232 const auto& a = aijkok->csrmatH.values; 233 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234 #endif 235 aijkok->hermitian_updated = PETSC_TRUE; 236 } 237 *csrmatH = &aijkok->csrmatH; 238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 /* y = A x */ 243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 244 { 245 PetscErrorCode ierr; 246 Mat_SeqAIJKokkos *aijkok; 247 ConstPetscScalarKokkosView xv; 248 PetscScalarKokkosView yv; 249 250 PetscFunctionBegin; 251 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 252 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 254 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 255 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 256 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 259 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /* y = A^T x */ 265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 266 { 267 PetscErrorCode ierr; 268 Mat_SeqAIJKokkos *aijkok; 269 const char *mode; 270 ConstPetscScalarKokkosView xv; 271 PetscScalarKokkosView yv; 272 KokkosCsrMatrix *csrmat; 273 274 PetscFunctionBegin; 275 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 276 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 277 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 279 if (A->form_explicit_transpose) { 280 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 281 mode = "N"; 282 } else { 283 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 284 csrmat = &aijkok->csrmat; 285 mode = "T"; 286 } 287 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 288 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 289 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 290 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 291 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 /* y = A^H x */ 296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 297 { 298 PetscErrorCode ierr; 299 Mat_SeqAIJKokkos *aijkok; 300 const char *mode; 301 ConstPetscScalarKokkosView xv; 302 PetscScalarKokkosView yv; 303 KokkosCsrMatrix *csrmat; 304 305 PetscFunctionBegin; 306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 307 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 308 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 310 if (A->form_explicit_transpose) { 311 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 312 mode = "N"; 313 } else { 314 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 315 csrmat = &aijkok->csrmat; 316 mode = "C"; 317 } 318 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 319 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 320 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 321 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 322 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 /* z = A x + y */ 327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 328 { 329 PetscErrorCode ierr; 330 Mat_SeqAIJKokkos *aijkok; 331 ConstPetscScalarKokkosView xv,yv; 332 PetscScalarKokkosView zv; 333 334 PetscFunctionBegin; 335 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 336 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 337 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 340 if (zz != yy) Kokkos::deep_copy(zv,yv); 341 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 342 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 343 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 /* z = A^T x + y */ 352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 353 { 354 PetscErrorCode ierr; 355 Mat_SeqAIJKokkos *aijkok; 356 const char *mode; 357 ConstPetscScalarKokkosView xv,yv; 358 PetscScalarKokkosView zv; 359 KokkosCsrMatrix *csrmat; 360 361 PetscFunctionBegin; 362 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 363 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 364 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 367 if (zz != yy) Kokkos::deep_copy(zv,yv); 368 if (A->form_explicit_transpose) { 369 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 370 mode = "N"; 371 } else { 372 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 373 csrmat = &aijkok->csrmat; 374 mode = "T"; 375 } 376 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 377 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 378 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 380 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 381 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 /* z = A^H x + y */ 386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 387 { 388 PetscErrorCode ierr; 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv,yv; 392 PetscScalarKokkosView zv; 393 KokkosCsrMatrix *csrmat; 394 395 PetscFunctionBegin; 396 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 397 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 398 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 401 if (zz != yy) Kokkos::deep_copy(zv,yv); 402 if (A->form_explicit_transpose) { 403 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 404 mode = "N"; 405 } else { 406 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 407 csrmat = &aijkok->csrmat; 408 mode = "C"; 409 } 410 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 411 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 412 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 414 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 415 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 416 PetscFunctionReturn(0); 417 } 418 419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 420 { 421 PetscErrorCode ierr; 422 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 423 424 PetscFunctionBegin; 425 switch (op) { 426 case MAT_FORM_EXPLICIT_TRANSPOSE: 427 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 428 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 429 A->form_explicit_transpose = flg; 430 break; 431 default: 432 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 433 break; 434 } 435 PetscFunctionReturn(0); 436 } 437 438 /* Depending on reuse, either build a new mat, or use the existing mat */ 439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *aseq; 443 444 PetscFunctionBegin; 445 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 446 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 447 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 448 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 449 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 450 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 451 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 452 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 454 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 455 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 456 aseq = static_cast<Mat_SeqAIJ*>(A->data); 457 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 458 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 459 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 460 } 461 } 462 PetscFunctionReturn(0); 463 } 464 465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 466 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 467 */ 468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 469 { 470 PetscErrorCode ierr; 471 Mat_SeqAIJ *bseq; 472 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 473 Mat mat; 474 475 PetscFunctionBegin; 476 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 477 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 478 mat = *B; 479 if (A->assembled) { 480 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 481 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 482 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 483 /* Now copy values to B if needed */ 484 if (dupOption == MAT_COPY_VALUES) { 485 if (akok->a_dual.need_sync_device()) { 486 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 487 bkok->a_dual.modify_host(); 488 } else { /* If device has the latest data, we only copy data on device */ 489 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 490 bkok->a_dual.modify_device(); 491 } 492 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 493 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 494 bkok->a_dual.modify_host(); 495 } 496 mat->spptr = bkok; 497 } 498 499 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 500 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 501 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 502 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 503 PetscFunctionReturn(0); 504 } 505 506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 507 { 508 PetscErrorCode ierr; 509 Mat At; 510 KokkosCsrMatrix *internT,*csrmatT; 511 Mat_SeqAIJKokkos *atkok,*bkok; 512 513 PetscFunctionBegin; 514 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 515 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 516 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 517 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 518 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 519 if (reuse == MAT_INITIAL_MATRIX) *B = At; 520 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 521 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 522 if ((*B)->assembled) { 523 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 524 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 525 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 526 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 527 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 528 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 529 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 530 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 531 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 532 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 533 } 534 PetscFunctionReturn(0); 535 } 536 537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 538 { 539 PetscErrorCode ierr; 540 Mat_SeqAIJKokkos *aijkok; 541 542 PetscFunctionBegin; 543 if (A->factortype == MAT_FACTOR_NONE) { 544 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 545 if (aijkok) { 546 if (aijkok->device_mat_d.data()) { 547 delete aijkok->colmap_d; 548 delete aijkok->i_uncompressed_d; 549 } 550 if (aijkok->diag_d) delete aijkok->diag_d; 551 } 552 delete aijkok; 553 } else { 554 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 555 } 556 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 557 A->spptr = NULL; 558 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 559 PetscFunctionReturn(0); 560 } 561 562 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 563 { 564 PetscErrorCode ierr; 565 566 PetscFunctionBegin; 567 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 568 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 569 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 570 PetscFunctionReturn(0); 571 } 572 573 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 574 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 575 { 576 PetscErrorCode ierr; 577 Mat_SeqAIJ *a,*b; 578 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 579 MatScalarKokkosView aa,ba,ca; 580 MatRowMapKokkosView ai,bi,ci; 581 MatColIdxKokkosView aj,bj,cj; 582 PetscInt m,n,nnz,aN; 583 584 PetscFunctionBegin; 585 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 586 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 587 PetscValidPointer(C,4); 588 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 589 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 590 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 591 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 592 593 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 594 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 595 a = static_cast<Mat_SeqAIJ*>(A->data); 596 b = static_cast<Mat_SeqAIJ*>(B->data); 597 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 599 aa = akok->a_dual.view_device(); 600 ai = akok->i_dual.view_device(); 601 ba = bkok->a_dual.view_device(); 602 bi = bkok->i_dual.view_device(); 603 m = A->rmap->n; /* M, N and nnz of C */ 604 n = A->cmap->n + B->cmap->n; 605 nnz = a->nz + b->nz; 606 aN = A->cmap->n; /* N of A */ 607 if (reuse == MAT_INITIAL_MATRIX) { 608 aj = akok->j_dual.view_device(); 609 bj = bkok->j_dual.view_device(); 610 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 611 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 612 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 613 ca = ca_dual.view_device(); 614 ci = ci_dual.view_device(); 615 cj = cj_dual.view_device(); 616 617 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 618 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 619 PetscInt i = t.league_rank(); /* row i */ 620 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 621 622 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 623 ci(i) = coffset; 624 if (i == m-1) ci(m) = ai(m) + bi(m); 625 }); 626 627 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 628 if (k < alen) { 629 ca(coffset+k) = aa(ai(i)+k); 630 cj(coffset+k) = aj(ai(i)+k); 631 } else { 632 ca(coffset+k) = ba(bi(i)+k-alen); 633 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 634 } 635 }); 636 }); 637 ca_dual.modify_device(); 638 ci_dual.modify_device(); 639 cj_dual.modify_device(); 640 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 641 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 642 } else if (reuse == MAT_REUSE_MATRIX) { 643 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 644 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 645 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 646 ca = ckok->a_dual.view_device(); 647 ci = ckok->i_dual.view_device(); 648 649 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 650 PetscInt i = t.league_rank(); /* row i */ 651 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 652 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 653 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 654 else ca(ci(i)+k) = ba(bi(i)+k-alen); 655 }); 656 }); 657 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 658 } 659 PetscFunctionReturn(0); 660 } 661 662 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 663 { 664 PetscFunctionBegin; 665 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 670 { 671 PetscErrorCode ierr; 672 Mat_Product *product = C->product; 673 Mat A,B; 674 bool transA,transB; /* use bool, since KK needs this type */ 675 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 676 Mat_SeqAIJ *c; 677 MatProductData_SeqAIJKokkos *pdata; 678 KokkosCsrMatrix *csrmatA,*csrmatB; 679 680 PetscFunctionBegin; 681 MatCheckProduct(C,1); 682 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 683 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 684 685 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 686 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 687 PetscFunctionReturn(0); 688 } 689 690 switch (product->type) { 691 case MATPRODUCT_AB: transA = false; transB = false; break; 692 case MATPRODUCT_AtB: transA = true; transB = false; break; 693 case MATPRODUCT_ABt: transA = false; transB = true; break; 694 default: 695 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 696 } 697 698 A = product->A; 699 B = product->B; 700 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 701 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 702 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 703 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 704 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 705 706 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 707 708 csrmatA = &akok->csrmat; 709 csrmatB = &bkok->csrmat; 710 711 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 712 if (transA) { 713 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 714 transA = false; 715 } 716 717 if (transB) { 718 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 719 transB = false; 720 } 721 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 722 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 723 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 724 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 725 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 726 /* shorter version of MatAssemblyEnd_SeqAIJ */ 727 c = (Mat_SeqAIJ*)C->data; 728 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 729 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 730 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 731 c->reallocs = 0; 732 C->info.mallocs = 0; 733 C->info.nz_unneeded = 0; 734 C->assembled = C->was_assembled = PETSC_TRUE; 735 C->num_ass++; 736 PetscFunctionReturn(0); 737 } 738 739 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 740 { 741 PetscErrorCode ierr; 742 Mat_Product *product = C->product; 743 MatProductType ptype; 744 Mat A,B; 745 bool transA,transB; 746 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 747 MatProductData_SeqAIJKokkos *pdata; 748 MPI_Comm comm; 749 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 750 751 PetscFunctionBegin; 752 MatCheckProduct(C,1); 753 ierr = PetscObjectGetComm((PetscObject)C,&comm); 754 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 755 A = product->A; 756 B = product->B; 757 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 758 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 759 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 760 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 761 csrmatA = &akok->csrmat; 762 csrmatB = &bkok->csrmat; 763 764 ptype = product->type; 765 switch (ptype) { 766 case MATPRODUCT_AB: transA = false; transB = false; break; 767 case MATPRODUCT_AtB: transA = true; transB = false; break; 768 case MATPRODUCT_ABt: transA = false; transB = true; break; 769 default: 770 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 771 } 772 773 product->data = pdata = new MatProductData_SeqAIJKokkos(); 774 pdata->kh.set_team_work_size(16); 775 pdata->kh.set_dynamic_scheduling(true); 776 pdata->reusesym = product->api_user; 777 778 /* TODO: add command line options to select spgemm algorithms */ 779 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 780 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 781 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 782 */ 783 pdata->kh.create_spgemm_handle(spgemm_alg); 784 785 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 786 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 787 if (transA) { 788 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 789 transA = false; 790 } 791 792 if (transB) { 793 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 794 transB = false; 795 } 796 797 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 798 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 799 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 800 calling new Mat_SeqAIJKokkos(). 801 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 802 */ 803 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 804 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 805 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 806 807 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 808 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 809 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 810 PetscFunctionReturn(0); 811 } 812 813 /* handles sparse matrix matrix ops */ 814 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 815 { 816 PetscErrorCode ierr; 817 Mat_Product *product = mat->product; 818 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 819 820 PetscFunctionBegin; 821 MatCheckProduct(mat,1); 822 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 823 if (product->type == MATPRODUCT_ABC) { 824 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 825 } 826 if (Biskok && Ciskok) { 827 switch (product->type) { 828 case MATPRODUCT_AB: 829 case MATPRODUCT_AtB: 830 case MATPRODUCT_ABt: 831 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 832 break; 833 case MATPRODUCT_PtAP: 834 case MATPRODUCT_RARt: 835 case MATPRODUCT_ABC: 836 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 837 break; 838 default: 839 break; 840 } 841 } else { /* fallback for AIJ */ 842 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 843 } 844 PetscFunctionReturn(0); 845 } 846 847 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 848 { 849 PetscErrorCode ierr; 850 Mat_SeqAIJKokkos *aijkok; 851 852 PetscFunctionBegin; 853 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 854 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 855 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 856 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 857 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 858 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 859 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 860 PetscFunctionReturn(0); 861 } 862 863 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 864 { 865 PetscErrorCode ierr; 866 Mat_SeqAIJKokkos *aijkok; 867 868 PetscFunctionBegin; 869 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 870 if (aijkok) { /* Only zero the device if data is already there */ 871 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 872 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 873 } else { /* Might be preallocated but not assembled */ 874 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 875 } 876 PetscFunctionReturn(0); 877 } 878 879 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 880 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 881 { 882 PetscErrorCode ierr; 883 Mat_SeqAIJKokkos *aijkok; 884 885 PetscFunctionBegin; 886 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887 PetscValidPointer(kv,2); 888 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 890 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 891 *kv = aijkok->a_dual.view_device(); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 896 { 897 PetscFunctionBegin; 898 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899 PetscValidPointer(kv,2); 900 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901 PetscFunctionReturn(0); 902 } 903 904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 905 { 906 PetscErrorCode ierr; 907 Mat_SeqAIJKokkos *aijkok; 908 909 PetscFunctionBegin; 910 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 911 PetscValidPointer(kv,2); 912 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 913 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 914 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 915 *kv = aijkok->a_dual.view_device(); 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 920 { 921 PetscErrorCode ierr; 922 923 PetscFunctionBegin; 924 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 925 PetscValidPointer(kv,2); 926 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 927 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 931 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 932 { 933 Mat_SeqAIJKokkos *aijkok; 934 935 PetscFunctionBegin; 936 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 937 PetscValidPointer(kv,2); 938 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 939 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 940 *kv = aijkok->a_dual.view_device(); 941 PetscFunctionReturn(0); 942 } 943 944 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 945 { 946 PetscErrorCode ierr; 947 948 PetscFunctionBegin; 949 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 950 PetscValidPointer(kv,2); 951 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 952 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 953 PetscFunctionReturn(0); 954 } 955 956 /* Computes Y += alpha X */ 957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 958 { 959 PetscErrorCode ierr; 960 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 961 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 962 ConstMatScalarKokkosView Xa; 963 MatScalarKokkosView Ya; 964 965 PetscFunctionBegin; 966 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 967 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 968 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 969 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 970 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 971 972 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 973 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 974 PetscBool e; 975 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 976 if (e) { 977 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 978 if (e) pattern = SAME_NONZERO_PATTERN; 979 } 980 } 981 982 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 983 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 984 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 985 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 986 */ 987 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 988 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 989 Xa = xkok->a_dual.view_device(); 990 Ya = ykok->a_dual.view_device(); 991 992 if (pattern == SAME_NONZERO_PATTERN) { 993 KokkosBlas::axpy(alpha,Xa,Ya); 994 ierr = MatSeqAIJKokkosModifyDevice(Y); 995 } else if (pattern == SUBSET_NONZERO_PATTERN) { 996 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 997 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 998 999 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1000 PetscInt i = t.league_rank(); /* row i */ 1001 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1002 PetscInt p,q = Yi(i); 1003 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1004 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1005 if (Xj(p) == Yj(q)) { /* Found it */ 1006 Ya(q) += alpha * Xa(p); 1007 q++; 1008 } else { 1009 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1010 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1011 */ 1012 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1013 } 1014 } 1015 }); 1016 }); 1017 ierr = MatSeqAIJKokkosModifyDevice(Y); 1018 } else { /* different nonzero patterns */ 1019 Mat Z; 1020 KokkosCsrMatrix zcsr; 1021 KernelHandle kh; 1022 kh.create_spadd_handle(false); 1023 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1024 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1025 zkok = new Mat_SeqAIJKokkos(zcsr); 1026 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1027 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1028 kh.destroy_spadd_handle(); 1029 } 1030 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1031 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1032 PetscFunctionReturn(0); 1033 } 1034 1035 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1036 { 1037 PetscErrorCode ierr; 1038 1039 PetscFunctionBegin; 1040 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1041 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1042 B->offloadmask = PETSC_OFFLOAD_CPU; 1043 PetscFunctionReturn(0); 1044 } 1045 1046 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1047 { 1048 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1049 1050 PetscFunctionBegin; 1051 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1052 A->boundtocpu = PETSC_FALSE; 1053 1054 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1055 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1056 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1057 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1058 A->ops->scale = MatScale_SeqAIJKokkos; 1059 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1060 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1061 A->ops->mult = MatMult_SeqAIJKokkos; 1062 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1063 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1064 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1065 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1066 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1067 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1068 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1069 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1070 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1071 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1072 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1073 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1074 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1075 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1076 PetscFunctionReturn(0); 1077 } 1078 1079 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1080 { 1081 PetscErrorCode ierr; 1082 Mat_SeqAIJ *aseq; 1083 PetscInt i,m,n; 1084 1085 PetscFunctionBegin; 1086 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1087 1088 m = akok->nrows(); 1089 n = akok->ncols(); 1090 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1091 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1092 1093 /* Set up data structures of A as a MATSEQAIJ */ 1094 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1095 aseq = (Mat_SeqAIJ*)(A)->data; 1096 1097 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1098 akok->j_dual.sync_host(); 1099 1100 aseq->i = akok->i_host_data(); 1101 aseq->j = akok->j_host_data(); 1102 aseq->a = akok->a_host_data(); 1103 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1104 aseq->singlemalloc = PETSC_FALSE; 1105 aseq->free_a = PETSC_FALSE; 1106 aseq->free_ij = PETSC_FALSE; 1107 aseq->nz = akok->nnz(); 1108 aseq->maxnz = aseq->nz; 1109 1110 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1111 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1112 for (i=0; i<m; i++) { 1113 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1114 } 1115 1116 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1117 akok->nonzerostate = A->nonzerostate; 1118 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1119 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1120 A->spptr = akok; 1121 PetscFunctionReturn(0); 1122 } 1123 1124 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1125 1126 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1127 */ 1128 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1129 { 1130 PetscErrorCode ierr; 1131 1132 PetscFunctionBegin; 1133 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1134 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1135 PetscFunctionReturn(0); 1136 } 1137 1138 /* --------------------------------------------------------------------------------*/ 1139 /*@C 1140 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1141 (the default parallel PETSc format). This matrix will ultimately be handled by 1142 Kokkos for calculations. For good matrix 1143 assembly performance the user should preallocate the matrix storage by setting 1144 the parameter nz (or the array nnz). By setting these parameters accurately, 1145 performance during matrix assembly can be increased by more than a factor of 50. 1146 1147 Collective 1148 1149 Input Parameters: 1150 + comm - MPI communicator, set to PETSC_COMM_SELF 1151 . m - number of rows 1152 . n - number of columns 1153 . nz - number of nonzeros per row (same for all rows) 1154 - nnz - array containing the number of nonzeros in the various rows 1155 (possibly different for each row) or NULL 1156 1157 Output Parameter: 1158 . A - the matrix 1159 1160 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1161 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1162 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1163 1164 Notes: 1165 If nnz is given then nz is ignored 1166 1167 The AIJ format (also called the Yale sparse matrix format or 1168 compressed row storage), is fully compatible with standard Fortran 77 1169 storage. That is, the stored row and column indices can begin at 1170 either one (as in Fortran) or zero. See the users' manual for details. 1171 1172 Specify the preallocated storage with either nz or nnz (not both). 1173 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1174 allocation. For large problems you MUST preallocate memory or you 1175 will get TERRIBLE performance, see the users' manual chapter on matrices. 1176 1177 By default, this format uses inodes (identical nodes) when possible, to 1178 improve numerical efficiency of matrix-vector products and solves. We 1179 search for consecutive rows with the same nonzero structure, thereby 1180 reusing matrix information to achieve increased efficiency. 1181 1182 Level: intermediate 1183 1184 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1185 @*/ 1186 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1187 { 1188 PetscErrorCode ierr; 1189 1190 PetscFunctionBegin; 1191 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1192 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1193 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1194 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1195 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1196 PetscFunctionReturn(0); 1197 } 1198 1199 typedef Kokkos::TeamPolicy<>::member_type team_member; 1200 // 1201 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1202 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1203 // 1204 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1205 { 1206 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1207 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1208 IS isrow = b->row,isicol = b->icol; 1209 PetscErrorCode ierr; 1210 const PetscInt *r_h,*ic_h; 1211 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1212 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1213 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1214 PetscBool row_identity,col_identity; 1215 PetscInt nc, Nf, nVec=32; // should be a parameter 1216 PetscContainer container; 1217 1218 PetscFunctionBegin; 1219 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1220 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1221 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1222 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1223 if (container) { 1224 PetscInt *pNf=NULL, nv; 1225 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1226 Nf = (*pNf)%1000; 1227 nv = (*pNf)/1000; 1228 if (nv>0) nVec = nv; 1229 } else Nf = 1; 1230 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf); 1231 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1232 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1233 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1234 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1235 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1236 { 1237 #define KOKKOS_SHARED_LEVEL 1 1238 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1239 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1240 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1241 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1242 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1243 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1244 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1245 size_t flops_h = 0.0; 1246 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1247 Kokkos::View<size_t> d_flops_k ("flops"); 1248 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1249 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1250 Kokkos::deep_copy (d_flops_k, h_flops_k); 1251 Kokkos::deep_copy (d_r_k, h_r_k); 1252 Kokkos::deep_copy (d_ic_k, h_ic_k); 1253 // Fill A --> fact 1254 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1255 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1256 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1257 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1258 // zero rows of B 1259 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1260 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1261 PetscScalar *baL = ba_d + bi_d[rowb]; 1262 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1263 /* zero (unfactored row) */ 1264 for (int j=0;j<nzbL;j++) baL[j] = 0; 1265 for (int j=0;j<nzbU;j++) baU[j] = 0; 1266 }); 1267 // copy A into B 1268 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1269 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1270 const PetscScalar *av = aa_d + ai_d[rowa]; 1271 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1272 /* load in initial (unfactored row) */ 1273 for (int j=0;j<nza;j++) { 1274 PetscInt colb = ic[ajtmp[j]]; 1275 PetscScalar vala = av[j]; 1276 if (colb == rowb) { 1277 *(ba_d + bdiag_d[rowb]) = vala; 1278 } else { 1279 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1280 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1281 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1282 for (int j=0; j<nz ; j++) { 1283 if (pbj[j] == colb) { 1284 pba[j] = vala; 1285 set++; 1286 break; 1287 } 1288 } 1289 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1290 } 1291 } 1292 }); 1293 }); 1294 Kokkos::fence(); 1295 1296 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1297 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1298 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1299 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1300 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1301 const PetscInt start = field*nloc, end = start + nloc; 1302 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1303 // A22 panel update for each row A(1,:) and col A(:,1) 1304 for (int ii=start; ii<end-1; ii++) { 1305 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1306 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1307 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1308 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1309 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1310 PetscInt kIdx = j*Ni + field_block_idx; 1311 if (kIdx >= nzUi) /* void */ ; 1312 else { 1313 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1314 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1315 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1316 size_t st_idx; 1317 // find and do L(k,i) = A(:k,i) / A(i,i) 1318 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1319 // get column, there has got to be a better way 1320 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1321 if (pjL[j] == ii) { 1322 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1323 idx = j; // output 1324 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1325 } 1326 }, st_idx); 1327 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1328 #if defined(PETSC_USE_DEBUG) 1329 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1330 #endif 1331 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1332 // U(i+1,:end) 1333 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1334 PetscScalar Uij = baUi[uiIdx]; 1335 PetscInt col = bjUi[uiIdx]; 1336 if (col==myk) { 1337 // A_kk = A_kk - L_ki * U_ij(k) 1338 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1339 *Akkv = *Akkv - L_ki() * Uij; // UiK 1340 } else { 1341 PetscScalar *start, *end, *pAkjv=NULL; 1342 PetscInt high, low; 1343 const PetscInt *startj; 1344 if (col<myk) { // L 1345 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1346 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1347 start = pLki+1; // start at pLki+1, A22(myk,1) 1348 startj= bj_d + bi_d[myk] + idx; 1349 end = ba_d + bi_d[myk+1]; 1350 } else { 1351 PetscInt idx = bdiag_d[myk+1]+1; 1352 start = ba_d + idx; 1353 startj= bj_d + idx; 1354 end = ba_d + bdiag_d[myk]; 1355 } 1356 // search for 'col', use bisection search - TODO 1357 low = 0; 1358 high = (PetscInt)(end-start); 1359 while (high-low > 5) { 1360 int t = (low+high)/2; 1361 if (startj[t] > col) high = t; 1362 else low = t; 1363 } 1364 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1365 if (startj[pAkjv-start] == col) break; 1366 } 1367 #if defined(PETSC_USE_DEBUG) 1368 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1369 #endif 1370 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1371 } 1372 }); 1373 } 1374 }); 1375 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1376 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1377 } /* endof for (i=0; i<n; i++) { */ 1378 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1379 }); 1380 Kokkos::fence(); 1381 Kokkos::deep_copy (h_flops_k, d_flops_k); 1382 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1383 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1384 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1385 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1386 /* Invert diagonal for simpler triangular solves */ 1387 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1388 int i = start + outer_index*Ni + lg_rank%Ni; 1389 if (i < end) { 1390 PetscScalar *pv = ba_d + bdiag_d[i]; 1391 *pv = 1.0/(*pv); 1392 } 1393 }); 1394 }); 1395 } 1396 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1397 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1398 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1399 1400 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1401 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1402 if (b->inode.size) { 1403 B->ops->solve = MatSolve_SeqAIJ_Inode; 1404 } else if (row_identity && col_identity) { 1405 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1406 } else { 1407 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1408 } 1409 B->offloadmask = PETSC_OFFLOAD_GPU; 1410 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1411 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1412 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1413 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1414 B->ops->matsolve = MatMatSolve_SeqAIJ; 1415 B->assembled = PETSC_TRUE; 1416 B->preallocated = PETSC_TRUE; 1417 1418 PetscFunctionReturn(0); 1419 } 1420 1421 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1422 { 1423 PetscErrorCode ierr; 1424 1425 PetscFunctionBegin; 1426 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1427 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1428 PetscFunctionReturn(0); 1429 } 1430 1431 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1432 { 1433 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1434 1435 PetscFunctionBegin; 1436 if (!factors->sptrsv_symbolic_completed) { 1437 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1438 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1439 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1440 } 1441 PetscFunctionReturn(0); 1442 } 1443 1444 /* Check if we need to update factors etc for transpose solve */ 1445 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1446 { 1447 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1448 MatColIdxType n = A->rmap->n; 1449 1450 PetscFunctionBegin; 1451 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1452 /* Update L^T and do sptrsv symbolic */ 1453 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1454 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1455 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1456 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1457 1458 KokkosKernels::Impl::transpose_matrix< 1459 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1460 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1461 MatRowMapKokkosView,DefaultExecutionSpace>( 1462 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1463 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1464 1465 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1466 We have to sort the indices, until KK provides finer control options. 1467 */ 1468 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1469 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1470 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1471 1472 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1473 1474 /* Update U^T and do sptrsv symbolic */ 1475 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1476 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1477 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1478 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1479 1480 KokkosKernels::Impl::transpose_matrix< 1481 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1482 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1483 MatRowMapKokkosView,DefaultExecutionSpace>( 1484 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1485 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1486 1487 /* Sort indices. See comments above */ 1488 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1489 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1490 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1491 1492 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1493 factors->transpose_updated = PETSC_TRUE; 1494 } 1495 PetscFunctionReturn(0); 1496 } 1497 1498 /* Solve Ax = b, with A = LU */ 1499 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1500 { 1501 PetscErrorCode ierr; 1502 ConstPetscScalarKokkosView bv; 1503 PetscScalarKokkosView xv; 1504 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1505 1506 PetscFunctionBegin; 1507 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1508 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1509 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1510 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1511 /* Solve L tmpv = b */ 1512 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1513 /* Solve Ux = tmpv */ 1514 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1515 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1516 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1517 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1518 PetscFunctionReturn(0); 1519 } 1520 1521 /* Solve A^T x = b, where A^T = U^T L^T */ 1522 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1523 { 1524 PetscErrorCode ierr; 1525 ConstPetscScalarKokkosView bv; 1526 PetscScalarKokkosView xv; 1527 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1528 1529 PetscFunctionBegin; 1530 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1531 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1532 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1533 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1534 /* Solve U^T tmpv = b */ 1535 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1536 1537 /* Solve L^T x = tmpv */ 1538 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1539 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1540 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1541 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1542 PetscFunctionReturn(0); 1543 } 1544 1545 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1546 { 1547 PetscErrorCode ierr; 1548 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1549 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1550 PetscInt fill_lev = info->levels; 1551 1552 PetscFunctionBegin; 1553 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1554 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1555 1556 auto a_d = aijkok->a_dual.view_device(); 1557 auto i_d = aijkok->i_dual.view_device(); 1558 auto j_d = aijkok->j_dual.view_device(); 1559 1560 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1561 1562 B->assembled = PETSC_TRUE; 1563 B->preallocated = PETSC_TRUE; 1564 B->ops->solve = MatSolve_SeqAIJKokkos; 1565 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1566 B->ops->matsolve = NULL; 1567 B->ops->matsolvetranspose = NULL; 1568 B->offloadmask = PETSC_OFFLOAD_GPU; 1569 1570 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1571 factors->transpose_updated = PETSC_FALSE; 1572 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1573 /* TODO: log flops, but how to know that? */ 1574 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1575 PetscFunctionReturn(0); 1576 } 1577 1578 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1579 { 1580 PetscErrorCode ierr; 1581 Mat_SeqAIJKokkos *aijkok; 1582 Mat_SeqAIJ *b; 1583 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1584 PetscInt fill_lev = info->levels; 1585 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1586 PetscInt n = A->rmap->n; 1587 1588 PetscFunctionBegin; 1589 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1590 /* Rebuild factors */ 1591 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1592 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1593 1594 /* Create a spiluk handle and then do symbolic factorization */ 1595 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1596 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1597 1598 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1599 1600 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1601 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1602 Kokkos::realloc(factors->iU_d,n+1); 1603 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1604 1605 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1606 auto i_d = aijkok->i_dual.view_device(); 1607 auto j_d = aijkok->j_dual.view_device(); 1608 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1609 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1610 1611 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1612 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1613 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1614 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1615 1616 /* TODO: add options to select sptrsv algorithms */ 1617 /* Create sptrsv handles for L, U and their transpose */ 1618 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1619 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1620 #else 1621 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1622 #endif 1623 1624 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1625 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1626 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1627 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1628 1629 /* Fill fields of the factor matrix B */ 1630 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1631 b = (Mat_SeqAIJ*)B->data; 1632 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1633 B->info.fill_ratio_given = info->fill; 1634 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1635 1636 B->offloadmask = PETSC_OFFLOAD_GPU; 1637 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1638 PetscFunctionReturn(0); 1639 } 1640 1641 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1642 { 1643 PetscErrorCode ierr; 1644 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1645 const PetscInt nrows = A->rmap->n; 1646 1647 PetscFunctionBegin; 1648 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1649 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1650 // move B data into Kokkos 1651 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1652 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1653 { 1654 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1655 if (!baijkok->diag_d) { 1656 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1657 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1658 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1659 } 1660 } 1661 PetscFunctionReturn(0); 1662 } 1663 1664 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1665 { 1666 PetscFunctionBegin; 1667 *type = MATSOLVERKOKKOS; 1668 PetscFunctionReturn(0); 1669 } 1670 1671 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1672 { 1673 PetscFunctionBegin; 1674 *type = MATSOLVERKOKKOSDEVICE; 1675 PetscFunctionReturn(0); 1676 } 1677 1678 /*MC 1679 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1680 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1681 1682 Level: beginner 1683 1684 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1685 M*/ 1686 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1687 { 1688 PetscErrorCode ierr; 1689 PetscInt n = A->rmap->n; 1690 1691 PetscFunctionBegin; 1692 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1693 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1694 (*B)->factortype = ftype; 1695 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1696 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1697 1698 if (ftype == MAT_FACTOR_LU) { 1699 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1700 (*B)->canuseordering = PETSC_TRUE; 1701 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1702 } else if (ftype == MAT_FACTOR_ILU) { 1703 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1704 (*B)->canuseordering = PETSC_FALSE; 1705 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1706 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1707 1708 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1709 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1710 PetscFunctionReturn(0); 1711 } 1712 1713 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1714 { 1715 PetscErrorCode ierr; 1716 PetscInt n = A->rmap->n; 1717 1718 PetscFunctionBegin; 1719 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1720 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1721 (*B)->factortype = ftype; 1722 (*B)->canuseordering = PETSC_TRUE; 1723 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1724 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1725 1726 if (ftype == MAT_FACTOR_LU) { 1727 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1728 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1729 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1730 1731 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1732 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1733 PetscFunctionReturn(0); 1734 } 1735 1736 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1737 { 1738 PetscErrorCode ierr; 1739 1740 PetscFunctionBegin; 1741 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1742 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1743 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1744 PetscFunctionReturn(0); 1745 } 1746 1747 /* Utility to print out a KokkosCsrMatrix for debugging */ 1748 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1749 { 1750 PetscErrorCode ierr; 1751 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1752 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1753 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1754 const PetscInt *i = iv.data(); 1755 const PetscInt *j = jv.data(); 1756 const PetscScalar *a = av.data(); 1757 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1758 1759 PetscFunctionBegin; 1760 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1761 for (PetscInt k=0; k<m; k++) { 1762 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1763 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1764 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1765 } 1766 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1767 } 1768 PetscFunctionReturn(0); 1769 } 1770