1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 226 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228 CHKERRCXX(aijkok->a_dual.sync_device()); 229 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231 #if defined(PETSC_USE_COMPLEX) 232 const auto& a = aijkok->csrmatH.values; 233 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234 #endif 235 aijkok->hermitian_updated = PETSC_TRUE; 236 } 237 *csrmatH = &aijkok->csrmatH; 238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 /* y = A x */ 243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 244 { 245 PetscErrorCode ierr; 246 Mat_SeqAIJKokkos *aijkok; 247 ConstPetscScalarKokkosView xv; 248 PetscScalarKokkosView yv; 249 250 PetscFunctionBegin; 251 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 252 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 254 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 255 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 256 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 259 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /* y = A^T x */ 265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 266 { 267 PetscErrorCode ierr; 268 Mat_SeqAIJKokkos *aijkok; 269 const char *mode; 270 ConstPetscScalarKokkosView xv; 271 PetscScalarKokkosView yv; 272 KokkosCsrMatrix *csrmat; 273 274 PetscFunctionBegin; 275 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 276 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 277 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 279 if (A->form_explicit_transpose) { 280 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 281 mode = "N"; 282 } else { 283 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 284 csrmat = &aijkok->csrmat; 285 mode = "T"; 286 } 287 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 288 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 289 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 290 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 291 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 /* y = A^H x */ 296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 297 { 298 PetscErrorCode ierr; 299 Mat_SeqAIJKokkos *aijkok; 300 const char *mode; 301 ConstPetscScalarKokkosView xv; 302 PetscScalarKokkosView yv; 303 KokkosCsrMatrix *csrmat; 304 305 PetscFunctionBegin; 306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 307 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 308 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 310 if (A->form_explicit_transpose) { 311 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 312 mode = "N"; 313 } else { 314 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 315 csrmat = &aijkok->csrmat; 316 mode = "C"; 317 } 318 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 319 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 320 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 321 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 322 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 /* z = A x + y */ 327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 328 { 329 PetscErrorCode ierr; 330 Mat_SeqAIJKokkos *aijkok; 331 ConstPetscScalarKokkosView xv,yv; 332 PetscScalarKokkosView zv; 333 334 PetscFunctionBegin; 335 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 336 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 337 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 340 if (zz != yy) Kokkos::deep_copy(zv,yv); 341 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 342 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 343 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 /* z = A^T x + y */ 352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 353 { 354 PetscErrorCode ierr; 355 Mat_SeqAIJKokkos *aijkok; 356 const char *mode; 357 ConstPetscScalarKokkosView xv,yv; 358 PetscScalarKokkosView zv; 359 KokkosCsrMatrix *csrmat; 360 361 PetscFunctionBegin; 362 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 363 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 364 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 367 if (zz != yy) Kokkos::deep_copy(zv,yv); 368 if (A->form_explicit_transpose) { 369 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 370 mode = "N"; 371 } else { 372 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 373 csrmat = &aijkok->csrmat; 374 mode = "T"; 375 } 376 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 377 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 378 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 380 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 381 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 /* z = A^H x + y */ 386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 387 { 388 PetscErrorCode ierr; 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv,yv; 392 PetscScalarKokkosView zv; 393 KokkosCsrMatrix *csrmat; 394 395 PetscFunctionBegin; 396 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 397 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 398 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 401 if (zz != yy) Kokkos::deep_copy(zv,yv); 402 if (A->form_explicit_transpose) { 403 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 404 mode = "N"; 405 } else { 406 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 407 csrmat = &aijkok->csrmat; 408 mode = "C"; 409 } 410 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 411 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 412 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 414 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 415 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 416 PetscFunctionReturn(0); 417 } 418 419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 420 { 421 PetscErrorCode ierr; 422 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 423 424 PetscFunctionBegin; 425 switch (op) { 426 case MAT_FORM_EXPLICIT_TRANSPOSE: 427 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 428 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 429 A->form_explicit_transpose = flg; 430 break; 431 default: 432 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 433 break; 434 } 435 PetscFunctionReturn(0); 436 } 437 438 /* Depending on reuse, either build a new mat, or use the existing mat */ 439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *aseq; 443 444 PetscFunctionBegin; 445 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 446 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 447 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 448 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 449 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 450 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 451 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 452 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 454 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 455 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 456 aseq = static_cast<Mat_SeqAIJ*>(A->data); 457 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 458 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 459 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 460 } 461 } 462 PetscFunctionReturn(0); 463 } 464 465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 466 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 467 */ 468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 469 { 470 PetscErrorCode ierr; 471 Mat_SeqAIJ *bseq; 472 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 473 Mat mat; 474 475 PetscFunctionBegin; 476 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 477 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 478 mat = *B; 479 if (A->assembled) { 480 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 481 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 482 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 483 /* Now copy values to B if needed */ 484 if (dupOption == MAT_COPY_VALUES) { 485 if (akok->a_dual.need_sync_device()) { 486 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 487 bkok->a_dual.modify_host(); 488 } else { /* If device has the latest data, we only copy data on device */ 489 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 490 bkok->a_dual.modify_device(); 491 } 492 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 493 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 494 bkok->a_dual.modify_host(); 495 } 496 mat->spptr = bkok; 497 } 498 499 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 500 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 501 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 502 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 503 PetscFunctionReturn(0); 504 } 505 506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 507 { 508 PetscErrorCode ierr; 509 Mat At; 510 KokkosCsrMatrix *internT,*csrmatT; 511 Mat_SeqAIJKokkos *atkok,*bkok; 512 513 PetscFunctionBegin; 514 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 515 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 516 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 517 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 518 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 519 if (reuse == MAT_INITIAL_MATRIX) *B = At; 520 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 521 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 522 if ((*B)->assembled) { 523 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 524 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 525 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 526 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 527 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 528 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 529 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 530 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 531 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 532 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 533 } 534 PetscFunctionReturn(0); 535 } 536 537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 538 { 539 PetscErrorCode ierr; 540 Mat_SeqAIJKokkos *aijkok; 541 542 PetscFunctionBegin; 543 if (A->factortype == MAT_FACTOR_NONE) { 544 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 545 if (aijkok) { 546 if (aijkok->device_mat_d.data()) { 547 delete aijkok->colmap_d; 548 delete aijkok->i_uncompressed_d; 549 } 550 if (aijkok->diag_d) delete aijkok->diag_d; 551 } 552 delete aijkok; 553 } else { 554 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 555 } 556 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 557 A->spptr = NULL; 558 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 559 PetscFunctionReturn(0); 560 } 561 562 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 563 { 564 PetscErrorCode ierr; 565 566 PetscFunctionBegin; 567 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 568 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 569 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 570 PetscFunctionReturn(0); 571 } 572 573 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 574 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 575 { 576 PetscErrorCode ierr; 577 Mat_SeqAIJ *a,*b; 578 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 579 MatScalarKokkosView aa,ba,ca; 580 MatRowMapKokkosView ai,bi,ci; 581 MatColIdxKokkosView aj,bj,cj; 582 PetscInt m,n,nnz,aN; 583 584 PetscFunctionBegin; 585 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 586 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 587 PetscValidPointer(C,4); 588 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 589 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 590 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 591 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 592 593 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 594 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 595 a = static_cast<Mat_SeqAIJ*>(A->data); 596 b = static_cast<Mat_SeqAIJ*>(B->data); 597 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 599 aa = akok->a_dual.view_device(); 600 ai = akok->i_dual.view_device(); 601 ba = bkok->a_dual.view_device(); 602 bi = bkok->i_dual.view_device(); 603 m = A->rmap->n; /* M, N and nnz of C */ 604 n = A->cmap->n + B->cmap->n; 605 nnz = a->nz + b->nz; 606 aN = A->cmap->n; /* N of A */ 607 if (reuse == MAT_INITIAL_MATRIX) { 608 aj = akok->j_dual.view_device(); 609 bj = bkok->j_dual.view_device(); 610 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 611 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 612 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 613 ca = ca_dual.view_device(); 614 ci = ci_dual.view_device(); 615 cj = cj_dual.view_device(); 616 617 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 618 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 619 PetscInt i = t.league_rank(); /* row i */ 620 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 621 622 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 623 ci(i) = coffset; 624 if (i == m-1) ci(m) = ai(m) + bi(m); 625 }); 626 627 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 628 if (k < alen) { 629 ca(coffset+k) = aa(ai(i)+k); 630 cj(coffset+k) = aj(ai(i)+k); 631 } else { 632 ca(coffset+k) = ba(bi(i)+k-alen); 633 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 634 } 635 }); 636 }); 637 ca_dual.modify_device(); 638 ci_dual.modify_device(); 639 cj_dual.modify_device(); 640 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 641 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 642 } else if (reuse == MAT_REUSE_MATRIX) { 643 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 644 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 645 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 646 ca = ckok->a_dual.view_device(); 647 ci = ckok->i_dual.view_device(); 648 649 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 650 PetscInt i = t.league_rank(); /* row i */ 651 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 652 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 653 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 654 else ca(ci(i)+k) = ba(bi(i)+k-alen); 655 }); 656 }); 657 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 658 } 659 PetscFunctionReturn(0); 660 } 661 662 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 663 { 664 PetscFunctionBegin; 665 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 670 { 671 PetscErrorCode ierr; 672 Mat_Product *product = C->product; 673 Mat A,B; 674 bool transA,transB; /* use bool, since KK needs this type */ 675 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 676 Mat_SeqAIJ *c; 677 MatProductData_SeqAIJKokkos *pdata; 678 KokkosCsrMatrix *csrmatA,*csrmatB; 679 680 PetscFunctionBegin; 681 MatCheckProduct(C,1); 682 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 683 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 684 685 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 686 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 687 PetscFunctionReturn(0); 688 } 689 690 switch (product->type) { 691 case MATPRODUCT_AB: transA = false; transB = false; break; 692 case MATPRODUCT_AtB: transA = true; transB = false; break; 693 case MATPRODUCT_ABt: transA = false; transB = true; break; 694 default: 695 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 696 } 697 698 A = product->A; 699 B = product->B; 700 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 701 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 702 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 703 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 704 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 705 706 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 707 708 csrmatA = &akok->csrmat; 709 csrmatB = &bkok->csrmat; 710 711 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 712 if (transA) { 713 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 714 transA = false; 715 } 716 717 if (transB) { 718 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 719 transB = false; 720 } 721 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 722 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 723 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 724 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 725 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 726 /* shorter version of MatAssemblyEnd_SeqAIJ */ 727 c = (Mat_SeqAIJ*)C->data; 728 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 729 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 730 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 731 c->reallocs = 0; 732 C->info.mallocs = 0; 733 C->info.nz_unneeded = 0; 734 C->assembled = C->was_assembled = PETSC_TRUE; 735 C->num_ass++; 736 PetscFunctionReturn(0); 737 } 738 739 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 740 { 741 PetscErrorCode ierr; 742 Mat_Product *product = C->product; 743 MatProductType ptype; 744 Mat A,B; 745 bool transA,transB; 746 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 747 MatProductData_SeqAIJKokkos *pdata; 748 MPI_Comm comm; 749 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 750 751 PetscFunctionBegin; 752 MatCheckProduct(C,1); 753 ierr = PetscObjectGetComm((PetscObject)C,&comm); 754 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 755 A = product->A; 756 B = product->B; 757 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 758 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 759 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 760 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 761 csrmatA = &akok->csrmat; 762 csrmatB = &bkok->csrmat; 763 764 ptype = product->type; 765 switch (ptype) { 766 case MATPRODUCT_AB: transA = false; transB = false; break; 767 case MATPRODUCT_AtB: transA = true; transB = false; break; 768 case MATPRODUCT_ABt: transA = false; transB = true; break; 769 default: 770 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 771 } 772 773 product->data = pdata = new MatProductData_SeqAIJKokkos(); 774 pdata->kh.set_team_work_size(16); 775 pdata->kh.set_dynamic_scheduling(true); 776 pdata->reusesym = product->api_user; 777 778 /* TODO: add command line options to select spgemm algorithms */ 779 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 780 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 781 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 782 */ 783 pdata->kh.create_spgemm_handle(spgemm_alg); 784 785 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 786 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 787 if (transA) { 788 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 789 transA = false; 790 } 791 792 if (transB) { 793 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 794 transB = false; 795 } 796 797 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 798 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 799 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 800 calling new Mat_SeqAIJKokkos(). 801 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 802 */ 803 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 804 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 805 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 806 807 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 808 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 809 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 810 PetscFunctionReturn(0); 811 } 812 813 /* handles sparse matrix matrix ops */ 814 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 815 { 816 PetscErrorCode ierr; 817 Mat_Product *product = mat->product; 818 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 819 820 PetscFunctionBegin; 821 MatCheckProduct(mat,1); 822 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 823 if (product->type == MATPRODUCT_ABC) { 824 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 825 } 826 if (Biskok && Ciskok) { 827 switch (product->type) { 828 case MATPRODUCT_AB: 829 case MATPRODUCT_AtB: 830 case MATPRODUCT_ABt: 831 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 832 break; 833 case MATPRODUCT_PtAP: 834 case MATPRODUCT_RARt: 835 case MATPRODUCT_ABC: 836 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 837 break; 838 default: 839 break; 840 } 841 } else { /* fallback for AIJ */ 842 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 843 } 844 PetscFunctionReturn(0); 845 } 846 847 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 848 { 849 PetscErrorCode ierr; 850 Mat_SeqAIJKokkos *aijkok; 851 852 PetscFunctionBegin; 853 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 854 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 855 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 856 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 857 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 858 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 859 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 860 PetscFunctionReturn(0); 861 } 862 863 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 864 { 865 PetscErrorCode ierr; 866 Mat_SeqAIJKokkos *aijkok; 867 868 PetscFunctionBegin; 869 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 870 if (aijkok) { /* Only zero the device if data is already there */ 871 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 872 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 873 } else { /* Might be preallocated but not assembled */ 874 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 875 } 876 PetscFunctionReturn(0); 877 } 878 879 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 880 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 881 { 882 PetscErrorCode ierr; 883 Mat_SeqAIJKokkos *aijkok; 884 885 PetscFunctionBegin; 886 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887 PetscValidPointer(kv,2); 888 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 890 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 891 *kv = aijkok->a_dual.view_device(); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 896 { 897 PetscFunctionBegin; 898 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899 PetscValidPointer(kv,2); 900 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901 PetscFunctionReturn(0); 902 } 903 904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 905 { 906 PetscErrorCode ierr; 907 Mat_SeqAIJKokkos *aijkok; 908 909 PetscFunctionBegin; 910 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 911 PetscValidPointer(kv,2); 912 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 913 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 914 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 915 *kv = aijkok->a_dual.view_device(); 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 920 { 921 PetscErrorCode ierr; 922 923 PetscFunctionBegin; 924 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 925 PetscValidPointer(kv,2); 926 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 927 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 931 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 932 { 933 Mat_SeqAIJKokkos *aijkok; 934 935 PetscFunctionBegin; 936 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 937 PetscValidPointer(kv,2); 938 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 939 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 940 *kv = aijkok->a_dual.view_device(); 941 PetscFunctionReturn(0); 942 } 943 944 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 945 { 946 PetscErrorCode ierr; 947 948 PetscFunctionBegin; 949 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 950 PetscValidPointer(kv,2); 951 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 952 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 953 PetscFunctionReturn(0); 954 } 955 956 /* Computes Y += alpha X */ 957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 958 { 959 PetscErrorCode ierr; 960 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 961 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 962 ConstMatScalarKokkosView Xa; 963 MatScalarKokkosView Ya; 964 965 PetscFunctionBegin; 966 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 967 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 968 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 969 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 970 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 971 972 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 973 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 974 PetscBool e; 975 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 976 if (e) { 977 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 978 if (e) pattern = SAME_NONZERO_PATTERN; 979 } 980 } 981 982 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 983 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 984 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 985 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 986 */ 987 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 988 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 989 Xa = xkok->a_dual.view_device(); 990 Ya = ykok->a_dual.view_device(); 991 992 if (pattern == SAME_NONZERO_PATTERN) { 993 KokkosBlas::axpy(alpha,Xa,Ya); 994 ierr = MatSeqAIJKokkosModifyDevice(Y); 995 } else if (pattern == SUBSET_NONZERO_PATTERN) { 996 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 997 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 998 999 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1000 PetscInt i = t.league_rank(); /* row i */ 1001 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1002 PetscInt p,q = Yi(i); 1003 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1004 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1005 if (Xj(p) == Yj(q)) { /* Found it */ 1006 Ya(q) += alpha * Xa(p); 1007 q++; 1008 } else { 1009 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1010 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1011 */ 1012 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1013 } 1014 } 1015 }); 1016 }); 1017 ierr = MatSeqAIJKokkosModifyDevice(Y); 1018 } else { /* different nonzero patterns */ 1019 Mat Z; 1020 KokkosCsrMatrix zcsr; 1021 KernelHandle kh; 1022 kh.create_spadd_handle(false); 1023 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1024 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1025 zkok = new Mat_SeqAIJKokkos(zcsr); 1026 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1027 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1028 kh.destroy_spadd_handle(); 1029 } 1030 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1031 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1032 PetscFunctionReturn(0); 1033 } 1034 1035 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1036 { 1037 PetscErrorCode ierr; 1038 1039 PetscFunctionBegin; 1040 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1041 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1042 B->offloadmask = PETSC_OFFLOAD_CPU; 1043 PetscFunctionReturn(0); 1044 } 1045 1046 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1047 { 1048 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1049 1050 PetscFunctionBegin; 1051 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1052 A->boundtocpu = PETSC_FALSE; 1053 1054 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1055 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1056 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1057 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1058 A->ops->scale = MatScale_SeqAIJKokkos; 1059 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1060 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1061 A->ops->mult = MatMult_SeqAIJKokkos; 1062 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1063 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1064 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1065 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1066 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1067 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1068 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1069 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1070 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1071 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1072 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1073 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1074 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1075 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1076 PetscFunctionReturn(0); 1077 } 1078 1079 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1080 { 1081 PetscErrorCode ierr; 1082 Mat_SeqAIJ *aseq; 1083 PetscInt i,m,n; 1084 1085 PetscFunctionBegin; 1086 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1087 1088 m = akok->nrows(); 1089 n = akok->ncols(); 1090 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1091 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1092 1093 /* Set up data structures of A as a MATSEQAIJ */ 1094 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1095 aseq = (Mat_SeqAIJ*)(A)->data; 1096 1097 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1098 akok->j_dual.sync_host(); 1099 1100 aseq->i = akok->i_host_data(); 1101 aseq->j = akok->j_host_data(); 1102 aseq->a = akok->a_host_data(); 1103 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1104 aseq->singlemalloc = PETSC_FALSE; 1105 aseq->free_a = PETSC_FALSE; 1106 aseq->free_ij = PETSC_FALSE; 1107 aseq->nz = akok->nnz(); 1108 aseq->maxnz = aseq->nz; 1109 1110 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1111 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1112 for (i=0; i<m; i++) { 1113 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1114 } 1115 1116 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1117 akok->nonzerostate = A->nonzerostate; 1118 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1119 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1120 A->spptr = akok; 1121 PetscFunctionReturn(0); 1122 } 1123 1124 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1125 1126 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1127 */ 1128 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1129 { 1130 PetscErrorCode ierr; 1131 1132 PetscFunctionBegin; 1133 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1134 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1135 PetscFunctionReturn(0); 1136 } 1137 1138 /* --------------------------------------------------------------------------------*/ 1139 /*@C 1140 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1141 (the default parallel PETSc format). This matrix will ultimately be handled by 1142 Kokkos for calculations. For good matrix 1143 assembly performance the user should preallocate the matrix storage by setting 1144 the parameter nz (or the array nnz). By setting these parameters accurately, 1145 performance during matrix assembly can be increased by more than a factor of 50. 1146 1147 Collective 1148 1149 Input Parameters: 1150 + comm - MPI communicator, set to PETSC_COMM_SELF 1151 . m - number of rows 1152 . n - number of columns 1153 . nz - number of nonzeros per row (same for all rows) 1154 - nnz - array containing the number of nonzeros in the various rows 1155 (possibly different for each row) or NULL 1156 1157 Output Parameter: 1158 . A - the matrix 1159 1160 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1161 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1162 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1163 1164 Notes: 1165 If nnz is given then nz is ignored 1166 1167 The AIJ format (also called the Yale sparse matrix format or 1168 compressed row storage), is fully compatible with standard Fortran 77 1169 storage. That is, the stored row and column indices can begin at 1170 either one (as in Fortran) or zero. See the users' manual for details. 1171 1172 Specify the preallocated storage with either nz or nnz (not both). 1173 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1174 allocation. For large problems you MUST preallocate memory or you 1175 will get TERRIBLE performance, see the users' manual chapter on matrices. 1176 1177 By default, this format uses inodes (identical nodes) when possible, to 1178 improve numerical efficiency of matrix-vector products and solves. We 1179 search for consecutive rows with the same nonzero structure, thereby 1180 reusing matrix information to achieve increased efficiency. 1181 1182 Level: intermediate 1183 1184 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1185 @*/ 1186 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1187 { 1188 PetscErrorCode ierr; 1189 1190 PetscFunctionBegin; 1191 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1192 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1193 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1194 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1195 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1196 PetscFunctionReturn(0); 1197 } 1198 1199 typedef Kokkos::TeamPolicy<>::member_type team_member; 1200 // 1201 // This factorization exploits block diagonal matrices with "Nf" (not used). 1202 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1203 // 1204 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1205 { 1206 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1207 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1208 IS isrow = b->row,isicol = b->icol; 1209 PetscErrorCode ierr; 1210 const PetscInt *r_h,*ic_h; 1211 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1212 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1213 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1214 PetscBool row_identity,col_identity; 1215 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1216 1217 PetscFunctionBegin; 1218 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1219 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1220 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1221 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1222 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1223 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1226 { 1227 #define KOKKOS_SHARED_LEVEL 1 1228 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1229 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1230 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1231 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1232 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1233 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1234 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1235 size_t flops_h = 0.0; 1236 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1237 Kokkos::View<size_t> d_flops_k ("flops"); 1238 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1239 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1240 Kokkos::deep_copy (d_flops_k, h_flops_k); 1241 Kokkos::deep_copy (d_r_k, h_r_k); 1242 Kokkos::deep_copy (d_ic_k, h_ic_k); 1243 // Fill A --> fact 1244 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1245 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1246 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1247 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1248 // zero rows of B 1249 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1250 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1251 PetscScalar *baL = ba_d + bi_d[rowb]; 1252 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1253 /* zero (unfactored row) */ 1254 for (int j=0;j<nzbL;j++) baL[j] = 0; 1255 for (int j=0;j<nzbU;j++) baU[j] = 0; 1256 }); 1257 // copy A into B 1258 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1259 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1260 const PetscScalar *av = aa_d + ai_d[rowa]; 1261 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1262 /* load in initial (unfactored row) */ 1263 for (int j=0;j<nza;j++) { 1264 PetscInt colb = ic[ajtmp[j]]; 1265 PetscScalar vala = av[j]; 1266 if (colb == rowb) { 1267 *(ba_d + bdiag_d[rowb]) = vala; 1268 } else { 1269 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1270 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1271 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1272 for (int j=0; j<nz ; j++) { 1273 if (pbj[j] == colb) { 1274 pba[j] = vala; 1275 set++; 1276 break; 1277 } 1278 } 1279 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1280 } 1281 } 1282 }); 1283 }); 1284 Kokkos::fence(); 1285 1286 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1287 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1288 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1289 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1290 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1291 const PetscInt start = field*nloc, end = start + nloc; 1292 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1293 // A22 panel update for each row A(1,:) and col A(:,1) 1294 for (int ii=start; ii<end-1; ii++) { 1295 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1296 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1297 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1298 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1299 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1300 PetscInt kIdx = j*Ni + field_block_idx; 1301 if (kIdx >= nzUi) /* void */ ; 1302 else { 1303 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1304 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1305 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1306 size_t st_idx; 1307 // find and do L(k,i) = A(:k,i) / A(i,i) 1308 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1309 // get column, there has got to be a better way 1310 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1311 if (pjL[j] == ii) { 1312 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1313 idx = j; // output 1314 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1315 } 1316 }, st_idx); 1317 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1318 #if defined(PETSC_USE_DEBUG) 1319 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1320 #endif 1321 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1322 // U(i+1,:end) 1323 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1324 PetscScalar Uij = baUi[uiIdx]; 1325 PetscInt col = bjUi[uiIdx]; 1326 if (col==myk) { 1327 // A_kk = A_kk - L_ki * U_ij(k) 1328 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1329 *Akkv = *Akkv - L_ki() * Uij; // UiK 1330 } else { 1331 PetscScalar *start, *end, *pAkjv=NULL; 1332 PetscInt high, low; 1333 const PetscInt *startj; 1334 if (col<myk) { // L 1335 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1336 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1337 start = pLki+1; // start at pLki+1, A22(myk,1) 1338 startj= bj_d + bi_d[myk] + idx; 1339 end = ba_d + bi_d[myk+1]; 1340 } else { 1341 PetscInt idx = bdiag_d[myk+1]+1; 1342 start = ba_d + idx; 1343 startj= bj_d + idx; 1344 end = ba_d + bdiag_d[myk]; 1345 } 1346 // search for 'col', use bisection search - TODO 1347 low = 0; 1348 high = (PetscInt)(end-start); 1349 while (high-low > 5) { 1350 int t = (low+high)/2; 1351 if (startj[t] > col) high = t; 1352 else low = t; 1353 } 1354 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1355 if (startj[pAkjv-start] == col) break; 1356 } 1357 #if defined(PETSC_USE_DEBUG) 1358 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1359 #endif 1360 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1361 } 1362 }); 1363 } 1364 }); 1365 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1366 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1367 } /* endof for (i=0; i<n; i++) { */ 1368 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1369 }); 1370 Kokkos::fence(); 1371 Kokkos::deep_copy (h_flops_k, d_flops_k); 1372 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1373 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1374 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1375 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1376 /* Invert diagonal for simpler triangular solves */ 1377 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1378 int i = start + outer_index*Ni + lg_rank%Ni; 1379 if (i < end) { 1380 PetscScalar *pv = ba_d + bdiag_d[i]; 1381 *pv = 1.0/(*pv); 1382 } 1383 }); 1384 }); 1385 } 1386 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1387 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1388 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1389 1390 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1391 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1392 if (b->inode.size) { 1393 B->ops->solve = MatSolve_SeqAIJ_Inode; 1394 } else if (row_identity && col_identity) { 1395 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1396 } else { 1397 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1398 } 1399 B->offloadmask = PETSC_OFFLOAD_GPU; 1400 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1401 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1402 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1403 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1404 B->ops->matsolve = MatMatSolve_SeqAIJ; 1405 B->assembled = PETSC_TRUE; 1406 B->preallocated = PETSC_TRUE; 1407 1408 PetscFunctionReturn(0); 1409 } 1410 1411 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1412 { 1413 PetscErrorCode ierr; 1414 1415 PetscFunctionBegin; 1416 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1417 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1418 PetscFunctionReturn(0); 1419 } 1420 1421 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1422 { 1423 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1424 1425 PetscFunctionBegin; 1426 if (!factors->sptrsv_symbolic_completed) { 1427 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1428 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1429 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1430 } 1431 PetscFunctionReturn(0); 1432 } 1433 1434 /* Check if we need to update factors etc for transpose solve */ 1435 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1436 { 1437 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1438 MatColIdxType n = A->rmap->n; 1439 1440 PetscFunctionBegin; 1441 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1442 /* Update L^T and do sptrsv symbolic */ 1443 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1444 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1445 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1446 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1447 1448 KokkosKernels::Impl::transpose_matrix< 1449 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1450 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1451 MatRowMapKokkosView,DefaultExecutionSpace>( 1452 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1453 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1454 1455 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1456 We have to sort the indices, until KK provides finer control options. 1457 */ 1458 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1459 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1460 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1461 1462 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1463 1464 /* Update U^T and do sptrsv symbolic */ 1465 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1466 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1467 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1468 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1469 1470 KokkosKernels::Impl::transpose_matrix< 1471 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1472 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1473 MatRowMapKokkosView,DefaultExecutionSpace>( 1474 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1475 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1476 1477 /* Sort indices. See comments above */ 1478 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1479 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1480 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1481 1482 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1483 factors->transpose_updated = PETSC_TRUE; 1484 } 1485 PetscFunctionReturn(0); 1486 } 1487 1488 /* Solve Ax = b, with A = LU */ 1489 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1490 { 1491 PetscErrorCode ierr; 1492 ConstPetscScalarKokkosView bv; 1493 PetscScalarKokkosView xv; 1494 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1495 1496 PetscFunctionBegin; 1497 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1498 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1499 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1500 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1501 /* Solve L tmpv = b */ 1502 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1503 /* Solve Ux = tmpv */ 1504 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1505 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1506 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1507 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1508 PetscFunctionReturn(0); 1509 } 1510 1511 /* Solve A^T x = b, where A^T = U^T L^T */ 1512 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1513 { 1514 PetscErrorCode ierr; 1515 ConstPetscScalarKokkosView bv; 1516 PetscScalarKokkosView xv; 1517 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1518 1519 PetscFunctionBegin; 1520 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1521 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1522 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1523 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1524 /* Solve U^T tmpv = b */ 1525 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1526 1527 /* Solve L^T x = tmpv */ 1528 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1529 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1530 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1531 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1532 PetscFunctionReturn(0); 1533 } 1534 1535 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1536 { 1537 PetscErrorCode ierr; 1538 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1539 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1540 PetscInt fill_lev = info->levels; 1541 1542 PetscFunctionBegin; 1543 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1544 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1545 1546 auto a_d = aijkok->a_dual.view_device(); 1547 auto i_d = aijkok->i_dual.view_device(); 1548 auto j_d = aijkok->j_dual.view_device(); 1549 1550 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1551 1552 B->assembled = PETSC_TRUE; 1553 B->preallocated = PETSC_TRUE; 1554 B->ops->solve = MatSolve_SeqAIJKokkos; 1555 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1556 B->ops->matsolve = NULL; 1557 B->ops->matsolvetranspose = NULL; 1558 B->offloadmask = PETSC_OFFLOAD_GPU; 1559 1560 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1561 factors->transpose_updated = PETSC_FALSE; 1562 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1563 /* TODO: log flops, but how to know that? */ 1564 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1565 PetscFunctionReturn(0); 1566 } 1567 1568 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1569 { 1570 PetscErrorCode ierr; 1571 Mat_SeqAIJKokkos *aijkok; 1572 Mat_SeqAIJ *b; 1573 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1574 PetscInt fill_lev = info->levels; 1575 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1576 PetscInt n = A->rmap->n; 1577 1578 PetscFunctionBegin; 1579 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1580 /* Rebuild factors */ 1581 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1582 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1583 1584 /* Create a spiluk handle and then do symbolic factorization */ 1585 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1586 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1587 1588 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1589 1590 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1591 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1592 Kokkos::realloc(factors->iU_d,n+1); 1593 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1594 1595 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1596 auto i_d = aijkok->i_dual.view_device(); 1597 auto j_d = aijkok->j_dual.view_device(); 1598 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1599 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1600 1601 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1602 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1603 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1604 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1605 1606 /* TODO: add options to select sptrsv algorithms */ 1607 /* Create sptrsv handles for L, U and their transpose */ 1608 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1609 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1610 #else 1611 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1612 #endif 1613 1614 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1615 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1616 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1617 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1618 1619 /* Fill fields of the factor matrix B */ 1620 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1621 b = (Mat_SeqAIJ*)B->data; 1622 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1623 B->info.fill_ratio_given = info->fill; 1624 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1625 1626 B->offloadmask = PETSC_OFFLOAD_GPU; 1627 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1628 PetscFunctionReturn(0); 1629 } 1630 1631 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1632 { 1633 PetscErrorCode ierr; 1634 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1635 const PetscInt nrows = A->rmap->n; 1636 1637 PetscFunctionBegin; 1638 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1639 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1640 // move B data into Kokkos 1641 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1642 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1643 { 1644 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1645 if (!baijkok->diag_d) { 1646 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1647 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1648 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1649 } 1650 } 1651 PetscFunctionReturn(0); 1652 } 1653 1654 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1655 { 1656 PetscFunctionBegin; 1657 *type = MATSOLVERKOKKOS; 1658 PetscFunctionReturn(0); 1659 } 1660 1661 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1662 { 1663 PetscFunctionBegin; 1664 *type = MATSOLVERKOKKOSDEVICE; 1665 PetscFunctionReturn(0); 1666 } 1667 1668 /*MC 1669 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1670 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1671 1672 Level: beginner 1673 1674 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1675 M*/ 1676 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1677 { 1678 PetscErrorCode ierr; 1679 PetscInt n = A->rmap->n; 1680 1681 PetscFunctionBegin; 1682 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1683 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1684 (*B)->factortype = ftype; 1685 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1686 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1687 1688 if (ftype == MAT_FACTOR_LU) { 1689 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1690 (*B)->canuseordering = PETSC_TRUE; 1691 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1692 } else if (ftype == MAT_FACTOR_ILU) { 1693 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1694 (*B)->canuseordering = PETSC_FALSE; 1695 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1696 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1697 1698 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1699 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1700 PetscFunctionReturn(0); 1701 } 1702 1703 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1704 { 1705 PetscErrorCode ierr; 1706 PetscInt n = A->rmap->n; 1707 1708 PetscFunctionBegin; 1709 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1710 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1711 (*B)->factortype = ftype; 1712 (*B)->canuseordering = PETSC_TRUE; 1713 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1714 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1715 1716 if (ftype == MAT_FACTOR_LU) { 1717 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1718 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1719 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1720 1721 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1722 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1723 PetscFunctionReturn(0); 1724 } 1725 1726 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1727 { 1728 PetscErrorCode ierr; 1729 1730 PetscFunctionBegin; 1731 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1732 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1733 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1734 PetscFunctionReturn(0); 1735 } 1736 1737 /* Utility to print out a KokkosCsrMatrix for debugging */ 1738 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1739 { 1740 PetscErrorCode ierr; 1741 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1742 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1743 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1744 const PetscInt *i = iv.data(); 1745 const PetscInt *j = jv.data(); 1746 const PetscScalar *a = av.data(); 1747 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1748 1749 PetscFunctionBegin; 1750 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1751 for (PetscInt k=0; k<m; k++) { 1752 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1753 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1754 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1755 } 1756 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1757 } 1758 PetscFunctionReturn(0); 1759 } 1760