1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 226 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228 CHKERRCXX(aijkok->a_dual.sync_device()); 229 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231 #if defined(PETSC_USE_COMPLEX) 232 const auto& a = aijkok->csrmatH.values; 233 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234 #endif 235 aijkok->hermitian_updated = PETSC_TRUE; 236 } 237 *csrmatH = &aijkok->csrmatH; 238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 /* y = A x */ 243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 244 { 245 PetscErrorCode ierr; 246 Mat_SeqAIJKokkos *aijkok; 247 ConstPetscScalarKokkosView xv; 248 PetscScalarKokkosView yv; 249 250 PetscFunctionBegin; 251 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 252 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 255 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 259 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 261 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 /* y = A^T x */ 266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 267 { 268 PetscErrorCode ierr; 269 Mat_SeqAIJKokkos *aijkok; 270 const char *mode; 271 ConstPetscScalarKokkosView xv; 272 PetscScalarKokkosView yv; 273 KokkosCsrMatrix *csrmat; 274 275 PetscFunctionBegin; 276 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 277 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 280 if (A->form_explicit_transpose) { 281 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282 mode = "N"; 283 } else { 284 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285 csrmat = &aijkok->csrmat; 286 mode = "T"; 287 } 288 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 291 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 /* y = A^H x */ 297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 298 { 299 PetscErrorCode ierr; 300 Mat_SeqAIJKokkos *aijkok; 301 const char *mode; 302 ConstPetscScalarKokkosView xv; 303 PetscScalarKokkosView yv; 304 KokkosCsrMatrix *csrmat; 305 306 PetscFunctionBegin; 307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 308 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 311 if (A->form_explicit_transpose) { 312 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313 mode = "N"; 314 } else { 315 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316 csrmat = &aijkok->csrmat; 317 mode = "C"; 318 } 319 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 322 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 /* z = A x + y */ 328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 329 { 330 PetscErrorCode ierr; 331 Mat_SeqAIJKokkos *aijkok; 332 ConstPetscScalarKokkosView xv,yv; 333 PetscScalarKokkosView zv; 334 335 PetscFunctionBegin; 336 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 337 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 341 if (zz != yy) Kokkos::deep_copy(zv,yv); 342 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 /* z = A^T x + y */ 353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJKokkos *aijkok; 357 const char *mode; 358 ConstPetscScalarKokkosView xv,yv; 359 PetscScalarKokkosView zv; 360 KokkosCsrMatrix *csrmat; 361 362 PetscFunctionBegin; 363 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 364 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 368 if (zz != yy) Kokkos::deep_copy(zv,yv); 369 if (A->form_explicit_transpose) { 370 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374 csrmat = &aijkok->csrmat; 375 mode = "T"; 376 } 377 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 381 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 /* z = A^H x + y */ 387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 388 { 389 PetscErrorCode ierr; 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv,yv; 393 PetscScalarKokkosView zv; 394 KokkosCsrMatrix *csrmat; 395 396 PetscFunctionBegin; 397 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 398 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 402 if (zz != yy) Kokkos::deep_copy(zv,yv); 403 if (A->form_explicit_transpose) { 404 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408 csrmat = &aijkok->csrmat; 409 mode = "C"; 410 } 411 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 415 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424 425 PetscFunctionBegin; 426 switch (op) { 427 case MAT_FORM_EXPLICIT_TRANSPOSE: 428 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430 A->form_explicit_transpose = flg; 431 break; 432 default: 433 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434 break; 435 } 436 PetscFunctionReturn(0); 437 } 438 439 /* Depending on reuse, either build a new mat, or use the existing mat */ 440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 441 { 442 PetscErrorCode ierr; 443 Mat_SeqAIJ *aseq; 444 445 PetscFunctionBegin; 446 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 449 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 452 PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457 aseq = static_cast<Mat_SeqAIJ*>(A->data); 458 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 459 PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 461 } 462 } 463 PetscFunctionReturn(0); 464 } 465 466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468 */ 469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 470 { 471 PetscErrorCode ierr; 472 Mat_SeqAIJ *bseq; 473 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474 Mat mat; 475 476 PetscFunctionBegin; 477 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479 mat = *B; 480 if (A->assembled) { 481 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484 /* Now copy values to B if needed */ 485 if (dupOption == MAT_COPY_VALUES) { 486 if (akok->a_dual.need_sync_device()) { 487 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488 bkok->a_dual.modify_host(); 489 } else { /* If device has the latest data, we only copy data on device */ 490 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491 bkok->a_dual.modify_device(); 492 } 493 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495 bkok->a_dual.modify_host(); 496 } 497 mat->spptr = bkok; 498 } 499 500 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 508 { 509 PetscErrorCode ierr; 510 Mat At; 511 KokkosCsrMatrix *internT; 512 Mat_SeqAIJKokkos *atkok,*bkok; 513 514 PetscFunctionBegin; 515 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 516 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517 /* Deep copy internT, as we want to isolate the internal transpose */ 518 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 519 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 520 if (reuse == MAT_INITIAL_MATRIX) *B = At; 521 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 522 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 523 if ((*B)->assembled) { 524 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 525 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 526 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 527 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 528 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 529 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 530 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 531 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 532 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 533 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 534 } 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 if (A->factortype == MAT_FACTOR_NONE) { 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 delete aijkok; 547 } else { 548 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 549 } 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 553 A->spptr = NULL; 554 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 555 PetscFunctionReturn(0); 556 } 557 558 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 559 { 560 PetscErrorCode ierr; 561 562 PetscFunctionBegin; 563 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 564 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 565 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 566 PetscFunctionReturn(0); 567 } 568 569 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 570 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 571 { 572 PetscErrorCode ierr; 573 Mat_SeqAIJ *a,*b; 574 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 575 MatScalarKokkosView aa,ba,ca; 576 MatRowMapKokkosView ai,bi,ci; 577 MatColIdxKokkosView aj,bj,cj; 578 PetscInt m,n,nnz,aN; 579 580 PetscFunctionBegin; 581 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 582 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 583 PetscValidPointer(C,4); 584 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 585 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 586 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 587 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 588 589 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 590 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 591 a = static_cast<Mat_SeqAIJ*>(A->data); 592 b = static_cast<Mat_SeqAIJ*>(B->data); 593 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 594 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 595 aa = akok->a_dual.view_device(); 596 ai = akok->i_dual.view_device(); 597 ba = bkok->a_dual.view_device(); 598 bi = bkok->i_dual.view_device(); 599 m = A->rmap->n; /* M, N and nnz of C */ 600 n = A->cmap->n + B->cmap->n; 601 nnz = a->nz + b->nz; 602 aN = A->cmap->n; /* N of A */ 603 if (reuse == MAT_INITIAL_MATRIX) { 604 aj = akok->j_dual.view_device(); 605 bj = bkok->j_dual.view_device(); 606 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 607 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 608 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 609 ca = ca_dual.view_device(); 610 ci = ci_dual.view_device(); 611 cj = cj_dual.view_device(); 612 613 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 614 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 615 PetscInt i = t.league_rank(); /* row i */ 616 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 617 618 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 619 ci(i) = coffset; 620 if (i == m-1) ci(m) = ai(m) + bi(m); 621 }); 622 623 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 624 if (k < alen) { 625 ca(coffset+k) = aa(ai(i)+k); 626 cj(coffset+k) = aj(ai(i)+k); 627 } else { 628 ca(coffset+k) = ba(bi(i)+k-alen); 629 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 630 } 631 }); 632 }); 633 ca_dual.modify_device(); 634 ci_dual.modify_device(); 635 cj_dual.modify_device(); 636 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 637 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 638 } else if (reuse == MAT_REUSE_MATRIX) { 639 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 640 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 641 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 642 ca = ckok->a_dual.view_device(); 643 ci = ckok->i_dual.view_device(); 644 645 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 646 PetscInt i = t.league_rank(); /* row i */ 647 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 648 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 649 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 650 else ca(ci(i)+k) = ba(bi(i)+k-alen); 651 }); 652 }); 653 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 654 } 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 659 { 660 PetscFunctionBegin; 661 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 662 PetscFunctionReturn(0); 663 } 664 665 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 666 { 667 PetscErrorCode ierr; 668 Mat_Product *product = C->product; 669 Mat A,B; 670 bool transA,transB; /* use bool, since KK needs this type */ 671 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 672 Mat_SeqAIJ *c; 673 MatProductData_SeqAIJKokkos *pdata; 674 KokkosCsrMatrix *csrmatA,*csrmatB; 675 676 PetscFunctionBegin; 677 MatCheckProduct(C,1); 678 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 679 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 680 681 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 682 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 683 PetscFunctionReturn(0); 684 } 685 686 switch (product->type) { 687 case MATPRODUCT_AB: transA = false; transB = false; break; 688 case MATPRODUCT_AtB: transA = true; transB = false; break; 689 case MATPRODUCT_ABt: transA = false; transB = true; break; 690 default: 691 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 692 } 693 694 A = product->A; 695 B = product->B; 696 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 697 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 698 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 699 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 700 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 701 702 PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 703 704 csrmatA = &akok->csrmat; 705 csrmatB = &bkok->csrmat; 706 707 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 708 if (transA) { 709 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 710 transA = false; 711 } 712 713 if (transB) { 714 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 715 transB = false; 716 } 717 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 718 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 719 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 720 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 721 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 722 /* shorter version of MatAssemblyEnd_SeqAIJ */ 723 c = (Mat_SeqAIJ*)C->data; 724 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 725 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 726 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 727 c->reallocs = 0; 728 C->info.mallocs = 0; 729 C->info.nz_unneeded = 0; 730 C->assembled = C->was_assembled = PETSC_TRUE; 731 C->num_ass++; 732 PetscFunctionReturn(0); 733 } 734 735 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 736 { 737 PetscErrorCode ierr; 738 Mat_Product *product = C->product; 739 MatProductType ptype; 740 Mat A,B; 741 bool transA,transB; 742 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 743 MatProductData_SeqAIJKokkos *pdata; 744 MPI_Comm comm; 745 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 746 747 PetscFunctionBegin; 748 MatCheckProduct(C,1); 749 ierr = PetscObjectGetComm((PetscObject)C,&comm); 750 PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 751 A = product->A; 752 B = product->B; 753 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 754 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 755 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 756 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 757 csrmatA = &akok->csrmat; 758 csrmatB = &bkok->csrmat; 759 760 ptype = product->type; 761 switch (ptype) { 762 case MATPRODUCT_AB: transA = false; transB = false; break; 763 case MATPRODUCT_AtB: transA = true; transB = false; break; 764 case MATPRODUCT_ABt: transA = false; transB = true; break; 765 default: 766 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 767 } 768 769 product->data = pdata = new MatProductData_SeqAIJKokkos(); 770 pdata->kh.set_team_work_size(16); 771 pdata->kh.set_dynamic_scheduling(true); 772 pdata->reusesym = product->api_user; 773 774 /* TODO: add command line options to select spgemm algorithms */ 775 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 776 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 777 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 778 */ 779 pdata->kh.create_spgemm_handle(spgemm_alg); 780 781 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 782 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 783 if (transA) { 784 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 785 transA = false; 786 } 787 788 if (transB) { 789 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 790 transB = false; 791 } 792 793 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 794 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 795 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 796 calling new Mat_SeqAIJKokkos(). 797 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 798 */ 799 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 800 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 801 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 802 803 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 804 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 805 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 806 PetscFunctionReturn(0); 807 } 808 809 /* handles sparse matrix matrix ops */ 810 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 811 { 812 PetscErrorCode ierr; 813 Mat_Product *product = mat->product; 814 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 815 816 PetscFunctionBegin; 817 MatCheckProduct(mat,1); 818 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 819 if (product->type == MATPRODUCT_ABC) { 820 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 821 } 822 if (Biskok && Ciskok) { 823 switch (product->type) { 824 case MATPRODUCT_AB: 825 case MATPRODUCT_AtB: 826 case MATPRODUCT_ABt: 827 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 828 break; 829 case MATPRODUCT_PtAP: 830 case MATPRODUCT_RARt: 831 case MATPRODUCT_ABC: 832 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 833 break; 834 default: 835 break; 836 } 837 } else { /* fallback for AIJ */ 838 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 839 } 840 PetscFunctionReturn(0); 841 } 842 843 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 844 { 845 PetscErrorCode ierr; 846 Mat_SeqAIJKokkos *aijkok; 847 848 PetscFunctionBegin; 849 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 850 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 851 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 852 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 853 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 854 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 855 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 856 PetscFunctionReturn(0); 857 } 858 859 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 860 { 861 PetscErrorCode ierr; 862 Mat_SeqAIJKokkos *aijkok; 863 864 PetscFunctionBegin; 865 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 866 if (aijkok) { /* Only zero the device if data is already there */ 867 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 868 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 869 } else { /* Might be preallocated but not assembled */ 870 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 871 } 872 PetscFunctionReturn(0); 873 } 874 875 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 876 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 877 { 878 PetscErrorCode ierr; 879 Mat_SeqAIJKokkos *aijkok; 880 881 PetscFunctionBegin; 882 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 883 PetscValidPointer(kv,2); 884 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 885 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 886 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 887 *kv = aijkok->a_dual.view_device(); 888 PetscFunctionReturn(0); 889 } 890 891 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 892 { 893 PetscFunctionBegin; 894 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 895 PetscValidPointer(kv,2); 896 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 897 PetscFunctionReturn(0); 898 } 899 900 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 901 { 902 PetscErrorCode ierr; 903 Mat_SeqAIJKokkos *aijkok; 904 905 PetscFunctionBegin; 906 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 907 PetscValidPointer(kv,2); 908 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 909 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 910 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 911 *kv = aijkok->a_dual.view_device(); 912 PetscFunctionReturn(0); 913 } 914 915 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 916 { 917 PetscErrorCode ierr; 918 919 PetscFunctionBegin; 920 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 921 PetscValidPointer(kv,2); 922 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 923 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 924 PetscFunctionReturn(0); 925 } 926 927 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 928 { 929 Mat_SeqAIJKokkos *aijkok; 930 931 PetscFunctionBegin; 932 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 933 PetscValidPointer(kv,2); 934 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 935 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 936 *kv = aijkok->a_dual.view_device(); 937 PetscFunctionReturn(0); 938 } 939 940 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 941 { 942 PetscErrorCode ierr; 943 944 PetscFunctionBegin; 945 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 946 PetscValidPointer(kv,2); 947 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 948 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 949 PetscFunctionReturn(0); 950 } 951 952 /* Computes Y += alpha X */ 953 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 954 { 955 PetscErrorCode ierr; 956 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 957 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 958 ConstMatScalarKokkosView Xa; 959 MatScalarKokkosView Ya; 960 961 PetscFunctionBegin; 962 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 963 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 964 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 965 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 966 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 967 968 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 969 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 970 PetscBool e; 971 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 972 if (e) { 973 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 974 if (e) pattern = SAME_NONZERO_PATTERN; 975 } 976 } 977 978 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 979 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 980 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 981 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 982 */ 983 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 984 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 985 Xa = xkok->a_dual.view_device(); 986 Ya = ykok->a_dual.view_device(); 987 988 if (pattern == SAME_NONZERO_PATTERN) { 989 KokkosBlas::axpy(alpha,Xa,Ya); 990 ierr = MatSeqAIJKokkosModifyDevice(Y); 991 } else if (pattern == SUBSET_NONZERO_PATTERN) { 992 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 993 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 994 995 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 996 PetscInt i = t.league_rank(); /* row i */ 997 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 998 PetscInt p,q = Yi(i); 999 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1000 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1001 if (Xj(p) == Yj(q)) { /* Found it */ 1002 Ya(q) += alpha * Xa(p); 1003 q++; 1004 } else { 1005 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1006 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1007 */ 1008 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1009 } 1010 } 1011 }); 1012 }); 1013 ierr = MatSeqAIJKokkosModifyDevice(Y); 1014 } else { /* different nonzero patterns */ 1015 Mat Z; 1016 KokkosCsrMatrix zcsr; 1017 KernelHandle kh; 1018 kh.create_spadd_handle(false); 1019 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1020 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1021 zkok = new Mat_SeqAIJKokkos(zcsr); 1022 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1023 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1024 kh.destroy_spadd_handle(); 1025 } 1026 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1027 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1028 PetscFunctionReturn(0); 1029 } 1030 1031 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1032 { 1033 PetscErrorCode ierr; 1034 Mat newmat; 1035 Mat_SeqAIJKokkos *akok; 1036 Mat_SeqAIJ *aseq; 1037 1038 PetscFunctionBegin; 1039 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 1040 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 1041 ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr); 1042 ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 1043 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1044 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1045 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1046 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1047 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1048 akok->SetUpCOO(aseq); 1049 PetscFunctionReturn(0); 1050 } 1051 1052 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1053 { 1054 PetscErrorCode ierr; 1055 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1056 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1057 PetscCount Annz = aseq->nz; 1058 const PetscCountKokkosView& jmap = akok->jmap_d; 1059 const PetscCountKokkosView& perm = akok->perm_d; 1060 MatScalarKokkosView Aa; 1061 ConstMatScalarKokkosView kv; 1062 PetscMemType memtype; 1063 1064 PetscFunctionBegin; 1065 PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()"); 1066 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1067 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1068 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1069 } else { 1070 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1071 } 1072 1073 if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */ 1074 else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 1075 1076 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1077 PetscScalar sum = 0.0; 1078 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1079 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1080 }); 1081 1082 if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1083 else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 1084 PetscFunctionReturn(0); 1085 } 1086 1087 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1088 { 1089 PetscErrorCode ierr; 1090 1091 PetscFunctionBegin; 1092 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1093 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1094 B->offloadmask = PETSC_OFFLOAD_CPU; 1095 PetscFunctionReturn(0); 1096 } 1097 1098 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1099 { 1100 PetscErrorCode ierr; 1101 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1102 1103 PetscFunctionBegin; 1104 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1105 A->boundtocpu = PETSC_FALSE; 1106 1107 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1108 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1109 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1110 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1111 A->ops->scale = MatScale_SeqAIJKokkos; 1112 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1113 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1114 A->ops->mult = MatMult_SeqAIJKokkos; 1115 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1116 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1117 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1118 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1119 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1120 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1121 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1122 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1123 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1124 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1125 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1126 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1127 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1128 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1129 1130 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1131 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1132 PetscFunctionReturn(0); 1133 } 1134 1135 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1136 { 1137 PetscErrorCode ierr; 1138 Mat_SeqAIJ *aseq; 1139 PetscInt i,m,n; 1140 1141 PetscFunctionBegin; 1142 PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1143 1144 m = akok->nrows(); 1145 n = akok->ncols(); 1146 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1147 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1148 1149 /* Set up data structures of A as a MATSEQAIJ */ 1150 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1151 aseq = (Mat_SeqAIJ*)(A)->data; 1152 1153 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1154 akok->j_dual.sync_host(); 1155 1156 aseq->i = akok->i_host_data(); 1157 aseq->j = akok->j_host_data(); 1158 aseq->a = akok->a_host_data(); 1159 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1160 aseq->singlemalloc = PETSC_FALSE; 1161 aseq->free_a = PETSC_FALSE; 1162 aseq->free_ij = PETSC_FALSE; 1163 aseq->nz = akok->nnz(); 1164 aseq->maxnz = aseq->nz; 1165 1166 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1167 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1168 for (i=0; i<m; i++) { 1169 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1170 } 1171 1172 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1173 akok->nonzerostate = A->nonzerostate; 1174 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1175 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1176 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1181 1182 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1183 */ 1184 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1185 { 1186 PetscErrorCode ierr; 1187 1188 PetscFunctionBegin; 1189 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1190 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1191 PetscFunctionReturn(0); 1192 } 1193 1194 /* --------------------------------------------------------------------------------*/ 1195 /*@C 1196 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1197 (the default parallel PETSc format). This matrix will ultimately be handled by 1198 Kokkos for calculations. For good matrix 1199 assembly performance the user should preallocate the matrix storage by setting 1200 the parameter nz (or the array nnz). By setting these parameters accurately, 1201 performance during matrix assembly can be increased by more than a factor of 50. 1202 1203 Collective 1204 1205 Input Parameters: 1206 + comm - MPI communicator, set to PETSC_COMM_SELF 1207 . m - number of rows 1208 . n - number of columns 1209 . nz - number of nonzeros per row (same for all rows) 1210 - nnz - array containing the number of nonzeros in the various rows 1211 (possibly different for each row) or NULL 1212 1213 Output Parameter: 1214 . A - the matrix 1215 1216 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1217 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1218 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1219 1220 Notes: 1221 If nnz is given then nz is ignored 1222 1223 The AIJ format (also called the Yale sparse matrix format or 1224 compressed row storage), is fully compatible with standard Fortran 77 1225 storage. That is, the stored row and column indices can begin at 1226 either one (as in Fortran) or zero. See the users' manual for details. 1227 1228 Specify the preallocated storage with either nz or nnz (not both). 1229 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1230 allocation. For large problems you MUST preallocate memory or you 1231 will get TERRIBLE performance, see the users' manual chapter on matrices. 1232 1233 By default, this format uses inodes (identical nodes) when possible, to 1234 improve numerical efficiency of matrix-vector products and solves. We 1235 search for consecutive rows with the same nonzero structure, thereby 1236 reusing matrix information to achieve increased efficiency. 1237 1238 Level: intermediate 1239 1240 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1241 @*/ 1242 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1243 { 1244 PetscErrorCode ierr; 1245 1246 PetscFunctionBegin; 1247 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1248 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1249 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1250 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1251 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 typedef Kokkos::TeamPolicy<>::member_type team_member; 1256 // 1257 // This factorization exploits block diagonal matrices with "Nf" (not used). 1258 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1259 // 1260 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1261 { 1262 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1263 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1264 IS isrow = b->row,isicol = b->icol; 1265 PetscErrorCode ierr; 1266 const PetscInt *r_h,*ic_h; 1267 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1268 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1269 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1270 PetscBool row_identity,col_identity; 1271 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1272 1273 PetscFunctionBegin; 1274 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1275 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1276 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1277 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1278 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1279 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1280 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1281 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1282 { 1283 #define KOKKOS_SHARED_LEVEL 1 1284 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1285 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1286 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1287 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1288 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1289 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1290 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1291 size_t flops_h = 0.0; 1292 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1293 Kokkos::View<size_t> d_flops_k ("flops"); 1294 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1295 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1296 Kokkos::deep_copy (d_flops_k, h_flops_k); 1297 Kokkos::deep_copy (d_r_k, h_r_k); 1298 Kokkos::deep_copy (d_ic_k, h_ic_k); 1299 // Fill A --> fact 1300 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1301 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1302 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1303 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1304 // zero rows of B 1305 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1306 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1307 PetscScalar *baL = ba_d + bi_d[rowb]; 1308 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1309 /* zero (unfactored row) */ 1310 for (int j=0;j<nzbL;j++) baL[j] = 0; 1311 for (int j=0;j<nzbU;j++) baU[j] = 0; 1312 }); 1313 // copy A into B 1314 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1315 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1316 const PetscScalar *av = aa_d + ai_d[rowa]; 1317 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1318 /* load in initial (unfactored row) */ 1319 for (int j=0;j<nza;j++) { 1320 PetscInt colb = ic[ajtmp[j]]; 1321 PetscScalar vala = av[j]; 1322 if (colb == rowb) { 1323 *(ba_d + bdiag_d[rowb]) = vala; 1324 } else { 1325 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1326 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1327 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1328 for (int j=0; j<nz ; j++) { 1329 if (pbj[j] == colb) { 1330 pba[j] = vala; 1331 set++; 1332 break; 1333 } 1334 } 1335 #if !defined(PETSC_HAVE_SYCL) 1336 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1337 #endif 1338 } 1339 } 1340 }); 1341 }); 1342 Kokkos::fence(); 1343 1344 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1345 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1346 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1347 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1348 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1349 const PetscInt start = field*nloc, end = start + nloc; 1350 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1351 // A22 panel update for each row A(1,:) and col A(:,1) 1352 for (int ii=start; ii<end-1; ii++) { 1353 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1354 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1355 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1356 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1357 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1358 PetscInt kIdx = j*Ni + field_block_idx; 1359 if (kIdx >= nzUi) /* void */ ; 1360 else { 1361 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1362 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1363 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1364 size_t st_idx; 1365 // find and do L(k,i) = A(:k,i) / A(i,i) 1366 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1367 // get column, there has got to be a better way 1368 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1369 if (pjL[j] == ii) { 1370 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1371 idx = j; // output 1372 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1373 } 1374 }, st_idx); 1375 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1376 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1377 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1378 #endif 1379 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1380 // U(i+1,:end) 1381 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1382 PetscScalar Uij = baUi[uiIdx]; 1383 PetscInt col = bjUi[uiIdx]; 1384 if (col==myk) { 1385 // A_kk = A_kk - L_ki * U_ij(k) 1386 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1387 *Akkv = *Akkv - L_ki() * Uij; // UiK 1388 } else { 1389 PetscScalar *start, *end, *pAkjv=NULL; 1390 PetscInt high, low; 1391 const PetscInt *startj; 1392 if (col<myk) { // L 1393 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1394 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1395 start = pLki+1; // start at pLki+1, A22(myk,1) 1396 startj= bj_d + bi_d[myk] + idx; 1397 end = ba_d + bi_d[myk+1]; 1398 } else { 1399 PetscInt idx = bdiag_d[myk+1]+1; 1400 start = ba_d + idx; 1401 startj= bj_d + idx; 1402 end = ba_d + bdiag_d[myk]; 1403 } 1404 // search for 'col', use bisection search - TODO 1405 low = 0; 1406 high = (PetscInt)(end-start); 1407 while (high-low > 5) { 1408 int t = (low+high)/2; 1409 if (startj[t] > col) high = t; 1410 else low = t; 1411 } 1412 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1413 if (startj[pAkjv-start] == col) break; 1414 } 1415 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1416 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1417 #endif 1418 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1419 } 1420 }); 1421 } 1422 }); 1423 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1424 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1425 } /* endof for (i=0; i<n; i++) { */ 1426 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1427 }); 1428 Kokkos::fence(); 1429 Kokkos::deep_copy (h_flops_k, d_flops_k); 1430 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1431 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1432 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1433 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1434 /* Invert diagonal for simpler triangular solves */ 1435 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1436 int i = start + outer_index*Ni + lg_rank%Ni; 1437 if (i < end) { 1438 PetscScalar *pv = ba_d + bdiag_d[i]; 1439 *pv = 1.0/(*pv); 1440 } 1441 }); 1442 }); 1443 } 1444 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1445 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1446 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1447 1448 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1449 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1450 if (b->inode.size) { 1451 B->ops->solve = MatSolve_SeqAIJ_Inode; 1452 } else if (row_identity && col_identity) { 1453 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1454 } else { 1455 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1456 } 1457 B->offloadmask = PETSC_OFFLOAD_GPU; 1458 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1459 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1460 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1461 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1462 B->ops->matsolve = MatMatSolve_SeqAIJ; 1463 B->assembled = PETSC_TRUE; 1464 B->preallocated = PETSC_TRUE; 1465 1466 PetscFunctionReturn(0); 1467 } 1468 1469 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1470 { 1471 PetscErrorCode ierr; 1472 1473 PetscFunctionBegin; 1474 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1475 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1476 PetscFunctionReturn(0); 1477 } 1478 1479 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1480 { 1481 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1482 1483 PetscFunctionBegin; 1484 if (!factors->sptrsv_symbolic_completed) { 1485 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1486 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1487 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1488 } 1489 PetscFunctionReturn(0); 1490 } 1491 1492 /* Check if we need to update factors etc for transpose solve */ 1493 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1494 { 1495 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1496 MatColIdxType n = A->rmap->n; 1497 1498 PetscFunctionBegin; 1499 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1500 /* Update L^T and do sptrsv symbolic */ 1501 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1502 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1503 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1504 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1505 1506 KokkosKernels::Impl::transpose_matrix< 1507 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1508 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1509 MatRowMapKokkosView,DefaultExecutionSpace>( 1510 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1511 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1512 1513 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1514 We have to sort the indices, until KK provides finer control options. 1515 */ 1516 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1517 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1518 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1519 1520 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1521 1522 /* Update U^T and do sptrsv symbolic */ 1523 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1524 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1525 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1526 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1527 1528 KokkosKernels::Impl::transpose_matrix< 1529 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1530 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1531 MatRowMapKokkosView,DefaultExecutionSpace>( 1532 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1533 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1534 1535 /* Sort indices. See comments above */ 1536 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1537 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1538 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1539 1540 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1541 factors->transpose_updated = PETSC_TRUE; 1542 } 1543 PetscFunctionReturn(0); 1544 } 1545 1546 /* Solve Ax = b, with A = LU */ 1547 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1548 { 1549 PetscErrorCode ierr; 1550 ConstPetscScalarKokkosView bv; 1551 PetscScalarKokkosView xv; 1552 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1553 1554 PetscFunctionBegin; 1555 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1556 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1557 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1558 ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1559 /* Solve L tmpv = b */ 1560 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1561 /* Solve Ux = tmpv */ 1562 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1563 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1564 ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1565 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1566 PetscFunctionReturn(0); 1567 } 1568 1569 /* Solve A^T x = b, where A^T = U^T L^T */ 1570 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1571 { 1572 PetscErrorCode ierr; 1573 ConstPetscScalarKokkosView bv; 1574 PetscScalarKokkosView xv; 1575 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1576 1577 PetscFunctionBegin; 1578 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1579 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1580 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1581 ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1582 /* Solve U^T tmpv = b */ 1583 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1584 1585 /* Solve L^T x = tmpv */ 1586 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1587 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1588 ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1589 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1594 { 1595 PetscErrorCode ierr; 1596 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1597 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1598 PetscInt fill_lev = info->levels; 1599 1600 PetscFunctionBegin; 1601 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1602 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1603 1604 auto a_d = aijkok->a_dual.view_device(); 1605 auto i_d = aijkok->i_dual.view_device(); 1606 auto j_d = aijkok->j_dual.view_device(); 1607 1608 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1609 1610 B->assembled = PETSC_TRUE; 1611 B->preallocated = PETSC_TRUE; 1612 B->ops->solve = MatSolve_SeqAIJKokkos; 1613 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1614 B->ops->matsolve = NULL; 1615 B->ops->matsolvetranspose = NULL; 1616 B->offloadmask = PETSC_OFFLOAD_GPU; 1617 1618 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1619 factors->transpose_updated = PETSC_FALSE; 1620 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1621 /* TODO: log flops, but how to know that? */ 1622 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1627 { 1628 PetscErrorCode ierr; 1629 Mat_SeqAIJKokkos *aijkok; 1630 Mat_SeqAIJ *b; 1631 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1632 PetscInt fill_lev = info->levels; 1633 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1634 PetscInt n = A->rmap->n; 1635 1636 PetscFunctionBegin; 1637 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1638 /* Rebuild factors */ 1639 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1640 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1641 1642 /* Create a spiluk handle and then do symbolic factorization */ 1643 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1644 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1645 1646 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1647 1648 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1649 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1650 Kokkos::realloc(factors->iU_d,n+1); 1651 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1652 1653 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1654 auto i_d = aijkok->i_dual.view_device(); 1655 auto j_d = aijkok->j_dual.view_device(); 1656 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1657 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1658 1659 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1660 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1661 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1662 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1663 1664 /* TODO: add options to select sptrsv algorithms */ 1665 /* Create sptrsv handles for L, U and their transpose */ 1666 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1667 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1668 #else 1669 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1670 #endif 1671 1672 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1673 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1674 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1675 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1676 1677 /* Fill fields of the factor matrix B */ 1678 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1679 b = (Mat_SeqAIJ*)B->data; 1680 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1681 B->info.fill_ratio_given = info->fill; 1682 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1683 1684 B->offloadmask = PETSC_OFFLOAD_GPU; 1685 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1686 PetscFunctionReturn(0); 1687 } 1688 1689 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1690 { 1691 PetscErrorCode ierr; 1692 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1693 const PetscInt nrows = A->rmap->n; 1694 1695 PetscFunctionBegin; 1696 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1697 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1698 // move B data into Kokkos 1699 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1700 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1701 { 1702 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1703 if (!baijkok->diag_d.extent(0)) { 1704 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1705 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1706 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1707 } 1708 } 1709 PetscFunctionReturn(0); 1710 } 1711 1712 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1713 { 1714 PetscFunctionBegin; 1715 *type = MATSOLVERKOKKOS; 1716 PetscFunctionReturn(0); 1717 } 1718 1719 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1720 { 1721 PetscFunctionBegin; 1722 *type = MATSOLVERKOKKOSDEVICE; 1723 PetscFunctionReturn(0); 1724 } 1725 1726 /*MC 1727 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1728 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1729 1730 Level: beginner 1731 1732 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1733 M*/ 1734 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1735 { 1736 PetscErrorCode ierr; 1737 PetscInt n = A->rmap->n; 1738 1739 PetscFunctionBegin; 1740 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1741 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1742 (*B)->factortype = ftype; 1743 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1744 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1745 1746 if (ftype == MAT_FACTOR_LU) { 1747 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1748 (*B)->canuseordering = PETSC_TRUE; 1749 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1750 } else if (ftype == MAT_FACTOR_ILU) { 1751 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1752 (*B)->canuseordering = PETSC_FALSE; 1753 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1754 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1755 1756 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1757 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1758 PetscFunctionReturn(0); 1759 } 1760 1761 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1762 { 1763 PetscErrorCode ierr; 1764 PetscInt n = A->rmap->n; 1765 1766 PetscFunctionBegin; 1767 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1768 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1769 (*B)->factortype = ftype; 1770 (*B)->canuseordering = PETSC_TRUE; 1771 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1772 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1773 1774 if (ftype == MAT_FACTOR_LU) { 1775 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1776 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1777 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1778 1779 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1780 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1781 PetscFunctionReturn(0); 1782 } 1783 1784 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1785 { 1786 PetscErrorCode ierr; 1787 1788 PetscFunctionBegin; 1789 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1790 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1791 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1792 PetscFunctionReturn(0); 1793 } 1794 1795 /* Utility to print out a KokkosCsrMatrix for debugging */ 1796 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1797 { 1798 PetscErrorCode ierr; 1799 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1800 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1801 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1802 const PetscInt *i = iv.data(); 1803 const PetscInt *j = jv.data(); 1804 const PetscScalar *a = av.data(); 1805 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1806 1807 PetscFunctionBegin; 1808 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1809 for (PetscInt k=0; k<m; k++) { 1810 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1811 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1812 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1813 } 1814 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1815 } 1816 PetscFunctionReturn(0); 1817 } 1818