1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 PetscErrorCode ierr; 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 226 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228 CHKERRCXX(aijkok->a_dual.sync_device()); 229 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231 #if defined(PETSC_USE_COMPLEX) 232 const auto& a = aijkok->csrmatH.values; 233 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234 #endif 235 aijkok->hermitian_updated = PETSC_TRUE; 236 } 237 *csrmatH = &aijkok->csrmatH; 238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 /* y = A x */ 243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 244 { 245 PetscErrorCode ierr; 246 Mat_SeqAIJKokkos *aijkok; 247 ConstPetscScalarKokkosView xv; 248 PetscScalarKokkosView yv; 249 250 PetscFunctionBegin; 251 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 252 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 254 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 255 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 256 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 259 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /* y = A^T x */ 265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 266 { 267 PetscErrorCode ierr; 268 Mat_SeqAIJKokkos *aijkok; 269 const char *mode; 270 ConstPetscScalarKokkosView xv; 271 PetscScalarKokkosView yv; 272 KokkosCsrMatrix *csrmat; 273 274 PetscFunctionBegin; 275 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 276 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 277 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 279 if (A->form_explicit_transpose) { 280 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 281 mode = "N"; 282 } else { 283 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 284 csrmat = &aijkok->csrmat; 285 mode = "T"; 286 } 287 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 288 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 289 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 290 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 291 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 /* y = A^H x */ 296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 297 { 298 PetscErrorCode ierr; 299 Mat_SeqAIJKokkos *aijkok; 300 const char *mode; 301 ConstPetscScalarKokkosView xv; 302 PetscScalarKokkosView yv; 303 KokkosCsrMatrix *csrmat; 304 305 PetscFunctionBegin; 306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 307 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 308 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 310 if (A->form_explicit_transpose) { 311 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 312 mode = "N"; 313 } else { 314 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 315 csrmat = &aijkok->csrmat; 316 mode = "C"; 317 } 318 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 319 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 320 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 321 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 322 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 /* z = A x + y */ 327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 328 { 329 PetscErrorCode ierr; 330 Mat_SeqAIJKokkos *aijkok; 331 ConstPetscScalarKokkosView xv,yv; 332 PetscScalarKokkosView zv; 333 334 PetscFunctionBegin; 335 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 336 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 337 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 340 if (zz != yy) Kokkos::deep_copy(zv,yv); 341 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 342 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 343 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 /* z = A^T x + y */ 352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 353 { 354 PetscErrorCode ierr; 355 Mat_SeqAIJKokkos *aijkok; 356 const char *mode; 357 ConstPetscScalarKokkosView xv,yv; 358 PetscScalarKokkosView zv; 359 KokkosCsrMatrix *csrmat; 360 361 PetscFunctionBegin; 362 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 363 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 364 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 367 if (zz != yy) Kokkos::deep_copy(zv,yv); 368 if (A->form_explicit_transpose) { 369 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 370 mode = "N"; 371 } else { 372 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 373 csrmat = &aijkok->csrmat; 374 mode = "T"; 375 } 376 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 377 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 378 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 380 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 381 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 /* z = A^H x + y */ 386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 387 { 388 PetscErrorCode ierr; 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv,yv; 392 PetscScalarKokkosView zv; 393 KokkosCsrMatrix *csrmat; 394 395 PetscFunctionBegin; 396 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 397 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 398 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 401 if (zz != yy) Kokkos::deep_copy(zv,yv); 402 if (A->form_explicit_transpose) { 403 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 404 mode = "N"; 405 } else { 406 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 407 csrmat = &aijkok->csrmat; 408 mode = "C"; 409 } 410 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 411 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 412 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 414 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 415 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 416 PetscFunctionReturn(0); 417 } 418 419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 420 { 421 PetscErrorCode ierr; 422 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 423 424 PetscFunctionBegin; 425 switch (op) { 426 case MAT_FORM_EXPLICIT_TRANSPOSE: 427 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 428 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 429 A->form_explicit_transpose = flg; 430 break; 431 default: 432 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 433 break; 434 } 435 PetscFunctionReturn(0); 436 } 437 438 /* Depending on reuse, either build a new mat, or use the existing mat */ 439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *aseq; 443 444 PetscFunctionBegin; 445 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 446 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 447 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 448 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 449 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 450 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 451 PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 452 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 454 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 455 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 456 aseq = static_cast<Mat_SeqAIJ*>(A->data); 457 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 458 PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 459 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 460 } 461 } 462 PetscFunctionReturn(0); 463 } 464 465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 466 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 467 */ 468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 469 { 470 PetscErrorCode ierr; 471 Mat_SeqAIJ *bseq; 472 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 473 Mat mat; 474 475 PetscFunctionBegin; 476 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 477 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 478 mat = *B; 479 if (A->assembled) { 480 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 481 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 482 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 483 /* Now copy values to B if needed */ 484 if (dupOption == MAT_COPY_VALUES) { 485 if (akok->a_dual.need_sync_device()) { 486 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 487 bkok->a_dual.modify_host(); 488 } else { /* If device has the latest data, we only copy data on device */ 489 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 490 bkok->a_dual.modify_device(); 491 } 492 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 493 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 494 bkok->a_dual.modify_host(); 495 } 496 mat->spptr = bkok; 497 } 498 499 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 500 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 501 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 502 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 503 PetscFunctionReturn(0); 504 } 505 506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 507 { 508 PetscErrorCode ierr; 509 Mat At; 510 KokkosCsrMatrix *internT; 511 Mat_SeqAIJKokkos *atkok,*bkok; 512 513 PetscFunctionBegin; 514 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 515 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 516 /* Deep copy internT, as we want to isolate the internal transpose */ 517 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 518 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 519 if (reuse == MAT_INITIAL_MATRIX) *B = At; 520 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 521 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 522 if ((*B)->assembled) { 523 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 524 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 525 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 526 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 527 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 528 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 529 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 530 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 531 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 532 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 533 } 534 PetscFunctionReturn(0); 535 } 536 537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 538 { 539 PetscErrorCode ierr; 540 Mat_SeqAIJKokkos *aijkok; 541 542 PetscFunctionBegin; 543 if (A->factortype == MAT_FACTOR_NONE) { 544 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 545 delete aijkok; 546 } else { 547 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 548 } 549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 552 A->spptr = NULL; 553 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 554 PetscFunctionReturn(0); 555 } 556 557 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 558 { 559 PetscErrorCode ierr; 560 561 PetscFunctionBegin; 562 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 563 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 564 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 565 PetscFunctionReturn(0); 566 } 567 568 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 569 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 570 { 571 PetscErrorCode ierr; 572 Mat_SeqAIJ *a,*b; 573 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 574 MatScalarKokkosView aa,ba,ca; 575 MatRowMapKokkosView ai,bi,ci; 576 MatColIdxKokkosView aj,bj,cj; 577 PetscInt m,n,nnz,aN; 578 579 PetscFunctionBegin; 580 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 581 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 582 PetscValidPointer(C,4); 583 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 584 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 585 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 586 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 587 588 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 589 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 590 a = static_cast<Mat_SeqAIJ*>(A->data); 591 b = static_cast<Mat_SeqAIJ*>(B->data); 592 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 593 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 594 aa = akok->a_dual.view_device(); 595 ai = akok->i_dual.view_device(); 596 ba = bkok->a_dual.view_device(); 597 bi = bkok->i_dual.view_device(); 598 m = A->rmap->n; /* M, N and nnz of C */ 599 n = A->cmap->n + B->cmap->n; 600 nnz = a->nz + b->nz; 601 aN = A->cmap->n; /* N of A */ 602 if (reuse == MAT_INITIAL_MATRIX) { 603 aj = akok->j_dual.view_device(); 604 bj = bkok->j_dual.view_device(); 605 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 606 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 607 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 608 ca = ca_dual.view_device(); 609 ci = ci_dual.view_device(); 610 cj = cj_dual.view_device(); 611 612 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 613 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 614 PetscInt i = t.league_rank(); /* row i */ 615 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 616 617 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 618 ci(i) = coffset; 619 if (i == m-1) ci(m) = ai(m) + bi(m); 620 }); 621 622 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 623 if (k < alen) { 624 ca(coffset+k) = aa(ai(i)+k); 625 cj(coffset+k) = aj(ai(i)+k); 626 } else { 627 ca(coffset+k) = ba(bi(i)+k-alen); 628 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 629 } 630 }); 631 }); 632 ca_dual.modify_device(); 633 ci_dual.modify_device(); 634 cj_dual.modify_device(); 635 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 636 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 637 } else if (reuse == MAT_REUSE_MATRIX) { 638 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 639 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 640 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 641 ca = ckok->a_dual.view_device(); 642 ci = ckok->i_dual.view_device(); 643 644 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 645 PetscInt i = t.league_rank(); /* row i */ 646 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 647 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 648 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 649 else ca(ci(i)+k) = ba(bi(i)+k-alen); 650 }); 651 }); 652 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 653 } 654 PetscFunctionReturn(0); 655 } 656 657 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 658 { 659 PetscFunctionBegin; 660 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 661 PetscFunctionReturn(0); 662 } 663 664 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 665 { 666 PetscErrorCode ierr; 667 Mat_Product *product = C->product; 668 Mat A,B; 669 bool transA,transB; /* use bool, since KK needs this type */ 670 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 671 Mat_SeqAIJ *c; 672 MatProductData_SeqAIJKokkos *pdata; 673 KokkosCsrMatrix *csrmatA,*csrmatB; 674 675 PetscFunctionBegin; 676 MatCheckProduct(C,1); 677 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 678 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 679 680 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 681 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 682 PetscFunctionReturn(0); 683 } 684 685 switch (product->type) { 686 case MATPRODUCT_AB: transA = false; transB = false; break; 687 case MATPRODUCT_AtB: transA = true; transB = false; break; 688 case MATPRODUCT_ABt: transA = false; transB = true; break; 689 default: 690 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 691 } 692 693 A = product->A; 694 B = product->B; 695 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 696 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 697 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 698 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 699 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 700 701 PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 702 703 csrmatA = &akok->csrmat; 704 csrmatB = &bkok->csrmat; 705 706 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 707 if (transA) { 708 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 709 transA = false; 710 } 711 712 if (transB) { 713 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 714 transB = false; 715 } 716 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 717 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 718 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 719 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 720 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 721 /* shorter version of MatAssemblyEnd_SeqAIJ */ 722 c = (Mat_SeqAIJ*)C->data; 723 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 724 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 725 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 726 c->reallocs = 0; 727 C->info.mallocs = 0; 728 C->info.nz_unneeded = 0; 729 C->assembled = C->was_assembled = PETSC_TRUE; 730 C->num_ass++; 731 PetscFunctionReturn(0); 732 } 733 734 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 735 { 736 PetscErrorCode ierr; 737 Mat_Product *product = C->product; 738 MatProductType ptype; 739 Mat A,B; 740 bool transA,transB; 741 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 742 MatProductData_SeqAIJKokkos *pdata; 743 MPI_Comm comm; 744 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 745 746 PetscFunctionBegin; 747 MatCheckProduct(C,1); 748 ierr = PetscObjectGetComm((PetscObject)C,&comm); 749 PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 750 A = product->A; 751 B = product->B; 752 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 753 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 754 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 755 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 756 csrmatA = &akok->csrmat; 757 csrmatB = &bkok->csrmat; 758 759 ptype = product->type; 760 switch (ptype) { 761 case MATPRODUCT_AB: transA = false; transB = false; break; 762 case MATPRODUCT_AtB: transA = true; transB = false; break; 763 case MATPRODUCT_ABt: transA = false; transB = true; break; 764 default: 765 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 766 } 767 768 product->data = pdata = new MatProductData_SeqAIJKokkos(); 769 pdata->kh.set_team_work_size(16); 770 pdata->kh.set_dynamic_scheduling(true); 771 pdata->reusesym = product->api_user; 772 773 /* TODO: add command line options to select spgemm algorithms */ 774 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 775 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 776 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 777 */ 778 pdata->kh.create_spgemm_handle(spgemm_alg); 779 780 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 781 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 782 if (transA) { 783 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 784 transA = false; 785 } 786 787 if (transB) { 788 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 789 transB = false; 790 } 791 792 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 793 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 794 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 795 calling new Mat_SeqAIJKokkos(). 796 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 797 */ 798 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 799 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 800 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 801 802 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 803 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 804 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 805 PetscFunctionReturn(0); 806 } 807 808 /* handles sparse matrix matrix ops */ 809 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 810 { 811 PetscErrorCode ierr; 812 Mat_Product *product = mat->product; 813 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 814 815 PetscFunctionBegin; 816 MatCheckProduct(mat,1); 817 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 818 if (product->type == MATPRODUCT_ABC) { 819 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 820 } 821 if (Biskok && Ciskok) { 822 switch (product->type) { 823 case MATPRODUCT_AB: 824 case MATPRODUCT_AtB: 825 case MATPRODUCT_ABt: 826 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 827 break; 828 case MATPRODUCT_PtAP: 829 case MATPRODUCT_RARt: 830 case MATPRODUCT_ABC: 831 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 832 break; 833 default: 834 break; 835 } 836 } else { /* fallback for AIJ */ 837 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 838 } 839 PetscFunctionReturn(0); 840 } 841 842 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 843 { 844 PetscErrorCode ierr; 845 Mat_SeqAIJKokkos *aijkok; 846 847 PetscFunctionBegin; 848 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 849 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 850 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 851 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 852 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 853 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 854 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 855 PetscFunctionReturn(0); 856 } 857 858 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 859 { 860 PetscErrorCode ierr; 861 Mat_SeqAIJKokkos *aijkok; 862 863 PetscFunctionBegin; 864 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 865 if (aijkok) { /* Only zero the device if data is already there */ 866 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 867 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 868 } else { /* Might be preallocated but not assembled */ 869 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 870 } 871 PetscFunctionReturn(0); 872 } 873 874 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 875 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 876 { 877 PetscErrorCode ierr; 878 Mat_SeqAIJKokkos *aijkok; 879 880 PetscFunctionBegin; 881 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 882 PetscValidPointer(kv,2); 883 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 884 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 885 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 886 *kv = aijkok->a_dual.view_device(); 887 PetscFunctionReturn(0); 888 } 889 890 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 891 { 892 PetscFunctionBegin; 893 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 894 PetscValidPointer(kv,2); 895 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 896 PetscFunctionReturn(0); 897 } 898 899 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 900 { 901 PetscErrorCode ierr; 902 Mat_SeqAIJKokkos *aijkok; 903 904 PetscFunctionBegin; 905 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 906 PetscValidPointer(kv,2); 907 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 908 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 909 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 910 *kv = aijkok->a_dual.view_device(); 911 PetscFunctionReturn(0); 912 } 913 914 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 915 { 916 PetscErrorCode ierr; 917 918 PetscFunctionBegin; 919 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 920 PetscValidPointer(kv,2); 921 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 922 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 923 PetscFunctionReturn(0); 924 } 925 926 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 927 { 928 Mat_SeqAIJKokkos *aijkok; 929 930 PetscFunctionBegin; 931 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 932 PetscValidPointer(kv,2); 933 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 934 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 935 *kv = aijkok->a_dual.view_device(); 936 PetscFunctionReturn(0); 937 } 938 939 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 940 { 941 PetscErrorCode ierr; 942 943 PetscFunctionBegin; 944 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945 PetscValidPointer(kv,2); 946 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 947 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 948 PetscFunctionReturn(0); 949 } 950 951 /* Computes Y += alpha X */ 952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 953 { 954 PetscErrorCode ierr; 955 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 956 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 957 ConstMatScalarKokkosView Xa; 958 MatScalarKokkosView Ya; 959 960 PetscFunctionBegin; 961 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 962 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 963 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 964 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 965 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 966 967 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 968 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 969 PetscBool e; 970 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 971 if (e) { 972 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 973 if (e) pattern = SAME_NONZERO_PATTERN; 974 } 975 } 976 977 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 978 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 979 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 980 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 981 */ 982 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 983 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 984 Xa = xkok->a_dual.view_device(); 985 Ya = ykok->a_dual.view_device(); 986 987 if (pattern == SAME_NONZERO_PATTERN) { 988 KokkosBlas::axpy(alpha,Xa,Ya); 989 ierr = MatSeqAIJKokkosModifyDevice(Y); 990 } else if (pattern == SUBSET_NONZERO_PATTERN) { 991 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 992 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 993 994 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 995 PetscInt i = t.league_rank(); /* row i */ 996 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 997 PetscInt p,q = Yi(i); 998 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 999 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1000 if (Xj(p) == Yj(q)) { /* Found it */ 1001 Ya(q) += alpha * Xa(p); 1002 q++; 1003 } else { 1004 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1005 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1006 */ 1007 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1008 } 1009 } 1010 }); 1011 }); 1012 ierr = MatSeqAIJKokkosModifyDevice(Y); 1013 } else { /* different nonzero patterns */ 1014 Mat Z; 1015 KokkosCsrMatrix zcsr; 1016 KernelHandle kh; 1017 kh.create_spadd_handle(false); 1018 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1019 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1020 zkok = new Mat_SeqAIJKokkos(zcsr); 1021 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1022 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1023 kh.destroy_spadd_handle(); 1024 } 1025 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1026 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1027 PetscFunctionReturn(0); 1028 } 1029 1030 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1031 { 1032 PetscErrorCode ierr; 1033 Mat newmat; 1034 Mat_SeqAIJKokkos *akok; 1035 Mat_SeqAIJ *aseq; 1036 1037 PetscFunctionBegin; 1038 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 1039 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 1040 ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr); 1041 ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 1042 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1043 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1044 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1045 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1046 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1047 akok->SetUpCOO(aseq); 1048 PetscFunctionReturn(0); 1049 } 1050 1051 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1052 { 1053 PetscErrorCode ierr; 1054 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1055 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1056 PetscCount Annz = aseq->nz; 1057 const PetscCountKokkosView& jmap = akok->jmap_d; 1058 const PetscCountKokkosView& perm = akok->perm_d; 1059 MatScalarKokkosView Aa; 1060 ConstMatScalarKokkosView kv; 1061 PetscMemType memtype; 1062 1063 PetscFunctionBegin; 1064 PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()"); 1065 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1066 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1067 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1068 } else { 1069 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1070 } 1071 1072 if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */ 1073 else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 1074 1075 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1076 PetscScalar sum = 0.0; 1077 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1078 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1079 }); 1080 1081 if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1082 else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 1083 PetscFunctionReturn(0); 1084 } 1085 1086 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1087 { 1088 PetscErrorCode ierr; 1089 1090 PetscFunctionBegin; 1091 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1092 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1093 B->offloadmask = PETSC_OFFLOAD_CPU; 1094 PetscFunctionReturn(0); 1095 } 1096 1097 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1098 { 1099 PetscErrorCode ierr; 1100 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1101 1102 PetscFunctionBegin; 1103 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1104 A->boundtocpu = PETSC_FALSE; 1105 1106 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1107 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1108 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1109 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1110 A->ops->scale = MatScale_SeqAIJKokkos; 1111 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1112 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1113 A->ops->mult = MatMult_SeqAIJKokkos; 1114 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1115 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1116 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1117 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1118 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1119 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1120 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1121 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1122 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1123 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1124 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1125 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1126 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1127 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1128 1129 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1130 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1131 PetscFunctionReturn(0); 1132 } 1133 1134 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1135 { 1136 PetscErrorCode ierr; 1137 Mat_SeqAIJ *aseq; 1138 PetscInt i,m,n; 1139 1140 PetscFunctionBegin; 1141 PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1142 1143 m = akok->nrows(); 1144 n = akok->ncols(); 1145 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1146 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1147 1148 /* Set up data structures of A as a MATSEQAIJ */ 1149 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1150 aseq = (Mat_SeqAIJ*)(A)->data; 1151 1152 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1153 akok->j_dual.sync_host(); 1154 1155 aseq->i = akok->i_host_data(); 1156 aseq->j = akok->j_host_data(); 1157 aseq->a = akok->a_host_data(); 1158 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1159 aseq->singlemalloc = PETSC_FALSE; 1160 aseq->free_a = PETSC_FALSE; 1161 aseq->free_ij = PETSC_FALSE; 1162 aseq->nz = akok->nnz(); 1163 aseq->maxnz = aseq->nz; 1164 1165 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1166 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1167 for (i=0; i<m; i++) { 1168 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1169 } 1170 1171 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1172 akok->nonzerostate = A->nonzerostate; 1173 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1174 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1175 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1176 PetscFunctionReturn(0); 1177 } 1178 1179 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1180 1181 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1182 */ 1183 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1184 { 1185 PetscErrorCode ierr; 1186 1187 PetscFunctionBegin; 1188 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1189 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1190 PetscFunctionReturn(0); 1191 } 1192 1193 /* --------------------------------------------------------------------------------*/ 1194 /*@C 1195 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1196 (the default parallel PETSc format). This matrix will ultimately be handled by 1197 Kokkos for calculations. For good matrix 1198 assembly performance the user should preallocate the matrix storage by setting 1199 the parameter nz (or the array nnz). By setting these parameters accurately, 1200 performance during matrix assembly can be increased by more than a factor of 50. 1201 1202 Collective 1203 1204 Input Parameters: 1205 + comm - MPI communicator, set to PETSC_COMM_SELF 1206 . m - number of rows 1207 . n - number of columns 1208 . nz - number of nonzeros per row (same for all rows) 1209 - nnz - array containing the number of nonzeros in the various rows 1210 (possibly different for each row) or NULL 1211 1212 Output Parameter: 1213 . A - the matrix 1214 1215 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1216 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1217 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1218 1219 Notes: 1220 If nnz is given then nz is ignored 1221 1222 The AIJ format (also called the Yale sparse matrix format or 1223 compressed row storage), is fully compatible with standard Fortran 77 1224 storage. That is, the stored row and column indices can begin at 1225 either one (as in Fortran) or zero. See the users' manual for details. 1226 1227 Specify the preallocated storage with either nz or nnz (not both). 1228 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1229 allocation. For large problems you MUST preallocate memory or you 1230 will get TERRIBLE performance, see the users' manual chapter on matrices. 1231 1232 By default, this format uses inodes (identical nodes) when possible, to 1233 improve numerical efficiency of matrix-vector products and solves. We 1234 search for consecutive rows with the same nonzero structure, thereby 1235 reusing matrix information to achieve increased efficiency. 1236 1237 Level: intermediate 1238 1239 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1240 @*/ 1241 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1242 { 1243 PetscErrorCode ierr; 1244 1245 PetscFunctionBegin; 1246 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1247 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1248 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1249 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1250 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1251 PetscFunctionReturn(0); 1252 } 1253 1254 typedef Kokkos::TeamPolicy<>::member_type team_member; 1255 // 1256 // This factorization exploits block diagonal matrices with "Nf" (not used). 1257 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1258 // 1259 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1260 { 1261 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1262 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1263 IS isrow = b->row,isicol = b->icol; 1264 PetscErrorCode ierr; 1265 const PetscInt *r_h,*ic_h; 1266 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1267 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1268 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1269 PetscBool row_identity,col_identity; 1270 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1271 1272 PetscFunctionBegin; 1273 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1274 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1275 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1276 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1277 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1278 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1279 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1280 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1281 { 1282 #define KOKKOS_SHARED_LEVEL 1 1283 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1284 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1285 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1286 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1287 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1288 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1289 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1290 size_t flops_h = 0.0; 1291 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1292 Kokkos::View<size_t> d_flops_k ("flops"); 1293 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1294 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1295 Kokkos::deep_copy (d_flops_k, h_flops_k); 1296 Kokkos::deep_copy (d_r_k, h_r_k); 1297 Kokkos::deep_copy (d_ic_k, h_ic_k); 1298 // Fill A --> fact 1299 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1300 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1301 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1302 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1303 // zero rows of B 1304 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1305 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1306 PetscScalar *baL = ba_d + bi_d[rowb]; 1307 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1308 /* zero (unfactored row) */ 1309 for (int j=0;j<nzbL;j++) baL[j] = 0; 1310 for (int j=0;j<nzbU;j++) baU[j] = 0; 1311 }); 1312 // copy A into B 1313 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1314 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1315 const PetscScalar *av = aa_d + ai_d[rowa]; 1316 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1317 /* load in initial (unfactored row) */ 1318 for (int j=0;j<nza;j++) { 1319 PetscInt colb = ic[ajtmp[j]]; 1320 PetscScalar vala = av[j]; 1321 if (colb == rowb) { 1322 *(ba_d + bdiag_d[rowb]) = vala; 1323 } else { 1324 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1325 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1326 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1327 for (int j=0; j<nz ; j++) { 1328 if (pbj[j] == colb) { 1329 pba[j] = vala; 1330 set++; 1331 break; 1332 } 1333 } 1334 #if !defined(PETSC_HAVE_SYCL) 1335 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1336 #endif 1337 } 1338 } 1339 }); 1340 }); 1341 Kokkos::fence(); 1342 1343 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1344 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1345 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1346 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1347 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1348 const PetscInt start = field*nloc, end = start + nloc; 1349 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1350 // A22 panel update for each row A(1,:) and col A(:,1) 1351 for (int ii=start; ii<end-1; ii++) { 1352 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1353 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1354 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1355 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1356 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1357 PetscInt kIdx = j*Ni + field_block_idx; 1358 if (kIdx >= nzUi) /* void */ ; 1359 else { 1360 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1361 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1362 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1363 size_t st_idx; 1364 // find and do L(k,i) = A(:k,i) / A(i,i) 1365 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1366 // get column, there has got to be a better way 1367 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1368 if (pjL[j] == ii) { 1369 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1370 idx = j; // output 1371 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1372 } 1373 }, st_idx); 1374 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1375 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1376 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1377 #endif 1378 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1379 // U(i+1,:end) 1380 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1381 PetscScalar Uij = baUi[uiIdx]; 1382 PetscInt col = bjUi[uiIdx]; 1383 if (col==myk) { 1384 // A_kk = A_kk - L_ki * U_ij(k) 1385 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1386 *Akkv = *Akkv - L_ki() * Uij; // UiK 1387 } else { 1388 PetscScalar *start, *end, *pAkjv=NULL; 1389 PetscInt high, low; 1390 const PetscInt *startj; 1391 if (col<myk) { // L 1392 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1393 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1394 start = pLki+1; // start at pLki+1, A22(myk,1) 1395 startj= bj_d + bi_d[myk] + idx; 1396 end = ba_d + bi_d[myk+1]; 1397 } else { 1398 PetscInt idx = bdiag_d[myk+1]+1; 1399 start = ba_d + idx; 1400 startj= bj_d + idx; 1401 end = ba_d + bdiag_d[myk]; 1402 } 1403 // search for 'col', use bisection search - TODO 1404 low = 0; 1405 high = (PetscInt)(end-start); 1406 while (high-low > 5) { 1407 int t = (low+high)/2; 1408 if (startj[t] > col) high = t; 1409 else low = t; 1410 } 1411 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1412 if (startj[pAkjv-start] == col) break; 1413 } 1414 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1415 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1416 #endif 1417 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1418 } 1419 }); 1420 } 1421 }); 1422 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1423 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1424 } /* endof for (i=0; i<n; i++) { */ 1425 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1426 }); 1427 Kokkos::fence(); 1428 Kokkos::deep_copy (h_flops_k, d_flops_k); 1429 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1430 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1431 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1432 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1433 /* Invert diagonal for simpler triangular solves */ 1434 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1435 int i = start + outer_index*Ni + lg_rank%Ni; 1436 if (i < end) { 1437 PetscScalar *pv = ba_d + bdiag_d[i]; 1438 *pv = 1.0/(*pv); 1439 } 1440 }); 1441 }); 1442 } 1443 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1444 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1445 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1446 1447 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1448 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1449 if (b->inode.size) { 1450 B->ops->solve = MatSolve_SeqAIJ_Inode; 1451 } else if (row_identity && col_identity) { 1452 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1453 } else { 1454 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1455 } 1456 B->offloadmask = PETSC_OFFLOAD_GPU; 1457 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1458 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1459 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1460 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1461 B->ops->matsolve = MatMatSolve_SeqAIJ; 1462 B->assembled = PETSC_TRUE; 1463 B->preallocated = PETSC_TRUE; 1464 1465 PetscFunctionReturn(0); 1466 } 1467 1468 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1469 { 1470 PetscErrorCode ierr; 1471 1472 PetscFunctionBegin; 1473 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1474 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1475 PetscFunctionReturn(0); 1476 } 1477 1478 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1479 { 1480 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1481 1482 PetscFunctionBegin; 1483 if (!factors->sptrsv_symbolic_completed) { 1484 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1485 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1486 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1487 } 1488 PetscFunctionReturn(0); 1489 } 1490 1491 /* Check if we need to update factors etc for transpose solve */ 1492 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1493 { 1494 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1495 MatColIdxType n = A->rmap->n; 1496 1497 PetscFunctionBegin; 1498 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1499 /* Update L^T and do sptrsv symbolic */ 1500 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1501 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1502 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1503 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1504 1505 KokkosKernels::Impl::transpose_matrix< 1506 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1507 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1508 MatRowMapKokkosView,DefaultExecutionSpace>( 1509 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1510 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1511 1512 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1513 We have to sort the indices, until KK provides finer control options. 1514 */ 1515 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1516 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1517 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1518 1519 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1520 1521 /* Update U^T and do sptrsv symbolic */ 1522 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1523 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1524 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1525 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1526 1527 KokkosKernels::Impl::transpose_matrix< 1528 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1529 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1530 MatRowMapKokkosView,DefaultExecutionSpace>( 1531 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1532 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1533 1534 /* Sort indices. See comments above */ 1535 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1536 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1537 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1538 1539 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1540 factors->transpose_updated = PETSC_TRUE; 1541 } 1542 PetscFunctionReturn(0); 1543 } 1544 1545 /* Solve Ax = b, with A = LU */ 1546 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1547 { 1548 PetscErrorCode ierr; 1549 ConstPetscScalarKokkosView bv; 1550 PetscScalarKokkosView xv; 1551 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1552 1553 PetscFunctionBegin; 1554 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1555 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1556 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1557 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1558 /* Solve L tmpv = b */ 1559 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1560 /* Solve Ux = tmpv */ 1561 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1562 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1563 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1564 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1565 PetscFunctionReturn(0); 1566 } 1567 1568 /* Solve A^T x = b, where A^T = U^T L^T */ 1569 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1570 { 1571 PetscErrorCode ierr; 1572 ConstPetscScalarKokkosView bv; 1573 PetscScalarKokkosView xv; 1574 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1575 1576 PetscFunctionBegin; 1577 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1578 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1579 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1580 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1581 /* Solve U^T tmpv = b */ 1582 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1583 1584 /* Solve L^T x = tmpv */ 1585 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1586 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1587 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1588 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1589 PetscFunctionReturn(0); 1590 } 1591 1592 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1593 { 1594 PetscErrorCode ierr; 1595 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1596 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1597 PetscInt fill_lev = info->levels; 1598 1599 PetscFunctionBegin; 1600 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1601 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1602 1603 auto a_d = aijkok->a_dual.view_device(); 1604 auto i_d = aijkok->i_dual.view_device(); 1605 auto j_d = aijkok->j_dual.view_device(); 1606 1607 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1608 1609 B->assembled = PETSC_TRUE; 1610 B->preallocated = PETSC_TRUE; 1611 B->ops->solve = MatSolve_SeqAIJKokkos; 1612 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1613 B->ops->matsolve = NULL; 1614 B->ops->matsolvetranspose = NULL; 1615 B->offloadmask = PETSC_OFFLOAD_GPU; 1616 1617 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1618 factors->transpose_updated = PETSC_FALSE; 1619 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1620 /* TODO: log flops, but how to know that? */ 1621 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1622 PetscFunctionReturn(0); 1623 } 1624 1625 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1626 { 1627 PetscErrorCode ierr; 1628 Mat_SeqAIJKokkos *aijkok; 1629 Mat_SeqAIJ *b; 1630 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1631 PetscInt fill_lev = info->levels; 1632 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1633 PetscInt n = A->rmap->n; 1634 1635 PetscFunctionBegin; 1636 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1637 /* Rebuild factors */ 1638 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1639 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1640 1641 /* Create a spiluk handle and then do symbolic factorization */ 1642 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1643 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1644 1645 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1646 1647 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1648 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1649 Kokkos::realloc(factors->iU_d,n+1); 1650 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1651 1652 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1653 auto i_d = aijkok->i_dual.view_device(); 1654 auto j_d = aijkok->j_dual.view_device(); 1655 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1656 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1657 1658 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1659 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1660 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1661 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1662 1663 /* TODO: add options to select sptrsv algorithms */ 1664 /* Create sptrsv handles for L, U and their transpose */ 1665 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1666 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1667 #else 1668 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1669 #endif 1670 1671 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1672 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1673 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1674 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1675 1676 /* Fill fields of the factor matrix B */ 1677 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1678 b = (Mat_SeqAIJ*)B->data; 1679 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1680 B->info.fill_ratio_given = info->fill; 1681 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1682 1683 B->offloadmask = PETSC_OFFLOAD_GPU; 1684 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1685 PetscFunctionReturn(0); 1686 } 1687 1688 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1689 { 1690 PetscErrorCode ierr; 1691 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1692 const PetscInt nrows = A->rmap->n; 1693 1694 PetscFunctionBegin; 1695 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1696 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1697 // move B data into Kokkos 1698 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1699 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1700 { 1701 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1702 if (!baijkok->diag_d.extent(0)) { 1703 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1704 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1705 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1706 } 1707 } 1708 PetscFunctionReturn(0); 1709 } 1710 1711 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1712 { 1713 PetscFunctionBegin; 1714 *type = MATSOLVERKOKKOS; 1715 PetscFunctionReturn(0); 1716 } 1717 1718 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1719 { 1720 PetscFunctionBegin; 1721 *type = MATSOLVERKOKKOSDEVICE; 1722 PetscFunctionReturn(0); 1723 } 1724 1725 /*MC 1726 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1727 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1728 1729 Level: beginner 1730 1731 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1732 M*/ 1733 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1734 { 1735 PetscErrorCode ierr; 1736 PetscInt n = A->rmap->n; 1737 1738 PetscFunctionBegin; 1739 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1740 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1741 (*B)->factortype = ftype; 1742 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1743 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1744 1745 if (ftype == MAT_FACTOR_LU) { 1746 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1747 (*B)->canuseordering = PETSC_TRUE; 1748 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1749 } else if (ftype == MAT_FACTOR_ILU) { 1750 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1751 (*B)->canuseordering = PETSC_FALSE; 1752 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1753 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1754 1755 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1756 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1757 PetscFunctionReturn(0); 1758 } 1759 1760 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1761 { 1762 PetscErrorCode ierr; 1763 PetscInt n = A->rmap->n; 1764 1765 PetscFunctionBegin; 1766 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1767 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1768 (*B)->factortype = ftype; 1769 (*B)->canuseordering = PETSC_TRUE; 1770 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1771 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1772 1773 if (ftype == MAT_FACTOR_LU) { 1774 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1775 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1776 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1777 1778 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1779 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1780 PetscFunctionReturn(0); 1781 } 1782 1783 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1784 { 1785 PetscErrorCode ierr; 1786 1787 PetscFunctionBegin; 1788 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1789 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1790 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1791 PetscFunctionReturn(0); 1792 } 1793 1794 /* Utility to print out a KokkosCsrMatrix for debugging */ 1795 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1796 { 1797 PetscErrorCode ierr; 1798 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1799 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1800 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1801 const PetscInt *i = iv.data(); 1802 const PetscInt *j = jv.data(); 1803 const PetscScalar *a = av.data(); 1804 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1805 1806 PetscFunctionBegin; 1807 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1808 for (PetscInt k=0; k<m; k++) { 1809 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1810 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1811 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1812 } 1813 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1814 } 1815 PetscFunctionReturn(0); 1816 } 1817