1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/aij.h> 19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 20 21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 22 23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 24 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 25 In the latter case, it is important to set a_dual's sync state correctly. 26 */ 27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 28 { 29 PetscErrorCode ierr; 30 Mat_SeqAIJ *aijseq; 31 Mat_SeqAIJKokkos *aijkok; 32 33 PetscFunctionBegin; 34 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 35 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 36 37 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 38 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 39 40 /* If aijkok does not exist, we just copy i, j to device. 41 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 42 In both cases, we build a new aijkok structure. 43 */ 44 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 45 delete aijkok; 46 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 47 A->spptr = aijkok; 48 } 49 50 if (aijkok && aijkok->device_mat_d.data()) { 51 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 52 } 53 PetscFunctionReturn(0); 54 } 55 56 /* Sync CSR data to device if not yet */ 57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 58 { 59 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 61 PetscFunctionBegin; 62 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 63 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 64 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 65 if (aijkok->a_dual.need_sync_device()) { 66 aijkok->a_dual.sync_device(); 67 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 68 aijkok->hermitian_updated = PETSC_FALSE; 69 } 70 PetscFunctionReturn(0); 71 } 72 73 /* Mark the CSR data on device as modified */ 74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 75 { 76 PetscErrorCode ierr; 77 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 78 79 PetscFunctionBegin; 80 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 81 aijkok->a_dual.clear_sync_state(); 82 aijkok->a_dual.modify_device(); 83 aijkok->transpose_updated = PETSC_FALSE; 84 aijkok->hermitian_updated = PETSC_FALSE; 85 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 86 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 87 PetscFunctionReturn(0); 88 } 89 90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 91 { 92 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 93 94 PetscFunctionBegin; 95 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 96 /* We do not expect one needs factors on host */ 97 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 98 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 99 aijkok->a_dual.sync_host(); 100 PetscFunctionReturn(0); 101 } 102 103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 104 { 105 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 106 107 PetscFunctionBegin; 108 if (aijkok) { 109 aijkok->a_dual.sync_host(); 110 *array = aijkok->a_dual.view_host().data(); 111 } else { /* Happens when calling MatSetValues on a newly created matrix */ 112 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 113 } 114 PetscFunctionReturn(0); 115 } 116 117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 118 { 119 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 120 121 PetscFunctionBegin; 122 if (aijkok) aijkok->a_dual.modify_host(); 123 PetscFunctionReturn(0); 124 } 125 126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 129 130 PetscFunctionBegin; 131 if (aijkok) { 132 aijkok->a_dual.sync_host(); 133 *array = aijkok->a_dual.view_host().data(); 134 } else { 135 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 136 } 137 PetscFunctionReturn(0); 138 } 139 140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 141 { 142 PetscFunctionBegin; 143 *array = NULL; 144 PetscFunctionReturn(0); 145 } 146 147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 148 { 149 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 150 151 PetscFunctionBegin; 152 if (aijkok) { 153 *array = aijkok->a_dual.view_host().data(); 154 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 155 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 156 } 157 PetscFunctionReturn(0); 158 } 159 160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 161 { 162 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 164 PetscFunctionBegin; 165 if (aijkok) { 166 aijkok->a_dual.clear_sync_state(); 167 aijkok->a_dual.modify_host(); 168 } 169 PetscFunctionReturn(0); 170 } 171 172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 177 178 PetscFunctionBegin; 179 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 180 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 181 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 190 PetscFunctionBegin; 191 if (aijkok && aijkok->device_mat_d.data()) { 192 *d_mat = aijkok->device_mat_d.data(); 193 } else { 194 PetscErrorCode ierr; 195 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 196 *d_mat = NULL; 197 } 198 PetscFunctionReturn(0); 199 } 200 201 /* Generate the transpose on device and cache it internally */ 202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 205 206 PetscFunctionBegin; 207 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 208 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 209 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 210 CHKERRCXX(aijkok->a_dual.sync_device()); 211 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 212 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 213 aijkok->transpose_updated = PETSC_TRUE; 214 } 215 *csrmatT = &aijkok->csrmatT; 216 PetscFunctionReturn(0); 217 } 218 219 /* Generate the Hermitian on device and cache it internally */ 220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 221 { 222 PetscErrorCode ierr; 223 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 224 225 PetscFunctionBegin; 226 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 227 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 228 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 229 CHKERRCXX(aijkok->a_dual.sync_device()); 230 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 231 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 232 #if defined(PETSC_USE_COMPLEX) 233 const auto& a = aijkok->csrmatH.values; 234 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 235 #endif 236 aijkok->hermitian_updated = PETSC_TRUE; 237 } 238 *csrmatH = &aijkok->csrmatH; 239 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 240 PetscFunctionReturn(0); 241 } 242 243 /* y = A x */ 244 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 245 { 246 PetscErrorCode ierr; 247 Mat_SeqAIJKokkos *aijkok; 248 ConstPetscScalarKokkosView xv; 249 PetscScalarKokkosView yv; 250 251 PetscFunctionBegin; 252 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 255 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 257 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 258 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 259 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 260 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 261 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 /* y = A^T x */ 266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 267 { 268 PetscErrorCode ierr; 269 Mat_SeqAIJKokkos *aijkok; 270 const char *mode; 271 ConstPetscScalarKokkosView xv; 272 PetscScalarKokkosView yv; 273 KokkosCsrMatrix *csrmat; 274 275 PetscFunctionBegin; 276 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 277 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 280 if (A->form_explicit_transpose) { 281 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282 mode = "N"; 283 } else { 284 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285 csrmat = &aijkok->csrmat; 286 mode = "T"; 287 } 288 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 291 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 /* y = A^H x */ 297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 298 { 299 PetscErrorCode ierr; 300 Mat_SeqAIJKokkos *aijkok; 301 const char *mode; 302 ConstPetscScalarKokkosView xv; 303 PetscScalarKokkosView yv; 304 KokkosCsrMatrix *csrmat; 305 306 PetscFunctionBegin; 307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 308 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 311 if (A->form_explicit_transpose) { 312 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313 mode = "N"; 314 } else { 315 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316 csrmat = &aijkok->csrmat; 317 mode = "C"; 318 } 319 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 /* z = A x + y */ 328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 329 { 330 PetscErrorCode ierr; 331 Mat_SeqAIJKokkos *aijkok; 332 ConstPetscScalarKokkosView xv,yv; 333 PetscScalarKokkosView zv; 334 335 PetscFunctionBegin; 336 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 337 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 341 if (zz != yy) Kokkos::deep_copy(zv,yv); 342 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 /* z = A^T x + y */ 353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJKokkos *aijkok; 357 const char *mode; 358 ConstPetscScalarKokkosView xv,yv; 359 PetscScalarKokkosView zv; 360 KokkosCsrMatrix *csrmat; 361 362 PetscFunctionBegin; 363 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 364 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 368 if (zz != yy) Kokkos::deep_copy(zv,yv); 369 if (A->form_explicit_transpose) { 370 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374 csrmat = &aijkok->csrmat; 375 mode = "T"; 376 } 377 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 381 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 /* z = A^H x + y */ 387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 388 { 389 PetscErrorCode ierr; 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv,yv; 393 PetscScalarKokkosView zv; 394 KokkosCsrMatrix *csrmat; 395 396 PetscFunctionBegin; 397 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 398 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 402 if (zz != yy) Kokkos::deep_copy(zv,yv); 403 if (A->form_explicit_transpose) { 404 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408 csrmat = &aijkok->csrmat; 409 mode = "C"; 410 } 411 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 415 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424 425 PetscFunctionBegin; 426 switch (op) { 427 case MAT_FORM_EXPLICIT_TRANSPOSE: 428 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430 A->form_explicit_transpose = flg; 431 break; 432 default: 433 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434 break; 435 } 436 PetscFunctionReturn(0); 437 } 438 439 /* Depending on reuse, either build a new mat, or use the existing mat */ 440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 441 { 442 PetscErrorCode ierr; 443 Mat_SeqAIJ *aseq; 444 445 PetscFunctionBegin; 446 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 449 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 452 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457 aseq = static_cast<Mat_SeqAIJ*>(A->data); 458 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 459 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 461 } 462 } 463 PetscFunctionReturn(0); 464 } 465 466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468 */ 469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 470 { 471 PetscErrorCode ierr; 472 Mat_SeqAIJ *bseq; 473 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474 Mat mat; 475 476 PetscFunctionBegin; 477 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479 mat = *B; 480 if (A->assembled) { 481 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484 /* Now copy values to B if needed */ 485 if (dupOption == MAT_COPY_VALUES) { 486 if (akok->a_dual.need_sync_device()) { 487 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488 bkok->a_dual.modify_host(); 489 } else { /* If device has the latest data, we only copy data on device */ 490 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491 bkok->a_dual.modify_device(); 492 } 493 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495 bkok->a_dual.modify_host(); 496 } 497 mat->spptr = bkok; 498 } 499 500 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 508 { 509 PetscErrorCode ierr; 510 Mat At; 511 KokkosCsrMatrix *internT,*csrmatT; 512 Mat_SeqAIJKokkos *atkok,*bkok; 513 514 PetscFunctionBegin; 515 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 516 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 518 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 519 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 520 if (reuse == MAT_INITIAL_MATRIX) *B = At; 521 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 522 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 523 if ((*B)->assembled) { 524 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 525 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 526 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 527 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 528 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 529 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 530 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 531 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 532 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 533 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 534 } 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 if (A->factortype == MAT_FACTOR_NONE) { 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 if (aijkok) { 547 if (aijkok->device_mat_d.data()) { 548 delete aijkok->colmap_d; 549 delete aijkok->i_uncompressed_d; 550 } 551 if (aijkok->diag_d) delete aijkok->diag_d; 552 } 553 delete aijkok; 554 } else { 555 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 556 } 557 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 558 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 559 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 560 A->spptr = NULL; 561 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 565 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 566 { 567 PetscErrorCode ierr; 568 569 PetscFunctionBegin; 570 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 571 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 572 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 577 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 578 { 579 PetscErrorCode ierr; 580 Mat_SeqAIJ *a,*b; 581 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 582 MatScalarKokkosView aa,ba,ca; 583 MatRowMapKokkosView ai,bi,ci; 584 MatColIdxKokkosView aj,bj,cj; 585 PetscInt m,n,nnz,aN; 586 587 PetscFunctionBegin; 588 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 589 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 590 PetscValidPointer(C,4); 591 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 592 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 593 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 594 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 595 596 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 597 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 598 a = static_cast<Mat_SeqAIJ*>(A->data); 599 b = static_cast<Mat_SeqAIJ*>(B->data); 600 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 601 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 602 aa = akok->a_dual.view_device(); 603 ai = akok->i_dual.view_device(); 604 ba = bkok->a_dual.view_device(); 605 bi = bkok->i_dual.view_device(); 606 m = A->rmap->n; /* M, N and nnz of C */ 607 n = A->cmap->n + B->cmap->n; 608 nnz = a->nz + b->nz; 609 aN = A->cmap->n; /* N of A */ 610 if (reuse == MAT_INITIAL_MATRIX) { 611 aj = akok->j_dual.view_device(); 612 bj = bkok->j_dual.view_device(); 613 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 614 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 615 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 616 ca = ca_dual.view_device(); 617 ci = ci_dual.view_device(); 618 cj = cj_dual.view_device(); 619 620 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 621 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 622 PetscInt i = t.league_rank(); /* row i */ 623 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 624 625 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 626 ci(i) = coffset; 627 if (i == m-1) ci(m) = ai(m) + bi(m); 628 }); 629 630 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 631 if (k < alen) { 632 ca(coffset+k) = aa(ai(i)+k); 633 cj(coffset+k) = aj(ai(i)+k); 634 } else { 635 ca(coffset+k) = ba(bi(i)+k-alen); 636 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 637 } 638 }); 639 }); 640 ca_dual.modify_device(); 641 ci_dual.modify_device(); 642 cj_dual.modify_device(); 643 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 644 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 645 } else if (reuse == MAT_REUSE_MATRIX) { 646 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 647 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 648 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 649 ca = ckok->a_dual.view_device(); 650 ci = ckok->i_dual.view_device(); 651 652 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 653 PetscInt i = t.league_rank(); /* row i */ 654 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 655 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 656 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 657 else ca(ci(i)+k) = ba(bi(i)+k-alen); 658 }); 659 }); 660 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 661 } 662 PetscFunctionReturn(0); 663 } 664 665 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 666 { 667 PetscFunctionBegin; 668 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 673 { 674 PetscErrorCode ierr; 675 Mat_Product *product = C->product; 676 Mat A,B; 677 bool transA,transB; /* use bool, since KK needs this type */ 678 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 679 Mat_SeqAIJ *c; 680 MatProductData_SeqAIJKokkos *pdata; 681 KokkosCsrMatrix *csrmatA,*csrmatB; 682 683 PetscFunctionBegin; 684 MatCheckProduct(C,1); 685 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 686 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 687 688 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 689 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 690 PetscFunctionReturn(0); 691 } 692 693 switch (product->type) { 694 case MATPRODUCT_AB: transA = false; transB = false; break; 695 case MATPRODUCT_AtB: transA = true; transB = false; break; 696 case MATPRODUCT_ABt: transA = false; transB = true; break; 697 default: 698 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 699 } 700 701 A = product->A; 702 B = product->B; 703 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 704 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 705 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 706 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 707 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 708 709 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 710 711 csrmatA = &akok->csrmat; 712 csrmatB = &bkok->csrmat; 713 714 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 715 if (transA) { 716 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 717 transA = false; 718 } 719 720 if (transB) { 721 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 722 transB = false; 723 } 724 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 725 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 726 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 727 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 728 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 729 /* shorter version of MatAssemblyEnd_SeqAIJ */ 730 c = (Mat_SeqAIJ*)C->data; 731 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 732 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 733 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 734 c->reallocs = 0; 735 C->info.mallocs = 0; 736 C->info.nz_unneeded = 0; 737 C->assembled = C->was_assembled = PETSC_TRUE; 738 C->num_ass++; 739 PetscFunctionReturn(0); 740 } 741 742 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 743 { 744 PetscErrorCode ierr; 745 Mat_Product *product = C->product; 746 MatProductType ptype; 747 Mat A,B; 748 bool transA,transB; 749 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 750 MatProductData_SeqAIJKokkos *pdata; 751 MPI_Comm comm; 752 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 753 754 PetscFunctionBegin; 755 MatCheckProduct(C,1); 756 ierr = PetscObjectGetComm((PetscObject)C,&comm); 757 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 758 A = product->A; 759 B = product->B; 760 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 761 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 762 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 763 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 764 csrmatA = &akok->csrmat; 765 csrmatB = &bkok->csrmat; 766 767 ptype = product->type; 768 switch (ptype) { 769 case MATPRODUCT_AB: transA = false; transB = false; break; 770 case MATPRODUCT_AtB: transA = true; transB = false; break; 771 case MATPRODUCT_ABt: transA = false; transB = true; break; 772 default: 773 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 774 } 775 776 product->data = pdata = new MatProductData_SeqAIJKokkos(); 777 pdata->kh.set_team_work_size(16); 778 pdata->kh.set_dynamic_scheduling(true); 779 pdata->reusesym = product->api_user; 780 781 /* TODO: add command line options to select spgemm algorithms */ 782 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 783 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 784 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 785 */ 786 pdata->kh.create_spgemm_handle(spgemm_alg); 787 788 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 789 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 790 if (transA) { 791 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 792 transA = false; 793 } 794 795 if (transB) { 796 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 797 transB = false; 798 } 799 800 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 801 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 802 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 803 calling new Mat_SeqAIJKokkos(). 804 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 805 */ 806 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 808 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 809 810 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 811 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 812 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 813 PetscFunctionReturn(0); 814 } 815 816 /* handles sparse matrix matrix ops */ 817 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 818 { 819 PetscErrorCode ierr; 820 Mat_Product *product = mat->product; 821 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822 823 PetscFunctionBegin; 824 MatCheckProduct(mat,1); 825 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 826 if (product->type == MATPRODUCT_ABC) { 827 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 828 } 829 if (Biskok && Ciskok) { 830 switch (product->type) { 831 case MATPRODUCT_AB: 832 case MATPRODUCT_AtB: 833 case MATPRODUCT_ABt: 834 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835 break; 836 case MATPRODUCT_PtAP: 837 case MATPRODUCT_RARt: 838 case MATPRODUCT_ABC: 839 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840 break; 841 default: 842 break; 843 } 844 } else { /* fallback for AIJ */ 845 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 846 } 847 PetscFunctionReturn(0); 848 } 849 850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851 { 852 PetscErrorCode ierr; 853 Mat_SeqAIJKokkos *aijkok; 854 855 PetscFunctionBegin; 856 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 857 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 858 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 859 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 860 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 861 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 862 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 863 PetscFunctionReturn(0); 864 } 865 866 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 867 { 868 PetscErrorCode ierr; 869 Mat_SeqAIJKokkos *aijkok; 870 871 PetscFunctionBegin; 872 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 873 if (aijkok) { /* Only zero the device if data is already there */ 874 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 875 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 876 } else { /* Might be preallocated but not assembled */ 877 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 878 } 879 PetscFunctionReturn(0); 880 } 881 882 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 883 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 884 { 885 PetscErrorCode ierr; 886 Mat_SeqAIJKokkos *aijkok; 887 888 PetscFunctionBegin; 889 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 890 PetscValidPointer(kv,2); 891 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 892 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 893 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 894 *kv = aijkok->a_dual.view_device(); 895 PetscFunctionReturn(0); 896 } 897 898 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 899 { 900 PetscFunctionBegin; 901 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 902 PetscValidPointer(kv,2); 903 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 904 PetscFunctionReturn(0); 905 } 906 907 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 908 { 909 PetscErrorCode ierr; 910 Mat_SeqAIJKokkos *aijkok; 911 912 PetscFunctionBegin; 913 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 914 PetscValidPointer(kv,2); 915 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 916 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 917 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 918 *kv = aijkok->a_dual.view_device(); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 923 { 924 PetscErrorCode ierr; 925 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 934 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 935 { 936 Mat_SeqAIJKokkos *aijkok; 937 938 PetscFunctionBegin; 939 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 940 PetscValidPointer(kv,2); 941 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 942 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 943 *kv = aijkok->a_dual.view_device(); 944 PetscFunctionReturn(0); 945 } 946 947 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 948 { 949 PetscErrorCode ierr; 950 951 PetscFunctionBegin; 952 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 953 PetscValidPointer(kv,2); 954 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 955 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 956 PetscFunctionReturn(0); 957 } 958 959 /* Computes Y += alpha X */ 960 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 961 { 962 PetscErrorCode ierr; 963 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 964 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 965 ConstMatScalarKokkosView Xa; 966 MatScalarKokkosView Ya; 967 968 PetscFunctionBegin; 969 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 970 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 971 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 972 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 973 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 974 975 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 976 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 977 PetscBool e; 978 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 979 if (e) { 980 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 981 if (e) pattern = SAME_NONZERO_PATTERN; 982 } 983 } 984 985 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 986 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 987 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 988 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 989 */ 990 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 991 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 992 Xa = xkok->a_dual.view_device(); 993 Ya = ykok->a_dual.view_device(); 994 995 if (pattern == SAME_NONZERO_PATTERN) { 996 KokkosBlas::axpy(alpha,Xa,Ya); 997 ierr = MatSeqAIJKokkosModifyDevice(Y); 998 } else if (pattern == SUBSET_NONZERO_PATTERN) { 999 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1000 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1001 1002 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1003 PetscInt i = t.league_rank(); /* row i */ 1004 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1005 PetscInt p,q = Yi(i); 1006 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1007 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1008 if (Xj(p) == Yj(q)) { /* Found it */ 1009 Ya(q) += alpha * Xa(p); 1010 q++; 1011 } else { 1012 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1013 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1014 */ 1015 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1016 } 1017 } 1018 }); 1019 }); 1020 ierr = MatSeqAIJKokkosModifyDevice(Y); 1021 } else { /* different nonzero patterns */ 1022 Mat Z; 1023 KokkosCsrMatrix zcsr; 1024 KernelHandle kh; 1025 kh.create_spadd_handle(false); 1026 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1027 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1028 zkok = new Mat_SeqAIJKokkos(zcsr); 1029 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1030 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1031 kh.destroy_spadd_handle(); 1032 } 1033 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1034 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1035 PetscFunctionReturn(0); 1036 } 1037 1038 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1039 { 1040 PetscErrorCode ierr; 1041 MPI_Comm comm; 1042 PetscInt *i,*j,*perm,*jmap; 1043 PetscInt k,M,N,p,q,row,start,end,nnz,nneg; 1044 PetscBool has_repeats = PETSC_FALSE; 1045 PetscInt *Ai,*Aj; 1046 PetscScalar *Aa; 1047 Mat_SeqAIJKokkos *akok; 1048 Mat_SeqAIJ *aseq; 1049 MatRowMapKokkosViewHost perm_h("perm",n),jmap_h; 1050 Mat newmat; 1051 1052 PetscFunctionBegin; 1053 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 1054 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 1055 ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr); 1056 ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */ 1057 ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr); 1058 perm = perm_h.data(); 1059 for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */ 1060 if (j[k] < 0) i[k] = -1; 1061 perm[k] = k; 1062 } 1063 1064 /* Sort by row */ 1065 ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr); 1066 for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */ 1067 nneg = k; 1068 Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */ 1069 nnz = 0; /* Total number of unique nonzeros to be counted */ 1070 jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */ 1071 jmap++; /* Inc jmap by 1 for convinience */ 1072 1073 ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */ 1074 ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */ 1075 1076 /* In each row, sort by column, then unique column indices to get row length */ 1077 Ai++; /* Inc by 1 for convinience */ 1078 q = 0; /* q-th unique nonzero, with q starting from 0 */ 1079 while (k<n) { 1080 row = i[k]; 1081 start = k; /* [start,end) indices for this row */ 1082 while (k<n && i[k] == row) k++; 1083 end = k; 1084 ierr = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr); 1085 /* Find number of unique col entries in this row */ 1086 Aj[q] = j[start]; /* Log the first nonzero in this row */ 1087 jmap[q] = 1; /* Number of repeats of this nozero entry */ 1088 Ai[row] = 1; 1089 nnz++; 1090 1091 for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */ 1092 if (j[p] != j[p-1]) { /* Meet a new nonzero */ 1093 q++; 1094 jmap[q] = 1; 1095 Aj[q] = j[p]; 1096 Ai[row]++; 1097 nnz++; 1098 } else { 1099 jmap[q]++; 1100 has_repeats = PETSC_TRUE; 1101 } 1102 } 1103 q++; /* Move to next row and thus next unique nonzero */ 1104 } 1105 ierr = PetscFree2(i,j);CHKERRQ(ierr); 1106 1107 Ai--; /* Back to the beginning of Ai[] */ 1108 for (k=0; k<M; k++) Ai[k+1] += Ai[k]; 1109 jmap--; /* Back to the beginning of jmap[] */ 1110 jmap[0] = 0; 1111 if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */ 1112 for (k=0; k<nnz; k++) jmap[k+1] += jmap[k]; 1113 Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */ 1114 } 1115 1116 if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */ 1117 PetscInt *Aj_new; 1118 ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr); 1119 ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr); 1120 ierr = PetscFree(Aj);CHKERRQ(ierr); 1121 Aj = Aj_new; 1122 } 1123 1124 ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr); 1125 ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr); 1126 aseq = static_cast<Mat_SeqAIJ*>(newmat->data); 1127 aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */ 1128 aseq->free_a = aseq->free_ij = PETSC_TRUE; 1129 1130 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1131 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1132 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1133 1134 if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */ 1135 MatRowMapKokkosViewHost newperm_h("perm",n-nneg); 1136 Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n))); 1137 perm_h = newperm_h; 1138 } 1139 1140 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1141 akok->SetUpCOO(n,has_repeats,jmap_h,perm_h); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1146 { 1147 PetscErrorCode ierr; 1148 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1149 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1150 PetscInt nz = aseq->nz; 1151 const MatRowMapKokkosView& jmap = akok->jmap_d; 1152 const MatRowMapKokkosView& perm = akok->perm_d; 1153 MatScalarKokkosView Aa; 1154 ConstMatScalarKokkosView kv; 1155 PetscMemType memtype; 1156 1157 PetscFunctionBegin; 1158 if (!v) { /* NULL v means an all zero array */ 1159 ierr = MatZeroEntries(A);CHKERRQ(ierr); 1160 PetscFunctionReturn(0); 1161 } 1162 1163 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1164 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1165 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n)); 1166 } else { 1167 kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */ 1168 } 1169 1170 ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */ 1171 if (imode == INSERT_VALUES) { 1172 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1173 } 1174 1175 if (akok->coo_has_repeats) { 1176 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) { 1177 for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k)); 1178 }); 1179 } else { 1180 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));}); 1181 } 1182 ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 1183 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1184 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1185 PetscFunctionReturn(0); 1186 } 1187 1188 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1189 { 1190 PetscErrorCode ierr; 1191 1192 PetscFunctionBegin; 1193 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1194 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1195 B->offloadmask = PETSC_OFFLOAD_CPU; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1200 { 1201 PetscErrorCode ierr; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 1204 PetscFunctionBegin; 1205 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1206 A->boundtocpu = PETSC_FALSE; 1207 1208 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1209 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1210 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1211 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1212 A->ops->scale = MatScale_SeqAIJKokkos; 1213 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1214 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1215 A->ops->mult = MatMult_SeqAIJKokkos; 1216 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1217 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1218 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1219 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1220 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1221 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1222 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1223 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1224 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1225 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1226 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1227 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1228 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1229 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1230 1231 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1232 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1233 PetscFunctionReturn(0); 1234 } 1235 1236 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1237 { 1238 PetscErrorCode ierr; 1239 Mat_SeqAIJ *aseq; 1240 PetscInt i,m,n; 1241 1242 PetscFunctionBegin; 1243 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1244 1245 m = akok->nrows(); 1246 n = akok->ncols(); 1247 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1248 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1249 1250 /* Set up data structures of A as a MATSEQAIJ */ 1251 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1252 aseq = (Mat_SeqAIJ*)(A)->data; 1253 1254 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1255 akok->j_dual.sync_host(); 1256 1257 aseq->i = akok->i_host_data(); 1258 aseq->j = akok->j_host_data(); 1259 aseq->a = akok->a_host_data(); 1260 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1261 aseq->singlemalloc = PETSC_FALSE; 1262 aseq->free_a = PETSC_FALSE; 1263 aseq->free_ij = PETSC_FALSE; 1264 aseq->nz = akok->nnz(); 1265 aseq->maxnz = aseq->nz; 1266 1267 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1268 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1269 for (i=0; i<m; i++) { 1270 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1271 } 1272 1273 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1274 akok->nonzerostate = A->nonzerostate; 1275 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1276 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1277 A->spptr = akok; 1278 PetscFunctionReturn(0); 1279 } 1280 1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1282 1283 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1284 */ 1285 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1286 { 1287 PetscErrorCode ierr; 1288 1289 PetscFunctionBegin; 1290 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1291 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1292 PetscFunctionReturn(0); 1293 } 1294 1295 /* --------------------------------------------------------------------------------*/ 1296 /*@C 1297 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1298 (the default parallel PETSc format). This matrix will ultimately be handled by 1299 Kokkos for calculations. For good matrix 1300 assembly performance the user should preallocate the matrix storage by setting 1301 the parameter nz (or the array nnz). By setting these parameters accurately, 1302 performance during matrix assembly can be increased by more than a factor of 50. 1303 1304 Collective 1305 1306 Input Parameters: 1307 + comm - MPI communicator, set to PETSC_COMM_SELF 1308 . m - number of rows 1309 . n - number of columns 1310 . nz - number of nonzeros per row (same for all rows) 1311 - nnz - array containing the number of nonzeros in the various rows 1312 (possibly different for each row) or NULL 1313 1314 Output Parameter: 1315 . A - the matrix 1316 1317 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1318 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1319 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1320 1321 Notes: 1322 If nnz is given then nz is ignored 1323 1324 The AIJ format (also called the Yale sparse matrix format or 1325 compressed row storage), is fully compatible with standard Fortran 77 1326 storage. That is, the stored row and column indices can begin at 1327 either one (as in Fortran) or zero. See the users' manual for details. 1328 1329 Specify the preallocated storage with either nz or nnz (not both). 1330 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1331 allocation. For large problems you MUST preallocate memory or you 1332 will get TERRIBLE performance, see the users' manual chapter on matrices. 1333 1334 By default, this format uses inodes (identical nodes) when possible, to 1335 improve numerical efficiency of matrix-vector products and solves. We 1336 search for consecutive rows with the same nonzero structure, thereby 1337 reusing matrix information to achieve increased efficiency. 1338 1339 Level: intermediate 1340 1341 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1342 @*/ 1343 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1344 { 1345 PetscErrorCode ierr; 1346 1347 PetscFunctionBegin; 1348 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1349 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1350 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1351 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1352 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1353 PetscFunctionReturn(0); 1354 } 1355 1356 typedef Kokkos::TeamPolicy<>::member_type team_member; 1357 // 1358 // This factorization exploits block diagonal matrices with "Nf" (not used). 1359 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1360 // 1361 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1362 { 1363 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1364 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1365 IS isrow = b->row,isicol = b->icol; 1366 PetscErrorCode ierr; 1367 const PetscInt *r_h,*ic_h; 1368 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1369 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1370 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1371 PetscBool row_identity,col_identity; 1372 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1373 1374 PetscFunctionBegin; 1375 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1376 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1377 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1378 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1379 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1380 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1381 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1382 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1383 { 1384 #define KOKKOS_SHARED_LEVEL 1 1385 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1386 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1387 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1388 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1389 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1390 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1391 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1392 size_t flops_h = 0.0; 1393 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1394 Kokkos::View<size_t> d_flops_k ("flops"); 1395 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1396 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1397 Kokkos::deep_copy (d_flops_k, h_flops_k); 1398 Kokkos::deep_copy (d_r_k, h_r_k); 1399 Kokkos::deep_copy (d_ic_k, h_ic_k); 1400 // Fill A --> fact 1401 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1402 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1403 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1404 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1405 // zero rows of B 1406 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1407 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1408 PetscScalar *baL = ba_d + bi_d[rowb]; 1409 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1410 /* zero (unfactored row) */ 1411 for (int j=0;j<nzbL;j++) baL[j] = 0; 1412 for (int j=0;j<nzbU;j++) baU[j] = 0; 1413 }); 1414 // copy A into B 1415 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1416 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1417 const PetscScalar *av = aa_d + ai_d[rowa]; 1418 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1419 /* load in initial (unfactored row) */ 1420 for (int j=0;j<nza;j++) { 1421 PetscInt colb = ic[ajtmp[j]]; 1422 PetscScalar vala = av[j]; 1423 if (colb == rowb) { 1424 *(ba_d + bdiag_d[rowb]) = vala; 1425 } else { 1426 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1427 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1428 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1429 for (int j=0; j<nz ; j++) { 1430 if (pbj[j] == colb) { 1431 pba[j] = vala; 1432 set++; 1433 break; 1434 } 1435 } 1436 #if !defined(PETSC_HAVE_SYCL) 1437 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1438 #endif 1439 } 1440 } 1441 }); 1442 }); 1443 Kokkos::fence(); 1444 1445 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1446 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1447 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1448 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1449 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1450 const PetscInt start = field*nloc, end = start + nloc; 1451 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1452 // A22 panel update for each row A(1,:) and col A(:,1) 1453 for (int ii=start; ii<end-1; ii++) { 1454 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1455 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1456 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1457 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1458 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1459 PetscInt kIdx = j*Ni + field_block_idx; 1460 if (kIdx >= nzUi) /* void */ ; 1461 else { 1462 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1463 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1464 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1465 size_t st_idx; 1466 // find and do L(k,i) = A(:k,i) / A(i,i) 1467 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1468 // get column, there has got to be a better way 1469 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1470 if (pjL[j] == ii) { 1471 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1472 idx = j; // output 1473 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1474 } 1475 }, st_idx); 1476 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1477 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1478 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1479 #endif 1480 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1481 // U(i+1,:end) 1482 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1483 PetscScalar Uij = baUi[uiIdx]; 1484 PetscInt col = bjUi[uiIdx]; 1485 if (col==myk) { 1486 // A_kk = A_kk - L_ki * U_ij(k) 1487 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1488 *Akkv = *Akkv - L_ki() * Uij; // UiK 1489 } else { 1490 PetscScalar *start, *end, *pAkjv=NULL; 1491 PetscInt high, low; 1492 const PetscInt *startj; 1493 if (col<myk) { // L 1494 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1495 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1496 start = pLki+1; // start at pLki+1, A22(myk,1) 1497 startj= bj_d + bi_d[myk] + idx; 1498 end = ba_d + bi_d[myk+1]; 1499 } else { 1500 PetscInt idx = bdiag_d[myk+1]+1; 1501 start = ba_d + idx; 1502 startj= bj_d + idx; 1503 end = ba_d + bdiag_d[myk]; 1504 } 1505 // search for 'col', use bisection search - TODO 1506 low = 0; 1507 high = (PetscInt)(end-start); 1508 while (high-low > 5) { 1509 int t = (low+high)/2; 1510 if (startj[t] > col) high = t; 1511 else low = t; 1512 } 1513 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1514 if (startj[pAkjv-start] == col) break; 1515 } 1516 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1517 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1518 #endif 1519 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1520 } 1521 }); 1522 } 1523 }); 1524 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1525 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1526 } /* endof for (i=0; i<n; i++) { */ 1527 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1528 }); 1529 Kokkos::fence(); 1530 Kokkos::deep_copy (h_flops_k, d_flops_k); 1531 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1532 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1533 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1534 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1535 /* Invert diagonal for simpler triangular solves */ 1536 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1537 int i = start + outer_index*Ni + lg_rank%Ni; 1538 if (i < end) { 1539 PetscScalar *pv = ba_d + bdiag_d[i]; 1540 *pv = 1.0/(*pv); 1541 } 1542 }); 1543 }); 1544 } 1545 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1546 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1547 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1548 1549 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1550 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1551 if (b->inode.size) { 1552 B->ops->solve = MatSolve_SeqAIJ_Inode; 1553 } else if (row_identity && col_identity) { 1554 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1555 } else { 1556 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1557 } 1558 B->offloadmask = PETSC_OFFLOAD_GPU; 1559 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1560 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1561 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1562 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1563 B->ops->matsolve = MatMatSolve_SeqAIJ; 1564 B->assembled = PETSC_TRUE; 1565 B->preallocated = PETSC_TRUE; 1566 1567 PetscFunctionReturn(0); 1568 } 1569 1570 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1571 { 1572 PetscErrorCode ierr; 1573 1574 PetscFunctionBegin; 1575 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1576 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1577 PetscFunctionReturn(0); 1578 } 1579 1580 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1581 { 1582 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1583 1584 PetscFunctionBegin; 1585 if (!factors->sptrsv_symbolic_completed) { 1586 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1587 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1588 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1589 } 1590 PetscFunctionReturn(0); 1591 } 1592 1593 /* Check if we need to update factors etc for transpose solve */ 1594 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1595 { 1596 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1597 MatColIdxType n = A->rmap->n; 1598 1599 PetscFunctionBegin; 1600 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1601 /* Update L^T and do sptrsv symbolic */ 1602 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1603 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1604 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1605 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1606 1607 KokkosKernels::Impl::transpose_matrix< 1608 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1609 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1610 MatRowMapKokkosView,DefaultExecutionSpace>( 1611 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1612 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1613 1614 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1615 We have to sort the indices, until KK provides finer control options. 1616 */ 1617 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1618 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1619 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1620 1621 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1622 1623 /* Update U^T and do sptrsv symbolic */ 1624 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1625 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1626 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1627 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1628 1629 KokkosKernels::Impl::transpose_matrix< 1630 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1631 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1632 MatRowMapKokkosView,DefaultExecutionSpace>( 1633 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1634 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1635 1636 /* Sort indices. See comments above */ 1637 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1638 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1639 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1640 1641 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1642 factors->transpose_updated = PETSC_TRUE; 1643 } 1644 PetscFunctionReturn(0); 1645 } 1646 1647 /* Solve Ax = b, with A = LU */ 1648 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1649 { 1650 PetscErrorCode ierr; 1651 ConstPetscScalarKokkosView bv; 1652 PetscScalarKokkosView xv; 1653 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1654 1655 PetscFunctionBegin; 1656 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1657 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1658 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1659 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1660 /* Solve L tmpv = b */ 1661 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1662 /* Solve Ux = tmpv */ 1663 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1664 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1665 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1666 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1667 PetscFunctionReturn(0); 1668 } 1669 1670 /* Solve A^T x = b, where A^T = U^T L^T */ 1671 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1672 { 1673 PetscErrorCode ierr; 1674 ConstPetscScalarKokkosView bv; 1675 PetscScalarKokkosView xv; 1676 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1677 1678 PetscFunctionBegin; 1679 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1680 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1681 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1682 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1683 /* Solve U^T tmpv = b */ 1684 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1685 1686 /* Solve L^T x = tmpv */ 1687 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1688 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1689 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1690 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1691 PetscFunctionReturn(0); 1692 } 1693 1694 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1695 { 1696 PetscErrorCode ierr; 1697 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1698 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1699 PetscInt fill_lev = info->levels; 1700 1701 PetscFunctionBegin; 1702 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1703 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1704 1705 auto a_d = aijkok->a_dual.view_device(); 1706 auto i_d = aijkok->i_dual.view_device(); 1707 auto j_d = aijkok->j_dual.view_device(); 1708 1709 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1710 1711 B->assembled = PETSC_TRUE; 1712 B->preallocated = PETSC_TRUE; 1713 B->ops->solve = MatSolve_SeqAIJKokkos; 1714 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1715 B->ops->matsolve = NULL; 1716 B->ops->matsolvetranspose = NULL; 1717 B->offloadmask = PETSC_OFFLOAD_GPU; 1718 1719 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1720 factors->transpose_updated = PETSC_FALSE; 1721 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1722 /* TODO: log flops, but how to know that? */ 1723 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1724 PetscFunctionReturn(0); 1725 } 1726 1727 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1728 { 1729 PetscErrorCode ierr; 1730 Mat_SeqAIJKokkos *aijkok; 1731 Mat_SeqAIJ *b; 1732 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1733 PetscInt fill_lev = info->levels; 1734 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1735 PetscInt n = A->rmap->n; 1736 1737 PetscFunctionBegin; 1738 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1739 /* Rebuild factors */ 1740 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1741 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1742 1743 /* Create a spiluk handle and then do symbolic factorization */ 1744 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1745 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1746 1747 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1748 1749 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1750 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1751 Kokkos::realloc(factors->iU_d,n+1); 1752 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1753 1754 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1755 auto i_d = aijkok->i_dual.view_device(); 1756 auto j_d = aijkok->j_dual.view_device(); 1757 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1758 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1759 1760 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1761 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1762 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1763 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1764 1765 /* TODO: add options to select sptrsv algorithms */ 1766 /* Create sptrsv handles for L, U and their transpose */ 1767 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1768 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1769 #else 1770 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1771 #endif 1772 1773 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1774 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1775 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1776 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1777 1778 /* Fill fields of the factor matrix B */ 1779 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1780 b = (Mat_SeqAIJ*)B->data; 1781 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1782 B->info.fill_ratio_given = info->fill; 1783 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1784 1785 B->offloadmask = PETSC_OFFLOAD_GPU; 1786 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1787 PetscFunctionReturn(0); 1788 } 1789 1790 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1791 { 1792 PetscErrorCode ierr; 1793 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1794 const PetscInt nrows = A->rmap->n; 1795 1796 PetscFunctionBegin; 1797 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1798 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1799 // move B data into Kokkos 1800 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1801 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1802 { 1803 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1804 if (!baijkok->diag_d) { 1805 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1806 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1807 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1808 } 1809 } 1810 PetscFunctionReturn(0); 1811 } 1812 1813 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1814 { 1815 PetscFunctionBegin; 1816 *type = MATSOLVERKOKKOS; 1817 PetscFunctionReturn(0); 1818 } 1819 1820 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1821 { 1822 PetscFunctionBegin; 1823 *type = MATSOLVERKOKKOSDEVICE; 1824 PetscFunctionReturn(0); 1825 } 1826 1827 /*MC 1828 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1829 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1830 1831 Level: beginner 1832 1833 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1834 M*/ 1835 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1836 { 1837 PetscErrorCode ierr; 1838 PetscInt n = A->rmap->n; 1839 1840 PetscFunctionBegin; 1841 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1842 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1843 (*B)->factortype = ftype; 1844 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1845 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1846 1847 if (ftype == MAT_FACTOR_LU) { 1848 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1849 (*B)->canuseordering = PETSC_TRUE; 1850 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1851 } else if (ftype == MAT_FACTOR_ILU) { 1852 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1853 (*B)->canuseordering = PETSC_FALSE; 1854 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1855 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1856 1857 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1858 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1859 PetscFunctionReturn(0); 1860 } 1861 1862 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1863 { 1864 PetscErrorCode ierr; 1865 PetscInt n = A->rmap->n; 1866 1867 PetscFunctionBegin; 1868 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1869 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1870 (*B)->factortype = ftype; 1871 (*B)->canuseordering = PETSC_TRUE; 1872 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1873 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1874 1875 if (ftype == MAT_FACTOR_LU) { 1876 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1877 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1878 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1879 1880 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1881 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1882 PetscFunctionReturn(0); 1883 } 1884 1885 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1886 { 1887 PetscErrorCode ierr; 1888 1889 PetscFunctionBegin; 1890 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1891 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1892 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1893 PetscFunctionReturn(0); 1894 } 1895 1896 /* Utility to print out a KokkosCsrMatrix for debugging */ 1897 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1898 { 1899 PetscErrorCode ierr; 1900 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1901 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1902 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1903 const PetscInt *i = iv.data(); 1904 const PetscInt *j = jv.data(); 1905 const PetscScalar *a = av.data(); 1906 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1907 1908 PetscFunctionBegin; 1909 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1910 for (PetscInt k=0; k<m; k++) { 1911 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1912 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1913 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1914 } 1915 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1916 } 1917 PetscFunctionReturn(0); 1918 } 1919