1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/aij.h> 19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 20 21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 22 23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 24 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 25 In the latter case, it is important to set a_dual's sync state correctly. 26 */ 27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 28 { 29 PetscErrorCode ierr; 30 Mat_SeqAIJ *aijseq; 31 Mat_SeqAIJKokkos *aijkok; 32 33 PetscFunctionBegin; 34 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 35 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 36 37 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 38 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 39 40 /* If aijkok does not exist, we just copy i, j to device. 41 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 42 In both cases, we build a new aijkok structure. 43 */ 44 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 45 delete aijkok; 46 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 47 A->spptr = aijkok; 48 } 49 50 if (aijkok && aijkok->device_mat_d.data()) { 51 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 52 } 53 PetscFunctionReturn(0); 54 } 55 56 /* Sync CSR data to device if not yet */ 57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 58 { 59 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 61 PetscFunctionBegin; 62 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 63 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 64 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 65 if (aijkok->a_dual.need_sync_device()) { 66 aijkok->a_dual.sync_device(); 67 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 68 aijkok->hermitian_updated = PETSC_FALSE; 69 } 70 PetscFunctionReturn(0); 71 } 72 73 /* Mark the CSR data on device as modified */ 74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 75 { 76 PetscErrorCode ierr; 77 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 78 79 PetscFunctionBegin; 80 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 81 aijkok->a_dual.clear_sync_state(); 82 aijkok->a_dual.modify_device(); 83 aijkok->transpose_updated = PETSC_FALSE; 84 aijkok->hermitian_updated = PETSC_FALSE; 85 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 86 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 87 PetscFunctionReturn(0); 88 } 89 90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 91 { 92 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 93 94 PetscFunctionBegin; 95 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 96 /* We do not expect one needs factors on host */ 97 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 98 PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 99 aijkok->a_dual.sync_host(); 100 PetscFunctionReturn(0); 101 } 102 103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 104 { 105 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 106 107 PetscFunctionBegin; 108 if (aijkok) { 109 aijkok->a_dual.sync_host(); 110 *array = aijkok->a_dual.view_host().data(); 111 } else { /* Happens when calling MatSetValues on a newly created matrix */ 112 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 113 } 114 PetscFunctionReturn(0); 115 } 116 117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 118 { 119 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 120 121 PetscFunctionBegin; 122 if (aijkok) aijkok->a_dual.modify_host(); 123 PetscFunctionReturn(0); 124 } 125 126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 129 130 PetscFunctionBegin; 131 if (aijkok) { 132 aijkok->a_dual.sync_host(); 133 *array = aijkok->a_dual.view_host().data(); 134 } else { 135 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 136 } 137 PetscFunctionReturn(0); 138 } 139 140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 141 { 142 PetscFunctionBegin; 143 *array = NULL; 144 PetscFunctionReturn(0); 145 } 146 147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 148 { 149 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 150 151 PetscFunctionBegin; 152 if (aijkok) { 153 *array = aijkok->a_dual.view_host().data(); 154 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 155 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 156 } 157 PetscFunctionReturn(0); 158 } 159 160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 161 { 162 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 164 PetscFunctionBegin; 165 if (aijkok) { 166 aijkok->a_dual.clear_sync_state(); 167 aijkok->a_dual.modify_host(); 168 } 169 PetscFunctionReturn(0); 170 } 171 172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 177 178 PetscFunctionBegin; 179 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 180 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 181 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 190 PetscFunctionBegin; 191 if (aijkok && aijkok->device_mat_d.data()) { 192 *d_mat = aijkok->device_mat_d.data(); 193 } else { 194 PetscErrorCode ierr; 195 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 196 *d_mat = NULL; 197 } 198 PetscFunctionReturn(0); 199 } 200 201 /* Generate the transpose on device and cache it internally */ 202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 205 206 PetscFunctionBegin; 207 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 208 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 209 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 210 CHKERRCXX(aijkok->a_dual.sync_device()); 211 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 212 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 213 aijkok->transpose_updated = PETSC_TRUE; 214 } 215 *csrmatT = &aijkok->csrmatT; 216 PetscFunctionReturn(0); 217 } 218 219 /* Generate the Hermitian on device and cache it internally */ 220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 221 { 222 PetscErrorCode ierr; 223 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 224 225 PetscFunctionBegin; 226 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 227 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 228 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 229 CHKERRCXX(aijkok->a_dual.sync_device()); 230 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 231 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 232 #if defined(PETSC_USE_COMPLEX) 233 const auto& a = aijkok->csrmatH.values; 234 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 235 #endif 236 aijkok->hermitian_updated = PETSC_TRUE; 237 } 238 *csrmatH = &aijkok->csrmatH; 239 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 240 PetscFunctionReturn(0); 241 } 242 243 /* y = A x */ 244 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 245 { 246 PetscErrorCode ierr; 247 Mat_SeqAIJKokkos *aijkok; 248 ConstPetscScalarKokkosView xv; 249 PetscScalarKokkosView yv; 250 251 PetscFunctionBegin; 252 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 255 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 257 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 258 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 259 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 260 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 261 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 /* y = A^T x */ 266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 267 { 268 PetscErrorCode ierr; 269 Mat_SeqAIJKokkos *aijkok; 270 const char *mode; 271 ConstPetscScalarKokkosView xv; 272 PetscScalarKokkosView yv; 273 KokkosCsrMatrix *csrmat; 274 275 PetscFunctionBegin; 276 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 277 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 280 if (A->form_explicit_transpose) { 281 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282 mode = "N"; 283 } else { 284 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285 csrmat = &aijkok->csrmat; 286 mode = "T"; 287 } 288 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 291 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 /* y = A^H x */ 297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 298 { 299 PetscErrorCode ierr; 300 Mat_SeqAIJKokkos *aijkok; 301 const char *mode; 302 ConstPetscScalarKokkosView xv; 303 PetscScalarKokkosView yv; 304 KokkosCsrMatrix *csrmat; 305 306 PetscFunctionBegin; 307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 308 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 311 if (A->form_explicit_transpose) { 312 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313 mode = "N"; 314 } else { 315 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316 csrmat = &aijkok->csrmat; 317 mode = "C"; 318 } 319 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 /* z = A x + y */ 328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 329 { 330 PetscErrorCode ierr; 331 Mat_SeqAIJKokkos *aijkok; 332 ConstPetscScalarKokkosView xv,yv; 333 PetscScalarKokkosView zv; 334 335 PetscFunctionBegin; 336 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 337 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 341 if (zz != yy) Kokkos::deep_copy(zv,yv); 342 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 /* z = A^T x + y */ 353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJKokkos *aijkok; 357 const char *mode; 358 ConstPetscScalarKokkosView xv,yv; 359 PetscScalarKokkosView zv; 360 KokkosCsrMatrix *csrmat; 361 362 PetscFunctionBegin; 363 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 364 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 368 if (zz != yy) Kokkos::deep_copy(zv,yv); 369 if (A->form_explicit_transpose) { 370 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374 csrmat = &aijkok->csrmat; 375 mode = "T"; 376 } 377 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 381 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 /* z = A^H x + y */ 387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 388 { 389 PetscErrorCode ierr; 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv,yv; 393 PetscScalarKokkosView zv; 394 KokkosCsrMatrix *csrmat; 395 396 PetscFunctionBegin; 397 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 398 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 402 if (zz != yy) Kokkos::deep_copy(zv,yv); 403 if (A->form_explicit_transpose) { 404 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408 csrmat = &aijkok->csrmat; 409 mode = "C"; 410 } 411 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 415 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424 425 PetscFunctionBegin; 426 switch (op) { 427 case MAT_FORM_EXPLICIT_TRANSPOSE: 428 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430 A->form_explicit_transpose = flg; 431 break; 432 default: 433 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434 break; 435 } 436 PetscFunctionReturn(0); 437 } 438 439 /* Depending on reuse, either build a new mat, or use the existing mat */ 440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 441 { 442 PetscErrorCode ierr; 443 Mat_SeqAIJ *aseq; 444 445 PetscFunctionBegin; 446 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 449 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 452 PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457 aseq = static_cast<Mat_SeqAIJ*>(A->data); 458 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 459 PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 461 } 462 } 463 PetscFunctionReturn(0); 464 } 465 466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468 */ 469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 470 { 471 PetscErrorCode ierr; 472 Mat_SeqAIJ *bseq; 473 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474 Mat mat; 475 476 PetscFunctionBegin; 477 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479 mat = *B; 480 if (A->assembled) { 481 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484 /* Now copy values to B if needed */ 485 if (dupOption == MAT_COPY_VALUES) { 486 if (akok->a_dual.need_sync_device()) { 487 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488 bkok->a_dual.modify_host(); 489 } else { /* If device has the latest data, we only copy data on device */ 490 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491 bkok->a_dual.modify_device(); 492 } 493 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495 bkok->a_dual.modify_host(); 496 } 497 mat->spptr = bkok; 498 } 499 500 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 508 { 509 PetscErrorCode ierr; 510 Mat At; 511 KokkosCsrMatrix *internT; 512 Mat_SeqAIJKokkos *atkok,*bkok; 513 514 PetscFunctionBegin; 515 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 516 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517 /* Deep copy internT, as we want to isolate the internal transpose */ 518 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 519 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 520 if (reuse == MAT_INITIAL_MATRIX) *B = At; 521 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 522 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 523 if ((*B)->assembled) { 524 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 525 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 526 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 527 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 528 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 529 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 530 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 531 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 532 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 533 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 534 } 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 if (A->factortype == MAT_FACTOR_NONE) { 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 delete aijkok; 547 } else { 548 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 549 } 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 553 A->spptr = NULL; 554 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 555 PetscFunctionReturn(0); 556 } 557 558 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 559 { 560 PetscErrorCode ierr; 561 562 PetscFunctionBegin; 563 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 564 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 565 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 566 PetscFunctionReturn(0); 567 } 568 569 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 570 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 571 { 572 PetscErrorCode ierr; 573 Mat_SeqAIJ *a,*b; 574 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 575 MatScalarKokkosView aa,ba,ca; 576 MatRowMapKokkosView ai,bi,ci; 577 MatColIdxKokkosView aj,bj,cj; 578 PetscInt m,n,nnz,aN; 579 580 PetscFunctionBegin; 581 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 582 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 583 PetscValidPointer(C,4); 584 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 585 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 586 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 587 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 588 589 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 590 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 591 a = static_cast<Mat_SeqAIJ*>(A->data); 592 b = static_cast<Mat_SeqAIJ*>(B->data); 593 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 594 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 595 aa = akok->a_dual.view_device(); 596 ai = akok->i_dual.view_device(); 597 ba = bkok->a_dual.view_device(); 598 bi = bkok->i_dual.view_device(); 599 m = A->rmap->n; /* M, N and nnz of C */ 600 n = A->cmap->n + B->cmap->n; 601 nnz = a->nz + b->nz; 602 aN = A->cmap->n; /* N of A */ 603 if (reuse == MAT_INITIAL_MATRIX) { 604 aj = akok->j_dual.view_device(); 605 bj = bkok->j_dual.view_device(); 606 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 607 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 608 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 609 ca = ca_dual.view_device(); 610 ci = ci_dual.view_device(); 611 cj = cj_dual.view_device(); 612 613 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 614 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 615 PetscInt i = t.league_rank(); /* row i */ 616 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 617 618 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 619 ci(i) = coffset; 620 if (i == m-1) ci(m) = ai(m) + bi(m); 621 }); 622 623 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 624 if (k < alen) { 625 ca(coffset+k) = aa(ai(i)+k); 626 cj(coffset+k) = aj(ai(i)+k); 627 } else { 628 ca(coffset+k) = ba(bi(i)+k-alen); 629 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 630 } 631 }); 632 }); 633 ca_dual.modify_device(); 634 ci_dual.modify_device(); 635 cj_dual.modify_device(); 636 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 637 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 638 } else if (reuse == MAT_REUSE_MATRIX) { 639 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 640 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 641 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 642 ca = ckok->a_dual.view_device(); 643 ci = ckok->i_dual.view_device(); 644 645 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 646 PetscInt i = t.league_rank(); /* row i */ 647 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 648 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 649 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 650 else ca(ci(i)+k) = ba(bi(i)+k-alen); 651 }); 652 }); 653 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 654 } 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 659 { 660 PetscFunctionBegin; 661 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 662 PetscFunctionReturn(0); 663 } 664 665 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 666 { 667 PetscErrorCode ierr; 668 Mat_Product *product = C->product; 669 Mat A,B; 670 bool transA,transB; /* use bool, since KK needs this type */ 671 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 672 Mat_SeqAIJ *c; 673 MatProductData_SeqAIJKokkos *pdata; 674 KokkosCsrMatrix *csrmatA,*csrmatB; 675 676 PetscFunctionBegin; 677 MatCheckProduct(C,1); 678 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 679 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 680 681 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 682 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 683 PetscFunctionReturn(0); 684 } 685 686 switch (product->type) { 687 case MATPRODUCT_AB: transA = false; transB = false; break; 688 case MATPRODUCT_AtB: transA = true; transB = false; break; 689 case MATPRODUCT_ABt: transA = false; transB = true; break; 690 default: 691 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 692 } 693 694 A = product->A; 695 B = product->B; 696 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 697 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 698 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 699 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 700 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 701 702 PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 703 704 csrmatA = &akok->csrmat; 705 csrmatB = &bkok->csrmat; 706 707 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 708 if (transA) { 709 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 710 transA = false; 711 } 712 713 if (transB) { 714 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 715 transB = false; 716 } 717 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 718 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 719 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 720 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 721 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 722 /* shorter version of MatAssemblyEnd_SeqAIJ */ 723 c = (Mat_SeqAIJ*)C->data; 724 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 725 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 726 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 727 c->reallocs = 0; 728 C->info.mallocs = 0; 729 C->info.nz_unneeded = 0; 730 C->assembled = C->was_assembled = PETSC_TRUE; 731 C->num_ass++; 732 PetscFunctionReturn(0); 733 } 734 735 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 736 { 737 PetscErrorCode ierr; 738 Mat_Product *product = C->product; 739 MatProductType ptype; 740 Mat A,B; 741 bool transA,transB; 742 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 743 MatProductData_SeqAIJKokkos *pdata; 744 MPI_Comm comm; 745 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 746 747 PetscFunctionBegin; 748 MatCheckProduct(C,1); 749 ierr = PetscObjectGetComm((PetscObject)C,&comm); 750 PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 751 A = product->A; 752 B = product->B; 753 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 754 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 755 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 756 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 757 csrmatA = &akok->csrmat; 758 csrmatB = &bkok->csrmat; 759 760 ptype = product->type; 761 switch (ptype) { 762 case MATPRODUCT_AB: transA = false; transB = false; break; 763 case MATPRODUCT_AtB: transA = true; transB = false; break; 764 case MATPRODUCT_ABt: transA = false; transB = true; break; 765 default: 766 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 767 } 768 769 product->data = pdata = new MatProductData_SeqAIJKokkos(); 770 pdata->kh.set_team_work_size(16); 771 pdata->kh.set_dynamic_scheduling(true); 772 pdata->reusesym = product->api_user; 773 774 /* TODO: add command line options to select spgemm algorithms */ 775 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 776 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 777 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 778 */ 779 pdata->kh.create_spgemm_handle(spgemm_alg); 780 781 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 782 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 783 if (transA) { 784 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 785 transA = false; 786 } 787 788 if (transB) { 789 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 790 transB = false; 791 } 792 793 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 794 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 795 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 796 calling new Mat_SeqAIJKokkos(). 797 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 798 */ 799 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 800 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 801 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 802 803 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 804 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 805 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 806 PetscFunctionReturn(0); 807 } 808 809 /* handles sparse matrix matrix ops */ 810 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 811 { 812 PetscErrorCode ierr; 813 Mat_Product *product = mat->product; 814 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 815 816 PetscFunctionBegin; 817 MatCheckProduct(mat,1); 818 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 819 if (product->type == MATPRODUCT_ABC) { 820 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 821 } 822 if (Biskok && Ciskok) { 823 switch (product->type) { 824 case MATPRODUCT_AB: 825 case MATPRODUCT_AtB: 826 case MATPRODUCT_ABt: 827 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 828 break; 829 case MATPRODUCT_PtAP: 830 case MATPRODUCT_RARt: 831 case MATPRODUCT_ABC: 832 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 833 break; 834 default: 835 break; 836 } 837 } else { /* fallback for AIJ */ 838 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 839 } 840 PetscFunctionReturn(0); 841 } 842 843 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 844 { 845 PetscErrorCode ierr; 846 Mat_SeqAIJKokkos *aijkok; 847 848 PetscFunctionBegin; 849 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 850 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 851 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 852 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 853 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 854 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 855 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 856 PetscFunctionReturn(0); 857 } 858 859 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 860 { 861 PetscErrorCode ierr; 862 Mat_SeqAIJKokkos *aijkok; 863 864 PetscFunctionBegin; 865 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 866 if (aijkok) { /* Only zero the device if data is already there */ 867 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 868 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 869 } else { /* Might be preallocated but not assembled */ 870 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 871 } 872 PetscFunctionReturn(0); 873 } 874 875 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 876 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 877 { 878 PetscErrorCode ierr; 879 Mat_SeqAIJKokkos *aijkok; 880 881 PetscFunctionBegin; 882 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 883 PetscValidPointer(kv,2); 884 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 885 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 886 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 887 *kv = aijkok->a_dual.view_device(); 888 PetscFunctionReturn(0); 889 } 890 891 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 892 { 893 PetscFunctionBegin; 894 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 895 PetscValidPointer(kv,2); 896 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 897 PetscFunctionReturn(0); 898 } 899 900 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 901 { 902 PetscErrorCode ierr; 903 Mat_SeqAIJKokkos *aijkok; 904 905 PetscFunctionBegin; 906 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 907 PetscValidPointer(kv,2); 908 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 909 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 910 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 911 *kv = aijkok->a_dual.view_device(); 912 PetscFunctionReturn(0); 913 } 914 915 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 916 { 917 PetscErrorCode ierr; 918 919 PetscFunctionBegin; 920 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 921 PetscValidPointer(kv,2); 922 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 923 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 924 PetscFunctionReturn(0); 925 } 926 927 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 928 { 929 Mat_SeqAIJKokkos *aijkok; 930 931 PetscFunctionBegin; 932 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 933 PetscValidPointer(kv,2); 934 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 935 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 936 *kv = aijkok->a_dual.view_device(); 937 PetscFunctionReturn(0); 938 } 939 940 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 941 { 942 PetscErrorCode ierr; 943 944 PetscFunctionBegin; 945 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 946 PetscValidPointer(kv,2); 947 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 948 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 949 PetscFunctionReturn(0); 950 } 951 952 /* Computes Y += alpha X */ 953 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 954 { 955 PetscErrorCode ierr; 956 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 957 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 958 ConstMatScalarKokkosView Xa; 959 MatScalarKokkosView Ya; 960 961 PetscFunctionBegin; 962 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 963 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 964 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 965 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 966 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 967 968 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 969 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 970 PetscBool e; 971 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 972 if (e) { 973 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 974 if (e) pattern = SAME_NONZERO_PATTERN; 975 } 976 } 977 978 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 979 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 980 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 981 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 982 */ 983 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 984 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 985 Xa = xkok->a_dual.view_device(); 986 Ya = ykok->a_dual.view_device(); 987 988 if (pattern == SAME_NONZERO_PATTERN) { 989 KokkosBlas::axpy(alpha,Xa,Ya); 990 ierr = MatSeqAIJKokkosModifyDevice(Y); 991 } else if (pattern == SUBSET_NONZERO_PATTERN) { 992 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 993 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 994 995 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 996 PetscInt i = t.league_rank(); /* row i */ 997 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 998 PetscInt p,q = Yi(i); 999 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1000 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1001 if (Xj(p) == Yj(q)) { /* Found it */ 1002 Ya(q) += alpha * Xa(p); 1003 q++; 1004 } else { 1005 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1006 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1007 */ 1008 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1009 } 1010 } 1011 }); 1012 }); 1013 ierr = MatSeqAIJKokkosModifyDevice(Y); 1014 } else { /* different nonzero patterns */ 1015 Mat Z; 1016 KokkosCsrMatrix zcsr; 1017 KernelHandle kh; 1018 kh.create_spadd_handle(false); 1019 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1020 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1021 zkok = new Mat_SeqAIJKokkos(zcsr); 1022 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1023 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1024 kh.destroy_spadd_handle(); 1025 } 1026 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1027 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1028 PetscFunctionReturn(0); 1029 } 1030 1031 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1032 { 1033 PetscErrorCode ierr; 1034 MPI_Comm comm; 1035 PetscInt *i,*j,*perm,*jmap; 1036 PetscInt k,M,N,p,q,row,start,end,nnz,nneg; 1037 PetscBool has_repeats = PETSC_FALSE; 1038 PetscInt *Ai,*Aj; 1039 PetscScalar *Aa; 1040 Mat_SeqAIJKokkos *akok; 1041 Mat_SeqAIJ *aseq; 1042 MatRowMapKokkosViewHost perm_h("perm",n),jmap_h; 1043 Mat newmat; 1044 1045 PetscFunctionBegin; 1046 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 1047 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 1048 ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr); 1049 ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */ 1050 ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr); 1051 perm = perm_h.data(); 1052 for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */ 1053 if (j[k] < 0) i[k] = -1; 1054 perm[k] = k; 1055 } 1056 1057 /* Sort by row */ 1058 ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr); 1059 for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */ 1060 nneg = k; 1061 Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */ 1062 nnz = 0; /* Total number of unique nonzeros to be counted */ 1063 jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */ 1064 jmap++; /* Inc jmap by 1 for convinience */ 1065 1066 ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */ 1067 ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */ 1068 1069 /* In each row, sort by column, then unique column indices to get row length */ 1070 Ai++; /* Inc by 1 for convinience */ 1071 q = 0; /* q-th unique nonzero, with q starting from 0 */ 1072 while (k<n) { 1073 row = i[k]; 1074 start = k; /* [start,end) indices for this row */ 1075 while (k<n && i[k] == row) k++; 1076 end = k; 1077 ierr = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr); 1078 /* Find number of unique col entries in this row */ 1079 Aj[q] = j[start]; /* Log the first nonzero in this row */ 1080 jmap[q] = 1; /* Number of repeats of this nozero entry */ 1081 Ai[row] = 1; 1082 nnz++; 1083 1084 for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */ 1085 if (j[p] != j[p-1]) { /* Meet a new nonzero */ 1086 q++; 1087 jmap[q] = 1; 1088 Aj[q] = j[p]; 1089 Ai[row]++; 1090 nnz++; 1091 } else { 1092 jmap[q]++; 1093 has_repeats = PETSC_TRUE; 1094 } 1095 } 1096 q++; /* Move to next row and thus next unique nonzero */ 1097 } 1098 ierr = PetscFree2(i,j);CHKERRQ(ierr); 1099 1100 Ai--; /* Back to the beginning of Ai[] */ 1101 for (k=0; k<M; k++) Ai[k+1] += Ai[k]; 1102 jmap--; /* Back to the beginning of jmap[] */ 1103 jmap[0] = 0; 1104 if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */ 1105 for (k=0; k<nnz; k++) jmap[k+1] += jmap[k]; 1106 Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */ 1107 } 1108 1109 if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */ 1110 PetscInt *Aj_new; 1111 ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr); 1112 ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr); 1113 ierr = PetscFree(Aj);CHKERRQ(ierr); 1114 Aj = Aj_new; 1115 } 1116 1117 ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr); 1118 ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr); 1119 aseq = static_cast<Mat_SeqAIJ*>(newmat->data); 1120 aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */ 1121 aseq->free_a = aseq->free_ij = PETSC_TRUE; 1122 1123 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1124 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1125 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1126 1127 if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */ 1128 MatRowMapKokkosViewHost newperm_h("perm",n-nneg); 1129 Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n))); 1130 perm_h = newperm_h; 1131 } 1132 1133 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1134 akok->SetUpCOO(n,has_repeats,jmap_h,perm_h); 1135 PetscFunctionReturn(0); 1136 } 1137 1138 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1139 { 1140 PetscErrorCode ierr; 1141 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1142 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1143 PetscInt nz = aseq->nz; 1144 const MatRowMapKokkosView& jmap = akok->jmap_d; 1145 const MatRowMapKokkosView& perm = akok->perm_d; 1146 MatScalarKokkosView Aa; 1147 ConstMatScalarKokkosView kv; 1148 PetscMemType memtype; 1149 1150 PetscFunctionBegin; 1151 if (!v) { /* NULL v means an all zero array */ 1152 ierr = MatZeroEntries(A);CHKERRQ(ierr); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1157 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1158 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n)); 1159 } else { 1160 kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */ 1161 } 1162 1163 ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */ 1164 if (imode == INSERT_VALUES) { 1165 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1166 } 1167 1168 if (akok->coo_has_repeats) { 1169 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) { 1170 for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k)); 1171 }); 1172 } else { 1173 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));}); 1174 } 1175 ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 1176 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1177 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1182 { 1183 PetscErrorCode ierr; 1184 1185 PetscFunctionBegin; 1186 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1187 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1188 B->offloadmask = PETSC_OFFLOAD_CPU; 1189 PetscFunctionReturn(0); 1190 } 1191 1192 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1193 { 1194 PetscErrorCode ierr; 1195 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1196 1197 PetscFunctionBegin; 1198 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1199 A->boundtocpu = PETSC_FALSE; 1200 1201 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1202 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1203 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1204 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1205 A->ops->scale = MatScale_SeqAIJKokkos; 1206 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1207 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1208 A->ops->mult = MatMult_SeqAIJKokkos; 1209 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1210 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1211 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1212 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1213 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1214 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1215 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1216 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1217 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1218 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1219 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1220 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1221 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1222 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1223 1224 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1225 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1226 PetscFunctionReturn(0); 1227 } 1228 1229 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1230 { 1231 PetscErrorCode ierr; 1232 Mat_SeqAIJ *aseq; 1233 PetscInt i,m,n; 1234 1235 PetscFunctionBegin; 1236 PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1237 1238 m = akok->nrows(); 1239 n = akok->ncols(); 1240 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1241 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1242 1243 /* Set up data structures of A as a MATSEQAIJ */ 1244 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1245 aseq = (Mat_SeqAIJ*)(A)->data; 1246 1247 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1248 akok->j_dual.sync_host(); 1249 1250 aseq->i = akok->i_host_data(); 1251 aseq->j = akok->j_host_data(); 1252 aseq->a = akok->a_host_data(); 1253 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1254 aseq->singlemalloc = PETSC_FALSE; 1255 aseq->free_a = PETSC_FALSE; 1256 aseq->free_ij = PETSC_FALSE; 1257 aseq->nz = akok->nnz(); 1258 aseq->maxnz = aseq->nz; 1259 1260 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1261 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1262 for (i=0; i<m; i++) { 1263 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1264 } 1265 1266 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1267 akok->nonzerostate = A->nonzerostate; 1268 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1269 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1270 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1271 PetscFunctionReturn(0); 1272 } 1273 1274 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1275 1276 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1277 */ 1278 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1279 { 1280 PetscErrorCode ierr; 1281 1282 PetscFunctionBegin; 1283 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1284 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1285 PetscFunctionReturn(0); 1286 } 1287 1288 /* --------------------------------------------------------------------------------*/ 1289 /*@C 1290 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1291 (the default parallel PETSc format). This matrix will ultimately be handled by 1292 Kokkos for calculations. For good matrix 1293 assembly performance the user should preallocate the matrix storage by setting 1294 the parameter nz (or the array nnz). By setting these parameters accurately, 1295 performance during matrix assembly can be increased by more than a factor of 50. 1296 1297 Collective 1298 1299 Input Parameters: 1300 + comm - MPI communicator, set to PETSC_COMM_SELF 1301 . m - number of rows 1302 . n - number of columns 1303 . nz - number of nonzeros per row (same for all rows) 1304 - nnz - array containing the number of nonzeros in the various rows 1305 (possibly different for each row) or NULL 1306 1307 Output Parameter: 1308 . A - the matrix 1309 1310 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1311 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1312 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1313 1314 Notes: 1315 If nnz is given then nz is ignored 1316 1317 The AIJ format (also called the Yale sparse matrix format or 1318 compressed row storage), is fully compatible with standard Fortran 77 1319 storage. That is, the stored row and column indices can begin at 1320 either one (as in Fortran) or zero. See the users' manual for details. 1321 1322 Specify the preallocated storage with either nz or nnz (not both). 1323 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1324 allocation. For large problems you MUST preallocate memory or you 1325 will get TERRIBLE performance, see the users' manual chapter on matrices. 1326 1327 By default, this format uses inodes (identical nodes) when possible, to 1328 improve numerical efficiency of matrix-vector products and solves. We 1329 search for consecutive rows with the same nonzero structure, thereby 1330 reusing matrix information to achieve increased efficiency. 1331 1332 Level: intermediate 1333 1334 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1335 @*/ 1336 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1337 { 1338 PetscErrorCode ierr; 1339 1340 PetscFunctionBegin; 1341 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1342 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1343 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1344 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1345 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1346 PetscFunctionReturn(0); 1347 } 1348 1349 typedef Kokkos::TeamPolicy<>::member_type team_member; 1350 // 1351 // This factorization exploits block diagonal matrices with "Nf" (not used). 1352 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1353 // 1354 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1355 { 1356 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1357 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1358 IS isrow = b->row,isicol = b->icol; 1359 PetscErrorCode ierr; 1360 const PetscInt *r_h,*ic_h; 1361 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1362 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1363 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1364 PetscBool row_identity,col_identity; 1365 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1366 1367 PetscFunctionBegin; 1368 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1369 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1370 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1371 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1372 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1373 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1374 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1375 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1376 { 1377 #define KOKKOS_SHARED_LEVEL 1 1378 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1379 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1380 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1381 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1382 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1383 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1384 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1385 size_t flops_h = 0.0; 1386 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1387 Kokkos::View<size_t> d_flops_k ("flops"); 1388 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1389 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1390 Kokkos::deep_copy (d_flops_k, h_flops_k); 1391 Kokkos::deep_copy (d_r_k, h_r_k); 1392 Kokkos::deep_copy (d_ic_k, h_ic_k); 1393 // Fill A --> fact 1394 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1395 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1396 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1397 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1398 // zero rows of B 1399 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1400 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1401 PetscScalar *baL = ba_d + bi_d[rowb]; 1402 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1403 /* zero (unfactored row) */ 1404 for (int j=0;j<nzbL;j++) baL[j] = 0; 1405 for (int j=0;j<nzbU;j++) baU[j] = 0; 1406 }); 1407 // copy A into B 1408 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1409 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1410 const PetscScalar *av = aa_d + ai_d[rowa]; 1411 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1412 /* load in initial (unfactored row) */ 1413 for (int j=0;j<nza;j++) { 1414 PetscInt colb = ic[ajtmp[j]]; 1415 PetscScalar vala = av[j]; 1416 if (colb == rowb) { 1417 *(ba_d + bdiag_d[rowb]) = vala; 1418 } else { 1419 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1420 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1421 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1422 for (int j=0; j<nz ; j++) { 1423 if (pbj[j] == colb) { 1424 pba[j] = vala; 1425 set++; 1426 break; 1427 } 1428 } 1429 #if !defined(PETSC_HAVE_SYCL) 1430 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1431 #endif 1432 } 1433 } 1434 }); 1435 }); 1436 Kokkos::fence(); 1437 1438 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1439 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1440 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1441 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1442 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1443 const PetscInt start = field*nloc, end = start + nloc; 1444 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1445 // A22 panel update for each row A(1,:) and col A(:,1) 1446 for (int ii=start; ii<end-1; ii++) { 1447 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1448 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1449 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1450 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1451 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1452 PetscInt kIdx = j*Ni + field_block_idx; 1453 if (kIdx >= nzUi) /* void */ ; 1454 else { 1455 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1456 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1457 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1458 size_t st_idx; 1459 // find and do L(k,i) = A(:k,i) / A(i,i) 1460 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1461 // get column, there has got to be a better way 1462 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1463 if (pjL[j] == ii) { 1464 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1465 idx = j; // output 1466 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1467 } 1468 }, st_idx); 1469 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1470 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1471 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1472 #endif 1473 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1474 // U(i+1,:end) 1475 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1476 PetscScalar Uij = baUi[uiIdx]; 1477 PetscInt col = bjUi[uiIdx]; 1478 if (col==myk) { 1479 // A_kk = A_kk - L_ki * U_ij(k) 1480 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1481 *Akkv = *Akkv - L_ki() * Uij; // UiK 1482 } else { 1483 PetscScalar *start, *end, *pAkjv=NULL; 1484 PetscInt high, low; 1485 const PetscInt *startj; 1486 if (col<myk) { // L 1487 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1488 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1489 start = pLki+1; // start at pLki+1, A22(myk,1) 1490 startj= bj_d + bi_d[myk] + idx; 1491 end = ba_d + bi_d[myk+1]; 1492 } else { 1493 PetscInt idx = bdiag_d[myk+1]+1; 1494 start = ba_d + idx; 1495 startj= bj_d + idx; 1496 end = ba_d + bdiag_d[myk]; 1497 } 1498 // search for 'col', use bisection search - TODO 1499 low = 0; 1500 high = (PetscInt)(end-start); 1501 while (high-low > 5) { 1502 int t = (low+high)/2; 1503 if (startj[t] > col) high = t; 1504 else low = t; 1505 } 1506 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1507 if (startj[pAkjv-start] == col) break; 1508 } 1509 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1510 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1511 #endif 1512 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1513 } 1514 }); 1515 } 1516 }); 1517 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1518 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1519 } /* endof for (i=0; i<n; i++) { */ 1520 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1521 }); 1522 Kokkos::fence(); 1523 Kokkos::deep_copy (h_flops_k, d_flops_k); 1524 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1525 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1526 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1527 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1528 /* Invert diagonal for simpler triangular solves */ 1529 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1530 int i = start + outer_index*Ni + lg_rank%Ni; 1531 if (i < end) { 1532 PetscScalar *pv = ba_d + bdiag_d[i]; 1533 *pv = 1.0/(*pv); 1534 } 1535 }); 1536 }); 1537 } 1538 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1539 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1540 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1541 1542 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1543 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1544 if (b->inode.size) { 1545 B->ops->solve = MatSolve_SeqAIJ_Inode; 1546 } else if (row_identity && col_identity) { 1547 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1548 } else { 1549 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1550 } 1551 B->offloadmask = PETSC_OFFLOAD_GPU; 1552 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1553 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1554 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1555 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1556 B->ops->matsolve = MatMatSolve_SeqAIJ; 1557 B->assembled = PETSC_TRUE; 1558 B->preallocated = PETSC_TRUE; 1559 1560 PetscFunctionReturn(0); 1561 } 1562 1563 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1564 { 1565 PetscErrorCode ierr; 1566 1567 PetscFunctionBegin; 1568 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1569 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1570 PetscFunctionReturn(0); 1571 } 1572 1573 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1574 { 1575 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1576 1577 PetscFunctionBegin; 1578 if (!factors->sptrsv_symbolic_completed) { 1579 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1580 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1581 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1582 } 1583 PetscFunctionReturn(0); 1584 } 1585 1586 /* Check if we need to update factors etc for transpose solve */ 1587 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1588 { 1589 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1590 MatColIdxType n = A->rmap->n; 1591 1592 PetscFunctionBegin; 1593 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1594 /* Update L^T and do sptrsv symbolic */ 1595 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1596 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1597 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1598 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1599 1600 KokkosKernels::Impl::transpose_matrix< 1601 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1602 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1603 MatRowMapKokkosView,DefaultExecutionSpace>( 1604 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1605 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1606 1607 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1608 We have to sort the indices, until KK provides finer control options. 1609 */ 1610 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1611 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1612 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1613 1614 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1615 1616 /* Update U^T and do sptrsv symbolic */ 1617 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1618 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1619 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1620 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1621 1622 KokkosKernels::Impl::transpose_matrix< 1623 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1624 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1625 MatRowMapKokkosView,DefaultExecutionSpace>( 1626 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1627 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1628 1629 /* Sort indices. See comments above */ 1630 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1631 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1632 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1633 1634 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1635 factors->transpose_updated = PETSC_TRUE; 1636 } 1637 PetscFunctionReturn(0); 1638 } 1639 1640 /* Solve Ax = b, with A = LU */ 1641 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1642 { 1643 PetscErrorCode ierr; 1644 ConstPetscScalarKokkosView bv; 1645 PetscScalarKokkosView xv; 1646 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1647 1648 PetscFunctionBegin; 1649 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1650 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1651 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1652 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1653 /* Solve L tmpv = b */ 1654 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1655 /* Solve Ux = tmpv */ 1656 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1657 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1658 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1659 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1660 PetscFunctionReturn(0); 1661 } 1662 1663 /* Solve A^T x = b, where A^T = U^T L^T */ 1664 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1665 { 1666 PetscErrorCode ierr; 1667 ConstPetscScalarKokkosView bv; 1668 PetscScalarKokkosView xv; 1669 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1670 1671 PetscFunctionBegin; 1672 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1673 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1674 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1675 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1676 /* Solve U^T tmpv = b */ 1677 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1678 1679 /* Solve L^T x = tmpv */ 1680 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1681 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1682 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1683 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1684 PetscFunctionReturn(0); 1685 } 1686 1687 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1688 { 1689 PetscErrorCode ierr; 1690 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1691 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1692 PetscInt fill_lev = info->levels; 1693 1694 PetscFunctionBegin; 1695 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1696 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1697 1698 auto a_d = aijkok->a_dual.view_device(); 1699 auto i_d = aijkok->i_dual.view_device(); 1700 auto j_d = aijkok->j_dual.view_device(); 1701 1702 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1703 1704 B->assembled = PETSC_TRUE; 1705 B->preallocated = PETSC_TRUE; 1706 B->ops->solve = MatSolve_SeqAIJKokkos; 1707 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1708 B->ops->matsolve = NULL; 1709 B->ops->matsolvetranspose = NULL; 1710 B->offloadmask = PETSC_OFFLOAD_GPU; 1711 1712 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1713 factors->transpose_updated = PETSC_FALSE; 1714 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1715 /* TODO: log flops, but how to know that? */ 1716 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1717 PetscFunctionReturn(0); 1718 } 1719 1720 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1721 { 1722 PetscErrorCode ierr; 1723 Mat_SeqAIJKokkos *aijkok; 1724 Mat_SeqAIJ *b; 1725 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1726 PetscInt fill_lev = info->levels; 1727 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1728 PetscInt n = A->rmap->n; 1729 1730 PetscFunctionBegin; 1731 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1732 /* Rebuild factors */ 1733 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1734 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1735 1736 /* Create a spiluk handle and then do symbolic factorization */ 1737 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1738 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1739 1740 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1741 1742 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1743 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1744 Kokkos::realloc(factors->iU_d,n+1); 1745 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1746 1747 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1748 auto i_d = aijkok->i_dual.view_device(); 1749 auto j_d = aijkok->j_dual.view_device(); 1750 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1751 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1752 1753 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1754 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1755 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1756 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1757 1758 /* TODO: add options to select sptrsv algorithms */ 1759 /* Create sptrsv handles for L, U and their transpose */ 1760 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1761 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1762 #else 1763 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1764 #endif 1765 1766 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1767 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1768 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1769 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1770 1771 /* Fill fields of the factor matrix B */ 1772 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1773 b = (Mat_SeqAIJ*)B->data; 1774 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1775 B->info.fill_ratio_given = info->fill; 1776 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1777 1778 B->offloadmask = PETSC_OFFLOAD_GPU; 1779 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1780 PetscFunctionReturn(0); 1781 } 1782 1783 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1784 { 1785 PetscErrorCode ierr; 1786 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1787 const PetscInt nrows = A->rmap->n; 1788 1789 PetscFunctionBegin; 1790 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1791 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1792 // move B data into Kokkos 1793 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1794 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1795 { 1796 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1797 if (!baijkok->diag_d.extent(0)) { 1798 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1799 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1800 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1801 } 1802 } 1803 PetscFunctionReturn(0); 1804 } 1805 1806 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1807 { 1808 PetscFunctionBegin; 1809 *type = MATSOLVERKOKKOS; 1810 PetscFunctionReturn(0); 1811 } 1812 1813 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1814 { 1815 PetscFunctionBegin; 1816 *type = MATSOLVERKOKKOSDEVICE; 1817 PetscFunctionReturn(0); 1818 } 1819 1820 /*MC 1821 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1822 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1823 1824 Level: beginner 1825 1826 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1827 M*/ 1828 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1829 { 1830 PetscErrorCode ierr; 1831 PetscInt n = A->rmap->n; 1832 1833 PetscFunctionBegin; 1834 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1835 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1836 (*B)->factortype = ftype; 1837 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1838 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1839 1840 if (ftype == MAT_FACTOR_LU) { 1841 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1842 (*B)->canuseordering = PETSC_TRUE; 1843 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1844 } else if (ftype == MAT_FACTOR_ILU) { 1845 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1846 (*B)->canuseordering = PETSC_FALSE; 1847 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1848 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1849 1850 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1851 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1852 PetscFunctionReturn(0); 1853 } 1854 1855 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1856 { 1857 PetscErrorCode ierr; 1858 PetscInt n = A->rmap->n; 1859 1860 PetscFunctionBegin; 1861 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1862 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1863 (*B)->factortype = ftype; 1864 (*B)->canuseordering = PETSC_TRUE; 1865 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1866 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1867 1868 if (ftype == MAT_FACTOR_LU) { 1869 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1870 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1871 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1872 1873 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1874 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1875 PetscFunctionReturn(0); 1876 } 1877 1878 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1879 { 1880 PetscErrorCode ierr; 1881 1882 PetscFunctionBegin; 1883 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1884 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1885 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1886 PetscFunctionReturn(0); 1887 } 1888 1889 /* Utility to print out a KokkosCsrMatrix for debugging */ 1890 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1891 { 1892 PetscErrorCode ierr; 1893 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1894 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1895 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1896 const PetscInt *i = iv.data(); 1897 const PetscInt *j = jv.data(); 1898 const PetscScalar *a = av.data(); 1899 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1900 1901 PetscFunctionBegin; 1902 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1903 for (PetscInt k=0; k<m; k++) { 1904 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1905 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1906 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1907 } 1908 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1909 } 1910 PetscFunctionReturn(0); 1911 } 1912