1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/aij.h> 19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 20 21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 22 23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 24 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 25 In the latter case, it is important to set a_dual's sync state correctly. 26 */ 27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 28 { 29 PetscErrorCode ierr; 30 Mat_SeqAIJ *aijseq; 31 Mat_SeqAIJKokkos *aijkok; 32 33 PetscFunctionBegin; 34 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 35 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 36 37 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 38 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 39 40 /* If aijkok does not exist, we just copy i, j to device. 41 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 42 In both cases, we build a new aijkok structure. 43 */ 44 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 45 delete aijkok; 46 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 47 A->spptr = aijkok; 48 } 49 50 if (aijkok && aijkok->device_mat_d.data()) { 51 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 52 } 53 PetscFunctionReturn(0); 54 } 55 56 /* Sync CSR data to device if not yet */ 57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 58 { 59 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 61 PetscFunctionBegin; 62 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 63 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 64 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 65 if (aijkok->a_dual.need_sync_device()) { 66 aijkok->a_dual.sync_device(); 67 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 68 aijkok->hermitian_updated = PETSC_FALSE; 69 } 70 PetscFunctionReturn(0); 71 } 72 73 /* Mark the CSR data on device as modified */ 74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 75 { 76 PetscErrorCode ierr; 77 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 78 79 PetscFunctionBegin; 80 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 81 aijkok->a_dual.clear_sync_state(); 82 aijkok->a_dual.modify_device(); 83 aijkok->transpose_updated = PETSC_FALSE; 84 aijkok->hermitian_updated = PETSC_FALSE; 85 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 86 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 87 PetscFunctionReturn(0); 88 } 89 90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 91 { 92 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 93 94 PetscFunctionBegin; 95 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 96 /* We do not expect one needs factors on host */ 97 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 98 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 99 aijkok->a_dual.sync_host(); 100 PetscFunctionReturn(0); 101 } 102 103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 104 { 105 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 106 107 PetscFunctionBegin; 108 if (aijkok) { 109 aijkok->a_dual.sync_host(); 110 *array = aijkok->a_dual.view_host().data(); 111 } else { /* Happens when calling MatSetValues on a newly created matrix */ 112 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 113 } 114 PetscFunctionReturn(0); 115 } 116 117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 118 { 119 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 120 121 PetscFunctionBegin; 122 if (aijkok) aijkok->a_dual.modify_host(); 123 PetscFunctionReturn(0); 124 } 125 126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 129 130 PetscFunctionBegin; 131 if (aijkok) { 132 aijkok->a_dual.sync_host(); 133 *array = aijkok->a_dual.view_host().data(); 134 } else { 135 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 136 } 137 PetscFunctionReturn(0); 138 } 139 140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 141 { 142 PetscFunctionBegin; 143 *array = NULL; 144 PetscFunctionReturn(0); 145 } 146 147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 148 { 149 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 150 151 PetscFunctionBegin; 152 if (aijkok) { 153 *array = aijkok->a_dual.view_host().data(); 154 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 155 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 156 } 157 PetscFunctionReturn(0); 158 } 159 160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 161 { 162 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 164 PetscFunctionBegin; 165 if (aijkok) { 166 aijkok->a_dual.clear_sync_state(); 167 aijkok->a_dual.modify_host(); 168 } 169 PetscFunctionReturn(0); 170 } 171 172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 177 178 PetscFunctionBegin; 179 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 180 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 181 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 190 PetscFunctionBegin; 191 if (aijkok && aijkok->device_mat_d.data()) { 192 *d_mat = aijkok->device_mat_d.data(); 193 } else { 194 PetscErrorCode ierr; 195 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 196 *d_mat = NULL; 197 } 198 PetscFunctionReturn(0); 199 } 200 201 /* Generate the transpose on device and cache it internally */ 202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 205 206 PetscFunctionBegin; 207 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 208 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 209 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 210 CHKERRCXX(aijkok->a_dual.sync_device()); 211 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 212 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 213 aijkok->transpose_updated = PETSC_TRUE; 214 } 215 *csrmatT = &aijkok->csrmatT; 216 PetscFunctionReturn(0); 217 } 218 219 /* Generate the Hermitian on device and cache it internally */ 220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 221 { 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223 224 PetscFunctionBegin; 225 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 226 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 227 CHKERRCXX(aijkok->a_dual.sync_device()); 228 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 230 #if defined(PETSC_USE_COMPLEX) 231 const auto& a = aijkok->csrmatH.values; 232 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 233 #endif 234 aijkok->hermitian_updated = PETSC_TRUE; 235 } 236 *csrmatH = &aijkok->csrmatH; 237 PetscFunctionReturn(0); 238 } 239 240 /* y = A x */ 241 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 242 { 243 PetscErrorCode ierr; 244 Mat_SeqAIJKokkos *aijkok; 245 ConstPetscScalarKokkosView xv; 246 PetscScalarKokkosView yv; 247 248 PetscFunctionBegin; 249 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 250 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 251 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 252 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 253 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 254 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 255 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 256 ierr = WaitForKokkos();CHKERRQ(ierr); 257 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 258 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 259 PetscFunctionReturn(0); 260 } 261 262 /* y = A^T x */ 263 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 264 { 265 PetscErrorCode ierr; 266 Mat_SeqAIJKokkos *aijkok; 267 const char *mode; 268 ConstPetscScalarKokkosView xv; 269 PetscScalarKokkosView yv; 270 KokkosCsrMatrix *csrmat; 271 272 PetscFunctionBegin; 273 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 274 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 275 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 276 if (A->form_explicit_transpose) { 277 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 278 mode = "N"; 279 } else { 280 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 281 csrmat = &aijkok->csrmat; 282 mode = "T"; 283 } 284 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 285 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 286 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 287 ierr = WaitForKokkos();CHKERRQ(ierr); 288 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 289 PetscFunctionReturn(0); 290 } 291 292 /* y = A^H x */ 293 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 294 { 295 PetscErrorCode ierr; 296 Mat_SeqAIJKokkos *aijkok; 297 const char *mode; 298 ConstPetscScalarKokkosView xv; 299 PetscScalarKokkosView yv; 300 KokkosCsrMatrix *csrmat; 301 302 PetscFunctionBegin; 303 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 304 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 305 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 306 if (A->form_explicit_transpose) { 307 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 308 mode = "N"; 309 } else { 310 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 311 csrmat = &aijkok->csrmat; 312 mode = "C"; 313 } 314 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 315 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 316 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 317 ierr = WaitForKokkos();CHKERRQ(ierr); 318 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 319 PetscFunctionReturn(0); 320 } 321 322 /* z = A x + y */ 323 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 324 { 325 PetscErrorCode ierr; 326 Mat_SeqAIJKokkos *aijkok; 327 ConstPetscScalarKokkosView xv,yv; 328 PetscScalarKokkosView zv; 329 330 PetscFunctionBegin; 331 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 332 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 333 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 334 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 335 if (zz != yy) Kokkos::deep_copy(zv,yv); 336 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 337 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 338 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 341 ierr = WaitForKokkos();CHKERRQ(ierr); 342 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 /* z = A^T x + y */ 347 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 348 { 349 PetscErrorCode ierr; 350 Mat_SeqAIJKokkos *aijkok; 351 const char *mode; 352 ConstPetscScalarKokkosView xv,yv; 353 PetscScalarKokkosView zv; 354 KokkosCsrMatrix *csrmat; 355 356 PetscFunctionBegin; 357 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 358 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 359 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 360 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 361 if (zz != yy) Kokkos::deep_copy(zv,yv); 362 if (A->form_explicit_transpose) { 363 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 364 mode = "N"; 365 } else { 366 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 367 csrmat = &aijkok->csrmat; 368 mode = "T"; 369 } 370 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 371 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 372 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 373 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 374 ierr = WaitForKokkos();CHKERRQ(ierr); 375 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 376 PetscFunctionReturn(0); 377 } 378 379 /* z = A^H x + y */ 380 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 381 { 382 PetscErrorCode ierr; 383 Mat_SeqAIJKokkos *aijkok; 384 const char *mode; 385 ConstPetscScalarKokkosView xv,yv; 386 PetscScalarKokkosView zv; 387 KokkosCsrMatrix *csrmat; 388 389 PetscFunctionBegin; 390 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 391 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 392 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 393 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 394 if (zz != yy) Kokkos::deep_copy(zv,yv); 395 if (A->form_explicit_transpose) { 396 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 397 mode = "N"; 398 } else { 399 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 400 csrmat = &aijkok->csrmat; 401 mode = "C"; 402 } 403 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 404 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 405 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 406 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 407 ierr = WaitForKokkos();CHKERRQ(ierr); 408 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 409 PetscFunctionReturn(0); 410 } 411 412 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 413 { 414 PetscErrorCode ierr; 415 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 416 417 PetscFunctionBegin; 418 switch (op) { 419 case MAT_FORM_EXPLICIT_TRANSPOSE: 420 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 421 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 422 A->form_explicit_transpose = flg; 423 break; 424 default: 425 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 426 break; 427 } 428 PetscFunctionReturn(0); 429 } 430 431 /* Depending on reuse, either build a new mat, or use the existing mat */ 432 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 433 { 434 PetscErrorCode ierr; 435 Mat_SeqAIJ *aseq; 436 437 PetscFunctionBegin; 438 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 439 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 440 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 441 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 442 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 443 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 444 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 445 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 446 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 447 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 448 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 449 aseq = static_cast<Mat_SeqAIJ*>(A->data); 450 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 451 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 452 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 453 } 454 } 455 PetscFunctionReturn(0); 456 } 457 458 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 459 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 460 */ 461 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 462 { 463 PetscErrorCode ierr; 464 Mat_SeqAIJ *bseq; 465 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 466 Mat mat; 467 468 PetscFunctionBegin; 469 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 470 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 471 mat = *B; 472 if (A->assembled) { 473 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 474 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 475 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 476 /* Now copy values to B if needed */ 477 if (dupOption == MAT_COPY_VALUES) { 478 if (akok->a_dual.need_sync_device()) { 479 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 480 bkok->a_dual.modify_host(); 481 } else { /* If device has the latest data, we only copy data on device */ 482 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 483 bkok->a_dual.modify_device(); 484 } 485 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 486 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 487 bkok->a_dual.modify_host(); 488 } 489 mat->spptr = bkok; 490 } 491 492 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 493 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 494 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 495 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 496 PetscFunctionReturn(0); 497 } 498 499 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 500 { 501 PetscErrorCode ierr; 502 Mat At; 503 KokkosCsrMatrix *internT,*csrmatT; 504 Mat_SeqAIJKokkos *atkok,*bkok; 505 506 PetscFunctionBegin; 507 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 508 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 509 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 510 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 511 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 512 if (reuse == MAT_INITIAL_MATRIX) *B = At; 513 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 514 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 515 if ((*B)->assembled) { 516 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 517 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 518 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 519 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 520 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 521 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 522 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 523 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 524 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 525 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 526 } 527 PetscFunctionReturn(0); 528 } 529 530 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 531 { 532 PetscErrorCode ierr; 533 Mat_SeqAIJKokkos *aijkok; 534 535 PetscFunctionBegin; 536 if (A->factortype == MAT_FACTOR_NONE) { 537 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 538 if (aijkok) { 539 if (aijkok->device_mat_d.data()) { 540 delete aijkok->colmap_d; 541 delete aijkok->i_uncompressed_d; 542 } 543 if (aijkok->diag_d) delete aijkok->diag_d; 544 } 545 delete aijkok; 546 } else { 547 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 548 } 549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 552 A->spptr = NULL; 553 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 554 PetscFunctionReturn(0); 555 } 556 557 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 558 { 559 PetscErrorCode ierr; 560 561 PetscFunctionBegin; 562 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 563 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 564 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 565 PetscFunctionReturn(0); 566 } 567 568 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 569 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 570 { 571 PetscErrorCode ierr; 572 Mat_SeqAIJ *a,*b; 573 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 574 MatScalarKokkosView aa,ba,ca; 575 MatRowMapKokkosView ai,bi,ci; 576 MatColIdxKokkosView aj,bj,cj; 577 PetscInt m,n,nnz,aN; 578 579 PetscFunctionBegin; 580 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 581 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 582 PetscValidPointer(C,4); 583 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 584 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 585 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 586 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 587 588 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 589 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 590 a = static_cast<Mat_SeqAIJ*>(A->data); 591 b = static_cast<Mat_SeqAIJ*>(B->data); 592 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 593 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 594 aa = akok->a_dual.view_device(); 595 ai = akok->i_dual.view_device(); 596 ba = bkok->a_dual.view_device(); 597 bi = bkok->i_dual.view_device(); 598 m = A->rmap->n; /* M, N and nnz of C */ 599 n = A->cmap->n + B->cmap->n; 600 nnz = a->nz + b->nz; 601 aN = A->cmap->n; /* N of A */ 602 if (reuse == MAT_INITIAL_MATRIX) { 603 aj = akok->j_dual.view_device(); 604 bj = bkok->j_dual.view_device(); 605 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 606 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 607 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 608 ca = ca_dual.view_device(); 609 ci = ci_dual.view_device(); 610 cj = cj_dual.view_device(); 611 612 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 613 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 614 PetscInt i = t.league_rank(); /* row i */ 615 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 616 617 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 618 ci(i) = coffset; 619 if (i == m-1) ci(m) = ai(m) + bi(m); 620 }); 621 622 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 623 if (k < alen) { 624 ca(coffset+k) = aa(ai(i)+k); 625 cj(coffset+k) = aj(ai(i)+k); 626 } else { 627 ca(coffset+k) = ba(bi(i)+k-alen); 628 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 629 } 630 }); 631 }); 632 ca_dual.modify_device(); 633 ci_dual.modify_device(); 634 cj_dual.modify_device(); 635 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 636 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 637 } else if (reuse == MAT_REUSE_MATRIX) { 638 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 639 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 640 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 641 ca = ckok->a_dual.view_device(); 642 ci = ckok->i_dual.view_device(); 643 644 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 645 PetscInt i = t.league_rank(); /* row i */ 646 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 647 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 648 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 649 else ca(ci(i)+k) = ba(bi(i)+k-alen); 650 }); 651 }); 652 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 653 } 654 PetscFunctionReturn(0); 655 } 656 657 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 658 { 659 PetscFunctionBegin; 660 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 661 PetscFunctionReturn(0); 662 } 663 664 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 665 { 666 PetscErrorCode ierr; 667 Mat_Product *product = C->product; 668 Mat A,B; 669 bool transA,transB; /* use bool, since KK needs this type */ 670 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 671 Mat_SeqAIJ *c; 672 MatProductData_SeqAIJKokkos *pdata; 673 KokkosCsrMatrix *csrmatA,*csrmatB; 674 675 PetscFunctionBegin; 676 MatCheckProduct(C,1); 677 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 678 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 679 680 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 681 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 682 PetscFunctionReturn(0); 683 } 684 685 switch (product->type) { 686 case MATPRODUCT_AB: transA = false; transB = false; break; 687 case MATPRODUCT_AtB: transA = true; transB = false; break; 688 case MATPRODUCT_ABt: transA = false; transB = true; break; 689 default: 690 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 691 } 692 693 A = product->A; 694 B = product->B; 695 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 696 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 697 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 698 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 699 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 700 701 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 702 703 csrmatA = &akok->csrmat; 704 csrmatB = &bkok->csrmat; 705 706 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 707 if (transA) { 708 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 709 transA = false; 710 } 711 712 if (transB) { 713 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 714 transB = false; 715 } 716 717 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 718 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 719 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 720 /* shorter version of MatAssemblyEnd_SeqAIJ */ 721 c = (Mat_SeqAIJ*)C->data; 722 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 723 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 724 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 725 c->reallocs = 0; 726 C->info.mallocs = 0; 727 C->info.nz_unneeded = 0; 728 C->assembled = C->was_assembled = PETSC_TRUE; 729 C->num_ass++; 730 PetscFunctionReturn(0); 731 } 732 733 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 734 { 735 PetscErrorCode ierr; 736 Mat_Product *product = C->product; 737 MatProductType ptype; 738 Mat A,B; 739 bool transA,transB; 740 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 741 MatProductData_SeqAIJKokkos *pdata; 742 MPI_Comm comm; 743 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 744 745 PetscFunctionBegin; 746 MatCheckProduct(C,1); 747 ierr = PetscObjectGetComm((PetscObject)C,&comm); 748 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 749 A = product->A; 750 B = product->B; 751 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 752 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 753 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 754 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 755 csrmatA = &akok->csrmat; 756 csrmatB = &bkok->csrmat; 757 758 ptype = product->type; 759 switch (ptype) { 760 case MATPRODUCT_AB: transA = false; transB = false; break; 761 case MATPRODUCT_AtB: transA = true; transB = false; break; 762 case MATPRODUCT_ABt: transA = false; transB = true; break; 763 default: 764 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 765 } 766 767 product->data = pdata = new MatProductData_SeqAIJKokkos(); 768 pdata->kh.set_team_work_size(16); 769 pdata->kh.set_dynamic_scheduling(true); 770 pdata->reusesym = product->api_user; 771 772 /* TODO: add command line options to select spgemm algorithms */ 773 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 774 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 775 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 776 */ 777 pdata->kh.create_spgemm_handle(spgemm_alg); 778 779 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 780 if (transA) { 781 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 782 transA = false; 783 } 784 785 if (transB) { 786 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 787 transB = false; 788 } 789 790 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 791 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 792 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 793 calling new Mat_SeqAIJKokkos(). 794 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 795 */ 796 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 797 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 798 799 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 800 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 801 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 802 PetscFunctionReturn(0); 803 } 804 805 /* handles sparse matrix matrix ops */ 806 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 807 { 808 PetscErrorCode ierr; 809 Mat_Product *product = mat->product; 810 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 811 812 PetscFunctionBegin; 813 MatCheckProduct(mat,1); 814 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 815 if (product->type == MATPRODUCT_ABC) { 816 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 817 } 818 if (Biskok && Ciskok) { 819 switch (product->type) { 820 case MATPRODUCT_AB: 821 case MATPRODUCT_AtB: 822 case MATPRODUCT_ABt: 823 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 824 break; 825 case MATPRODUCT_PtAP: 826 case MATPRODUCT_RARt: 827 case MATPRODUCT_ABC: 828 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 829 break; 830 default: 831 break; 832 } 833 } else { /* fallback for AIJ */ 834 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 835 } 836 PetscFunctionReturn(0); 837 } 838 839 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 840 { 841 PetscErrorCode ierr; 842 Mat_SeqAIJKokkos *aijkok; 843 844 PetscFunctionBegin; 845 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 846 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 847 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 848 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 849 ierr = WaitForKokkos();CHKERRQ(ierr); 850 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 851 PetscFunctionReturn(0); 852 } 853 854 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 855 { 856 PetscErrorCode ierr; 857 Mat_SeqAIJKokkos *aijkok; 858 859 PetscFunctionBegin; 860 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 861 if (aijkok) { /* Only zero the device if data is already there */ 862 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 863 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 864 } else { /* Might be preallocated but not assembled */ 865 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 866 } 867 PetscFunctionReturn(0); 868 } 869 870 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 871 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 872 { 873 PetscErrorCode ierr; 874 Mat_SeqAIJKokkos *aijkok; 875 876 PetscFunctionBegin; 877 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 878 PetscValidPointer(kv,2); 879 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 880 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 881 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 882 *kv = aijkok->a_dual.view_device(); 883 PetscFunctionReturn(0); 884 } 885 886 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 887 { 888 PetscFunctionBegin; 889 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 890 PetscValidPointer(kv,2); 891 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 896 { 897 PetscErrorCode ierr; 898 Mat_SeqAIJKokkos *aijkok; 899 900 PetscFunctionBegin; 901 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 902 PetscValidPointer(kv,2); 903 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 904 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 905 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 906 *kv = aijkok->a_dual.view_device(); 907 PetscFunctionReturn(0); 908 } 909 910 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 911 { 912 PetscErrorCode ierr; 913 914 PetscFunctionBegin; 915 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 916 PetscValidPointer(kv,2); 917 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 918 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 923 { 924 Mat_SeqAIJKokkos *aijkok; 925 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 931 *kv = aijkok->a_dual.view_device(); 932 PetscFunctionReturn(0); 933 } 934 935 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 936 { 937 PetscErrorCode ierr; 938 939 PetscFunctionBegin; 940 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 941 PetscValidPointer(kv,2); 942 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 943 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 944 PetscFunctionReturn(0); 945 } 946 947 /* Computes Y += alpha X */ 948 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 949 { 950 PetscErrorCode ierr; 951 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 952 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 953 ConstMatScalarKokkosView Xa; 954 MatScalarKokkosView Ya; 955 956 PetscFunctionBegin; 957 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 958 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 959 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 960 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 961 962 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 963 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 964 PetscBool e; 965 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 966 if (e) { 967 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 968 if (e) pattern = SAME_NONZERO_PATTERN; 969 } 970 } 971 972 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 973 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 974 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 975 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 976 */ 977 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 978 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 979 Xa = xkok->a_dual.view_device(); 980 Ya = ykok->a_dual.view_device(); 981 982 if (pattern == SAME_NONZERO_PATTERN) { 983 KokkosBlas::axpy(alpha,Xa,Ya); 984 ierr = MatSeqAIJKokkosModifyDevice(Y); 985 } else if (pattern == SUBSET_NONZERO_PATTERN) { 986 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 987 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 988 989 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 990 PetscInt i = t.league_rank(); /* row i */ 991 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 992 PetscInt p,q = Yi(i); 993 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 994 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 995 if (Xj(p) == Yj(q)) { /* Found it */ 996 Ya(q) += alpha * Xa(p); 997 q++; 998 } else { 999 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1000 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1001 */ 1002 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1003 } 1004 } 1005 }); 1006 }); 1007 ierr = MatSeqAIJKokkosModifyDevice(Y); 1008 } else { /* different nonzero patterns */ 1009 Mat Z; 1010 KokkosCsrMatrix zcsr; 1011 KernelHandle kh; 1012 kh.create_spadd_handle(false); 1013 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1014 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1015 zkok = new Mat_SeqAIJKokkos(zcsr); 1016 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1017 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1018 kh.destroy_spadd_handle(); 1019 } 1020 1021 PetscFunctionReturn(0); 1022 } 1023 1024 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1025 { 1026 PetscErrorCode ierr; 1027 MPI_Comm comm; 1028 PetscInt *i,*j,*perm,*jmap; 1029 PetscInt k,M,N,p,q,row,start,end,nnz,nneg; 1030 PetscBool has_repeats = PETSC_FALSE; 1031 PetscInt *Ai,*Aj; 1032 PetscScalar *Aa; 1033 Mat_SeqAIJKokkos *akok; 1034 Mat_SeqAIJ *aseq; 1035 MatRowMapKokkosViewHost perm_h("perm",n),jmap_h; 1036 Mat newmat; 1037 1038 PetscFunctionBegin; 1039 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 1040 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 1041 ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr); 1042 ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */ 1043 ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr); 1044 perm = perm_h.data(); 1045 for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */ 1046 if (j[k] < 0) i[k] = -1; 1047 perm[k] = k; 1048 } 1049 1050 /* Sort by row */ 1051 ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr); 1052 for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */ 1053 nneg = k; 1054 Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */ 1055 nnz = 0; /* Total number of unique nonzeros to be counted */ 1056 jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */ 1057 jmap++; /* Inc jmap by 1 for convinience */ 1058 1059 ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */ 1060 ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */ 1061 1062 /* In each row, sort by column, then unique column indices to get row length */ 1063 Ai++; /* Inc by 1 for convinience */ 1064 q = 0; /* q-th unique nonzero, with q starting from 0 */ 1065 while (k<n) { 1066 row = i[k]; 1067 start = k; /* [start,end) indices for this row */ 1068 while (k<n && i[k] == row) k++; 1069 end = k; 1070 ierr = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr); 1071 /* Find number of unique col entries in this row */ 1072 Aj[q] = j[start]; /* Log the first nonzero in this row */ 1073 jmap[q] = 1; /* Number of repeats of this nozero entry */ 1074 Ai[row] = 1; 1075 nnz++; 1076 1077 for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */ 1078 if (j[p] != j[p-1]) { /* Meet a new nonzero */ 1079 q++; 1080 jmap[q] = 1; 1081 Aj[q] = j[p]; 1082 Ai[row]++; 1083 nnz++; 1084 } else { 1085 jmap[q]++; 1086 has_repeats = PETSC_TRUE; 1087 } 1088 } 1089 q++; /* Move to next row and thus next unique nonzero */ 1090 } 1091 ierr = PetscFree2(i,j);CHKERRQ(ierr); 1092 1093 Ai--; /* Back to the beginning of Ai[] */ 1094 for (k=0; k<M; k++) Ai[k+1] += Ai[k]; 1095 jmap--; /* Back to the beginning of jmap[] */ 1096 jmap[0] = 0; 1097 if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */ 1098 for (k=0; k<nnz; k++) jmap[k+1] += jmap[k]; 1099 Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */ 1100 } 1101 1102 if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */ 1103 PetscInt *Aj_new; 1104 ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr); 1105 ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr); 1106 ierr = PetscFree(Aj);CHKERRQ(ierr); 1107 Aj = Aj_new; 1108 } 1109 1110 ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr); 1111 ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr); 1112 aseq = static_cast<Mat_SeqAIJ*>(newmat->data); 1113 aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */ 1114 aseq->free_a = aseq->free_ij = PETSC_TRUE; 1115 1116 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1117 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1118 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1119 1120 if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */ 1121 MatRowMapKokkosViewHost newperm_h("perm",n-nneg); 1122 Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n))); 1123 perm_h = newperm_h; 1124 } 1125 1126 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1127 akok->SetUpCOO(n,has_repeats,jmap_h,perm_h); 1128 PetscFunctionReturn(0); 1129 } 1130 1131 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1132 { 1133 PetscErrorCode ierr; 1134 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1135 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1136 PetscInt nz = aseq->nz; 1137 const MatRowMapKokkosView& jmap = akok->jmap_d; 1138 const MatRowMapKokkosView& perm = akok->perm_d; 1139 MatScalarKokkosView Aa; 1140 ConstMatScalarKokkosView kv; 1141 PetscMemType memtype; 1142 1143 PetscFunctionBegin; 1144 if (!v) { /* NULL v means an all zero array */ 1145 ierr = MatZeroEntries(A);CHKERRQ(ierr); 1146 PetscFunctionReturn(0); 1147 } 1148 1149 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1150 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1151 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n)); 1152 } else { 1153 kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */ 1154 } 1155 1156 ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */ 1157 if (imode == INSERT_VALUES) { 1158 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1159 } 1160 1161 if (akok->coo_has_repeats) { 1162 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) { 1163 for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k)); 1164 }); 1165 } else { 1166 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));}); 1167 } 1168 ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 1169 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1170 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1175 { 1176 PetscErrorCode ierr; 1177 1178 PetscFunctionBegin; 1179 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1180 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1181 B->offloadmask = PETSC_OFFLOAD_CPU; 1182 PetscFunctionReturn(0); 1183 } 1184 1185 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1186 { 1187 PetscErrorCode ierr; 1188 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1189 1190 PetscFunctionBegin; 1191 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1192 A->boundtocpu = PETSC_FALSE; 1193 1194 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1195 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1196 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1197 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1198 A->ops->scale = MatScale_SeqAIJKokkos; 1199 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1200 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1201 A->ops->mult = MatMult_SeqAIJKokkos; 1202 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1203 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1204 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1205 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1206 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1207 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1208 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1209 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1210 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1211 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1212 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1213 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1214 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1215 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1216 1217 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1218 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1219 PetscFunctionReturn(0); 1220 } 1221 1222 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1223 { 1224 PetscErrorCode ierr; 1225 Mat_SeqAIJ *aseq; 1226 PetscInt i,m,n; 1227 1228 PetscFunctionBegin; 1229 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1230 1231 m = akok->nrows(); 1232 n = akok->ncols(); 1233 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1234 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1235 1236 /* Set up data structures of A as a MATSEQAIJ */ 1237 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1238 aseq = (Mat_SeqAIJ*)(A)->data; 1239 1240 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1241 akok->j_dual.sync_host(); 1242 1243 aseq->i = akok->i_host_data(); 1244 aseq->j = akok->j_host_data(); 1245 aseq->a = akok->a_host_data(); 1246 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1247 aseq->singlemalloc = PETSC_FALSE; 1248 aseq->free_a = PETSC_FALSE; 1249 aseq->free_ij = PETSC_FALSE; 1250 aseq->nz = akok->nnz(); 1251 aseq->maxnz = aseq->nz; 1252 1253 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1254 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1255 for (i=0; i<m; i++) { 1256 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1257 } 1258 1259 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1260 akok->nonzerostate = A->nonzerostate; 1261 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1262 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1263 A->spptr = akok; 1264 PetscFunctionReturn(0); 1265 } 1266 1267 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1268 1269 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1270 */ 1271 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1272 { 1273 PetscErrorCode ierr; 1274 1275 PetscFunctionBegin; 1276 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1277 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1278 PetscFunctionReturn(0); 1279 } 1280 1281 /* --------------------------------------------------------------------------------*/ 1282 /*@C 1283 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1284 (the default parallel PETSc format). This matrix will ultimately be handled by 1285 Kokkos for calculations. For good matrix 1286 assembly performance the user should preallocate the matrix storage by setting 1287 the parameter nz (or the array nnz). By setting these parameters accurately, 1288 performance during matrix assembly can be increased by more than a factor of 50. 1289 1290 Collective 1291 1292 Input Parameters: 1293 + comm - MPI communicator, set to PETSC_COMM_SELF 1294 . m - number of rows 1295 . n - number of columns 1296 . nz - number of nonzeros per row (same for all rows) 1297 - nnz - array containing the number of nonzeros in the various rows 1298 (possibly different for each row) or NULL 1299 1300 Output Parameter: 1301 . A - the matrix 1302 1303 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1304 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1305 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1306 1307 Notes: 1308 If nnz is given then nz is ignored 1309 1310 The AIJ format (also called the Yale sparse matrix format or 1311 compressed row storage), is fully compatible with standard Fortran 77 1312 storage. That is, the stored row and column indices can begin at 1313 either one (as in Fortran) or zero. See the users' manual for details. 1314 1315 Specify the preallocated storage with either nz or nnz (not both). 1316 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1317 allocation. For large problems you MUST preallocate memory or you 1318 will get TERRIBLE performance, see the users' manual chapter on matrices. 1319 1320 By default, this format uses inodes (identical nodes) when possible, to 1321 improve numerical efficiency of matrix-vector products and solves. We 1322 search for consecutive rows with the same nonzero structure, thereby 1323 reusing matrix information to achieve increased efficiency. 1324 1325 Level: intermediate 1326 1327 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1328 @*/ 1329 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1330 { 1331 PetscErrorCode ierr; 1332 1333 PetscFunctionBegin; 1334 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1335 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1336 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1337 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1338 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1339 PetscFunctionReturn(0); 1340 } 1341 1342 typedef Kokkos::TeamPolicy<>::member_type team_member; 1343 // 1344 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1345 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1346 // 1347 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1348 { 1349 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1350 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1351 IS isrow = b->row,isicol = b->icol; 1352 PetscErrorCode ierr; 1353 const PetscInt *r_h,*ic_h; 1354 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1355 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1356 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1357 PetscBool row_identity,col_identity; 1358 PetscInt nc, Nf, nVec=32; // should be a parameter 1359 PetscContainer container; 1360 1361 PetscFunctionBegin; 1362 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1363 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1364 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1365 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1366 if (container) { 1367 PetscInt *pNf=NULL, nv; 1368 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1369 Nf = (*pNf)%1000; 1370 nv = (*pNf)/1000; 1371 if (nv>0) nVec = nv; 1372 } else Nf = 1; 1373 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf); 1374 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1375 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1376 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1377 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1378 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1379 #endif 1380 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1381 { 1382 #define KOKKOS_SHARED_LEVEL 1 1383 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1384 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1385 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1386 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1387 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1388 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1389 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1390 size_t flops_h = 0.0; 1391 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1392 Kokkos::View<size_t> d_flops_k ("flops"); 1393 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1394 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1395 Kokkos::deep_copy (d_flops_k, h_flops_k); 1396 Kokkos::deep_copy (d_r_k, h_r_k); 1397 Kokkos::deep_copy (d_ic_k, h_ic_k); 1398 // Fill A --> fact 1399 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1400 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1401 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1402 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1403 // zero rows of B 1404 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1405 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1406 PetscScalar *baL = ba_d + bi_d[rowb]; 1407 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1408 /* zero (unfactored row) */ 1409 for (int j=0;j<nzbL;j++) baL[j] = 0; 1410 for (int j=0;j<nzbU;j++) baU[j] = 0; 1411 }); 1412 // copy A into B 1413 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1414 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1415 const PetscScalar *av = aa_d + ai_d[rowa]; 1416 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1417 /* load in initial (unfactored row) */ 1418 for (int j=0;j<nza;j++) { 1419 PetscInt colb = ic[ajtmp[j]]; 1420 PetscScalar vala = av[j]; 1421 if (colb == rowb) { 1422 *(ba_d + bdiag_d[rowb]) = vala; 1423 } else { 1424 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1425 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1426 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1427 for (int j=0; j<nz ; j++) { 1428 if (pbj[j] == colb) { 1429 pba[j] = vala; 1430 set++; 1431 break; 1432 } 1433 } 1434 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1435 } 1436 } 1437 }); 1438 }); 1439 Kokkos::fence(); 1440 1441 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1442 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1443 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1444 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1445 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1446 const PetscInt start = field*nloc, end = start + nloc; 1447 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1448 // A22 panel update for each row A(1,:) and col A(:,1) 1449 for (int ii=start; ii<end-1; ii++) { 1450 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1451 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1452 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1453 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1454 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1455 PetscInt kIdx = j*Ni + field_block_idx; 1456 if (kIdx >= nzUi) /* void */ ; 1457 else { 1458 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1459 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1460 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1461 size_t st_idx; 1462 // find and do L(k,i) = A(:k,i) / A(i,i) 1463 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1464 // get column, there has got to be a better way 1465 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1466 if (pjL[j] == ii) { 1467 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1468 idx = j; // output 1469 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1470 } 1471 }, st_idx); 1472 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1473 #if defined(PETSC_USE_DEBUG) 1474 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1475 #endif 1476 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1477 // U(i+1,:end) 1478 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1479 PetscScalar Uij = baUi[uiIdx]; 1480 PetscInt col = bjUi[uiIdx]; 1481 if (col==myk) { 1482 // A_kk = A_kk - L_ki * U_ij(k) 1483 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1484 *Akkv = *Akkv - L_ki() * Uij; // UiK 1485 } else { 1486 PetscScalar *start, *end, *pAkjv=NULL; 1487 PetscInt high, low; 1488 const PetscInt *startj; 1489 if (col<myk) { // L 1490 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1491 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1492 start = pLki+1; // start at pLki+1, A22(myk,1) 1493 startj= bj_d + bi_d[myk] + idx; 1494 end = ba_d + bi_d[myk+1]; 1495 } else { 1496 PetscInt idx = bdiag_d[myk+1]+1; 1497 start = ba_d + idx; 1498 startj= bj_d + idx; 1499 end = ba_d + bdiag_d[myk]; 1500 } 1501 // search for 'col', use bisection search - TODO 1502 low = 0; 1503 high = (PetscInt)(end-start); 1504 while (high-low > 5) { 1505 int t = (low+high)/2; 1506 if (startj[t] > col) high = t; 1507 else low = t; 1508 } 1509 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1510 if (startj[pAkjv-start] == col) break; 1511 } 1512 #if defined(PETSC_USE_DEBUG) 1513 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1514 #endif 1515 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1516 } 1517 }); 1518 } 1519 }); 1520 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1521 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1522 } /* endof for (i=0; i<n; i++) { */ 1523 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1524 }); 1525 Kokkos::fence(); 1526 Kokkos::deep_copy (h_flops_k, d_flops_k); 1527 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1528 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1529 #elif defined(PETSC_USE_LOG) 1530 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1531 #endif 1532 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1533 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1534 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1535 /* Invert diagonal for simpler triangular solves */ 1536 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1537 int i = start + outer_index*Ni + lg_rank%Ni; 1538 if (i < end) { 1539 PetscScalar *pv = ba_d + bdiag_d[i]; 1540 *pv = 1.0/(*pv); 1541 } 1542 }); 1543 }); 1544 } 1545 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1546 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1547 #endif 1548 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1549 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1550 1551 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1552 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1553 if (b->inode.size) { 1554 B->ops->solve = MatSolve_SeqAIJ_Inode; 1555 } else if (row_identity && col_identity) { 1556 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1557 } else { 1558 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1559 } 1560 B->offloadmask = PETSC_OFFLOAD_GPU; 1561 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1562 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1563 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1564 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1565 B->ops->matsolve = MatMatSolve_SeqAIJ; 1566 B->assembled = PETSC_TRUE; 1567 B->preallocated = PETSC_TRUE; 1568 1569 PetscFunctionReturn(0); 1570 } 1571 1572 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1573 { 1574 PetscErrorCode ierr; 1575 1576 PetscFunctionBegin; 1577 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1578 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1579 PetscFunctionReturn(0); 1580 } 1581 1582 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1583 { 1584 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1585 1586 PetscFunctionBegin; 1587 if (!factors->sptrsv_symbolic_completed) { 1588 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1589 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1590 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1591 } 1592 PetscFunctionReturn(0); 1593 } 1594 1595 /* Check if we need to update factors etc for transpose solve */ 1596 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1597 { 1598 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1599 MatColIdxType n = A->rmap->n; 1600 1601 PetscFunctionBegin; 1602 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1603 /* Update L^T and do sptrsv symbolic */ 1604 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1605 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1606 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1607 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1608 1609 KokkosKernels::Impl::transpose_matrix< 1610 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1611 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1612 MatRowMapKokkosView,DefaultExecutionSpace>( 1613 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1614 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1615 1616 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1617 We have to sort the indices, until KK provides finer control options. 1618 */ 1619 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1620 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1621 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1622 1623 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1624 1625 /* Update U^T and do sptrsv symbolic */ 1626 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1627 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1628 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1629 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1630 1631 KokkosKernels::Impl::transpose_matrix< 1632 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1633 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1634 MatRowMapKokkosView,DefaultExecutionSpace>( 1635 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1636 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1637 1638 /* Sort indices. See comments above */ 1639 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1640 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1641 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1642 1643 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1644 factors->transpose_updated = PETSC_TRUE; 1645 } 1646 PetscFunctionReturn(0); 1647 } 1648 1649 /* Solve Ax = b, with A = LU */ 1650 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1651 { 1652 PetscErrorCode ierr; 1653 ConstPetscScalarKokkosView bv; 1654 PetscScalarKokkosView xv; 1655 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1656 1657 PetscFunctionBegin; 1658 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1659 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1660 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1661 /* Solve L tmpv = b */ 1662 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1663 /* Solve Ux = tmpv */ 1664 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1665 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1666 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1667 PetscFunctionReturn(0); 1668 } 1669 1670 /* Solve A^T x = b, where A^T = U^T L^T */ 1671 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1672 { 1673 PetscErrorCode ierr; 1674 ConstPetscScalarKokkosView bv; 1675 PetscScalarKokkosView xv; 1676 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1677 1678 PetscFunctionBegin; 1679 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1680 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1681 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1682 /* Solve U^T tmpv = b */ 1683 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1684 1685 /* Solve L^T x = tmpv */ 1686 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1687 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1688 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1693 { 1694 PetscErrorCode ierr; 1695 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1696 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1697 PetscInt fill_lev = info->levels; 1698 1699 PetscFunctionBegin; 1700 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1701 1702 auto a_d = aijkok->a_dual.view_device(); 1703 auto i_d = aijkok->i_dual.view_device(); 1704 auto j_d = aijkok->j_dual.view_device(); 1705 1706 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1707 1708 B->assembled = PETSC_TRUE; 1709 B->preallocated = PETSC_TRUE; 1710 B->ops->solve = MatSolve_SeqAIJKokkos; 1711 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1712 B->ops->matsolve = NULL; 1713 B->ops->matsolvetranspose = NULL; 1714 B->offloadmask = PETSC_OFFLOAD_GPU; 1715 1716 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1717 factors->transpose_updated = PETSC_FALSE; 1718 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1719 PetscFunctionReturn(0); 1720 } 1721 1722 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1723 { 1724 PetscErrorCode ierr; 1725 Mat_SeqAIJKokkos *aijkok; 1726 Mat_SeqAIJ *b; 1727 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1728 PetscInt fill_lev = info->levels; 1729 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1730 PetscInt n = A->rmap->n; 1731 1732 PetscFunctionBegin; 1733 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1734 /* Rebuild factors */ 1735 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1736 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1737 1738 /* Create a spiluk handle and then do symbolic factorization */ 1739 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1740 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1741 1742 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1743 1744 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1745 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1746 Kokkos::realloc(factors->iU_d,n+1); 1747 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1748 1749 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1750 auto i_d = aijkok->i_dual.view_device(); 1751 auto j_d = aijkok->j_dual.view_device(); 1752 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1753 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1754 1755 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1756 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1757 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1758 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1759 1760 /* TODO: add options to select sptrsv algorithms */ 1761 /* Create sptrsv handles for L, U and their transpose */ 1762 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1763 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1764 #else 1765 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1766 #endif 1767 1768 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1769 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1770 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1771 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1772 1773 /* Fill fields of the factor matrix B */ 1774 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1775 b = (Mat_SeqAIJ*)B->data; 1776 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1777 B->info.fill_ratio_given = info->fill; 1778 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1779 1780 B->offloadmask = PETSC_OFFLOAD_GPU; 1781 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1782 PetscFunctionReturn(0); 1783 } 1784 1785 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1786 { 1787 PetscErrorCode ierr; 1788 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1789 const PetscInt nrows = A->rmap->n; 1790 1791 PetscFunctionBegin; 1792 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1793 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1794 // move B data into Kokkos 1795 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1796 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1797 { 1798 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1799 if (!baijkok->diag_d) { 1800 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1801 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1802 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1803 } 1804 } 1805 PetscFunctionReturn(0); 1806 } 1807 1808 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1809 { 1810 PetscFunctionBegin; 1811 *type = MATSOLVERKOKKOS; 1812 PetscFunctionReturn(0); 1813 } 1814 1815 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1816 { 1817 PetscFunctionBegin; 1818 *type = MATSOLVERKOKKOSDEVICE; 1819 PetscFunctionReturn(0); 1820 } 1821 1822 /*MC 1823 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1824 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1825 1826 Level: beginner 1827 1828 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1829 M*/ 1830 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1831 { 1832 PetscErrorCode ierr; 1833 PetscInt n = A->rmap->n; 1834 1835 PetscFunctionBegin; 1836 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1837 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1838 (*B)->factortype = ftype; 1839 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1840 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1841 1842 if (ftype == MAT_FACTOR_LU) { 1843 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1844 (*B)->canuseordering = PETSC_TRUE; 1845 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1846 } else if (ftype == MAT_FACTOR_ILU) { 1847 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1848 (*B)->canuseordering = PETSC_FALSE; 1849 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1850 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1851 1852 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1853 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1854 PetscFunctionReturn(0); 1855 } 1856 1857 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1858 { 1859 PetscErrorCode ierr; 1860 PetscInt n = A->rmap->n; 1861 1862 PetscFunctionBegin; 1863 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1864 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1865 (*B)->factortype = ftype; 1866 (*B)->canuseordering = PETSC_TRUE; 1867 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1868 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1869 1870 if (ftype == MAT_FACTOR_LU) { 1871 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1872 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1873 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1874 1875 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1876 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1877 PetscFunctionReturn(0); 1878 } 1879 1880 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1881 { 1882 PetscErrorCode ierr; 1883 1884 PetscFunctionBegin; 1885 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1886 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1887 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1888 PetscFunctionReturn(0); 1889 } 1890 1891 /* Utility to print out a KokkosCsrMatrix for debugging */ 1892 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1893 { 1894 PetscErrorCode ierr; 1895 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1896 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1897 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1898 const PetscInt *i = iv.data(); 1899 const PetscInt *j = jv.data(); 1900 const PetscScalar *a = av.data(); 1901 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1902 1903 PetscFunctionBegin; 1904 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1905 for (PetscInt k=0; k<m; k++) { 1906 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1907 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1908 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1909 } 1910 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1911 } 1912 PetscFunctionReturn(0); 1913 } 1914