1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 85 PetscFunctionReturn(0); 86 } 87 88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89 { 90 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91 92 PetscFunctionBegin; 93 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 94 /* We do not expect one needs factors on host */ 95 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97 aijkok->a_dual.sync_host(); 98 PetscFunctionReturn(0); 99 } 100 101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102 { 103 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104 105 PetscFunctionBegin; 106 if (aijkok) { 107 aijkok->a_dual.sync_host(); 108 *array = aijkok->a_dual.view_host().data(); 109 } else { /* Happens when calling MatSetValues on a newly created matrix */ 110 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111 } 112 PetscFunctionReturn(0); 113 } 114 115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116 { 117 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118 119 PetscFunctionBegin; 120 if (aijkok) aijkok->a_dual.modify_host(); 121 PetscFunctionReturn(0); 122 } 123 124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127 128 PetscFunctionBegin; 129 aijkok->a_dual.sync_host(); 130 *array = aijkok->a_dual.view_host().data(); 131 PetscFunctionReturn(0); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 135 { 136 PetscFunctionBegin; 137 *array = NULL; 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 142 { 143 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 144 145 PetscFunctionBegin; 146 *array = aijkok->a_dual.view_host().data(); 147 PetscFunctionReturn(0); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 151 { 152 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 153 154 PetscFunctionBegin; 155 aijkok->a_dual.clear_sync_state(); 156 aijkok->a_dual.modify_host(); 157 PetscFunctionReturn(0); 158 } 159 160 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 161 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 165 166 PetscFunctionBegin; 167 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 168 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 169 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 170 PetscFunctionReturn(0); 171 } 172 173 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 174 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 175 { 176 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 177 178 PetscFunctionBegin; 179 if (aijkok && aijkok->device_mat_d.data()) { 180 *d_mat = aijkok->device_mat_d.data(); 181 } else { 182 PetscErrorCode ierr; 183 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 184 *d_mat = NULL; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 /* Generate the transpose on device and cache it internally */ 190 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 191 { 192 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 193 194 PetscFunctionBegin; 195 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 196 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 197 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 198 CHKERRCXX(aijkok->a_dual.sync_device()); 199 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 200 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 201 aijkok->transpose_updated = PETSC_TRUE; 202 } 203 *csrmatT = &aijkok->csrmatT; 204 PetscFunctionReturn(0); 205 } 206 207 /* Generate the Hermitian on device and cache it internally */ 208 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 209 { 210 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211 212 PetscFunctionBegin; 213 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 214 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 215 CHKERRCXX(aijkok->a_dual.sync_device()); 216 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 217 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 218 #if defined(PETSC_USE_COMPLEX) 219 const auto& a = aijkok->csrmatH.values; 220 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 221 #endif 222 aijkok->hermitian_updated = PETSC_TRUE; 223 } 224 *csrmatH = &aijkok->csrmatH; 225 PetscFunctionReturn(0); 226 } 227 228 /* y = A x */ 229 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqAIJKokkos *aijkok; 233 ConstPetscScalarKokkosView xv; 234 PetscScalarKokkosView yv; 235 236 PetscFunctionBegin; 237 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 238 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 239 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 240 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 241 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 242 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 243 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 244 ierr = WaitForKokkos();CHKERRQ(ierr); 245 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 246 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 247 PetscFunctionReturn(0); 248 } 249 250 /* y = A^T x */ 251 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 252 { 253 PetscErrorCode ierr; 254 Mat_SeqAIJKokkos *aijkok; 255 const char *mode; 256 ConstPetscScalarKokkosView xv; 257 PetscScalarKokkosView yv; 258 KokkosCsrMatrix *csrmat; 259 260 PetscFunctionBegin; 261 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 262 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 263 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 264 if (A->form_explicit_transpose) { 265 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 266 mode = "N"; 267 } else { 268 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269 csrmat = &aijkok->csrmat; 270 mode = "T"; 271 } 272 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 273 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 274 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 275 ierr = WaitForKokkos();CHKERRQ(ierr); 276 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 277 PetscFunctionReturn(0); 278 } 279 280 /* y = A^H x */ 281 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 282 { 283 PetscErrorCode ierr; 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 292 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 293 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 294 if (A->form_explicit_transpose) { 295 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 296 mode = "N"; 297 } else { 298 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 299 csrmat = &aijkok->csrmat; 300 mode = "C"; 301 } 302 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 303 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 304 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 305 ierr = WaitForKokkos();CHKERRQ(ierr); 306 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 /* z = A x + y */ 311 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 312 { 313 PetscErrorCode ierr; 314 Mat_SeqAIJKokkos *aijkok; 315 ConstPetscScalarKokkosView xv,yv; 316 PetscScalarKokkosView zv; 317 318 PetscFunctionBegin; 319 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 320 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 323 if (zz != yy) Kokkos::deep_copy(zv,yv); 324 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 326 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 327 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 328 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 329 ierr = WaitForKokkos();CHKERRQ(ierr); 330 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 /* z = A^T x + y */ 335 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 336 { 337 PetscErrorCode ierr; 338 Mat_SeqAIJKokkos *aijkok; 339 const char *mode; 340 ConstPetscScalarKokkosView xv,yv; 341 PetscScalarKokkosView zv; 342 KokkosCsrMatrix *csrmat; 343 344 PetscFunctionBegin; 345 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 346 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 347 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 348 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 349 if (zz != yy) Kokkos::deep_copy(zv,yv); 350 if (A->form_explicit_transpose) { 351 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 352 mode = "N"; 353 } else { 354 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355 csrmat = &aijkok->csrmat; 356 mode = "T"; 357 } 358 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 359 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 360 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 361 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 362 ierr = WaitForKokkos();CHKERRQ(ierr); 363 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 364 PetscFunctionReturn(0); 365 } 366 367 /* z = A^H x + y */ 368 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 369 { 370 PetscErrorCode ierr; 371 Mat_SeqAIJKokkos *aijkok; 372 const char *mode; 373 ConstPetscScalarKokkosView xv,yv; 374 PetscScalarKokkosView zv; 375 KokkosCsrMatrix *csrmat; 376 377 PetscFunctionBegin; 378 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 379 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 380 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 381 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 382 if (zz != yy) Kokkos::deep_copy(zv,yv); 383 if (A->form_explicit_transpose) { 384 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 385 mode = "N"; 386 } else { 387 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388 csrmat = &aijkok->csrmat; 389 mode = "C"; 390 } 391 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 392 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 393 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 394 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 395 ierr = WaitForKokkos();CHKERRQ(ierr); 396 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 401 { 402 PetscErrorCode ierr; 403 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 404 405 PetscFunctionBegin; 406 switch (op) { 407 case MAT_FORM_EXPLICIT_TRANSPOSE: 408 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 409 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 410 A->form_explicit_transpose = flg; 411 break; 412 default: 413 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 414 break; 415 } 416 PetscFunctionReturn(0); 417 } 418 419 /* Depending on reuse, either build a new mat, or use the existing mat */ 420 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJ *aseq; 424 425 PetscFunctionBegin; 426 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 427 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 428 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 429 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 430 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 431 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 432 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 433 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 434 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 435 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 436 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 437 aseq = static_cast<Mat_SeqAIJ*>(A->data); 438 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 439 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 440 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 441 } 442 } 443 PetscFunctionReturn(0); 444 } 445 446 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 447 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 448 */ 449 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 450 { 451 PetscErrorCode ierr; 452 Mat_SeqAIJ *bseq; 453 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 454 Mat mat; 455 456 PetscFunctionBegin; 457 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 458 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 459 mat = *B; 460 if (A->assembled) { 461 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 462 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 463 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 464 /* Now copy values to B if needed */ 465 if (dupOption == MAT_COPY_VALUES) { 466 if (akok->a_dual.need_sync_device()) { 467 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 468 bkok->a_dual.modify_host(); 469 } else { /* If device has the latest data, we only copy data on device */ 470 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 471 bkok->a_dual.modify_device(); 472 } 473 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 474 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 475 bkok->a_dual.modify_host(); 476 } 477 mat->spptr = bkok; 478 } 479 480 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 481 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 482 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 483 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 488 { 489 PetscErrorCode ierr; 490 Mat_SeqAIJKokkos *aijkok; 491 492 PetscFunctionBegin; 493 if (A->factortype == MAT_FACTOR_NONE) { 494 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 495 if (aijkok) { 496 if (aijkok->device_mat_d.data()) { 497 delete aijkok->colmap_d; 498 delete aijkok->i_uncompressed_d; 499 } 500 if (aijkok->diag_d) delete aijkok->diag_d; 501 } 502 delete aijkok; 503 } else { 504 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 505 } 506 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 507 A->spptr = NULL; 508 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 509 PetscFunctionReturn(0); 510 } 511 512 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 513 { 514 PetscErrorCode ierr; 515 516 PetscFunctionBegin; 517 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 518 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 519 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 520 PetscFunctionReturn(0); 521 } 522 523 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 524 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 525 { 526 PetscErrorCode ierr; 527 Mat_SeqAIJ *a,*b; 528 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 529 MatScalarKokkosView aa,ba,ca; 530 MatRowMapKokkosView ai,bi,ci; 531 MatColIdxKokkosView aj,bj,cj; 532 PetscInt m,n,nnz,aN; 533 534 PetscFunctionBegin; 535 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 536 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 537 PetscValidPointer(C,4); 538 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 539 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 540 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 541 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 542 543 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 544 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 545 a = static_cast<Mat_SeqAIJ*>(A->data); 546 b = static_cast<Mat_SeqAIJ*>(B->data); 547 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 548 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 549 aa = akok->a_dual.view_device(); 550 ai = akok->i_dual.view_device(); 551 ba = bkok->a_dual.view_device(); 552 bi = bkok->i_dual.view_device(); 553 m = A->rmap->n; /* M, N and nnz of C */ 554 n = A->cmap->n + B->cmap->n; 555 nnz = a->nz + b->nz; 556 aN = A->cmap->n; /* N of A */ 557 if (reuse == MAT_INITIAL_MATRIX) { 558 aj = akok->j_dual.view_device(); 559 bj = bkok->j_dual.view_device(); 560 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 561 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 562 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 563 ca = ca_dual.view_device(); 564 ci = ci_dual.view_device(); 565 cj = cj_dual.view_device(); 566 567 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 568 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 569 PetscInt i = t.league_rank(); /* row i */ 570 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 571 572 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 573 ci(i) = coffset; 574 if (i == m-1) ci(m) = ai(m) + bi(m); 575 }); 576 577 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 578 if (k < alen) { 579 ca(coffset+k) = aa(ai(i)+k); 580 cj(coffset+k) = aj(ai(i)+k); 581 } else { 582 ca(coffset+k) = ba(bi(i)+k-alen); 583 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 584 } 585 }); 586 }); 587 ca_dual.modify_device(); 588 ci_dual.modify_device(); 589 cj_dual.modify_device(); 590 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 591 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 592 } else if (reuse == MAT_REUSE_MATRIX) { 593 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 594 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 595 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 596 ca = ckok->a_dual.view_device(); 597 ci = ckok->i_dual.view_device(); 598 599 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 600 PetscInt i = t.league_rank(); /* row i */ 601 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 602 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 603 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 604 else ca(ci(i)+k) = ba(bi(i)+k-alen); 605 }); 606 }); 607 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 608 } 609 PetscFunctionReturn(0); 610 } 611 612 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 613 { 614 PetscFunctionBegin; 615 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 616 PetscFunctionReturn(0); 617 } 618 619 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 620 { 621 PetscErrorCode ierr; 622 Mat_Product *product = C->product; 623 Mat A,B; 624 bool transA,transB; /* use bool, since KK needs this type */ 625 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 626 Mat_SeqAIJ *c; 627 MatProductData_SeqAIJKokkos *pdata; 628 KokkosCsrMatrix *csrmatA,*csrmatB; 629 630 PetscFunctionBegin; 631 MatCheckProduct(C,1); 632 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 633 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 634 635 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 636 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 637 PetscFunctionReturn(0); 638 } 639 640 switch (product->type) { 641 case MATPRODUCT_AB: transA = false; transB = false; break; 642 case MATPRODUCT_AtB: transA = true; transB = false; break; 643 case MATPRODUCT_ABt: transA = false; transB = true; break; 644 default: 645 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 646 } 647 648 A = product->A; 649 B = product->B; 650 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 651 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 652 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 653 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 654 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 655 656 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 657 658 csrmatA = &akok->csrmat; 659 csrmatB = &bkok->csrmat; 660 661 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 662 if (transA) { 663 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 664 transA = false; 665 } 666 667 if (transB) { 668 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 669 transB = false; 670 } 671 672 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 673 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 674 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 675 /* shorter version of MatAssemblyEnd_SeqAIJ */ 676 c = (Mat_SeqAIJ*)C->data; 677 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 678 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 679 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 680 c->reallocs = 0; 681 C->info.mallocs = 0; 682 C->info.nz_unneeded = 0; 683 C->assembled = C->was_assembled = PETSC_TRUE; 684 C->num_ass++; 685 PetscFunctionReturn(0); 686 } 687 688 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 689 { 690 PetscErrorCode ierr; 691 Mat_Product *product = C->product; 692 MatProductType ptype; 693 Mat A,B; 694 bool transA,transB; 695 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 696 MatProductData_SeqAIJKokkos *pdata; 697 MPI_Comm comm; 698 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 699 700 PetscFunctionBegin; 701 MatCheckProduct(C,1); 702 ierr = PetscObjectGetComm((PetscObject)C,&comm); 703 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 704 A = product->A; 705 B = product->B; 706 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 707 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 708 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 709 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 710 csrmatA = &akok->csrmat; 711 csrmatB = &bkok->csrmat; 712 713 ptype = product->type; 714 switch (ptype) { 715 case MATPRODUCT_AB: transA = false; transB = false; break; 716 case MATPRODUCT_AtB: transA = true; transB = false; break; 717 case MATPRODUCT_ABt: transA = false; transB = true; break; 718 default: 719 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 720 } 721 722 product->data = pdata = new MatProductData_SeqAIJKokkos(); 723 pdata->kh.set_team_work_size(16); 724 pdata->kh.set_dynamic_scheduling(true); 725 pdata->reusesym = product->api_user; 726 727 /* TODO: add command line options to select spgemm algorithms */ 728 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 729 #if defined(PETSC_HAVE_CUDA) 730 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 731 /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 732 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 733 #endif 734 #endif 735 pdata->kh.create_spgemm_handle(spgemm_alg); 736 737 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 738 if (transA) { 739 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 740 transA = false; 741 } 742 743 if (transB) { 744 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 745 transB = false; 746 } 747 748 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 749 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 750 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 751 calling new Mat_SeqAIJKokkos(). 752 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 753 */ 754 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 755 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 756 757 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 758 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 759 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 760 PetscFunctionReturn(0); 761 } 762 763 /* handles sparse matrix matrix ops */ 764 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 765 { 766 PetscErrorCode ierr; 767 Mat_Product *product = mat->product; 768 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 769 770 PetscFunctionBegin; 771 MatCheckProduct(mat,1); 772 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 773 if (product->type == MATPRODUCT_ABC) { 774 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 775 } 776 if (Biskok && Ciskok) { 777 switch (product->type) { 778 case MATPRODUCT_AB: 779 case MATPRODUCT_AtB: 780 case MATPRODUCT_ABt: 781 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 782 break; 783 case MATPRODUCT_PtAP: 784 case MATPRODUCT_RARt: 785 case MATPRODUCT_ABC: 786 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 787 break; 788 default: 789 break; 790 } 791 } else { /* fallback for AIJ */ 792 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 793 } 794 PetscFunctionReturn(0); 795 } 796 797 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 798 { 799 PetscErrorCode ierr; 800 Mat_SeqAIJKokkos *aijkok; 801 802 PetscFunctionBegin; 803 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 804 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 805 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 806 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 807 ierr = WaitForKokkos();CHKERRQ(ierr); 808 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 809 PetscFunctionReturn(0); 810 } 811 812 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 813 { 814 PetscErrorCode ierr; 815 Mat_SeqAIJKokkos *aijkok; 816 817 PetscFunctionBegin; 818 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 819 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 820 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 821 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */ 822 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 823 PetscFunctionReturn(0); 824 } 825 826 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 827 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 828 { 829 PetscErrorCode ierr; 830 Mat_SeqAIJKokkos *aijkok; 831 832 PetscFunctionBegin; 833 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 834 PetscValidPointer(kv,2); 835 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 836 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 837 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 838 *kv = aijkok->a_dual.view_device(); 839 PetscFunctionReturn(0); 840 } 841 842 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 843 { 844 PetscFunctionBegin; 845 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 846 PetscValidPointer(kv,2); 847 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 848 PetscFunctionReturn(0); 849 } 850 851 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 852 { 853 PetscErrorCode ierr; 854 Mat_SeqAIJKokkos *aijkok; 855 856 PetscFunctionBegin; 857 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 858 PetscValidPointer(kv,2); 859 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 860 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 861 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 862 *kv = aijkok->a_dual.view_device(); 863 PetscFunctionReturn(0); 864 } 865 866 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 867 { 868 PetscErrorCode ierr; 869 870 PetscFunctionBegin; 871 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 872 PetscValidPointer(kv,2); 873 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 874 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 875 PetscFunctionReturn(0); 876 } 877 878 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 879 { 880 Mat_SeqAIJKokkos *aijkok; 881 882 PetscFunctionBegin; 883 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 884 PetscValidPointer(kv,2); 885 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 886 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 887 *kv = aijkok->a_dual.view_device(); 888 PetscFunctionReturn(0); 889 } 890 891 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 892 { 893 PetscErrorCode ierr; 894 895 PetscFunctionBegin; 896 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 897 PetscValidPointer(kv,2); 898 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 899 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 900 PetscFunctionReturn(0); 901 } 902 903 /* Computes Y += a*X */ 904 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 905 { 906 PetscErrorCode ierr; 907 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 908 909 PetscFunctionBegin; 910 if (X->ops->axpy != Y->ops->axpy) { 911 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 912 PetscFunctionReturn(0); 913 } 914 915 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 916 PetscBool e; 917 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 918 if (e) { 919 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 920 if (e) str = SAME_NONZERO_PATTERN; 921 } 922 } 923 924 if (str != SAME_NONZERO_PATTERN) { 925 /* TODO: do MatAXPY on device */ 926 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 927 PetscFunctionReturn(0); 928 } else { 929 ConstPetscScalarKokkosView xv; 930 PetscScalarKokkosView yv; 931 932 ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr); 933 ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr); 934 KokkosBlas::axpy(a,xv,yv); 935 ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr); 936 ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr); 937 } 938 PetscFunctionReturn(0); 939 } 940 941 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 942 { 943 PetscErrorCode ierr; 944 945 PetscFunctionBegin; 946 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 947 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 948 B->offloadmask = PETSC_OFFLOAD_CPU; 949 PetscFunctionReturn(0); 950 } 951 952 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 953 { 954 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 955 956 PetscFunctionBegin; 957 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 958 A->boundtocpu = PETSC_FALSE; 959 960 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 961 A->ops->destroy = MatDestroy_SeqAIJKokkos; 962 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 963 A->ops->axpy = MatAXPY_SeqAIJKokkos; 964 A->ops->scale = MatScale_SeqAIJKokkos; 965 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 966 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 967 A->ops->mult = MatMult_SeqAIJKokkos; 968 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 969 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 970 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 971 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 972 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 973 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 974 A->ops->setoption = MatSetOption_SeqAIJKokkos; 975 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 976 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 977 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 978 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 979 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 980 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 981 PetscFunctionReturn(0); 982 } 983 984 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 985 { 986 PetscErrorCode ierr; 987 Mat_SeqAIJ *aseq; 988 PetscInt i,m,n; 989 990 PetscFunctionBegin; 991 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 992 993 m = akok->nrows(); 994 n = akok->ncols(); 995 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 996 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 997 998 /* Set up data structures of A as a MATSEQAIJ */ 999 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1000 aseq = (Mat_SeqAIJ*)(A)->data; 1001 1002 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1003 akok->j_dual.sync_host(); 1004 1005 aseq->i = akok->i_host_data(); 1006 aseq->j = akok->j_host_data(); 1007 aseq->a = akok->a_host_data(); 1008 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1009 aseq->singlemalloc = PETSC_FALSE; 1010 aseq->free_a = PETSC_FALSE; 1011 aseq->free_ij = PETSC_FALSE; 1012 aseq->nz = akok->nnz(); 1013 aseq->maxnz = aseq->nz; 1014 1015 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1016 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1017 for (i=0; i<m; i++) { 1018 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1019 } 1020 1021 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1022 akok->nonzerostate = A->nonzerostate; 1023 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1024 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1025 A->spptr = akok; 1026 PetscFunctionReturn(0); 1027 } 1028 1029 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1030 1031 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1032 */ 1033 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1034 { 1035 PetscErrorCode ierr; 1036 1037 PetscFunctionBegin; 1038 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1039 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1040 PetscFunctionReturn(0); 1041 } 1042 1043 /* --------------------------------------------------------------------------------*/ 1044 /*@C 1045 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1046 (the default parallel PETSc format). This matrix will ultimately be handled by 1047 Kokkos for calculations. For good matrix 1048 assembly performance the user should preallocate the matrix storage by setting 1049 the parameter nz (or the array nnz). By setting these parameters accurately, 1050 performance during matrix assembly can be increased by more than a factor of 50. 1051 1052 Collective 1053 1054 Input Parameters: 1055 + comm - MPI communicator, set to PETSC_COMM_SELF 1056 . m - number of rows 1057 . n - number of columns 1058 . nz - number of nonzeros per row (same for all rows) 1059 - nnz - array containing the number of nonzeros in the various rows 1060 (possibly different for each row) or NULL 1061 1062 Output Parameter: 1063 . A - the matrix 1064 1065 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1066 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1067 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1068 1069 Notes: 1070 If nnz is given then nz is ignored 1071 1072 The AIJ format (also called the Yale sparse matrix format or 1073 compressed row storage), is fully compatible with standard Fortran 77 1074 storage. That is, the stored row and column indices can begin at 1075 either one (as in Fortran) or zero. See the users' manual for details. 1076 1077 Specify the preallocated storage with either nz or nnz (not both). 1078 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1079 allocation. For large problems you MUST preallocate memory or you 1080 will get TERRIBLE performance, see the users' manual chapter on matrices. 1081 1082 By default, this format uses inodes (identical nodes) when possible, to 1083 improve numerical efficiency of matrix-vector products and solves. We 1084 search for consecutive rows with the same nonzero structure, thereby 1085 reusing matrix information to achieve increased efficiency. 1086 1087 Level: intermediate 1088 1089 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1090 @*/ 1091 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1092 { 1093 PetscErrorCode ierr; 1094 1095 PetscFunctionBegin; 1096 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1097 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1098 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1099 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1100 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1101 PetscFunctionReturn(0); 1102 } 1103 1104 typedef Kokkos::TeamPolicy<>::member_type team_member; 1105 // 1106 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1107 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1108 // 1109 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1110 { 1111 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1112 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1113 IS isrow = b->row,isicol = b->icol; 1114 PetscErrorCode ierr; 1115 const PetscInt *r_h,*ic_h; 1116 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1117 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1118 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1119 PetscBool row_identity,col_identity; 1120 PetscInt nc, Nf, nVec=32; // should be a parameter 1121 PetscContainer container; 1122 1123 PetscFunctionBegin; 1124 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 1125 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1126 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1127 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1128 if (container) { 1129 PetscInt *pNf=NULL, nv; 1130 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1131 Nf = (*pNf)%1000; 1132 nv = (*pNf)/1000; 1133 if (nv>0) nVec = nv; 1134 } else Nf = 1; 1135 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 1136 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1137 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1138 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1139 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1140 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1141 #endif 1142 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1143 { 1144 #define KOKKOS_SHARED_LEVEL 1 1145 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1146 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1147 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1148 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1149 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1150 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1151 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1152 size_t flops_h = 0.0; 1153 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1154 Kokkos::View<size_t> d_flops_k ("flops"); 1155 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1156 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1157 Kokkos::deep_copy (d_flops_k, h_flops_k); 1158 Kokkos::deep_copy (d_r_k, h_r_k); 1159 Kokkos::deep_copy (d_ic_k, h_ic_k); 1160 // Fill A --> fact 1161 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1162 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1163 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1164 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1165 // zero rows of B 1166 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1167 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1168 PetscScalar *baL = ba_d + bi_d[rowb]; 1169 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1170 /* zero (unfactored row) */ 1171 for (int j=0;j<nzbL;j++) baL[j] = 0; 1172 for (int j=0;j<nzbU;j++) baU[j] = 0; 1173 }); 1174 // copy A into B 1175 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1176 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1177 const PetscScalar *av = aa_d + ai_d[rowa]; 1178 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1179 /* load in initial (unfactored row) */ 1180 for (int j=0;j<nza;j++) { 1181 PetscInt colb = ic[ajtmp[j]]; 1182 PetscScalar vala = av[j]; 1183 if (colb == rowb) { 1184 *(ba_d + bdiag_d[rowb]) = vala; 1185 } else { 1186 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1187 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1188 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1189 for (int j=0; j<nz ; j++) { 1190 if (pbj[j] == colb) { 1191 pba[j] = vala; 1192 set++; 1193 break; 1194 } 1195 } 1196 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1197 } 1198 } 1199 }); 1200 }); 1201 Kokkos::fence(); 1202 1203 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1204 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1205 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1206 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1207 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1208 const PetscInt start = field*nloc, end = start + nloc; 1209 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1210 // A22 panel update for each row A(1,:) and col A(:,1) 1211 for (int ii=start; ii<end-1; ii++) { 1212 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1213 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1214 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1215 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1216 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1217 PetscInt kIdx = j*Ni + field_block_idx; 1218 if (kIdx >= nzUi) /* void */ ; 1219 else { 1220 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1221 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1222 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1223 size_t st_idx; 1224 // find and do L(k,i) = A(:k,i) / A(i,i) 1225 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1226 // get column, there has got to be a better way 1227 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1228 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 1229 if (pjL[j] == ii) { 1230 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1231 idx = j; // output 1232 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1233 } 1234 }, st_idx); 1235 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1236 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 1237 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1238 // U(i+1,:end) 1239 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1240 PetscScalar Uij = baUi[uiIdx]; 1241 PetscInt col = bjUi[uiIdx]; 1242 if (col==myk) { 1243 // A_kk = A_kk - L_ki * U_ij(k) 1244 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1245 *Akkv = *Akkv - L_ki() * Uij; // UiK 1246 } else { 1247 PetscScalar *start, *end, *pAkjv=NULL; 1248 PetscInt high, low; 1249 const PetscInt *startj; 1250 if (col<myk) { // L 1251 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1252 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1253 start = pLki+1; // start at pLki+1, A22(myk,1) 1254 startj= bj_d + bi_d[myk] + idx; 1255 end = ba_d + bi_d[myk+1]; 1256 } else { 1257 PetscInt idx = bdiag_d[myk+1]+1; 1258 start = ba_d + idx; 1259 startj= bj_d + idx; 1260 end = ba_d + bdiag_d[myk]; 1261 } 1262 // search for 'col', use bisection search - TODO 1263 low = 0; 1264 high = (PetscInt)(end-start); 1265 while (high-low > 5) { 1266 int t = (low+high)/2; 1267 if (startj[t] > col) high = t; 1268 else low = t; 1269 } 1270 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1271 if (startj[pAkjv-start] == col) break; 1272 } 1273 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 1274 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1275 } 1276 }); 1277 } 1278 } 1279 }); 1280 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1281 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1282 } /* endof for (i=0; i<n; i++) { */ 1283 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1284 }); 1285 Kokkos::fence(); 1286 Kokkos::deep_copy (h_flops_k, d_flops_k); 1287 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1288 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1289 #elif defined(PETSC_USE_LOG) 1290 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1291 #endif 1292 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1293 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1294 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1295 /* Invert diagonal for simpler triangular solves */ 1296 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1297 int i = start + outer_index*Ni + lg_rank%Ni; 1298 if (i < end) { 1299 PetscScalar *pv = ba_d + bdiag_d[i]; 1300 *pv = 1.0/(*pv); 1301 } 1302 }); 1303 }); 1304 } 1305 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1306 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1307 #endif 1308 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1309 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1310 1311 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1312 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1313 if (b->inode.size) { 1314 B->ops->solve = MatSolve_SeqAIJ_Inode; 1315 } else if (row_identity && col_identity) { 1316 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1317 } else { 1318 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1319 } 1320 B->offloadmask = PETSC_OFFLOAD_GPU; 1321 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1322 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1323 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1324 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1325 B->ops->matsolve = MatMatSolve_SeqAIJ; 1326 B->assembled = PETSC_TRUE; 1327 B->preallocated = PETSC_TRUE; 1328 1329 PetscFunctionReturn(0); 1330 } 1331 1332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1333 { 1334 PetscErrorCode ierr; 1335 1336 PetscFunctionBegin; 1337 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1338 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1339 PetscFunctionReturn(0); 1340 } 1341 1342 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1343 { 1344 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1345 1346 PetscFunctionBegin; 1347 if (!factors->sptrsv_symbolic_completed) { 1348 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1349 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1350 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1351 } 1352 PetscFunctionReturn(0); 1353 } 1354 1355 /* Check if we need to update factors etc for transpose solve */ 1356 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1357 { 1358 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1359 MatColIdxType n = A->rmap->n; 1360 1361 PetscFunctionBegin; 1362 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1363 /* Update L^T and do sptrsv symbolic */ 1364 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1365 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1366 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1367 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1368 1369 KokkosKernels::Impl::transpose_matrix< 1370 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1371 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1372 MatRowMapKokkosView,DefaultExecutionSpace>( 1373 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1374 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1375 1376 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1377 We have to sort the indices, until KK provides finer control options. 1378 */ 1379 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1380 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1381 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1382 1383 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1384 1385 /* Update U^T and do sptrsv symbolic */ 1386 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1387 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1388 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1389 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1390 1391 KokkosKernels::Impl::transpose_matrix< 1392 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1393 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1394 MatRowMapKokkosView,DefaultExecutionSpace>( 1395 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1396 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1397 1398 /* Sort indices. See comments above */ 1399 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1400 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1401 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1402 1403 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1404 factors->transpose_updated = PETSC_TRUE; 1405 } 1406 PetscFunctionReturn(0); 1407 } 1408 1409 /* Solve Ax = b, with A = LU */ 1410 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1411 { 1412 PetscErrorCode ierr; 1413 ConstPetscScalarKokkosView bv; 1414 PetscScalarKokkosView xv; 1415 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1416 1417 PetscFunctionBegin; 1418 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1419 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1420 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1421 /* Solve L tmpv = b */ 1422 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1423 /* Solve Ux = tmpv */ 1424 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1425 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1426 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1427 PetscFunctionReturn(0); 1428 } 1429 1430 /* Solve A^T x = b, where A^T = U^T L^T */ 1431 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1432 { 1433 PetscErrorCode ierr; 1434 ConstPetscScalarKokkosView bv; 1435 PetscScalarKokkosView xv; 1436 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1437 1438 PetscFunctionBegin; 1439 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1440 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1441 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1442 /* Solve U^T tmpv = b */ 1443 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1444 1445 /* Solve L^T x = tmpv */ 1446 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1447 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1448 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1449 PetscFunctionReturn(0); 1450 } 1451 1452 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1453 { 1454 PetscErrorCode ierr; 1455 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1456 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1457 PetscInt fill_lev = info->levels; 1458 1459 PetscFunctionBegin; 1460 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1461 1462 auto a_d = aijkok->a_dual.view_device(); 1463 auto i_d = aijkok->i_dual.view_device(); 1464 auto j_d = aijkok->j_dual.view_device(); 1465 1466 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1467 1468 B->assembled = PETSC_TRUE; 1469 B->preallocated = PETSC_TRUE; 1470 B->ops->solve = MatSolve_SeqAIJKokkos; 1471 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1472 B->ops->matsolve = NULL; 1473 B->ops->matsolvetranspose = NULL; 1474 B->offloadmask = PETSC_OFFLOAD_GPU; 1475 1476 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1477 factors->transpose_updated = PETSC_FALSE; 1478 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1479 PetscFunctionReturn(0); 1480 } 1481 1482 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1483 { 1484 PetscErrorCode ierr; 1485 Mat_SeqAIJKokkos *aijkok; 1486 Mat_SeqAIJ *b; 1487 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1488 PetscInt fill_lev = info->levels; 1489 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1490 PetscInt n = A->rmap->n; 1491 1492 PetscFunctionBegin; 1493 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1494 /* Rebuild factors */ 1495 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1496 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1497 1498 /* Create a spiluk handle and then do symbolic factorization */ 1499 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1500 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1501 1502 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1503 1504 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1505 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1506 Kokkos::realloc(factors->iU_d,n+1); 1507 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1508 1509 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1510 auto i_d = aijkok->i_dual.view_device(); 1511 auto j_d = aijkok->j_dual.view_device(); 1512 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1513 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1514 1515 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1516 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1517 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1518 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1519 1520 /* TODO: add options to select sptrsv algorithms */ 1521 /* Create sptrsv handles for L, U and their transpose */ 1522 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1523 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1524 #else 1525 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1526 #endif 1527 1528 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1529 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1530 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1531 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1532 1533 /* Fill fields of the factor matrix B */ 1534 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1535 b = (Mat_SeqAIJ*)B->data; 1536 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1537 B->info.fill_ratio_given = info->fill; 1538 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1539 1540 B->offloadmask = PETSC_OFFLOAD_GPU; 1541 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1542 PetscFunctionReturn(0); 1543 } 1544 1545 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1546 { 1547 PetscErrorCode ierr; 1548 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1549 const PetscInt nrows = A->rmap->n; 1550 1551 PetscFunctionBegin; 1552 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1553 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1554 // move B data into Kokkos 1555 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1556 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1557 { 1558 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1559 if (!baijkok->diag_d) { 1560 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1561 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1562 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1563 } 1564 } 1565 PetscFunctionReturn(0); 1566 } 1567 1568 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1569 { 1570 PetscFunctionBegin; 1571 *type = MATSOLVERKOKKOS; 1572 PetscFunctionReturn(0); 1573 } 1574 1575 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1576 { 1577 PetscFunctionBegin; 1578 *type = MATSOLVERKOKKOSDEVICE; 1579 PetscFunctionReturn(0); 1580 } 1581 1582 /*MC 1583 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1584 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1585 1586 Level: beginner 1587 1588 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1589 M*/ 1590 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1591 { 1592 PetscErrorCode ierr; 1593 PetscInt n = A->rmap->n; 1594 1595 PetscFunctionBegin; 1596 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1597 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1598 (*B)->factortype = ftype; 1599 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1600 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1601 1602 if (ftype == MAT_FACTOR_LU) { 1603 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1604 (*B)->canuseordering = PETSC_TRUE; 1605 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1606 } else if (ftype == MAT_FACTOR_ILU) { 1607 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1608 (*B)->canuseordering = PETSC_FALSE; 1609 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1610 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1611 1612 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1613 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1614 PetscFunctionReturn(0); 1615 } 1616 1617 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1618 { 1619 PetscErrorCode ierr; 1620 PetscInt n = A->rmap->n; 1621 1622 PetscFunctionBegin; 1623 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1624 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1625 (*B)->factortype = ftype; 1626 (*B)->canuseordering = PETSC_TRUE; 1627 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1628 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1629 1630 if (ftype == MAT_FACTOR_LU) { 1631 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1632 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1633 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1634 1635 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1636 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1637 PetscFunctionReturn(0); 1638 } 1639 1640 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1641 { 1642 PetscErrorCode ierr; 1643 1644 PetscFunctionBegin; 1645 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1646 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1647 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1648 PetscFunctionReturn(0); 1649 } 1650 1651 /* Utility to print out a KokkosCsrMatrix for debugging */ 1652 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1653 { 1654 PetscErrorCode ierr; 1655 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1656 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1657 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1658 const PetscInt *i = iv.data(); 1659 const PetscInt *j = jv.data(); 1660 const PetscScalar *a = av.data(); 1661 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1662 1663 PetscFunctionBegin; 1664 ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1665 for (PetscInt k=0; k<m; k++) { 1666 ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1667 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1668 ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1669 } 1670 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1671 } 1672 PetscFunctionReturn(0); 1673 } 1674