1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 85 PetscFunctionReturn(0); 86 } 87 88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89 { 90 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91 92 PetscFunctionBegin; 93 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 94 /* We do not expect one needs factors on host */ 95 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97 aijkok->a_dual.sync_host(); 98 PetscFunctionReturn(0); 99 } 100 101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102 { 103 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104 105 PetscFunctionBegin; 106 if (aijkok) { 107 aijkok->a_dual.sync_host(); 108 *array = aijkok->a_dual.view_host().data(); 109 } else { /* Happens when calling MatSetValues on a newly created matrix */ 110 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111 } 112 PetscFunctionReturn(0); 113 } 114 115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116 { 117 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118 119 PetscFunctionBegin; 120 if (aijkok) aijkok->a_dual.modify_host(); 121 PetscFunctionReturn(0); 122 } 123 124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127 128 PetscFunctionBegin; 129 aijkok->a_dual.sync_host(); 130 *array = aijkok->a_dual.view_host().data(); 131 PetscFunctionReturn(0); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 135 { 136 PetscFunctionBegin; 137 *array = NULL; 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 142 { 143 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 144 145 PetscFunctionBegin; 146 *array = aijkok->a_dual.view_host().data(); 147 PetscFunctionReturn(0); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 151 { 152 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 153 154 PetscFunctionBegin; 155 aijkok->a_dual.clear_sync_state(); 156 aijkok->a_dual.modify_host(); 157 PetscFunctionReturn(0); 158 } 159 160 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 161 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 165 166 PetscFunctionBegin; 167 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 168 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 169 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 170 PetscFunctionReturn(0); 171 } 172 173 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 174 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 175 { 176 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 177 178 PetscFunctionBegin; 179 if (aijkok && aijkok->device_mat_d.data()) { 180 *d_mat = aijkok->device_mat_d.data(); 181 } else { 182 PetscErrorCode ierr; 183 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 184 *d_mat = NULL; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 /* Generate the transpose on device and cache it internally */ 190 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 191 { 192 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 193 194 PetscFunctionBegin; 195 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 196 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 197 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 198 CHKERRCXX(aijkok->a_dual.sync_device()); 199 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 200 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 201 aijkok->transpose_updated = PETSC_TRUE; 202 } 203 *csrmatT = &aijkok->csrmatT; 204 PetscFunctionReturn(0); 205 } 206 207 /* Generate the Hermitian on device and cache it internally */ 208 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 209 { 210 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211 212 PetscFunctionBegin; 213 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 214 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 215 CHKERRCXX(aijkok->a_dual.sync_device()); 216 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 217 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 218 #if defined(PETSC_USE_COMPLEX) 219 const auto& a = aijkok->csrmatH.values; 220 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 221 #endif 222 aijkok->hermitian_updated = PETSC_TRUE; 223 } 224 *csrmatH = &aijkok->csrmatH; 225 PetscFunctionReturn(0); 226 } 227 228 /* y = A x */ 229 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqAIJKokkos *aijkok; 233 ConstPetscScalarKokkosView xv; 234 PetscScalarKokkosView yv; 235 236 PetscFunctionBegin; 237 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 238 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 239 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 240 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 241 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 242 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 243 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 244 ierr = WaitForKokkos();CHKERRQ(ierr); 245 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 246 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 247 PetscFunctionReturn(0); 248 } 249 250 /* y = A^T x */ 251 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 252 { 253 PetscErrorCode ierr; 254 Mat_SeqAIJKokkos *aijkok; 255 const char *mode; 256 ConstPetscScalarKokkosView xv; 257 PetscScalarKokkosView yv; 258 KokkosCsrMatrix *csrmat; 259 260 PetscFunctionBegin; 261 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 262 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 263 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 264 if (A->form_explicit_transpose) { 265 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 266 mode = "N"; 267 } else { 268 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269 csrmat = &aijkok->csrmat; 270 mode = "T"; 271 } 272 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 273 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 274 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 275 ierr = WaitForKokkos();CHKERRQ(ierr); 276 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 277 PetscFunctionReturn(0); 278 } 279 280 /* y = A^H x */ 281 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 282 { 283 PetscErrorCode ierr; 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 292 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 293 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 294 if (A->form_explicit_transpose) { 295 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 296 mode = "N"; 297 } else { 298 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 299 csrmat = &aijkok->csrmat; 300 mode = "C"; 301 } 302 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 303 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 304 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 305 ierr = WaitForKokkos();CHKERRQ(ierr); 306 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 /* z = A x + y */ 311 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 312 { 313 PetscErrorCode ierr; 314 Mat_SeqAIJKokkos *aijkok; 315 ConstPetscScalarKokkosView xv,yv; 316 PetscScalarKokkosView zv; 317 318 PetscFunctionBegin; 319 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 320 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 323 if (zz != yy) Kokkos::deep_copy(zv,yv); 324 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 326 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 327 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 328 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 329 ierr = WaitForKokkos();CHKERRQ(ierr); 330 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 /* z = A^T x + y */ 335 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 336 { 337 PetscErrorCode ierr; 338 Mat_SeqAIJKokkos *aijkok; 339 const char *mode; 340 ConstPetscScalarKokkosView xv,yv; 341 PetscScalarKokkosView zv; 342 KokkosCsrMatrix *csrmat; 343 344 PetscFunctionBegin; 345 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 346 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 347 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 348 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 349 if (zz != yy) Kokkos::deep_copy(zv,yv); 350 if (A->form_explicit_transpose) { 351 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 352 mode = "N"; 353 } else { 354 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355 csrmat = &aijkok->csrmat; 356 mode = "T"; 357 } 358 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 359 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 360 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 361 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 362 ierr = WaitForKokkos();CHKERRQ(ierr); 363 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 364 PetscFunctionReturn(0); 365 } 366 367 /* z = A^H x + y */ 368 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 369 { 370 PetscErrorCode ierr; 371 Mat_SeqAIJKokkos *aijkok; 372 const char *mode; 373 ConstPetscScalarKokkosView xv,yv; 374 PetscScalarKokkosView zv; 375 KokkosCsrMatrix *csrmat; 376 377 PetscFunctionBegin; 378 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 379 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 380 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 381 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 382 if (zz != yy) Kokkos::deep_copy(zv,yv); 383 if (A->form_explicit_transpose) { 384 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 385 mode = "N"; 386 } else { 387 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388 csrmat = &aijkok->csrmat; 389 mode = "C"; 390 } 391 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 392 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 393 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 394 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 395 ierr = WaitForKokkos();CHKERRQ(ierr); 396 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 401 { 402 PetscErrorCode ierr; 403 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 404 405 PetscFunctionBegin; 406 switch (op) { 407 case MAT_FORM_EXPLICIT_TRANSPOSE: 408 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 409 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 410 A->form_explicit_transpose = flg; 411 break; 412 default: 413 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 414 break; 415 } 416 PetscFunctionReturn(0); 417 } 418 419 /* Depending on reuse, either build a new mat, or use the existing mat */ 420 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJ *aseq; 424 425 PetscFunctionBegin; 426 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 427 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 428 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 429 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 430 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 431 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 432 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 433 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 434 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 435 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 436 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 437 aseq = static_cast<Mat_SeqAIJ*>(A->data); 438 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 439 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 440 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 441 } 442 } 443 PetscFunctionReturn(0); 444 } 445 446 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 447 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 448 */ 449 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 450 { 451 PetscErrorCode ierr; 452 Mat_SeqAIJ *bseq; 453 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 454 Mat mat; 455 456 PetscFunctionBegin; 457 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 458 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 459 mat = *B; 460 if (A->assembled) { 461 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 462 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 463 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 464 /* Now copy values to B if needed */ 465 if (dupOption == MAT_COPY_VALUES) { 466 if (akok->a_dual.need_sync_device()) { 467 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 468 bkok->a_dual.modify_host(); 469 } else { /* If device has the latest data, we only copy data on device */ 470 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 471 bkok->a_dual.modify_device(); 472 } 473 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 474 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 475 bkok->a_dual.modify_host(); 476 } 477 mat->spptr = bkok; 478 } 479 480 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 481 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 482 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 483 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 488 { 489 PetscErrorCode ierr; 490 Mat_SeqAIJKokkos *aijkok; 491 492 PetscFunctionBegin; 493 if (A->factortype == MAT_FACTOR_NONE) { 494 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 495 if (aijkok) { 496 if (aijkok->device_mat_d.data()) { 497 delete aijkok->colmap_d; 498 delete aijkok->i_uncompressed_d; 499 } 500 if (aijkok->diag_d) delete aijkok->diag_d; 501 } 502 delete aijkok; 503 } else { 504 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 505 } 506 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 507 A->spptr = NULL; 508 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 509 PetscFunctionReturn(0); 510 } 511 512 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 513 { 514 PetscErrorCode ierr; 515 516 PetscFunctionBegin; 517 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 518 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 519 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 520 PetscFunctionReturn(0); 521 } 522 523 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 524 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 525 { 526 PetscErrorCode ierr; 527 Mat_SeqAIJ *a,*b; 528 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 529 MatScalarKokkosView aa,ba,ca; 530 MatRowMapKokkosView ai,bi,ci; 531 MatColIdxKokkosView aj,bj,cj; 532 PetscInt m,n,nnz,aN; 533 534 PetscFunctionBegin; 535 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 536 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 537 PetscValidPointer(C,4); 538 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 539 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 540 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 541 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 542 543 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 544 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 545 a = static_cast<Mat_SeqAIJ*>(A->data); 546 b = static_cast<Mat_SeqAIJ*>(B->data); 547 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 548 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 549 aa = akok->a_dual.view_device(); 550 ai = akok->i_dual.view_device(); 551 ba = bkok->a_dual.view_device(); 552 bi = bkok->i_dual.view_device(); 553 m = A->rmap->n; /* M, N and nnz of C */ 554 n = A->cmap->n + B->cmap->n; 555 nnz = a->nz + b->nz; 556 aN = A->cmap->n; /* N of A */ 557 if (reuse == MAT_INITIAL_MATRIX) { 558 aj = akok->j_dual.view_device(); 559 bj = bkok->j_dual.view_device(); 560 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 561 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 562 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 563 ca = ca_dual.view_device(); 564 ci = ci_dual.view_device(); 565 cj = cj_dual.view_device(); 566 567 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 568 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 569 PetscInt i = t.league_rank(); /* row i */ 570 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 571 572 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 573 ci(i) = coffset; 574 if (i == m-1) ci(m) = ai(m) + bi(m); 575 }); 576 577 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 578 if (k < alen) { 579 ca(coffset+k) = aa(ai(i)+k); 580 cj(coffset+k) = aj(ai(i)+k); 581 } else { 582 ca(coffset+k) = ba(bi(i)+k-alen); 583 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 584 } 585 }); 586 }); 587 ca_dual.modify_device(); 588 ci_dual.modify_device(); 589 cj_dual.modify_device(); 590 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 591 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 592 } else if (reuse == MAT_REUSE_MATRIX) { 593 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 594 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 595 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 596 ca = ckok->a_dual.view_device(); 597 ci = ckok->i_dual.view_device(); 598 599 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 600 PetscInt i = t.league_rank(); /* row i */ 601 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 602 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 603 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 604 else ca(ci(i)+k) = ba(bi(i)+k-alen); 605 }); 606 }); 607 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 608 } 609 PetscFunctionReturn(0); 610 } 611 612 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 613 { 614 PetscFunctionBegin; 615 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 616 PetscFunctionReturn(0); 617 } 618 619 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 620 { 621 PetscErrorCode ierr; 622 Mat_Product *product = C->product; 623 Mat A,B; 624 bool transA,transB; /* use bool, since KK needs this type */ 625 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 626 Mat_SeqAIJ *c; 627 MatProductData_SeqAIJKokkos *pdata; 628 KokkosCsrMatrix *csrmatA,*csrmatB; 629 630 PetscFunctionBegin; 631 MatCheckProduct(C,1); 632 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 633 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 634 635 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 636 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 637 PetscFunctionReturn(0); 638 } 639 640 switch (product->type) { 641 case MATPRODUCT_AB: transA = false; transB = false; break; 642 case MATPRODUCT_AtB: transA = true; transB = false; break; 643 case MATPRODUCT_ABt: transA = false; transB = true; break; 644 default: 645 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 646 } 647 648 A = product->A; 649 B = product->B; 650 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 651 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 652 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 653 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 654 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 655 656 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 657 658 csrmatA = &akok->csrmat; 659 csrmatB = &bkok->csrmat; 660 661 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 662 if (transA) { 663 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 664 transA = false; 665 } 666 667 if (transB) { 668 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 669 transB = false; 670 } 671 672 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 673 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 674 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 675 /* shorter version of MatAssemblyEnd_SeqAIJ */ 676 c = (Mat_SeqAIJ*)C->data; 677 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 678 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 679 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 680 c->reallocs = 0; 681 C->info.mallocs = 0; 682 C->info.nz_unneeded = 0; 683 C->assembled = C->was_assembled = PETSC_TRUE; 684 C->num_ass++; 685 PetscFunctionReturn(0); 686 } 687 688 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 689 { 690 PetscErrorCode ierr; 691 Mat_Product *product = C->product; 692 MatProductType ptype; 693 Mat A,B; 694 bool transA,transB; 695 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 696 MatProductData_SeqAIJKokkos *pdata; 697 MPI_Comm comm; 698 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 699 700 PetscFunctionBegin; 701 MatCheckProduct(C,1); 702 ierr = PetscObjectGetComm((PetscObject)C,&comm); 703 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 704 A = product->A; 705 B = product->B; 706 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 707 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 708 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 709 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 710 csrmatA = &akok->csrmat; 711 csrmatB = &bkok->csrmat; 712 713 ptype = product->type; 714 switch (ptype) { 715 case MATPRODUCT_AB: transA = false; transB = false; break; 716 case MATPRODUCT_AtB: transA = true; transB = false; break; 717 case MATPRODUCT_ABt: transA = false; transB = true; break; 718 default: 719 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 720 } 721 722 product->data = pdata = new MatProductData_SeqAIJKokkos(); 723 pdata->kh.set_team_work_size(16); 724 pdata->kh.set_dynamic_scheduling(true); 725 pdata->reusesym = product->api_user; 726 727 /* TODO: add command line options to select spgemm algorithms */ 728 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 729 #if defined(PETSC_HAVE_CUDA) 730 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 731 /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 732 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 733 #endif 734 #endif 735 pdata->kh.create_spgemm_handle(spgemm_alg); 736 737 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 738 if (transA) { 739 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 740 transA = false; 741 } 742 743 if (transB) { 744 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 745 transB = false; 746 } 747 748 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 749 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 750 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 751 calling new Mat_SeqAIJKokkos(). 752 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 753 */ 754 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 755 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 756 757 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 758 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 759 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 760 PetscFunctionReturn(0); 761 } 762 763 /* handles sparse matrix matrix ops */ 764 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 765 { 766 PetscErrorCode ierr; 767 Mat_Product *product = mat->product; 768 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 769 770 PetscFunctionBegin; 771 MatCheckProduct(mat,1); 772 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 773 if (product->type == MATPRODUCT_ABC) { 774 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 775 } 776 if (Biskok && Ciskok) { 777 switch (product->type) { 778 case MATPRODUCT_AB: 779 case MATPRODUCT_AtB: 780 case MATPRODUCT_ABt: 781 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 782 break; 783 case MATPRODUCT_PtAP: 784 case MATPRODUCT_RARt: 785 case MATPRODUCT_ABC: 786 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 787 break; 788 default: 789 break; 790 } 791 } else { /* fallback for AIJ */ 792 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 793 } 794 PetscFunctionReturn(0); 795 } 796 797 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 798 { 799 PetscErrorCode ierr; 800 Mat_SeqAIJKokkos *aijkok; 801 802 PetscFunctionBegin; 803 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 804 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 805 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 806 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 807 ierr = WaitForKokkos();CHKERRQ(ierr); 808 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 809 PetscFunctionReturn(0); 810 } 811 812 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 813 { 814 PetscErrorCode ierr; 815 Mat_SeqAIJKokkos *aijkok; 816 817 PetscFunctionBegin; 818 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 819 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 820 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 821 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */ 822 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 823 PetscFunctionReturn(0); 824 } 825 826 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 827 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 828 { 829 PetscErrorCode ierr; 830 Mat_SeqAIJKokkos *aijkok; 831 832 PetscFunctionBegin; 833 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 834 PetscValidPointer(kv,2); 835 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 836 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 837 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 838 *kv = aijkok->a_dual.view_device(); 839 PetscFunctionReturn(0); 840 } 841 842 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 843 { 844 PetscFunctionBegin; 845 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 846 PetscValidPointer(kv,2); 847 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 848 PetscFunctionReturn(0); 849 } 850 851 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 852 { 853 PetscErrorCode ierr; 854 Mat_SeqAIJKokkos *aijkok; 855 856 PetscFunctionBegin; 857 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 858 PetscValidPointer(kv,2); 859 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 860 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 861 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 862 *kv = aijkok->a_dual.view_device(); 863 PetscFunctionReturn(0); 864 } 865 866 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 867 { 868 PetscErrorCode ierr; 869 870 PetscFunctionBegin; 871 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 872 PetscValidPointer(kv,2); 873 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 874 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 875 PetscFunctionReturn(0); 876 } 877 878 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 879 { 880 Mat_SeqAIJKokkos *aijkok; 881 882 PetscFunctionBegin; 883 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 884 PetscValidPointer(kv,2); 885 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 886 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 887 *kv = aijkok->a_dual.view_device(); 888 PetscFunctionReturn(0); 889 } 890 891 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 892 { 893 PetscErrorCode ierr; 894 895 PetscFunctionBegin; 896 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 897 PetscValidPointer(kv,2); 898 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 899 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 900 PetscFunctionReturn(0); 901 } 902 903 /* Computes Y += alpha X */ 904 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 905 { 906 PetscErrorCode ierr; 907 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 908 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 909 ConstMatScalarKokkosView Xa; 910 MatScalarKokkosView Ya; 911 912 PetscFunctionBegin; 913 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 914 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 915 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 916 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 917 918 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 919 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 920 PetscBool e; 921 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 922 if (e) { 923 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 924 if (e) pattern = SAME_NONZERO_PATTERN; 925 } 926 } 927 928 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 929 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 930 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 931 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 932 */ 933 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 934 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 935 Xa = xkok->a_dual.view_device(); 936 Ya = ykok->a_dual.view_device(); 937 938 if (pattern == SAME_NONZERO_PATTERN) { 939 KokkosBlas::axpy(alpha,Xa,Ya); 940 ierr = MatSeqAIJKokkosModifyDevice(Y); 941 } else if (pattern == SUBSET_NONZERO_PATTERN) { 942 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 943 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 944 945 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 946 PetscInt i = t.league_rank(); /* row i */ 947 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 948 PetscInt p,q = Yi(i); 949 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 950 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 951 if (Xj(p) == Yj(q)) { /* Found it */ 952 Ya(q) += alpha * Xa(p); 953 q++; 954 } else { 955 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 956 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 957 */ 958 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 959 } 960 } 961 }); 962 }); 963 ierr = MatSeqAIJKokkosModifyDevice(Y); 964 } else { /* different nonzero patterns */ 965 Mat Z; 966 KokkosCsrMatrix zcsr; 967 KernelHandle kh; 968 kh.create_spadd_handle(false); 969 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 970 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 971 zkok = new Mat_SeqAIJKokkos(zcsr); 972 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 973 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 974 kh.destroy_spadd_handle(); 975 } 976 977 PetscFunctionReturn(0); 978 } 979 980 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 981 { 982 PetscErrorCode ierr; 983 984 PetscFunctionBegin; 985 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 986 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 987 B->offloadmask = PETSC_OFFLOAD_CPU; 988 PetscFunctionReturn(0); 989 } 990 991 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 992 { 993 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 994 995 PetscFunctionBegin; 996 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 997 A->boundtocpu = PETSC_FALSE; 998 999 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1000 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1001 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1002 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1003 A->ops->scale = MatScale_SeqAIJKokkos; 1004 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1005 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1006 A->ops->mult = MatMult_SeqAIJKokkos; 1007 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1008 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1009 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1010 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1011 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1012 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1013 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1014 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1015 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1016 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1017 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1018 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1019 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1020 PetscFunctionReturn(0); 1021 } 1022 1023 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1024 { 1025 PetscErrorCode ierr; 1026 Mat_SeqAIJ *aseq; 1027 PetscInt i,m,n; 1028 1029 PetscFunctionBegin; 1030 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1031 1032 m = akok->nrows(); 1033 n = akok->ncols(); 1034 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1035 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1036 1037 /* Set up data structures of A as a MATSEQAIJ */ 1038 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1039 aseq = (Mat_SeqAIJ*)(A)->data; 1040 1041 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1042 akok->j_dual.sync_host(); 1043 1044 aseq->i = akok->i_host_data(); 1045 aseq->j = akok->j_host_data(); 1046 aseq->a = akok->a_host_data(); 1047 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1048 aseq->singlemalloc = PETSC_FALSE; 1049 aseq->free_a = PETSC_FALSE; 1050 aseq->free_ij = PETSC_FALSE; 1051 aseq->nz = akok->nnz(); 1052 aseq->maxnz = aseq->nz; 1053 1054 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1055 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1056 for (i=0; i<m; i++) { 1057 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1058 } 1059 1060 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1061 akok->nonzerostate = A->nonzerostate; 1062 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1063 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1064 A->spptr = akok; 1065 PetscFunctionReturn(0); 1066 } 1067 1068 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1069 1070 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1071 */ 1072 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1073 { 1074 PetscErrorCode ierr; 1075 1076 PetscFunctionBegin; 1077 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1078 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1079 PetscFunctionReturn(0); 1080 } 1081 1082 /* --------------------------------------------------------------------------------*/ 1083 /*@C 1084 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1085 (the default parallel PETSc format). This matrix will ultimately be handled by 1086 Kokkos for calculations. For good matrix 1087 assembly performance the user should preallocate the matrix storage by setting 1088 the parameter nz (or the array nnz). By setting these parameters accurately, 1089 performance during matrix assembly can be increased by more than a factor of 50. 1090 1091 Collective 1092 1093 Input Parameters: 1094 + comm - MPI communicator, set to PETSC_COMM_SELF 1095 . m - number of rows 1096 . n - number of columns 1097 . nz - number of nonzeros per row (same for all rows) 1098 - nnz - array containing the number of nonzeros in the various rows 1099 (possibly different for each row) or NULL 1100 1101 Output Parameter: 1102 . A - the matrix 1103 1104 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1105 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1106 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1107 1108 Notes: 1109 If nnz is given then nz is ignored 1110 1111 The AIJ format (also called the Yale sparse matrix format or 1112 compressed row storage), is fully compatible with standard Fortran 77 1113 storage. That is, the stored row and column indices can begin at 1114 either one (as in Fortran) or zero. See the users' manual for details. 1115 1116 Specify the preallocated storage with either nz or nnz (not both). 1117 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1118 allocation. For large problems you MUST preallocate memory or you 1119 will get TERRIBLE performance, see the users' manual chapter on matrices. 1120 1121 By default, this format uses inodes (identical nodes) when possible, to 1122 improve numerical efficiency of matrix-vector products and solves. We 1123 search for consecutive rows with the same nonzero structure, thereby 1124 reusing matrix information to achieve increased efficiency. 1125 1126 Level: intermediate 1127 1128 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1129 @*/ 1130 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1131 { 1132 PetscErrorCode ierr; 1133 1134 PetscFunctionBegin; 1135 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1136 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1137 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1138 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1139 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1140 PetscFunctionReturn(0); 1141 } 1142 1143 typedef Kokkos::TeamPolicy<>::member_type team_member; 1144 // 1145 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1146 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1147 // 1148 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1149 { 1150 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1151 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1152 IS isrow = b->row,isicol = b->icol; 1153 PetscErrorCode ierr; 1154 const PetscInt *r_h,*ic_h; 1155 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1156 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1157 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1158 PetscBool row_identity,col_identity; 1159 PetscInt nc, Nf, nVec=32; // should be a parameter 1160 PetscContainer container; 1161 1162 PetscFunctionBegin; 1163 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 1164 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1165 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1166 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1167 if (container) { 1168 PetscInt *pNf=NULL, nv; 1169 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1170 Nf = (*pNf)%1000; 1171 nv = (*pNf)/1000; 1172 if (nv>0) nVec = nv; 1173 } else Nf = 1; 1174 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 1175 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1176 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1177 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1178 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1179 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1180 #endif 1181 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1182 { 1183 #define KOKKOS_SHARED_LEVEL 1 1184 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1185 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1186 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1187 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1188 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1189 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1190 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1191 size_t flops_h = 0.0; 1192 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1193 Kokkos::View<size_t> d_flops_k ("flops"); 1194 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1195 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1196 Kokkos::deep_copy (d_flops_k, h_flops_k); 1197 Kokkos::deep_copy (d_r_k, h_r_k); 1198 Kokkos::deep_copy (d_ic_k, h_ic_k); 1199 // Fill A --> fact 1200 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1201 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1202 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1203 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1204 // zero rows of B 1205 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1206 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1207 PetscScalar *baL = ba_d + bi_d[rowb]; 1208 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1209 /* zero (unfactored row) */ 1210 for (int j=0;j<nzbL;j++) baL[j] = 0; 1211 for (int j=0;j<nzbU;j++) baU[j] = 0; 1212 }); 1213 // copy A into B 1214 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1215 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1216 const PetscScalar *av = aa_d + ai_d[rowa]; 1217 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1218 /* load in initial (unfactored row) */ 1219 for (int j=0;j<nza;j++) { 1220 PetscInt colb = ic[ajtmp[j]]; 1221 PetscScalar vala = av[j]; 1222 if (colb == rowb) { 1223 *(ba_d + bdiag_d[rowb]) = vala; 1224 } else { 1225 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1226 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1227 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1228 for (int j=0; j<nz ; j++) { 1229 if (pbj[j] == colb) { 1230 pba[j] = vala; 1231 set++; 1232 break; 1233 } 1234 } 1235 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1236 } 1237 } 1238 }); 1239 }); 1240 Kokkos::fence(); 1241 1242 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1243 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1244 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1245 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1246 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1247 const PetscInt start = field*nloc, end = start + nloc; 1248 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1249 // A22 panel update for each row A(1,:) and col A(:,1) 1250 for (int ii=start; ii<end-1; ii++) { 1251 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1252 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1253 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1254 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1255 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1256 PetscInt kIdx = j*Ni + field_block_idx; 1257 if (kIdx >= nzUi) /* void */ ; 1258 else { 1259 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1260 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1261 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1262 size_t st_idx; 1263 // find and do L(k,i) = A(:k,i) / A(i,i) 1264 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1265 // get column, there has got to be a better way 1266 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1267 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 1268 if (pjL[j] == ii) { 1269 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1270 idx = j; // output 1271 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1272 } 1273 }, st_idx); 1274 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1275 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 1276 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1277 // U(i+1,:end) 1278 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1279 PetscScalar Uij = baUi[uiIdx]; 1280 PetscInt col = bjUi[uiIdx]; 1281 if (col==myk) { 1282 // A_kk = A_kk - L_ki * U_ij(k) 1283 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1284 *Akkv = *Akkv - L_ki() * Uij; // UiK 1285 } else { 1286 PetscScalar *start, *end, *pAkjv=NULL; 1287 PetscInt high, low; 1288 const PetscInt *startj; 1289 if (col<myk) { // L 1290 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1291 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1292 start = pLki+1; // start at pLki+1, A22(myk,1) 1293 startj= bj_d + bi_d[myk] + idx; 1294 end = ba_d + bi_d[myk+1]; 1295 } else { 1296 PetscInt idx = bdiag_d[myk+1]+1; 1297 start = ba_d + idx; 1298 startj= bj_d + idx; 1299 end = ba_d + bdiag_d[myk]; 1300 } 1301 // search for 'col', use bisection search - TODO 1302 low = 0; 1303 high = (PetscInt)(end-start); 1304 while (high-low > 5) { 1305 int t = (low+high)/2; 1306 if (startj[t] > col) high = t; 1307 else low = t; 1308 } 1309 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1310 if (startj[pAkjv-start] == col) break; 1311 } 1312 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 1313 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1314 } 1315 }); 1316 } 1317 } 1318 }); 1319 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1320 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1321 } /* endof for (i=0; i<n; i++) { */ 1322 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1323 }); 1324 Kokkos::fence(); 1325 Kokkos::deep_copy (h_flops_k, d_flops_k); 1326 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1327 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1328 #elif defined(PETSC_USE_LOG) 1329 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1330 #endif 1331 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1332 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1333 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1334 /* Invert diagonal for simpler triangular solves */ 1335 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1336 int i = start + outer_index*Ni + lg_rank%Ni; 1337 if (i < end) { 1338 PetscScalar *pv = ba_d + bdiag_d[i]; 1339 *pv = 1.0/(*pv); 1340 } 1341 }); 1342 }); 1343 } 1344 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1345 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1346 #endif 1347 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1348 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1349 1350 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1351 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1352 if (b->inode.size) { 1353 B->ops->solve = MatSolve_SeqAIJ_Inode; 1354 } else if (row_identity && col_identity) { 1355 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1356 } else { 1357 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1358 } 1359 B->offloadmask = PETSC_OFFLOAD_GPU; 1360 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1361 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1362 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1363 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1364 B->ops->matsolve = MatMatSolve_SeqAIJ; 1365 B->assembled = PETSC_TRUE; 1366 B->preallocated = PETSC_TRUE; 1367 1368 PetscFunctionReturn(0); 1369 } 1370 1371 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1372 { 1373 PetscErrorCode ierr; 1374 1375 PetscFunctionBegin; 1376 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1377 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1378 PetscFunctionReturn(0); 1379 } 1380 1381 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1382 { 1383 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1384 1385 PetscFunctionBegin; 1386 if (!factors->sptrsv_symbolic_completed) { 1387 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1388 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1389 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1390 } 1391 PetscFunctionReturn(0); 1392 } 1393 1394 /* Check if we need to update factors etc for transpose solve */ 1395 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1396 { 1397 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1398 MatColIdxType n = A->rmap->n; 1399 1400 PetscFunctionBegin; 1401 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1402 /* Update L^T and do sptrsv symbolic */ 1403 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1404 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1405 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1406 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1407 1408 KokkosKernels::Impl::transpose_matrix< 1409 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1410 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1411 MatRowMapKokkosView,DefaultExecutionSpace>( 1412 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1413 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1414 1415 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1416 We have to sort the indices, until KK provides finer control options. 1417 */ 1418 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1419 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1420 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1421 1422 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1423 1424 /* Update U^T and do sptrsv symbolic */ 1425 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1426 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1427 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1428 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1429 1430 KokkosKernels::Impl::transpose_matrix< 1431 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1432 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1433 MatRowMapKokkosView,DefaultExecutionSpace>( 1434 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1435 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1436 1437 /* Sort indices. See comments above */ 1438 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1439 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1440 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1441 1442 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1443 factors->transpose_updated = PETSC_TRUE; 1444 } 1445 PetscFunctionReturn(0); 1446 } 1447 1448 /* Solve Ax = b, with A = LU */ 1449 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1450 { 1451 PetscErrorCode ierr; 1452 ConstPetscScalarKokkosView bv; 1453 PetscScalarKokkosView xv; 1454 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1455 1456 PetscFunctionBegin; 1457 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1458 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1459 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1460 /* Solve L tmpv = b */ 1461 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1462 /* Solve Ux = tmpv */ 1463 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1464 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1465 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1466 PetscFunctionReturn(0); 1467 } 1468 1469 /* Solve A^T x = b, where A^T = U^T L^T */ 1470 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1471 { 1472 PetscErrorCode ierr; 1473 ConstPetscScalarKokkosView bv; 1474 PetscScalarKokkosView xv; 1475 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1476 1477 PetscFunctionBegin; 1478 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1479 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1480 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1481 /* Solve U^T tmpv = b */ 1482 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1483 1484 /* Solve L^T x = tmpv */ 1485 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1486 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1487 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1488 PetscFunctionReturn(0); 1489 } 1490 1491 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1492 { 1493 PetscErrorCode ierr; 1494 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1495 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1496 PetscInt fill_lev = info->levels; 1497 1498 PetscFunctionBegin; 1499 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1500 1501 auto a_d = aijkok->a_dual.view_device(); 1502 auto i_d = aijkok->i_dual.view_device(); 1503 auto j_d = aijkok->j_dual.view_device(); 1504 1505 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1506 1507 B->assembled = PETSC_TRUE; 1508 B->preallocated = PETSC_TRUE; 1509 B->ops->solve = MatSolve_SeqAIJKokkos; 1510 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1511 B->ops->matsolve = NULL; 1512 B->ops->matsolvetranspose = NULL; 1513 B->offloadmask = PETSC_OFFLOAD_GPU; 1514 1515 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1516 factors->transpose_updated = PETSC_FALSE; 1517 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1518 PetscFunctionReturn(0); 1519 } 1520 1521 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1522 { 1523 PetscErrorCode ierr; 1524 Mat_SeqAIJKokkos *aijkok; 1525 Mat_SeqAIJ *b; 1526 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1527 PetscInt fill_lev = info->levels; 1528 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1529 PetscInt n = A->rmap->n; 1530 1531 PetscFunctionBegin; 1532 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1533 /* Rebuild factors */ 1534 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1535 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1536 1537 /* Create a spiluk handle and then do symbolic factorization */ 1538 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1539 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1540 1541 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1542 1543 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1544 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1545 Kokkos::realloc(factors->iU_d,n+1); 1546 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1547 1548 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1549 auto i_d = aijkok->i_dual.view_device(); 1550 auto j_d = aijkok->j_dual.view_device(); 1551 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1552 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1553 1554 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1555 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1556 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1557 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1558 1559 /* TODO: add options to select sptrsv algorithms */ 1560 /* Create sptrsv handles for L, U and their transpose */ 1561 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1562 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1563 #else 1564 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1565 #endif 1566 1567 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1568 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1569 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1570 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1571 1572 /* Fill fields of the factor matrix B */ 1573 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1574 b = (Mat_SeqAIJ*)B->data; 1575 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1576 B->info.fill_ratio_given = info->fill; 1577 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1578 1579 B->offloadmask = PETSC_OFFLOAD_GPU; 1580 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1581 PetscFunctionReturn(0); 1582 } 1583 1584 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1585 { 1586 PetscErrorCode ierr; 1587 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1588 const PetscInt nrows = A->rmap->n; 1589 1590 PetscFunctionBegin; 1591 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1592 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1593 // move B data into Kokkos 1594 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1595 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1596 { 1597 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1598 if (!baijkok->diag_d) { 1599 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1600 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1601 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1602 } 1603 } 1604 PetscFunctionReturn(0); 1605 } 1606 1607 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1608 { 1609 PetscFunctionBegin; 1610 *type = MATSOLVERKOKKOS; 1611 PetscFunctionReturn(0); 1612 } 1613 1614 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1615 { 1616 PetscFunctionBegin; 1617 *type = MATSOLVERKOKKOSDEVICE; 1618 PetscFunctionReturn(0); 1619 } 1620 1621 /*MC 1622 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1623 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1624 1625 Level: beginner 1626 1627 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1628 M*/ 1629 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1630 { 1631 PetscErrorCode ierr; 1632 PetscInt n = A->rmap->n; 1633 1634 PetscFunctionBegin; 1635 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1636 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1637 (*B)->factortype = ftype; 1638 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1639 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1640 1641 if (ftype == MAT_FACTOR_LU) { 1642 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1643 (*B)->canuseordering = PETSC_TRUE; 1644 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1645 } else if (ftype == MAT_FACTOR_ILU) { 1646 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1647 (*B)->canuseordering = PETSC_FALSE; 1648 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1649 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1650 1651 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1652 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1653 PetscFunctionReturn(0); 1654 } 1655 1656 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1657 { 1658 PetscErrorCode ierr; 1659 PetscInt n = A->rmap->n; 1660 1661 PetscFunctionBegin; 1662 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1663 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1664 (*B)->factortype = ftype; 1665 (*B)->canuseordering = PETSC_TRUE; 1666 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1667 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1668 1669 if (ftype == MAT_FACTOR_LU) { 1670 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1671 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1672 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1673 1674 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1675 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1676 PetscFunctionReturn(0); 1677 } 1678 1679 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1680 { 1681 PetscErrorCode ierr; 1682 1683 PetscFunctionBegin; 1684 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1685 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1686 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1687 PetscFunctionReturn(0); 1688 } 1689 1690 /* Utility to print out a KokkosCsrMatrix for debugging */ 1691 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1692 { 1693 PetscErrorCode ierr; 1694 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1695 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1696 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1697 const PetscInt *i = iv.data(); 1698 const PetscInt *j = jv.data(); 1699 const PetscScalar *a = av.data(); 1700 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1701 1702 PetscFunctionBegin; 1703 ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1704 for (PetscInt k=0; k<m; k++) { 1705 ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1706 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1707 ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1708 } 1709 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1710 } 1711 PetscFunctionReturn(0); 1712 } 1713