1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 85 PetscFunctionReturn(0); 86 } 87 88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89 { 90 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91 92 PetscFunctionBegin; 93 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 94 /* We do not expect one needs factors on host */ 95 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97 aijkok->a_dual.sync_host(); 98 PetscFunctionReturn(0); 99 } 100 101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102 { 103 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104 105 PetscFunctionBegin; 106 if (aijkok) { 107 aijkok->a_dual.sync_host(); 108 *array = aijkok->a_dual.view_host().data(); 109 } else { /* Happens when calling MatSetValues on a newly created matrix */ 110 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111 } 112 PetscFunctionReturn(0); 113 } 114 115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116 { 117 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118 119 PetscFunctionBegin; 120 if (aijkok) aijkok->a_dual.modify_host(); 121 PetscFunctionReturn(0); 122 } 123 124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127 128 PetscFunctionBegin; 129 aijkok->a_dual.sync_host(); 130 *array = aijkok->a_dual.view_host().data(); 131 PetscFunctionReturn(0); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 135 { 136 PetscFunctionBegin; 137 *array = NULL; 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 142 { 143 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 144 145 PetscFunctionBegin; 146 *array = aijkok->a_dual.view_host().data(); 147 PetscFunctionReturn(0); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 151 { 152 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 153 154 PetscFunctionBegin; 155 aijkok->a_dual.clear_sync_state(); 156 aijkok->a_dual.modify_host(); 157 PetscFunctionReturn(0); 158 } 159 160 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 161 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 165 166 PetscFunctionBegin; 167 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 168 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 169 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 170 PetscFunctionReturn(0); 171 } 172 173 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 174 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 175 { 176 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 177 178 PetscFunctionBegin; 179 if (aijkok && aijkok->device_mat_d.data()) { 180 *d_mat = aijkok->device_mat_d.data(); 181 } else { 182 PetscErrorCode ierr; 183 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 184 *d_mat = NULL; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 /* Generate the transpose on device and cache it internally */ 190 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 191 { 192 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 193 194 PetscFunctionBegin; 195 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 196 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 197 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 198 CHKERRCXX(aijkok->a_dual.sync_device()); 199 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 200 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 201 aijkok->transpose_updated = PETSC_TRUE; 202 } 203 *csrmatT = &aijkok->csrmatT; 204 PetscFunctionReturn(0); 205 } 206 207 /* Generate the Hermitian on device and cache it internally */ 208 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 209 { 210 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211 212 PetscFunctionBegin; 213 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 214 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 215 CHKERRCXX(aijkok->a_dual.sync_device()); 216 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 217 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 218 #if defined(PETSC_USE_COMPLEX) 219 const auto& a = aijkok->csrmatH.values; 220 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 221 #endif 222 aijkok->hermitian_updated = PETSC_TRUE; 223 } 224 *csrmatH = &aijkok->csrmatH; 225 PetscFunctionReturn(0); 226 } 227 228 /* y = A x */ 229 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqAIJKokkos *aijkok; 233 ConstPetscScalarKokkosView xv; 234 PetscScalarKokkosView yv; 235 236 PetscFunctionBegin; 237 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 238 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 239 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 240 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 241 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 242 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 243 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 244 ierr = WaitForKokkos();CHKERRQ(ierr); 245 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 246 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 247 PetscFunctionReturn(0); 248 } 249 250 /* y = A^T x */ 251 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 252 { 253 PetscErrorCode ierr; 254 Mat_SeqAIJKokkos *aijkok; 255 const char *mode; 256 ConstPetscScalarKokkosView xv; 257 PetscScalarKokkosView yv; 258 KokkosCsrMatrix *csrmat; 259 260 PetscFunctionBegin; 261 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 262 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 263 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 264 if (A->form_explicit_transpose) { 265 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 266 mode = "N"; 267 } else { 268 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269 csrmat = &aijkok->csrmat; 270 mode = "T"; 271 } 272 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 273 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 274 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 275 ierr = WaitForKokkos();CHKERRQ(ierr); 276 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 277 PetscFunctionReturn(0); 278 } 279 280 /* y = A^H x */ 281 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 282 { 283 PetscErrorCode ierr; 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 292 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 293 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 294 if (A->form_explicit_transpose) { 295 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 296 mode = "N"; 297 } else { 298 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 299 csrmat = &aijkok->csrmat; 300 mode = "C"; 301 } 302 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 303 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 304 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 305 ierr = WaitForKokkos();CHKERRQ(ierr); 306 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 /* z = A x + y */ 311 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 312 { 313 PetscErrorCode ierr; 314 Mat_SeqAIJKokkos *aijkok; 315 ConstPetscScalarKokkosView xv,yv; 316 PetscScalarKokkosView zv; 317 318 PetscFunctionBegin; 319 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 320 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 323 if (zz != yy) Kokkos::deep_copy(zv,yv); 324 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 326 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 327 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 328 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 329 ierr = WaitForKokkos();CHKERRQ(ierr); 330 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 /* z = A^T x + y */ 335 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 336 { 337 PetscErrorCode ierr; 338 Mat_SeqAIJKokkos *aijkok; 339 const char *mode; 340 ConstPetscScalarKokkosView xv,yv; 341 PetscScalarKokkosView zv; 342 KokkosCsrMatrix *csrmat; 343 344 PetscFunctionBegin; 345 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 346 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 347 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 348 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 349 if (zz != yy) Kokkos::deep_copy(zv,yv); 350 if (A->form_explicit_transpose) { 351 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 352 mode = "N"; 353 } else { 354 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355 csrmat = &aijkok->csrmat; 356 mode = "T"; 357 } 358 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 359 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 360 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 361 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 362 ierr = WaitForKokkos();CHKERRQ(ierr); 363 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 364 PetscFunctionReturn(0); 365 } 366 367 /* z = A^H x + y */ 368 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 369 { 370 PetscErrorCode ierr; 371 Mat_SeqAIJKokkos *aijkok; 372 const char *mode; 373 ConstPetscScalarKokkosView xv,yv; 374 PetscScalarKokkosView zv; 375 KokkosCsrMatrix *csrmat; 376 377 PetscFunctionBegin; 378 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 379 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 380 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 381 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 382 if (zz != yy) Kokkos::deep_copy(zv,yv); 383 if (A->form_explicit_transpose) { 384 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 385 mode = "N"; 386 } else { 387 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388 csrmat = &aijkok->csrmat; 389 mode = "C"; 390 } 391 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 392 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 393 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 394 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 395 ierr = WaitForKokkos();CHKERRQ(ierr); 396 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 401 { 402 PetscErrorCode ierr; 403 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 404 405 PetscFunctionBegin; 406 switch (op) { 407 case MAT_FORM_EXPLICIT_TRANSPOSE: 408 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 409 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 410 A->form_explicit_transpose = flg; 411 break; 412 default: 413 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 414 break; 415 } 416 PetscFunctionReturn(0); 417 } 418 419 /* Depending on reuse, either build a new mat, or use the existing mat */ 420 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJ *aseq; 424 425 PetscFunctionBegin; 426 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 427 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 428 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 429 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 430 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 431 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 432 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 433 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 434 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 435 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 436 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 437 aseq = static_cast<Mat_SeqAIJ*>(A->data); 438 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 439 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 440 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 441 } 442 } 443 PetscFunctionReturn(0); 444 } 445 446 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 447 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 448 */ 449 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 450 { 451 PetscErrorCode ierr; 452 Mat_SeqAIJ *bseq; 453 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 454 Mat mat; 455 456 PetscFunctionBegin; 457 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 458 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 459 mat = *B; 460 if (A->assembled) { 461 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 462 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 463 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 464 /* Now copy values to B if needed */ 465 if (dupOption == MAT_COPY_VALUES) { 466 if (akok->a_dual.need_sync_device()) { 467 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 468 bkok->a_dual.modify_host(); 469 } else { /* If device has the latest data, we only copy data on device */ 470 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 471 bkok->a_dual.modify_device(); 472 } 473 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 474 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 475 bkok->a_dual.modify_host(); 476 } 477 mat->spptr = bkok; 478 } 479 480 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 481 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 482 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 483 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 488 { 489 PetscErrorCode ierr; 490 Mat At; 491 KokkosCsrMatrix *internT,*csrmatT; 492 Mat_SeqAIJKokkos *atkok,*bkok; 493 494 PetscFunctionBegin; 495 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 496 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 497 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 498 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 499 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 500 if (reuse == MAT_INITIAL_MATRIX) *B = At; 501 else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 502 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 503 if ((*B)->assembled) { 504 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 505 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 506 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 507 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 508 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 509 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 510 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 511 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 512 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 513 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 514 } 515 PetscFunctionReturn(0); 516 } 517 518 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 519 { 520 PetscErrorCode ierr; 521 Mat_SeqAIJKokkos *aijkok; 522 523 PetscFunctionBegin; 524 if (A->factortype == MAT_FACTOR_NONE) { 525 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 526 if (aijkok) { 527 if (aijkok->device_mat_d.data()) { 528 delete aijkok->colmap_d; 529 delete aijkok->i_uncompressed_d; 530 } 531 if (aijkok->diag_d) delete aijkok->diag_d; 532 } 533 delete aijkok; 534 } else { 535 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 536 } 537 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 538 A->spptr = NULL; 539 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 540 PetscFunctionReturn(0); 541 } 542 543 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 544 { 545 PetscErrorCode ierr; 546 547 PetscFunctionBegin; 548 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 549 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 550 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 551 PetscFunctionReturn(0); 552 } 553 554 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 555 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 556 { 557 PetscErrorCode ierr; 558 Mat_SeqAIJ *a,*b; 559 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 560 MatScalarKokkosView aa,ba,ca; 561 MatRowMapKokkosView ai,bi,ci; 562 MatColIdxKokkosView aj,bj,cj; 563 PetscInt m,n,nnz,aN; 564 565 PetscFunctionBegin; 566 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 567 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 568 PetscValidPointer(C,4); 569 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 570 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 571 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 572 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 573 574 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 575 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 576 a = static_cast<Mat_SeqAIJ*>(A->data); 577 b = static_cast<Mat_SeqAIJ*>(B->data); 578 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 579 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 580 aa = akok->a_dual.view_device(); 581 ai = akok->i_dual.view_device(); 582 ba = bkok->a_dual.view_device(); 583 bi = bkok->i_dual.view_device(); 584 m = A->rmap->n; /* M, N and nnz of C */ 585 n = A->cmap->n + B->cmap->n; 586 nnz = a->nz + b->nz; 587 aN = A->cmap->n; /* N of A */ 588 if (reuse == MAT_INITIAL_MATRIX) { 589 aj = akok->j_dual.view_device(); 590 bj = bkok->j_dual.view_device(); 591 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 592 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 593 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 594 ca = ca_dual.view_device(); 595 ci = ci_dual.view_device(); 596 cj = cj_dual.view_device(); 597 598 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 599 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 600 PetscInt i = t.league_rank(); /* row i */ 601 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 602 603 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 604 ci(i) = coffset; 605 if (i == m-1) ci(m) = ai(m) + bi(m); 606 }); 607 608 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 609 if (k < alen) { 610 ca(coffset+k) = aa(ai(i)+k); 611 cj(coffset+k) = aj(ai(i)+k); 612 } else { 613 ca(coffset+k) = ba(bi(i)+k-alen); 614 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 615 } 616 }); 617 }); 618 ca_dual.modify_device(); 619 ci_dual.modify_device(); 620 cj_dual.modify_device(); 621 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 622 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 623 } else if (reuse == MAT_REUSE_MATRIX) { 624 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 625 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 626 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 627 ca = ckok->a_dual.view_device(); 628 ci = ckok->i_dual.view_device(); 629 630 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 631 PetscInt i = t.league_rank(); /* row i */ 632 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 633 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 634 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 635 else ca(ci(i)+k) = ba(bi(i)+k-alen); 636 }); 637 }); 638 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 639 } 640 PetscFunctionReturn(0); 641 } 642 643 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 644 { 645 PetscFunctionBegin; 646 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 647 PetscFunctionReturn(0); 648 } 649 650 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 651 { 652 PetscErrorCode ierr; 653 Mat_Product *product = C->product; 654 Mat A,B; 655 bool transA,transB; /* use bool, since KK needs this type */ 656 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 657 Mat_SeqAIJ *c; 658 MatProductData_SeqAIJKokkos *pdata; 659 KokkosCsrMatrix *csrmatA,*csrmatB; 660 661 PetscFunctionBegin; 662 MatCheckProduct(C,1); 663 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 664 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 665 666 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 667 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 668 PetscFunctionReturn(0); 669 } 670 671 switch (product->type) { 672 case MATPRODUCT_AB: transA = false; transB = false; break; 673 case MATPRODUCT_AtB: transA = true; transB = false; break; 674 case MATPRODUCT_ABt: transA = false; transB = true; break; 675 default: 676 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 677 } 678 679 A = product->A; 680 B = product->B; 681 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 682 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 683 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 684 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 685 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 686 687 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 688 689 csrmatA = &akok->csrmat; 690 csrmatB = &bkok->csrmat; 691 692 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 693 if (transA) { 694 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 695 transA = false; 696 } 697 698 if (transB) { 699 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 700 transB = false; 701 } 702 703 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 704 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 705 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 706 /* shorter version of MatAssemblyEnd_SeqAIJ */ 707 c = (Mat_SeqAIJ*)C->data; 708 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 709 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 710 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 711 c->reallocs = 0; 712 C->info.mallocs = 0; 713 C->info.nz_unneeded = 0; 714 C->assembled = C->was_assembled = PETSC_TRUE; 715 C->num_ass++; 716 PetscFunctionReturn(0); 717 } 718 719 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 720 { 721 PetscErrorCode ierr; 722 Mat_Product *product = C->product; 723 MatProductType ptype; 724 Mat A,B; 725 bool transA,transB; 726 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 727 MatProductData_SeqAIJKokkos *pdata; 728 MPI_Comm comm; 729 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 730 731 PetscFunctionBegin; 732 MatCheckProduct(C,1); 733 ierr = PetscObjectGetComm((PetscObject)C,&comm); 734 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 735 A = product->A; 736 B = product->B; 737 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 738 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 739 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 740 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 741 csrmatA = &akok->csrmat; 742 csrmatB = &bkok->csrmat; 743 744 ptype = product->type; 745 switch (ptype) { 746 case MATPRODUCT_AB: transA = false; transB = false; break; 747 case MATPRODUCT_AtB: transA = true; transB = false; break; 748 case MATPRODUCT_ABt: transA = false; transB = true; break; 749 default: 750 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 751 } 752 753 product->data = pdata = new MatProductData_SeqAIJKokkos(); 754 pdata->kh.set_team_work_size(16); 755 pdata->kh.set_dynamic_scheduling(true); 756 pdata->reusesym = product->api_user; 757 758 /* TODO: add command line options to select spgemm algorithms */ 759 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 760 #if defined(PETSC_HAVE_CUDA) 761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 762 /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 763 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 764 #endif 765 #endif 766 pdata->kh.create_spgemm_handle(spgemm_alg); 767 768 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 769 if (transA) { 770 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 771 transA = false; 772 } 773 774 if (transB) { 775 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 776 transB = false; 777 } 778 779 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 780 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 781 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 782 calling new Mat_SeqAIJKokkos(). 783 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 784 */ 785 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 786 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 787 788 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 789 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 790 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 791 PetscFunctionReturn(0); 792 } 793 794 /* handles sparse matrix matrix ops */ 795 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 796 { 797 PetscErrorCode ierr; 798 Mat_Product *product = mat->product; 799 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 800 801 PetscFunctionBegin; 802 MatCheckProduct(mat,1); 803 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 804 if (product->type == MATPRODUCT_ABC) { 805 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 806 } 807 if (Biskok && Ciskok) { 808 switch (product->type) { 809 case MATPRODUCT_AB: 810 case MATPRODUCT_AtB: 811 case MATPRODUCT_ABt: 812 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 813 break; 814 case MATPRODUCT_PtAP: 815 case MATPRODUCT_RARt: 816 case MATPRODUCT_ABC: 817 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 818 break; 819 default: 820 break; 821 } 822 } else { /* fallback for AIJ */ 823 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 824 } 825 PetscFunctionReturn(0); 826 } 827 828 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 829 { 830 PetscErrorCode ierr; 831 Mat_SeqAIJKokkos *aijkok; 832 833 PetscFunctionBegin; 834 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 835 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 836 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 837 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 838 ierr = WaitForKokkos();CHKERRQ(ierr); 839 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 840 PetscFunctionReturn(0); 841 } 842 843 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 844 { 845 PetscErrorCode ierr; 846 Mat_SeqAIJKokkos *aijkok; 847 848 PetscFunctionBegin; 849 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 850 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 851 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 852 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */ 853 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 854 PetscFunctionReturn(0); 855 } 856 857 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 858 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 859 { 860 PetscErrorCode ierr; 861 Mat_SeqAIJKokkos *aijkok; 862 863 PetscFunctionBegin; 864 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 865 PetscValidPointer(kv,2); 866 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 867 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 868 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 869 *kv = aijkok->a_dual.view_device(); 870 PetscFunctionReturn(0); 871 } 872 873 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 874 { 875 PetscFunctionBegin; 876 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 877 PetscValidPointer(kv,2); 878 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 879 PetscFunctionReturn(0); 880 } 881 882 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 883 { 884 PetscErrorCode ierr; 885 Mat_SeqAIJKokkos *aijkok; 886 887 PetscFunctionBegin; 888 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 889 PetscValidPointer(kv,2); 890 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 891 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 892 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 893 *kv = aijkok->a_dual.view_device(); 894 PetscFunctionReturn(0); 895 } 896 897 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 898 { 899 PetscErrorCode ierr; 900 901 PetscFunctionBegin; 902 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 903 PetscValidPointer(kv,2); 904 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 905 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 906 PetscFunctionReturn(0); 907 } 908 909 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 910 { 911 Mat_SeqAIJKokkos *aijkok; 912 913 PetscFunctionBegin; 914 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 915 PetscValidPointer(kv,2); 916 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 917 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 918 *kv = aijkok->a_dual.view_device(); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 923 { 924 PetscErrorCode ierr; 925 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 934 /* Computes Y += alpha X */ 935 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 936 { 937 PetscErrorCode ierr; 938 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 939 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 940 ConstMatScalarKokkosView Xa; 941 MatScalarKokkosView Ya; 942 943 PetscFunctionBegin; 944 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 945 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 946 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 947 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 948 949 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 950 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 951 PetscBool e; 952 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 953 if (e) { 954 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 955 if (e) pattern = SAME_NONZERO_PATTERN; 956 } 957 } 958 959 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 960 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 961 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 962 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 963 */ 964 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 965 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 966 Xa = xkok->a_dual.view_device(); 967 Ya = ykok->a_dual.view_device(); 968 969 if (pattern == SAME_NONZERO_PATTERN) { 970 KokkosBlas::axpy(alpha,Xa,Ya); 971 ierr = MatSeqAIJKokkosModifyDevice(Y); 972 } else if (pattern == SUBSET_NONZERO_PATTERN) { 973 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 974 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 975 976 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 977 PetscInt i = t.league_rank(); /* row i */ 978 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 979 PetscInt p,q = Yi(i); 980 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 981 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 982 if (Xj(p) == Yj(q)) { /* Found it */ 983 Ya(q) += alpha * Xa(p); 984 q++; 985 } else { 986 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 987 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 988 */ 989 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 990 } 991 } 992 }); 993 }); 994 ierr = MatSeqAIJKokkosModifyDevice(Y); 995 } else { /* different nonzero patterns */ 996 Mat Z; 997 KokkosCsrMatrix zcsr; 998 KernelHandle kh; 999 kh.create_spadd_handle(false); 1000 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1001 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1002 zkok = new Mat_SeqAIJKokkos(zcsr); 1003 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1004 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1005 kh.destroy_spadd_handle(); 1006 } 1007 1008 PetscFunctionReturn(0); 1009 } 1010 1011 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1012 { 1013 PetscErrorCode ierr; 1014 1015 PetscFunctionBegin; 1016 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1017 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1018 B->offloadmask = PETSC_OFFLOAD_CPU; 1019 PetscFunctionReturn(0); 1020 } 1021 1022 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1023 { 1024 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1025 1026 PetscFunctionBegin; 1027 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1028 A->boundtocpu = PETSC_FALSE; 1029 1030 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1031 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1032 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1033 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1034 A->ops->scale = MatScale_SeqAIJKokkos; 1035 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1036 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1037 A->ops->mult = MatMult_SeqAIJKokkos; 1038 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1039 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1040 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1041 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1042 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1043 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1044 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1045 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1046 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1047 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1048 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1049 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1050 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1051 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1052 PetscFunctionReturn(0); 1053 } 1054 1055 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1056 { 1057 PetscErrorCode ierr; 1058 Mat_SeqAIJ *aseq; 1059 PetscInt i,m,n; 1060 1061 PetscFunctionBegin; 1062 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1063 1064 m = akok->nrows(); 1065 n = akok->ncols(); 1066 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1067 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1068 1069 /* Set up data structures of A as a MATSEQAIJ */ 1070 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1071 aseq = (Mat_SeqAIJ*)(A)->data; 1072 1073 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1074 akok->j_dual.sync_host(); 1075 1076 aseq->i = akok->i_host_data(); 1077 aseq->j = akok->j_host_data(); 1078 aseq->a = akok->a_host_data(); 1079 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1080 aseq->singlemalloc = PETSC_FALSE; 1081 aseq->free_a = PETSC_FALSE; 1082 aseq->free_ij = PETSC_FALSE; 1083 aseq->nz = akok->nnz(); 1084 aseq->maxnz = aseq->nz; 1085 1086 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1087 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1088 for (i=0; i<m; i++) { 1089 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1090 } 1091 1092 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1093 akok->nonzerostate = A->nonzerostate; 1094 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1095 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1096 A->spptr = akok; 1097 PetscFunctionReturn(0); 1098 } 1099 1100 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1101 1102 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1103 */ 1104 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1105 { 1106 PetscErrorCode ierr; 1107 1108 PetscFunctionBegin; 1109 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1110 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1111 PetscFunctionReturn(0); 1112 } 1113 1114 /* --------------------------------------------------------------------------------*/ 1115 /*@C 1116 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1117 (the default parallel PETSc format). This matrix will ultimately be handled by 1118 Kokkos for calculations. For good matrix 1119 assembly performance the user should preallocate the matrix storage by setting 1120 the parameter nz (or the array nnz). By setting these parameters accurately, 1121 performance during matrix assembly can be increased by more than a factor of 50. 1122 1123 Collective 1124 1125 Input Parameters: 1126 + comm - MPI communicator, set to PETSC_COMM_SELF 1127 . m - number of rows 1128 . n - number of columns 1129 . nz - number of nonzeros per row (same for all rows) 1130 - nnz - array containing the number of nonzeros in the various rows 1131 (possibly different for each row) or NULL 1132 1133 Output Parameter: 1134 . A - the matrix 1135 1136 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1137 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1138 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1139 1140 Notes: 1141 If nnz is given then nz is ignored 1142 1143 The AIJ format (also called the Yale sparse matrix format or 1144 compressed row storage), is fully compatible with standard Fortran 77 1145 storage. That is, the stored row and column indices can begin at 1146 either one (as in Fortran) or zero. See the users' manual for details. 1147 1148 Specify the preallocated storage with either nz or nnz (not both). 1149 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1150 allocation. For large problems you MUST preallocate memory or you 1151 will get TERRIBLE performance, see the users' manual chapter on matrices. 1152 1153 By default, this format uses inodes (identical nodes) when possible, to 1154 improve numerical efficiency of matrix-vector products and solves. We 1155 search for consecutive rows with the same nonzero structure, thereby 1156 reusing matrix information to achieve increased efficiency. 1157 1158 Level: intermediate 1159 1160 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1161 @*/ 1162 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1163 { 1164 PetscErrorCode ierr; 1165 1166 PetscFunctionBegin; 1167 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1168 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1169 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1170 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1171 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1172 PetscFunctionReturn(0); 1173 } 1174 1175 typedef Kokkos::TeamPolicy<>::member_type team_member; 1176 // 1177 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1178 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1179 // 1180 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1181 { 1182 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1184 IS isrow = b->row,isicol = b->icol; 1185 PetscErrorCode ierr; 1186 const PetscInt *r_h,*ic_h; 1187 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1188 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1189 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1190 PetscBool row_identity,col_identity; 1191 PetscInt nc, Nf, nVec=32; // should be a parameter 1192 PetscContainer container; 1193 1194 PetscFunctionBegin; 1195 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 1196 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1197 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1198 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1199 if (container) { 1200 PetscInt *pNf=NULL, nv; 1201 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1202 Nf = (*pNf)%1000; 1203 nv = (*pNf)/1000; 1204 if (nv>0) nVec = nv; 1205 } else Nf = 1; 1206 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 1207 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1208 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1209 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1210 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1211 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1212 #endif 1213 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1214 { 1215 #define KOKKOS_SHARED_LEVEL 1 1216 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1217 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1218 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1219 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1220 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1221 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1222 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1223 size_t flops_h = 0.0; 1224 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1225 Kokkos::View<size_t> d_flops_k ("flops"); 1226 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1227 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1228 Kokkos::deep_copy (d_flops_k, h_flops_k); 1229 Kokkos::deep_copy (d_r_k, h_r_k); 1230 Kokkos::deep_copy (d_ic_k, h_ic_k); 1231 // Fill A --> fact 1232 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1233 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1234 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1235 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1236 // zero rows of B 1237 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1238 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1239 PetscScalar *baL = ba_d + bi_d[rowb]; 1240 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1241 /* zero (unfactored row) */ 1242 for (int j=0;j<nzbL;j++) baL[j] = 0; 1243 for (int j=0;j<nzbU;j++) baU[j] = 0; 1244 }); 1245 // copy A into B 1246 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1247 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1248 const PetscScalar *av = aa_d + ai_d[rowa]; 1249 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1250 /* load in initial (unfactored row) */ 1251 for (int j=0;j<nza;j++) { 1252 PetscInt colb = ic[ajtmp[j]]; 1253 PetscScalar vala = av[j]; 1254 if (colb == rowb) { 1255 *(ba_d + bdiag_d[rowb]) = vala; 1256 } else { 1257 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1258 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1259 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1260 for (int j=0; j<nz ; j++) { 1261 if (pbj[j] == colb) { 1262 pba[j] = vala; 1263 set++; 1264 break; 1265 } 1266 } 1267 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1268 } 1269 } 1270 }); 1271 }); 1272 Kokkos::fence(); 1273 1274 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1275 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1276 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1277 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1278 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1279 const PetscInt start = field*nloc, end = start + nloc; 1280 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1281 // A22 panel update for each row A(1,:) and col A(:,1) 1282 for (int ii=start; ii<end-1; ii++) { 1283 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1284 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1285 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1286 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1287 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1288 PetscInt kIdx = j*Ni + field_block_idx; 1289 if (kIdx >= nzUi) /* void */ ; 1290 else { 1291 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1292 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1293 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1294 size_t st_idx; 1295 // find and do L(k,i) = A(:k,i) / A(i,i) 1296 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1297 // get column, there has got to be a better way 1298 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1299 if (pjL[j] == ii) { 1300 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1301 idx = j; // output 1302 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1303 } 1304 }, st_idx); 1305 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1306 #if defined(PETSC_USE_DEBUG) 1307 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1308 #endif 1309 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1310 // U(i+1,:end) 1311 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1312 PetscScalar Uij = baUi[uiIdx]; 1313 PetscInt col = bjUi[uiIdx]; 1314 if (col==myk) { 1315 // A_kk = A_kk - L_ki * U_ij(k) 1316 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1317 *Akkv = *Akkv - L_ki() * Uij; // UiK 1318 } else { 1319 PetscScalar *start, *end, *pAkjv=NULL; 1320 PetscInt high, low; 1321 const PetscInt *startj; 1322 if (col<myk) { // L 1323 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1324 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1325 start = pLki+1; // start at pLki+1, A22(myk,1) 1326 startj= bj_d + bi_d[myk] + idx; 1327 end = ba_d + bi_d[myk+1]; 1328 } else { 1329 PetscInt idx = bdiag_d[myk+1]+1; 1330 start = ba_d + idx; 1331 startj= bj_d + idx; 1332 end = ba_d + bdiag_d[myk]; 1333 } 1334 // search for 'col', use bisection search - TODO 1335 low = 0; 1336 high = (PetscInt)(end-start); 1337 while (high-low > 5) { 1338 int t = (low+high)/2; 1339 if (startj[t] > col) high = t; 1340 else low = t; 1341 } 1342 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1343 if (startj[pAkjv-start] == col) break; 1344 } 1345 #if defined(PETSC_USE_DEBUG) 1346 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1347 #endif 1348 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1349 } 1350 }); 1351 } 1352 }); 1353 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1354 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1355 } /* endof for (i=0; i<n; i++) { */ 1356 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1357 }); 1358 Kokkos::fence(); 1359 Kokkos::deep_copy (h_flops_k, d_flops_k); 1360 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1361 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1362 #elif defined(PETSC_USE_LOG) 1363 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1364 #endif 1365 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1366 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1367 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1368 /* Invert diagonal for simpler triangular solves */ 1369 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1370 int i = start + outer_index*Ni + lg_rank%Ni; 1371 if (i < end) { 1372 PetscScalar *pv = ba_d + bdiag_d[i]; 1373 *pv = 1.0/(*pv); 1374 } 1375 }); 1376 }); 1377 } 1378 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1379 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1380 #endif 1381 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1382 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1383 1384 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1385 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1386 if (b->inode.size) { 1387 B->ops->solve = MatSolve_SeqAIJ_Inode; 1388 } else if (row_identity && col_identity) { 1389 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1390 } else { 1391 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1392 } 1393 B->offloadmask = PETSC_OFFLOAD_GPU; 1394 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1395 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1396 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1397 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1398 B->ops->matsolve = MatMatSolve_SeqAIJ; 1399 B->assembled = PETSC_TRUE; 1400 B->preallocated = PETSC_TRUE; 1401 1402 PetscFunctionReturn(0); 1403 } 1404 1405 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1406 { 1407 PetscErrorCode ierr; 1408 1409 PetscFunctionBegin; 1410 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1411 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1412 PetscFunctionReturn(0); 1413 } 1414 1415 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1416 { 1417 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1418 1419 PetscFunctionBegin; 1420 if (!factors->sptrsv_symbolic_completed) { 1421 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1422 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1423 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1424 } 1425 PetscFunctionReturn(0); 1426 } 1427 1428 /* Check if we need to update factors etc for transpose solve */ 1429 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1430 { 1431 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1432 MatColIdxType n = A->rmap->n; 1433 1434 PetscFunctionBegin; 1435 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1436 /* Update L^T and do sptrsv symbolic */ 1437 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1438 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1439 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1440 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1441 1442 KokkosKernels::Impl::transpose_matrix< 1443 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1444 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1445 MatRowMapKokkosView,DefaultExecutionSpace>( 1446 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1447 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1448 1449 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1450 We have to sort the indices, until KK provides finer control options. 1451 */ 1452 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1453 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1454 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1455 1456 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1457 1458 /* Update U^T and do sptrsv symbolic */ 1459 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1460 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1461 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1462 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1463 1464 KokkosKernels::Impl::transpose_matrix< 1465 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1466 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1467 MatRowMapKokkosView,DefaultExecutionSpace>( 1468 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1469 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1470 1471 /* Sort indices. See comments above */ 1472 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1473 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1474 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1475 1476 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1477 factors->transpose_updated = PETSC_TRUE; 1478 } 1479 PetscFunctionReturn(0); 1480 } 1481 1482 /* Solve Ax = b, with A = LU */ 1483 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1484 { 1485 PetscErrorCode ierr; 1486 ConstPetscScalarKokkosView bv; 1487 PetscScalarKokkosView xv; 1488 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1489 1490 PetscFunctionBegin; 1491 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1492 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1493 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1494 /* Solve L tmpv = b */ 1495 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1496 /* Solve Ux = tmpv */ 1497 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1498 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1499 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 /* Solve A^T x = b, where A^T = U^T L^T */ 1504 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1505 { 1506 PetscErrorCode ierr; 1507 ConstPetscScalarKokkosView bv; 1508 PetscScalarKokkosView xv; 1509 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1510 1511 PetscFunctionBegin; 1512 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1513 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1514 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1515 /* Solve U^T tmpv = b */ 1516 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1517 1518 /* Solve L^T x = tmpv */ 1519 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1520 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1521 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1522 PetscFunctionReturn(0); 1523 } 1524 1525 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1526 { 1527 PetscErrorCode ierr; 1528 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1529 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1530 PetscInt fill_lev = info->levels; 1531 1532 PetscFunctionBegin; 1533 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1534 1535 auto a_d = aijkok->a_dual.view_device(); 1536 auto i_d = aijkok->i_dual.view_device(); 1537 auto j_d = aijkok->j_dual.view_device(); 1538 1539 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1540 1541 B->assembled = PETSC_TRUE; 1542 B->preallocated = PETSC_TRUE; 1543 B->ops->solve = MatSolve_SeqAIJKokkos; 1544 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1545 B->ops->matsolve = NULL; 1546 B->ops->matsolvetranspose = NULL; 1547 B->offloadmask = PETSC_OFFLOAD_GPU; 1548 1549 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1550 factors->transpose_updated = PETSC_FALSE; 1551 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1552 PetscFunctionReturn(0); 1553 } 1554 1555 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1556 { 1557 PetscErrorCode ierr; 1558 Mat_SeqAIJKokkos *aijkok; 1559 Mat_SeqAIJ *b; 1560 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1561 PetscInt fill_lev = info->levels; 1562 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1563 PetscInt n = A->rmap->n; 1564 1565 PetscFunctionBegin; 1566 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1567 /* Rebuild factors */ 1568 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1569 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1570 1571 /* Create a spiluk handle and then do symbolic factorization */ 1572 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1573 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1574 1575 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1576 1577 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1578 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1579 Kokkos::realloc(factors->iU_d,n+1); 1580 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1581 1582 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1583 auto i_d = aijkok->i_dual.view_device(); 1584 auto j_d = aijkok->j_dual.view_device(); 1585 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1586 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1587 1588 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1589 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1590 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1591 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1592 1593 /* TODO: add options to select sptrsv algorithms */ 1594 /* Create sptrsv handles for L, U and their transpose */ 1595 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1596 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1597 #else 1598 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1599 #endif 1600 1601 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1602 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1603 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1604 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1605 1606 /* Fill fields of the factor matrix B */ 1607 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1608 b = (Mat_SeqAIJ*)B->data; 1609 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1610 B->info.fill_ratio_given = info->fill; 1611 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1612 1613 B->offloadmask = PETSC_OFFLOAD_GPU; 1614 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1615 PetscFunctionReturn(0); 1616 } 1617 1618 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1619 { 1620 PetscErrorCode ierr; 1621 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1622 const PetscInt nrows = A->rmap->n; 1623 1624 PetscFunctionBegin; 1625 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1626 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1627 // move B data into Kokkos 1628 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1629 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1630 { 1631 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1632 if (!baijkok->diag_d) { 1633 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1634 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1635 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1636 } 1637 } 1638 PetscFunctionReturn(0); 1639 } 1640 1641 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1642 { 1643 PetscFunctionBegin; 1644 *type = MATSOLVERKOKKOS; 1645 PetscFunctionReturn(0); 1646 } 1647 1648 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1649 { 1650 PetscFunctionBegin; 1651 *type = MATSOLVERKOKKOSDEVICE; 1652 PetscFunctionReturn(0); 1653 } 1654 1655 /*MC 1656 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1657 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1658 1659 Level: beginner 1660 1661 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1662 M*/ 1663 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1664 { 1665 PetscErrorCode ierr; 1666 PetscInt n = A->rmap->n; 1667 1668 PetscFunctionBegin; 1669 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1670 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1671 (*B)->factortype = ftype; 1672 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1673 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1674 1675 if (ftype == MAT_FACTOR_LU) { 1676 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1677 (*B)->canuseordering = PETSC_TRUE; 1678 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1679 } else if (ftype == MAT_FACTOR_ILU) { 1680 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1681 (*B)->canuseordering = PETSC_FALSE; 1682 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1683 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1684 1685 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1686 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1687 PetscFunctionReturn(0); 1688 } 1689 1690 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1691 { 1692 PetscErrorCode ierr; 1693 PetscInt n = A->rmap->n; 1694 1695 PetscFunctionBegin; 1696 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1697 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1698 (*B)->factortype = ftype; 1699 (*B)->canuseordering = PETSC_TRUE; 1700 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1701 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1702 1703 if (ftype == MAT_FACTOR_LU) { 1704 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1705 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1706 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1707 1708 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1709 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1710 PetscFunctionReturn(0); 1711 } 1712 1713 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1714 { 1715 PetscErrorCode ierr; 1716 1717 PetscFunctionBegin; 1718 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1719 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1720 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1721 PetscFunctionReturn(0); 1722 } 1723 1724 /* Utility to print out a KokkosCsrMatrix for debugging */ 1725 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1726 { 1727 PetscErrorCode ierr; 1728 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1729 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1730 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1731 const PetscInt *i = iv.data(); 1732 const PetscInt *j = jv.data(); 1733 const PetscScalar *a = av.data(); 1734 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1735 1736 PetscFunctionBegin; 1737 ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1738 for (PetscInt k=0; k<m; k++) { 1739 ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1740 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1741 ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1742 } 1743 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1744 } 1745 PetscFunctionReturn(0); 1746 } 1747