1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 85 PetscFunctionReturn(0); 86 } 87 88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89 { 90 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91 92 PetscFunctionBegin; 93 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 94 /* We do not expect one needs factors on host */ 95 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97 aijkok->a_dual.sync_host(); 98 PetscFunctionReturn(0); 99 } 100 101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102 { 103 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104 105 PetscFunctionBegin; 106 if (aijkok) { 107 aijkok->a_dual.sync_host(); 108 *array = aijkok->a_dual.view_host().data(); 109 } else { /* Happens when calling MatSetValues on a newly created matrix */ 110 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111 } 112 PetscFunctionReturn(0); 113 } 114 115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116 { 117 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118 119 PetscFunctionBegin; 120 if (aijkok) aijkok->a_dual.modify_host(); 121 PetscFunctionReturn(0); 122 } 123 124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127 128 PetscFunctionBegin; 129 aijkok->a_dual.sync_host(); 130 *array = aijkok->a_dual.view_host().data(); 131 PetscFunctionReturn(0); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 135 { 136 PetscFunctionBegin; 137 *array = NULL; 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 142 { 143 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 144 145 PetscFunctionBegin; 146 *array = aijkok->a_dual.view_host().data(); 147 PetscFunctionReturn(0); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 151 { 152 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 153 154 PetscFunctionBegin; 155 aijkok->a_dual.clear_sync_state(); 156 aijkok->a_dual.modify_host(); 157 PetscFunctionReturn(0); 158 } 159 160 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 161 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 165 166 PetscFunctionBegin; 167 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 168 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 169 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 170 PetscFunctionReturn(0); 171 } 172 173 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 174 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 175 { 176 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 177 178 PetscFunctionBegin; 179 if (aijkok && aijkok->device_mat_d.data()) { 180 *d_mat = aijkok->device_mat_d.data(); 181 } else { 182 PetscErrorCode ierr; 183 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 184 *d_mat = NULL; 185 } 186 PetscFunctionReturn(0); 187 } 188 189 /* Generate the transpose on device and cache it internally */ 190 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 191 { 192 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 193 194 PetscFunctionBegin; 195 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 196 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 197 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 198 CHKERRCXX(aijkok->a_dual.sync_device()); 199 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 200 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 201 aijkok->transpose_updated = PETSC_TRUE; 202 } 203 *csrmatT = &aijkok->csrmatT; 204 PetscFunctionReturn(0); 205 } 206 207 /* Generate the Hermitian on device and cache it internally */ 208 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 209 { 210 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211 212 PetscFunctionBegin; 213 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 214 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 215 CHKERRCXX(aijkok->a_dual.sync_device()); 216 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 217 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 218 #if defined(PETSC_USE_COMPLEX) 219 const auto& a = aijkok->csrmatH.values; 220 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 221 #endif 222 aijkok->hermitian_updated = PETSC_TRUE; 223 } 224 *csrmatH = &aijkok->csrmatH; 225 PetscFunctionReturn(0); 226 } 227 228 /* y = A x */ 229 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqAIJKokkos *aijkok; 233 ConstPetscScalarKokkosView xv; 234 PetscScalarKokkosView yv; 235 236 PetscFunctionBegin; 237 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 238 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 239 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 240 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 241 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 242 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 243 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 244 ierr = WaitForKokkos();CHKERRQ(ierr); 245 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 246 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 247 PetscFunctionReturn(0); 248 } 249 250 /* y = A^T x */ 251 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 252 { 253 PetscErrorCode ierr; 254 Mat_SeqAIJKokkos *aijkok; 255 const char *mode; 256 ConstPetscScalarKokkosView xv; 257 PetscScalarKokkosView yv; 258 KokkosCsrMatrix *csrmat; 259 260 PetscFunctionBegin; 261 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 262 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 263 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 264 if (A->form_explicit_transpose) { 265 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 266 mode = "N"; 267 } else { 268 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269 csrmat = &aijkok->csrmat; 270 mode = "T"; 271 } 272 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 273 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 274 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 275 ierr = WaitForKokkos();CHKERRQ(ierr); 276 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 277 PetscFunctionReturn(0); 278 } 279 280 /* y = A^H x */ 281 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 282 { 283 PetscErrorCode ierr; 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 292 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 293 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 294 if (A->form_explicit_transpose) { 295 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 296 mode = "N"; 297 } else { 298 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 299 csrmat = &aijkok->csrmat; 300 mode = "C"; 301 } 302 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 303 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 304 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 305 ierr = WaitForKokkos();CHKERRQ(ierr); 306 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 /* z = A x + y */ 311 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 312 { 313 PetscErrorCode ierr; 314 Mat_SeqAIJKokkos *aijkok; 315 ConstPetscScalarKokkosView xv,yv; 316 PetscScalarKokkosView zv; 317 318 PetscFunctionBegin; 319 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 320 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 323 if (zz != yy) Kokkos::deep_copy(zv,yv); 324 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 326 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 327 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 328 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 329 ierr = WaitForKokkos();CHKERRQ(ierr); 330 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 /* z = A^T x + y */ 335 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 336 { 337 PetscErrorCode ierr; 338 Mat_SeqAIJKokkos *aijkok; 339 const char *mode; 340 ConstPetscScalarKokkosView xv,yv; 341 PetscScalarKokkosView zv; 342 KokkosCsrMatrix *csrmat; 343 344 PetscFunctionBegin; 345 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 346 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 347 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 348 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 349 if (zz != yy) Kokkos::deep_copy(zv,yv); 350 if (A->form_explicit_transpose) { 351 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 352 mode = "N"; 353 } else { 354 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355 csrmat = &aijkok->csrmat; 356 mode = "T"; 357 } 358 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 359 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 360 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 361 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 362 ierr = WaitForKokkos();CHKERRQ(ierr); 363 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 364 PetscFunctionReturn(0); 365 } 366 367 /* z = A^H x + y */ 368 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 369 { 370 PetscErrorCode ierr; 371 Mat_SeqAIJKokkos *aijkok; 372 const char *mode; 373 ConstPetscScalarKokkosView xv,yv; 374 PetscScalarKokkosView zv; 375 KokkosCsrMatrix *csrmat; 376 377 PetscFunctionBegin; 378 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 379 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 380 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 381 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 382 if (zz != yy) Kokkos::deep_copy(zv,yv); 383 if (A->form_explicit_transpose) { 384 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 385 mode = "N"; 386 } else { 387 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388 csrmat = &aijkok->csrmat; 389 mode = "C"; 390 } 391 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 392 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 393 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 394 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 395 ierr = WaitForKokkos();CHKERRQ(ierr); 396 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 401 { 402 PetscErrorCode ierr; 403 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 404 405 PetscFunctionBegin; 406 switch (op) { 407 case MAT_FORM_EXPLICIT_TRANSPOSE: 408 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 409 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 410 A->form_explicit_transpose = flg; 411 break; 412 default: 413 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 414 break; 415 } 416 PetscFunctionReturn(0); 417 } 418 419 /* Depending on reuse, either build a new mat, or use the existing mat */ 420 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJ *aseq; 424 425 PetscFunctionBegin; 426 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 427 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 428 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 429 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 430 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 431 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 432 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 433 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 434 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 435 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 436 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 437 aseq = static_cast<Mat_SeqAIJ*>(A->data); 438 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 439 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 440 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 441 } 442 } 443 PetscFunctionReturn(0); 444 } 445 446 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 447 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 448 */ 449 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 450 { 451 PetscErrorCode ierr; 452 Mat_SeqAIJ *bseq; 453 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 454 Mat mat; 455 456 PetscFunctionBegin; 457 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 458 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 459 mat = *B; 460 if (A->assembled) { 461 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 462 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 463 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 464 /* Now copy values to B if needed */ 465 if (dupOption == MAT_COPY_VALUES) { 466 if (akok->a_dual.need_sync_device()) { 467 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 468 bkok->a_dual.modify_host(); 469 } else { /* If device has the latest data, we only copy data on device */ 470 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 471 bkok->a_dual.modify_device(); 472 } 473 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 474 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 475 bkok->a_dual.modify_host(); 476 } 477 mat->spptr = bkok; 478 } 479 480 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 481 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 482 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 483 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 488 { 489 PetscErrorCode ierr; 490 Mat At; 491 KokkosCsrMatrix *internT,*csrmatT; 492 Mat_SeqAIJKokkos *atkok,*bkok; 493 494 PetscFunctionBegin; 495 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 496 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 497 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 498 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 499 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 500 if (reuse == MAT_INITIAL_MATRIX) *B = At; 501 else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 502 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 503 if ((*B)->assembled) { 504 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 505 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 506 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 507 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 508 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 509 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 510 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 511 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 512 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 513 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 514 } 515 PetscFunctionReturn(0); 516 } 517 518 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 519 { 520 PetscErrorCode ierr; 521 Mat_SeqAIJKokkos *aijkok; 522 523 PetscFunctionBegin; 524 if (A->factortype == MAT_FACTOR_NONE) { 525 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 526 if (aijkok) { 527 if (aijkok->device_mat_d.data()) { 528 delete aijkok->colmap_d; 529 delete aijkok->i_uncompressed_d; 530 } 531 if (aijkok->diag_d) delete aijkok->diag_d; 532 } 533 delete aijkok; 534 } else { 535 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 536 } 537 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 538 A->spptr = NULL; 539 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 540 PetscFunctionReturn(0); 541 } 542 543 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 544 { 545 PetscErrorCode ierr; 546 547 PetscFunctionBegin; 548 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 549 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 550 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 551 PetscFunctionReturn(0); 552 } 553 554 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 555 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 556 { 557 PetscErrorCode ierr; 558 Mat_SeqAIJ *a,*b; 559 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 560 MatScalarKokkosView aa,ba,ca; 561 MatRowMapKokkosView ai,bi,ci; 562 MatColIdxKokkosView aj,bj,cj; 563 PetscInt m,n,nnz,aN; 564 565 PetscFunctionBegin; 566 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 567 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 568 PetscValidPointer(C,4); 569 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 570 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 571 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 572 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 573 574 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 575 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 576 a = static_cast<Mat_SeqAIJ*>(A->data); 577 b = static_cast<Mat_SeqAIJ*>(B->data); 578 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 579 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 580 aa = akok->a_dual.view_device(); 581 ai = akok->i_dual.view_device(); 582 ba = bkok->a_dual.view_device(); 583 bi = bkok->i_dual.view_device(); 584 m = A->rmap->n; /* M, N and nnz of C */ 585 n = A->cmap->n + B->cmap->n; 586 nnz = a->nz + b->nz; 587 aN = A->cmap->n; /* N of A */ 588 if (reuse == MAT_INITIAL_MATRIX) { 589 aj = akok->j_dual.view_device(); 590 bj = bkok->j_dual.view_device(); 591 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 592 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 593 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 594 ca = ca_dual.view_device(); 595 ci = ci_dual.view_device(); 596 cj = cj_dual.view_device(); 597 598 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 599 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 600 PetscInt i = t.league_rank(); /* row i */ 601 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 602 603 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 604 ci(i) = coffset; 605 if (i == m-1) ci(m) = ai(m) + bi(m); 606 }); 607 608 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 609 if (k < alen) { 610 ca(coffset+k) = aa(ai(i)+k); 611 cj(coffset+k) = aj(ai(i)+k); 612 } else { 613 ca(coffset+k) = ba(bi(i)+k-alen); 614 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 615 } 616 }); 617 }); 618 ca_dual.modify_device(); 619 ci_dual.modify_device(); 620 cj_dual.modify_device(); 621 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 622 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 623 } else if (reuse == MAT_REUSE_MATRIX) { 624 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 625 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 626 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 627 ca = ckok->a_dual.view_device(); 628 ci = ckok->i_dual.view_device(); 629 630 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 631 PetscInt i = t.league_rank(); /* row i */ 632 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 633 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 634 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 635 else ca(ci(i)+k) = ba(bi(i)+k-alen); 636 }); 637 }); 638 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 639 } 640 PetscFunctionReturn(0); 641 } 642 643 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 644 { 645 PetscFunctionBegin; 646 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 647 PetscFunctionReturn(0); 648 } 649 650 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 651 { 652 PetscErrorCode ierr; 653 Mat_Product *product = C->product; 654 Mat A,B; 655 bool transA,transB; /* use bool, since KK needs this type */ 656 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 657 Mat_SeqAIJ *c; 658 MatProductData_SeqAIJKokkos *pdata; 659 KokkosCsrMatrix *csrmatA,*csrmatB; 660 661 PetscFunctionBegin; 662 MatCheckProduct(C,1); 663 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 664 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 665 666 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 667 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 668 PetscFunctionReturn(0); 669 } 670 671 switch (product->type) { 672 case MATPRODUCT_AB: transA = false; transB = false; break; 673 case MATPRODUCT_AtB: transA = true; transB = false; break; 674 case MATPRODUCT_ABt: transA = false; transB = true; break; 675 default: 676 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 677 } 678 679 A = product->A; 680 B = product->B; 681 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 682 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 683 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 684 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 685 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 686 687 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 688 689 csrmatA = &akok->csrmat; 690 csrmatB = &bkok->csrmat; 691 692 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 693 if (transA) { 694 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 695 transA = false; 696 } 697 698 if (transB) { 699 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 700 transB = false; 701 } 702 703 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 704 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 705 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 706 /* shorter version of MatAssemblyEnd_SeqAIJ */ 707 c = (Mat_SeqAIJ*)C->data; 708 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 709 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 710 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 711 c->reallocs = 0; 712 C->info.mallocs = 0; 713 C->info.nz_unneeded = 0; 714 C->assembled = C->was_assembled = PETSC_TRUE; 715 C->num_ass++; 716 PetscFunctionReturn(0); 717 } 718 719 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 720 { 721 PetscErrorCode ierr; 722 Mat_Product *product = C->product; 723 MatProductType ptype; 724 Mat A,B; 725 bool transA,transB; 726 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 727 MatProductData_SeqAIJKokkos *pdata; 728 MPI_Comm comm; 729 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 730 731 PetscFunctionBegin; 732 MatCheckProduct(C,1); 733 ierr = PetscObjectGetComm((PetscObject)C,&comm); 734 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 735 A = product->A; 736 B = product->B; 737 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 738 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 739 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 740 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 741 csrmatA = &akok->csrmat; 742 csrmatB = &bkok->csrmat; 743 744 ptype = product->type; 745 switch (ptype) { 746 case MATPRODUCT_AB: transA = false; transB = false; break; 747 case MATPRODUCT_AtB: transA = true; transB = false; break; 748 case MATPRODUCT_ABt: transA = false; transB = true; break; 749 default: 750 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 751 } 752 753 product->data = pdata = new MatProductData_SeqAIJKokkos(); 754 pdata->kh.set_team_work_size(16); 755 pdata->kh.set_dynamic_scheduling(true); 756 pdata->reusesym = product->api_user; 757 758 /* TODO: add command line options to select spgemm algorithms */ 759 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 760 #if defined(PETSC_HAVE_CUDA) 761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 762 /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 763 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 764 #endif 765 #endif 766 pdata->kh.create_spgemm_handle(spgemm_alg); 767 768 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 769 if (transA) { 770 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 771 transA = false; 772 } 773 774 if (transB) { 775 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 776 transB = false; 777 } 778 779 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 780 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 781 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 782 calling new Mat_SeqAIJKokkos(). 783 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 784 */ 785 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 786 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 787 788 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 789 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 790 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 791 PetscFunctionReturn(0); 792 } 793 794 /* handles sparse matrix matrix ops */ 795 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 796 { 797 PetscErrorCode ierr; 798 Mat_Product *product = mat->product; 799 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 800 801 PetscFunctionBegin; 802 MatCheckProduct(mat,1); 803 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 804 if (product->type == MATPRODUCT_ABC) { 805 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 806 } 807 if (Biskok && Ciskok) { 808 switch (product->type) { 809 case MATPRODUCT_AB: 810 case MATPRODUCT_AtB: 811 case MATPRODUCT_ABt: 812 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 813 break; 814 case MATPRODUCT_PtAP: 815 case MATPRODUCT_RARt: 816 case MATPRODUCT_ABC: 817 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 818 break; 819 default: 820 break; 821 } 822 } else { /* fallback for AIJ */ 823 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 824 } 825 PetscFunctionReturn(0); 826 } 827 828 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 829 { 830 PetscErrorCode ierr; 831 Mat_SeqAIJKokkos *aijkok; 832 833 PetscFunctionBegin; 834 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 835 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 836 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 837 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 838 ierr = WaitForKokkos();CHKERRQ(ierr); 839 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 840 PetscFunctionReturn(0); 841 } 842 843 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 844 { 845 PetscErrorCode ierr; 846 Mat_SeqAIJKokkos *aijkok; 847 848 PetscFunctionBegin; 849 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 850 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 851 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 852 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */ 853 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 854 PetscFunctionReturn(0); 855 } 856 857 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 858 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 859 { 860 PetscErrorCode ierr; 861 Mat_SeqAIJKokkos *aijkok; 862 863 PetscFunctionBegin; 864 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 865 PetscValidPointer(kv,2); 866 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 867 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 868 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 869 *kv = aijkok->a_dual.view_device(); 870 PetscFunctionReturn(0); 871 } 872 873 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 874 { 875 PetscFunctionBegin; 876 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 877 PetscValidPointer(kv,2); 878 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 879 PetscFunctionReturn(0); 880 } 881 882 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 883 { 884 PetscErrorCode ierr; 885 Mat_SeqAIJKokkos *aijkok; 886 887 PetscFunctionBegin; 888 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 889 PetscValidPointer(kv,2); 890 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 891 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 892 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 893 *kv = aijkok->a_dual.view_device(); 894 PetscFunctionReturn(0); 895 } 896 897 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 898 { 899 PetscErrorCode ierr; 900 901 PetscFunctionBegin; 902 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 903 PetscValidPointer(kv,2); 904 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 905 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 906 PetscFunctionReturn(0); 907 } 908 909 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 910 { 911 Mat_SeqAIJKokkos *aijkok; 912 913 PetscFunctionBegin; 914 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 915 PetscValidPointer(kv,2); 916 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 917 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 918 *kv = aijkok->a_dual.view_device(); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 923 { 924 PetscErrorCode ierr; 925 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 934 /* Computes Y += alpha X */ 935 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 936 { 937 PetscErrorCode ierr; 938 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 939 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 940 ConstMatScalarKokkosView Xa; 941 MatScalarKokkosView Ya; 942 943 PetscFunctionBegin; 944 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 945 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 946 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 947 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 948 949 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 950 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 951 PetscBool e; 952 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 953 if (e) { 954 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 955 if (e) pattern = SAME_NONZERO_PATTERN; 956 } 957 } 958 959 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 960 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 961 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 962 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 963 */ 964 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 965 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 966 Xa = xkok->a_dual.view_device(); 967 Ya = ykok->a_dual.view_device(); 968 969 if (pattern == SAME_NONZERO_PATTERN) { 970 KokkosBlas::axpy(alpha,Xa,Ya); 971 ierr = MatSeqAIJKokkosModifyDevice(Y); 972 } else if (pattern == SUBSET_NONZERO_PATTERN) { 973 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 974 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 975 976 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 977 PetscInt i = t.league_rank(); /* row i */ 978 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 979 PetscInt p,q = Yi(i); 980 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 981 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 982 if (Xj(p) == Yj(q)) { /* Found it */ 983 Ya(q) += alpha * Xa(p); 984 q++; 985 } else { 986 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 987 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 988 */ 989 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 990 } 991 } 992 }); 993 }); 994 ierr = MatSeqAIJKokkosModifyDevice(Y); 995 } else { /* different nonzero patterns */ 996 Mat Z; 997 KokkosCsrMatrix zcsr; 998 KernelHandle kh; 999 kh.create_spadd_handle(false); 1000 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1001 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1002 zkok = new Mat_SeqAIJKokkos(zcsr); 1003 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1004 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1005 kh.destroy_spadd_handle(); 1006 } 1007 1008 PetscFunctionReturn(0); 1009 } 1010 1011 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1012 { 1013 PetscErrorCode ierr; 1014 1015 PetscFunctionBegin; 1016 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1017 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1018 B->offloadmask = PETSC_OFFLOAD_CPU; 1019 PetscFunctionReturn(0); 1020 } 1021 1022 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1023 { 1024 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1025 1026 PetscFunctionBegin; 1027 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1028 A->boundtocpu = PETSC_FALSE; 1029 1030 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1031 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1032 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1033 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1034 A->ops->scale = MatScale_SeqAIJKokkos; 1035 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1036 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1037 A->ops->mult = MatMult_SeqAIJKokkos; 1038 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1039 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1040 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1041 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1042 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1043 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1044 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1045 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1046 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1047 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1048 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1049 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1050 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1051 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1052 PetscFunctionReturn(0); 1053 } 1054 1055 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1056 { 1057 PetscErrorCode ierr; 1058 Mat_SeqAIJ *aseq; 1059 PetscInt i,m,n; 1060 1061 PetscFunctionBegin; 1062 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1063 1064 m = akok->nrows(); 1065 n = akok->ncols(); 1066 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1067 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1068 1069 /* Set up data structures of A as a MATSEQAIJ */ 1070 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1071 aseq = (Mat_SeqAIJ*)(A)->data; 1072 1073 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1074 akok->j_dual.sync_host(); 1075 1076 aseq->i = akok->i_host_data(); 1077 aseq->j = akok->j_host_data(); 1078 aseq->a = akok->a_host_data(); 1079 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1080 aseq->singlemalloc = PETSC_FALSE; 1081 aseq->free_a = PETSC_FALSE; 1082 aseq->free_ij = PETSC_FALSE; 1083 aseq->nz = akok->nnz(); 1084 aseq->maxnz = aseq->nz; 1085 1086 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1087 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1088 for (i=0; i<m; i++) { 1089 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1090 } 1091 1092 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1093 akok->nonzerostate = A->nonzerostate; 1094 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1095 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1096 A->spptr = akok; 1097 PetscFunctionReturn(0); 1098 } 1099 1100 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1101 1102 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1103 */ 1104 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1105 { 1106 PetscErrorCode ierr; 1107 1108 PetscFunctionBegin; 1109 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1110 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1111 PetscFunctionReturn(0); 1112 } 1113 1114 /* --------------------------------------------------------------------------------*/ 1115 /*@C 1116 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1117 (the default parallel PETSc format). This matrix will ultimately be handled by 1118 Kokkos for calculations. For good matrix 1119 assembly performance the user should preallocate the matrix storage by setting 1120 the parameter nz (or the array nnz). By setting these parameters accurately, 1121 performance during matrix assembly can be increased by more than a factor of 50. 1122 1123 Collective 1124 1125 Input Parameters: 1126 + comm - MPI communicator, set to PETSC_COMM_SELF 1127 . m - number of rows 1128 . n - number of columns 1129 . nz - number of nonzeros per row (same for all rows) 1130 - nnz - array containing the number of nonzeros in the various rows 1131 (possibly different for each row) or NULL 1132 1133 Output Parameter: 1134 . A - the matrix 1135 1136 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1137 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1138 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1139 1140 Notes: 1141 If nnz is given then nz is ignored 1142 1143 The AIJ format (also called the Yale sparse matrix format or 1144 compressed row storage), is fully compatible with standard Fortran 77 1145 storage. That is, the stored row and column indices can begin at 1146 either one (as in Fortran) or zero. See the users' manual for details. 1147 1148 Specify the preallocated storage with either nz or nnz (not both). 1149 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1150 allocation. For large problems you MUST preallocate memory or you 1151 will get TERRIBLE performance, see the users' manual chapter on matrices. 1152 1153 By default, this format uses inodes (identical nodes) when possible, to 1154 improve numerical efficiency of matrix-vector products and solves. We 1155 search for consecutive rows with the same nonzero structure, thereby 1156 reusing matrix information to achieve increased efficiency. 1157 1158 Level: intermediate 1159 1160 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1161 @*/ 1162 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1163 { 1164 PetscErrorCode ierr; 1165 1166 PetscFunctionBegin; 1167 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1168 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1169 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1170 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1171 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1172 PetscFunctionReturn(0); 1173 } 1174 1175 typedef Kokkos::TeamPolicy<>::member_type team_member; 1176 // 1177 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1178 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1179 // 1180 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1181 { 1182 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1184 IS isrow = b->row,isicol = b->icol; 1185 PetscErrorCode ierr; 1186 const PetscInt *r_h,*ic_h; 1187 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1188 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1189 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1190 PetscBool row_identity,col_identity; 1191 PetscInt nc, Nf, nVec=32; // should be a parameter 1192 PetscContainer container; 1193 1194 PetscFunctionBegin; 1195 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 1196 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1197 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1198 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1199 if (container) { 1200 PetscInt *pNf=NULL, nv; 1201 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1202 Nf = (*pNf)%1000; 1203 nv = (*pNf)/1000; 1204 if (nv>0) nVec = nv; 1205 } else Nf = 1; 1206 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 1207 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1208 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1209 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1210 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1211 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1212 #endif 1213 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1214 { 1215 #define KOKKOS_SHARED_LEVEL 1 1216 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1217 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1218 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1219 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1220 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1221 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1222 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1223 size_t flops_h = 0.0; 1224 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1225 Kokkos::View<size_t> d_flops_k ("flops"); 1226 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1227 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1228 Kokkos::deep_copy (d_flops_k, h_flops_k); 1229 Kokkos::deep_copy (d_r_k, h_r_k); 1230 Kokkos::deep_copy (d_ic_k, h_ic_k); 1231 // Fill A --> fact 1232 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1233 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1234 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1235 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1236 // zero rows of B 1237 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1238 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1239 PetscScalar *baL = ba_d + bi_d[rowb]; 1240 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1241 /* zero (unfactored row) */ 1242 for (int j=0;j<nzbL;j++) baL[j] = 0; 1243 for (int j=0;j<nzbU;j++) baU[j] = 0; 1244 }); 1245 // copy A into B 1246 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1247 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1248 const PetscScalar *av = aa_d + ai_d[rowa]; 1249 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1250 /* load in initial (unfactored row) */ 1251 for (int j=0;j<nza;j++) { 1252 PetscInt colb = ic[ajtmp[j]]; 1253 PetscScalar vala = av[j]; 1254 if (colb == rowb) { 1255 *(ba_d + bdiag_d[rowb]) = vala; 1256 } else { 1257 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1258 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1259 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1260 for (int j=0; j<nz ; j++) { 1261 if (pbj[j] == colb) { 1262 pba[j] = vala; 1263 set++; 1264 break; 1265 } 1266 } 1267 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1268 } 1269 } 1270 }); 1271 }); 1272 Kokkos::fence(); 1273 1274 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1275 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1276 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1277 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1278 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1279 const PetscInt start = field*nloc, end = start + nloc; 1280 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1281 // A22 panel update for each row A(1,:) and col A(:,1) 1282 for (int ii=start; ii<end-1; ii++) { 1283 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1284 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1285 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1286 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1287 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1288 PetscInt kIdx = j*Ni + field_block_idx; 1289 if (kIdx >= nzUi) /* void */ ; 1290 else { 1291 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1292 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1293 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1294 size_t st_idx; 1295 // find and do L(k,i) = A(:k,i) / A(i,i) 1296 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1297 // get column, there has got to be a better way 1298 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1299 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 1300 if (pjL[j] == ii) { 1301 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1302 idx = j; // output 1303 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1304 } 1305 }, st_idx); 1306 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1307 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 1308 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1309 // U(i+1,:end) 1310 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1311 PetscScalar Uij = baUi[uiIdx]; 1312 PetscInt col = bjUi[uiIdx]; 1313 if (col==myk) { 1314 // A_kk = A_kk - L_ki * U_ij(k) 1315 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1316 *Akkv = *Akkv - L_ki() * Uij; // UiK 1317 } else { 1318 PetscScalar *start, *end, *pAkjv=NULL; 1319 PetscInt high, low; 1320 const PetscInt *startj; 1321 if (col<myk) { // L 1322 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1323 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1324 start = pLki+1; // start at pLki+1, A22(myk,1) 1325 startj= bj_d + bi_d[myk] + idx; 1326 end = ba_d + bi_d[myk+1]; 1327 } else { 1328 PetscInt idx = bdiag_d[myk+1]+1; 1329 start = ba_d + idx; 1330 startj= bj_d + idx; 1331 end = ba_d + bdiag_d[myk]; 1332 } 1333 // search for 'col', use bisection search - TODO 1334 low = 0; 1335 high = (PetscInt)(end-start); 1336 while (high-low > 5) { 1337 int t = (low+high)/2; 1338 if (startj[t] > col) high = t; 1339 else low = t; 1340 } 1341 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1342 if (startj[pAkjv-start] == col) break; 1343 } 1344 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 1345 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1346 } 1347 }); 1348 } 1349 } 1350 }); 1351 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1352 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1353 } /* endof for (i=0; i<n; i++) { */ 1354 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1355 }); 1356 Kokkos::fence(); 1357 Kokkos::deep_copy (h_flops_k, d_flops_k); 1358 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1359 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1360 #elif defined(PETSC_USE_LOG) 1361 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1362 #endif 1363 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1364 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1365 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1366 /* Invert diagonal for simpler triangular solves */ 1367 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1368 int i = start + outer_index*Ni + lg_rank%Ni; 1369 if (i < end) { 1370 PetscScalar *pv = ba_d + bdiag_d[i]; 1371 *pv = 1.0/(*pv); 1372 } 1373 }); 1374 }); 1375 } 1376 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1377 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1378 #endif 1379 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1380 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1381 1382 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1383 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1384 if (b->inode.size) { 1385 B->ops->solve = MatSolve_SeqAIJ_Inode; 1386 } else if (row_identity && col_identity) { 1387 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1388 } else { 1389 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1390 } 1391 B->offloadmask = PETSC_OFFLOAD_GPU; 1392 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1393 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1394 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1395 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1396 B->ops->matsolve = MatMatSolve_SeqAIJ; 1397 B->assembled = PETSC_TRUE; 1398 B->preallocated = PETSC_TRUE; 1399 1400 PetscFunctionReturn(0); 1401 } 1402 1403 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1404 { 1405 PetscErrorCode ierr; 1406 1407 PetscFunctionBegin; 1408 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1409 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1410 PetscFunctionReturn(0); 1411 } 1412 1413 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1414 { 1415 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1416 1417 PetscFunctionBegin; 1418 if (!factors->sptrsv_symbolic_completed) { 1419 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1420 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1421 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1422 } 1423 PetscFunctionReturn(0); 1424 } 1425 1426 /* Check if we need to update factors etc for transpose solve */ 1427 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1428 { 1429 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1430 MatColIdxType n = A->rmap->n; 1431 1432 PetscFunctionBegin; 1433 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1434 /* Update L^T and do sptrsv symbolic */ 1435 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1436 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1437 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1438 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1439 1440 KokkosKernels::Impl::transpose_matrix< 1441 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1442 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1443 MatRowMapKokkosView,DefaultExecutionSpace>( 1444 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1445 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1446 1447 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1448 We have to sort the indices, until KK provides finer control options. 1449 */ 1450 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1451 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1452 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1453 1454 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1455 1456 /* Update U^T and do sptrsv symbolic */ 1457 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1458 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1459 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1460 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1461 1462 KokkosKernels::Impl::transpose_matrix< 1463 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1464 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1465 MatRowMapKokkosView,DefaultExecutionSpace>( 1466 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1467 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1468 1469 /* Sort indices. See comments above */ 1470 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1471 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1472 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1473 1474 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1475 factors->transpose_updated = PETSC_TRUE; 1476 } 1477 PetscFunctionReturn(0); 1478 } 1479 1480 /* Solve Ax = b, with A = LU */ 1481 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1482 { 1483 PetscErrorCode ierr; 1484 ConstPetscScalarKokkosView bv; 1485 PetscScalarKokkosView xv; 1486 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1487 1488 PetscFunctionBegin; 1489 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1490 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1491 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1492 /* Solve L tmpv = b */ 1493 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1494 /* Solve Ux = tmpv */ 1495 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1496 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1497 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1498 PetscFunctionReturn(0); 1499 } 1500 1501 /* Solve A^T x = b, where A^T = U^T L^T */ 1502 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1503 { 1504 PetscErrorCode ierr; 1505 ConstPetscScalarKokkosView bv; 1506 PetscScalarKokkosView xv; 1507 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1508 1509 PetscFunctionBegin; 1510 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1511 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1512 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1513 /* Solve U^T tmpv = b */ 1514 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1515 1516 /* Solve L^T x = tmpv */ 1517 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1518 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1519 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1520 PetscFunctionReturn(0); 1521 } 1522 1523 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1524 { 1525 PetscErrorCode ierr; 1526 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1527 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1528 PetscInt fill_lev = info->levels; 1529 1530 PetscFunctionBegin; 1531 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1532 1533 auto a_d = aijkok->a_dual.view_device(); 1534 auto i_d = aijkok->i_dual.view_device(); 1535 auto j_d = aijkok->j_dual.view_device(); 1536 1537 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1538 1539 B->assembled = PETSC_TRUE; 1540 B->preallocated = PETSC_TRUE; 1541 B->ops->solve = MatSolve_SeqAIJKokkos; 1542 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1543 B->ops->matsolve = NULL; 1544 B->ops->matsolvetranspose = NULL; 1545 B->offloadmask = PETSC_OFFLOAD_GPU; 1546 1547 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1548 factors->transpose_updated = PETSC_FALSE; 1549 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1550 PetscFunctionReturn(0); 1551 } 1552 1553 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1554 { 1555 PetscErrorCode ierr; 1556 Mat_SeqAIJKokkos *aijkok; 1557 Mat_SeqAIJ *b; 1558 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1559 PetscInt fill_lev = info->levels; 1560 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1561 PetscInt n = A->rmap->n; 1562 1563 PetscFunctionBegin; 1564 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1565 /* Rebuild factors */ 1566 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1567 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1568 1569 /* Create a spiluk handle and then do symbolic factorization */ 1570 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1571 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1572 1573 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1574 1575 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1576 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1577 Kokkos::realloc(factors->iU_d,n+1); 1578 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1579 1580 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1581 auto i_d = aijkok->i_dual.view_device(); 1582 auto j_d = aijkok->j_dual.view_device(); 1583 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1584 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1585 1586 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1587 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1588 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1589 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1590 1591 /* TODO: add options to select sptrsv algorithms */ 1592 /* Create sptrsv handles for L, U and their transpose */ 1593 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1594 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1595 #else 1596 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1597 #endif 1598 1599 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1600 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1601 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1602 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1603 1604 /* Fill fields of the factor matrix B */ 1605 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1606 b = (Mat_SeqAIJ*)B->data; 1607 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1608 B->info.fill_ratio_given = info->fill; 1609 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1610 1611 B->offloadmask = PETSC_OFFLOAD_GPU; 1612 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1613 PetscFunctionReturn(0); 1614 } 1615 1616 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1617 { 1618 PetscErrorCode ierr; 1619 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1620 const PetscInt nrows = A->rmap->n; 1621 1622 PetscFunctionBegin; 1623 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1624 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1625 // move B data into Kokkos 1626 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1627 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1628 { 1629 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1630 if (!baijkok->diag_d) { 1631 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1632 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1633 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1634 } 1635 } 1636 PetscFunctionReturn(0); 1637 } 1638 1639 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1640 { 1641 PetscFunctionBegin; 1642 *type = MATSOLVERKOKKOS; 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1647 { 1648 PetscFunctionBegin; 1649 *type = MATSOLVERKOKKOSDEVICE; 1650 PetscFunctionReturn(0); 1651 } 1652 1653 /*MC 1654 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1655 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1656 1657 Level: beginner 1658 1659 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1660 M*/ 1661 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1662 { 1663 PetscErrorCode ierr; 1664 PetscInt n = A->rmap->n; 1665 1666 PetscFunctionBegin; 1667 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1668 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1669 (*B)->factortype = ftype; 1670 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1671 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1672 1673 if (ftype == MAT_FACTOR_LU) { 1674 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1675 (*B)->canuseordering = PETSC_TRUE; 1676 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1677 } else if (ftype == MAT_FACTOR_ILU) { 1678 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1679 (*B)->canuseordering = PETSC_FALSE; 1680 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1681 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1682 1683 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1684 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1685 PetscFunctionReturn(0); 1686 } 1687 1688 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1689 { 1690 PetscErrorCode ierr; 1691 PetscInt n = A->rmap->n; 1692 1693 PetscFunctionBegin; 1694 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1695 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1696 (*B)->factortype = ftype; 1697 (*B)->canuseordering = PETSC_TRUE; 1698 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1699 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1700 1701 if (ftype == MAT_FACTOR_LU) { 1702 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1703 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1704 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1705 1706 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1707 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1708 PetscFunctionReturn(0); 1709 } 1710 1711 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1712 { 1713 PetscErrorCode ierr; 1714 1715 PetscFunctionBegin; 1716 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1717 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1718 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1719 PetscFunctionReturn(0); 1720 } 1721 1722 /* Utility to print out a KokkosCsrMatrix for debugging */ 1723 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1724 { 1725 PetscErrorCode ierr; 1726 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1727 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1728 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1729 const PetscInt *i = iv.data(); 1730 const PetscInt *j = jv.data(); 1731 const PetscScalar *a = av.data(); 1732 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1733 1734 PetscFunctionBegin; 1735 ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1736 for (PetscInt k=0; k<m; k++) { 1737 ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1738 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1739 ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1740 } 1741 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1742 } 1743 PetscFunctionReturn(0); 1744 } 1745