1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 63 if (aijkok->a_dual.need_sync_device()) { 64 aijkok->a_dual.sync_device(); 65 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 66 aijkok->hermitian_updated = PETSC_FALSE; 67 } 68 PetscFunctionReturn(0); 69 } 70 71 /* Mark the CSR data on device as modified */ 72 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 73 { 74 PetscErrorCode ierr; 75 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 76 77 PetscFunctionBegin; 78 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 79 aijkok->a_dual.clear_sync_state(); 80 aijkok->a_dual.modify_device(); 81 aijkok->transpose_updated = PETSC_FALSE; 82 aijkok->hermitian_updated = PETSC_FALSE; 83 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 84 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 85 PetscFunctionReturn(0); 86 } 87 88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89 { 90 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91 92 PetscFunctionBegin; 93 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 94 /* We do not expect one needs factors on host */ 95 PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96 PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97 aijkok->a_dual.sync_host(); 98 PetscFunctionReturn(0); 99 } 100 101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102 { 103 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104 105 PetscFunctionBegin; 106 if (aijkok) { 107 aijkok->a_dual.sync_host(); 108 *array = aijkok->a_dual.view_host().data(); 109 } else { /* Happens when calling MatSetValues on a newly created matrix */ 110 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111 } 112 PetscFunctionReturn(0); 113 } 114 115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116 { 117 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118 119 PetscFunctionBegin; 120 if (aijkok) aijkok->a_dual.modify_host(); 121 PetscFunctionReturn(0); 122 } 123 124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127 128 PetscFunctionBegin; 129 if (aijkok) { 130 aijkok->a_dual.sync_host(); 131 *array = aijkok->a_dual.view_host().data(); 132 } else { 133 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 134 } 135 PetscFunctionReturn(0); 136 } 137 138 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 139 { 140 PetscFunctionBegin; 141 *array = NULL; 142 PetscFunctionReturn(0); 143 } 144 145 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 146 { 147 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 148 149 PetscFunctionBegin; 150 if (aijkok) { 151 *array = aijkok->a_dual.view_host().data(); 152 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 153 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 154 } 155 PetscFunctionReturn(0); 156 } 157 158 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 159 { 160 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 161 162 PetscFunctionBegin; 163 if (aijkok) { 164 aijkok->a_dual.clear_sync_state(); 165 aijkok->a_dual.modify_host(); 166 } 167 PetscFunctionReturn(0); 168 } 169 170 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 171 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 172 { 173 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 174 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 175 176 PetscFunctionBegin; 177 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 178 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 179 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 180 PetscFunctionReturn(0); 181 } 182 183 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 184 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 185 { 186 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 187 188 PetscFunctionBegin; 189 if (aijkok && aijkok->device_mat_d.data()) { 190 *d_mat = aijkok->device_mat_d.data(); 191 } else { 192 PetscErrorCode ierr; 193 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 194 *d_mat = NULL; 195 } 196 PetscFunctionReturn(0); 197 } 198 199 /* Generate the transpose on device and cache it internally */ 200 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 203 204 PetscFunctionBegin; 205 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 206 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 207 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 208 CHKERRCXX(aijkok->a_dual.sync_device()); 209 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 210 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 211 aijkok->transpose_updated = PETSC_TRUE; 212 } 213 *csrmatT = &aijkok->csrmatT; 214 PetscFunctionReturn(0); 215 } 216 217 /* Generate the Hermitian on device and cache it internally */ 218 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 219 { 220 PetscErrorCode ierr; 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 225 PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 226 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 227 CHKERRCXX(aijkok->a_dual.sync_device()); 228 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 230 #if defined(PETSC_USE_COMPLEX) 231 const auto& a = aijkok->csrmatH.values; 232 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 233 #endif 234 aijkok->hermitian_updated = PETSC_TRUE; 235 } 236 *csrmatH = &aijkok->csrmatH; 237 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 /* y = A x */ 242 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 243 { 244 PetscErrorCode ierr; 245 Mat_SeqAIJKokkos *aijkok; 246 ConstPetscScalarKokkosView xv; 247 PetscScalarKokkosView yv; 248 249 PetscFunctionBegin; 250 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 251 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 252 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 253 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 254 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 255 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 256 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 257 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 258 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 259 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 260 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /* y = A^T x */ 265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 266 { 267 PetscErrorCode ierr; 268 Mat_SeqAIJKokkos *aijkok; 269 const char *mode; 270 ConstPetscScalarKokkosView xv; 271 PetscScalarKokkosView yv; 272 KokkosCsrMatrix *csrmat; 273 274 PetscFunctionBegin; 275 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 276 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 277 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 278 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 279 if (A->form_explicit_transpose) { 280 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 281 mode = "N"; 282 } else { 283 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 284 csrmat = &aijkok->csrmat; 285 mode = "T"; 286 } 287 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 288 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 289 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 290 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 291 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 /* y = A^H x */ 296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 297 { 298 PetscErrorCode ierr; 299 Mat_SeqAIJKokkos *aijkok; 300 const char *mode; 301 ConstPetscScalarKokkosView xv; 302 PetscScalarKokkosView yv; 303 KokkosCsrMatrix *csrmat; 304 305 PetscFunctionBegin; 306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 307 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 308 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 309 ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 310 if (A->form_explicit_transpose) { 311 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 312 mode = "N"; 313 } else { 314 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 315 csrmat = &aijkok->csrmat; 316 mode = "C"; 317 } 318 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 319 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 320 ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 321 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 322 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 /* z = A x + y */ 327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 328 { 329 PetscErrorCode ierr; 330 Mat_SeqAIJKokkos *aijkok; 331 ConstPetscScalarKokkosView xv,yv; 332 PetscScalarKokkosView zv; 333 334 PetscFunctionBegin; 335 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 336 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 337 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 339 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 340 if (zz != yy) Kokkos::deep_copy(zv,yv); 341 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 342 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 343 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 346 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 /* z = A^T x + y */ 352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 353 { 354 PetscErrorCode ierr; 355 Mat_SeqAIJKokkos *aijkok; 356 const char *mode; 357 ConstPetscScalarKokkosView xv,yv; 358 PetscScalarKokkosView zv; 359 KokkosCsrMatrix *csrmat; 360 361 PetscFunctionBegin; 362 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 363 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 364 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 366 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 367 if (zz != yy) Kokkos::deep_copy(zv,yv); 368 if (A->form_explicit_transpose) { 369 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 370 mode = "N"; 371 } else { 372 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 373 csrmat = &aijkok->csrmat; 374 mode = "T"; 375 } 376 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 377 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 378 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 380 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 381 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 /* z = A^H x + y */ 386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 387 { 388 PetscErrorCode ierr; 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv,yv; 392 PetscScalarKokkosView zv; 393 KokkosCsrMatrix *csrmat; 394 395 PetscFunctionBegin; 396 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 397 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 398 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 400 ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 401 if (zz != yy) Kokkos::deep_copy(zv,yv); 402 if (A->form_explicit_transpose) { 403 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 404 mode = "N"; 405 } else { 406 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 407 csrmat = &aijkok->csrmat; 408 mode = "C"; 409 } 410 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 411 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 412 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 414 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 415 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 416 PetscFunctionReturn(0); 417 } 418 419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 420 { 421 PetscErrorCode ierr; 422 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 423 424 PetscFunctionBegin; 425 switch (op) { 426 case MAT_FORM_EXPLICIT_TRANSPOSE: 427 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 428 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 429 A->form_explicit_transpose = flg; 430 break; 431 default: 432 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 433 break; 434 } 435 PetscFunctionReturn(0); 436 } 437 438 /* Depending on reuse, either build a new mat, or use the existing mat */ 439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *aseq; 443 444 PetscFunctionBegin; 445 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 446 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 447 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 448 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 449 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 450 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 451 PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 452 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 454 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 455 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 456 aseq = static_cast<Mat_SeqAIJ*>(A->data); 457 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 458 PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 459 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 460 } 461 } 462 PetscFunctionReturn(0); 463 } 464 465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 466 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 467 */ 468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 469 { 470 PetscErrorCode ierr; 471 Mat_SeqAIJ *bseq; 472 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 473 Mat mat; 474 475 PetscFunctionBegin; 476 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 477 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 478 mat = *B; 479 if (A->assembled) { 480 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 481 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 482 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 483 /* Now copy values to B if needed */ 484 if (dupOption == MAT_COPY_VALUES) { 485 if (akok->a_dual.need_sync_device()) { 486 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 487 bkok->a_dual.modify_host(); 488 } else { /* If device has the latest data, we only copy data on device */ 489 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 490 bkok->a_dual.modify_device(); 491 } 492 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 493 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 494 bkok->a_dual.modify_host(); 495 } 496 mat->spptr = bkok; 497 } 498 499 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 500 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 501 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 502 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 503 PetscFunctionReturn(0); 504 } 505 506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 507 { 508 PetscErrorCode ierr; 509 Mat At; 510 KokkosCsrMatrix *internT; 511 Mat_SeqAIJKokkos *atkok,*bkok; 512 513 PetscFunctionBegin; 514 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 515 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 516 /* Deep copy internT, as we want to isolate the internal transpose */ 517 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 518 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 519 if (reuse == MAT_INITIAL_MATRIX) *B = At; 520 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 521 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 522 if ((*B)->assembled) { 523 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 524 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 525 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 526 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 527 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 528 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 529 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 530 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 531 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 532 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 533 } 534 PetscFunctionReturn(0); 535 } 536 537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 538 { 539 PetscErrorCode ierr; 540 Mat_SeqAIJKokkos *aijkok; 541 542 PetscFunctionBegin; 543 if (A->factortype == MAT_FACTOR_NONE) { 544 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 545 delete aijkok; 546 } else { 547 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 548 } 549 A->spptr = NULL; 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 553 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 554 PetscFunctionReturn(0); 555 } 556 557 /*MC 558 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 559 560 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 561 562 Options Database Keys: 563 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 564 565 Level: beginner 566 567 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 568 M*/ 569 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 570 { 571 PetscErrorCode ierr; 572 573 PetscFunctionBegin; 574 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 575 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 576 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 577 PetscFunctionReturn(0); 578 } 579 580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 582 { 583 PetscErrorCode ierr; 584 Mat_SeqAIJ *a,*b; 585 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 586 MatScalarKokkosView aa,ba,ca; 587 MatRowMapKokkosView ai,bi,ci; 588 MatColIdxKokkosView aj,bj,cj; 589 PetscInt m,n,nnz,aN; 590 591 PetscFunctionBegin; 592 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 593 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 594 PetscValidPointer(C,4); 595 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 596 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 597 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 598 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 599 600 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 601 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 602 a = static_cast<Mat_SeqAIJ*>(A->data); 603 b = static_cast<Mat_SeqAIJ*>(B->data); 604 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 605 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 606 aa = akok->a_dual.view_device(); 607 ai = akok->i_dual.view_device(); 608 ba = bkok->a_dual.view_device(); 609 bi = bkok->i_dual.view_device(); 610 m = A->rmap->n; /* M, N and nnz of C */ 611 n = A->cmap->n + B->cmap->n; 612 nnz = a->nz + b->nz; 613 aN = A->cmap->n; /* N of A */ 614 if (reuse == MAT_INITIAL_MATRIX) { 615 aj = akok->j_dual.view_device(); 616 bj = bkok->j_dual.view_device(); 617 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 618 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 619 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 620 ca = ca_dual.view_device(); 621 ci = ci_dual.view_device(); 622 cj = cj_dual.view_device(); 623 624 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 625 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 626 PetscInt i = t.league_rank(); /* row i */ 627 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 628 629 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 630 ci(i) = coffset; 631 if (i == m-1) ci(m) = ai(m) + bi(m); 632 }); 633 634 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 635 if (k < alen) { 636 ca(coffset+k) = aa(ai(i)+k); 637 cj(coffset+k) = aj(ai(i)+k); 638 } else { 639 ca(coffset+k) = ba(bi(i)+k-alen); 640 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 641 } 642 }); 643 }); 644 ca_dual.modify_device(); 645 ci_dual.modify_device(); 646 cj_dual.modify_device(); 647 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 648 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 649 } else if (reuse == MAT_REUSE_MATRIX) { 650 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 651 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 652 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 653 ca = ckok->a_dual.view_device(); 654 ci = ckok->i_dual.view_device(); 655 656 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 657 PetscInt i = t.league_rank(); /* row i */ 658 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 659 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 660 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 661 else ca(ci(i)+k) = ba(bi(i)+k-alen); 662 }); 663 }); 664 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 665 } 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 670 { 671 PetscFunctionBegin; 672 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 673 PetscFunctionReturn(0); 674 } 675 676 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 677 { 678 PetscErrorCode ierr; 679 Mat_Product *product = C->product; 680 Mat A,B; 681 bool transA,transB; /* use bool, since KK needs this type */ 682 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 683 Mat_SeqAIJ *c; 684 MatProductData_SeqAIJKokkos *pdata; 685 KokkosCsrMatrix *csrmatA,*csrmatB; 686 687 PetscFunctionBegin; 688 MatCheckProduct(C,1); 689 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 690 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 691 692 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 693 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 694 PetscFunctionReturn(0); 695 } 696 697 switch (product->type) { 698 case MATPRODUCT_AB: transA = false; transB = false; break; 699 case MATPRODUCT_AtB: transA = true; transB = false; break; 700 case MATPRODUCT_ABt: transA = false; transB = true; break; 701 default: 702 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 703 } 704 705 A = product->A; 706 B = product->B; 707 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 708 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 709 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 710 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 711 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 712 713 PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 714 715 csrmatA = &akok->csrmat; 716 csrmatB = &bkok->csrmat; 717 718 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 719 if (transA) { 720 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 721 transA = false; 722 } 723 724 if (transB) { 725 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 726 transB = false; 727 } 728 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 729 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 730 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 731 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 732 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 733 /* shorter version of MatAssemblyEnd_SeqAIJ */ 734 c = (Mat_SeqAIJ*)C->data; 735 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 736 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 737 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 738 c->reallocs = 0; 739 C->info.mallocs = 0; 740 C->info.nz_unneeded = 0; 741 C->assembled = C->was_assembled = PETSC_TRUE; 742 C->num_ass++; 743 PetscFunctionReturn(0); 744 } 745 746 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 747 { 748 PetscErrorCode ierr; 749 Mat_Product *product = C->product; 750 MatProductType ptype; 751 Mat A,B; 752 bool transA,transB; 753 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 754 MatProductData_SeqAIJKokkos *pdata; 755 MPI_Comm comm; 756 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 757 758 PetscFunctionBegin; 759 MatCheckProduct(C,1); 760 ierr = PetscObjectGetComm((PetscObject)C,&comm); 761 PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 762 A = product->A; 763 B = product->B; 764 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 765 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 766 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 767 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 768 csrmatA = &akok->csrmat; 769 csrmatB = &bkok->csrmat; 770 771 ptype = product->type; 772 switch (ptype) { 773 case MATPRODUCT_AB: transA = false; transB = false; break; 774 case MATPRODUCT_AtB: transA = true; transB = false; break; 775 case MATPRODUCT_ABt: transA = false; transB = true; break; 776 default: 777 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 778 } 779 780 product->data = pdata = new MatProductData_SeqAIJKokkos(); 781 pdata->kh.set_team_work_size(16); 782 pdata->kh.set_dynamic_scheduling(true); 783 pdata->reusesym = product->api_user; 784 785 /* TODO: add command line options to select spgemm algorithms */ 786 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 787 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 788 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 789 */ 790 pdata->kh.create_spgemm_handle(spgemm_alg); 791 792 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 793 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 794 if (transA) { 795 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 796 transA = false; 797 } 798 799 if (transB) { 800 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 801 transB = false; 802 } 803 804 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 805 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 806 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 807 calling new Mat_SeqAIJKokkos(). 808 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 809 */ 810 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 811 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 812 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 813 814 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 815 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 816 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 817 PetscFunctionReturn(0); 818 } 819 820 /* handles sparse matrix matrix ops */ 821 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 822 { 823 PetscErrorCode ierr; 824 Mat_Product *product = mat->product; 825 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 826 827 PetscFunctionBegin; 828 MatCheckProduct(mat,1); 829 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 830 if (product->type == MATPRODUCT_ABC) { 831 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 832 } 833 if (Biskok && Ciskok) { 834 switch (product->type) { 835 case MATPRODUCT_AB: 836 case MATPRODUCT_AtB: 837 case MATPRODUCT_ABt: 838 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 839 break; 840 case MATPRODUCT_PtAP: 841 case MATPRODUCT_RARt: 842 case MATPRODUCT_ABC: 843 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 844 break; 845 default: 846 break; 847 } 848 } else { /* fallback for AIJ */ 849 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 850 } 851 PetscFunctionReturn(0); 852 } 853 854 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 855 { 856 PetscErrorCode ierr; 857 Mat_SeqAIJKokkos *aijkok; 858 859 PetscFunctionBegin; 860 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 861 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 862 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 863 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 864 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 865 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 866 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 867 PetscFunctionReturn(0); 868 } 869 870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 871 { 872 PetscErrorCode ierr; 873 Mat_SeqAIJKokkos *aijkok; 874 875 PetscFunctionBegin; 876 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 877 if (aijkok) { /* Only zero the device if data is already there */ 878 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 879 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 880 } else { /* Might be preallocated but not assembled */ 881 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 882 } 883 PetscFunctionReturn(0); 884 } 885 886 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 887 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 888 { 889 PetscErrorCode ierr; 890 Mat_SeqAIJKokkos *aijkok; 891 892 PetscFunctionBegin; 893 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 894 PetscValidPointer(kv,2); 895 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 896 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 897 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 898 *kv = aijkok->a_dual.view_device(); 899 PetscFunctionReturn(0); 900 } 901 902 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 903 { 904 PetscFunctionBegin; 905 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 906 PetscValidPointer(kv,2); 907 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 908 PetscFunctionReturn(0); 909 } 910 911 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 912 { 913 PetscErrorCode ierr; 914 Mat_SeqAIJKokkos *aijkok; 915 916 PetscFunctionBegin; 917 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 918 PetscValidPointer(kv,2); 919 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 920 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 921 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 922 *kv = aijkok->a_dual.view_device(); 923 PetscFunctionReturn(0); 924 } 925 926 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 927 { 928 PetscErrorCode ierr; 929 930 PetscFunctionBegin; 931 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 932 PetscValidPointer(kv,2); 933 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 934 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 935 PetscFunctionReturn(0); 936 } 937 938 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 939 { 940 Mat_SeqAIJKokkos *aijkok; 941 942 PetscFunctionBegin; 943 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 944 PetscValidPointer(kv,2); 945 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 946 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 947 *kv = aijkok->a_dual.view_device(); 948 PetscFunctionReturn(0); 949 } 950 951 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 952 { 953 PetscErrorCode ierr; 954 955 PetscFunctionBegin; 956 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 957 PetscValidPointer(kv,2); 958 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 959 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 960 PetscFunctionReturn(0); 961 } 962 963 /* Computes Y += alpha X */ 964 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 965 { 966 PetscErrorCode ierr; 967 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 968 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 969 ConstMatScalarKokkosView Xa; 970 MatScalarKokkosView Ya; 971 972 PetscFunctionBegin; 973 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 974 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 975 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 976 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 977 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 978 979 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 980 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 981 PetscBool e; 982 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 983 if (e) { 984 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 985 if (e) pattern = SAME_NONZERO_PATTERN; 986 } 987 } 988 989 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 990 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 991 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 992 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 993 */ 994 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 995 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 996 Xa = xkok->a_dual.view_device(); 997 Ya = ykok->a_dual.view_device(); 998 999 if (pattern == SAME_NONZERO_PATTERN) { 1000 KokkosBlas::axpy(alpha,Xa,Ya); 1001 ierr = MatSeqAIJKokkosModifyDevice(Y); 1002 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1003 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1004 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1005 1006 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1007 PetscInt i = t.league_rank(); /* row i */ 1008 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1009 PetscInt p,q = Yi(i); 1010 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1011 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1012 if (Xj(p) == Yj(q)) { /* Found it */ 1013 Ya(q) += alpha * Xa(p); 1014 q++; 1015 } else { 1016 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1017 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1018 */ 1019 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1020 } 1021 } 1022 }); 1023 }); 1024 ierr = MatSeqAIJKokkosModifyDevice(Y); 1025 } else { /* different nonzero patterns */ 1026 Mat Z; 1027 KokkosCsrMatrix zcsr; 1028 KernelHandle kh; 1029 kh.create_spadd_handle(false); 1030 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1031 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1032 zkok = new Mat_SeqAIJKokkos(zcsr); 1033 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1034 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1035 kh.destroy_spadd_handle(); 1036 } 1037 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1038 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1039 PetscFunctionReturn(0); 1040 } 1041 1042 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1043 { 1044 PetscErrorCode ierr; 1045 Mat_SeqAIJKokkos *akok; 1046 Mat_SeqAIJ *aseq; 1047 1048 PetscFunctionBegin; 1049 ierr = MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 1050 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1051 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1052 delete akok; 1053 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1054 ierr = MatZeroEntries_SeqAIJKokkos(mat);CHKERRQ(ierr); 1055 akok->SetUpCOO(aseq); 1056 PetscFunctionReturn(0); 1057 } 1058 1059 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1060 { 1061 PetscErrorCode ierr; 1062 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1063 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1064 PetscCount Annz = aseq->nz; 1065 const PetscCountKokkosView& jmap = akok->jmap_d; 1066 const PetscCountKokkosView& perm = akok->perm_d; 1067 MatScalarKokkosView Aa; 1068 ConstMatScalarKokkosView kv; 1069 PetscMemType memtype; 1070 1071 PetscFunctionBegin; 1072 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1073 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1074 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1075 } else { 1076 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1077 } 1078 1079 if (imode == INSERT_VALUES) { 1080 ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 1081 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1082 } else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 1083 1084 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1085 1086 if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1087 else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 1088 PetscFunctionReturn(0); 1089 } 1090 1091 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1092 { 1093 PetscErrorCode ierr; 1094 1095 PetscFunctionBegin; 1096 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1097 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1098 B->offloadmask = PETSC_OFFLOAD_CPU; 1099 PetscFunctionReturn(0); 1100 } 1101 1102 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1103 { 1104 PetscErrorCode ierr; 1105 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1106 1107 PetscFunctionBegin; 1108 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1109 A->boundtocpu = PETSC_FALSE; 1110 1111 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1112 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1113 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1114 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1115 A->ops->scale = MatScale_SeqAIJKokkos; 1116 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1117 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1118 A->ops->mult = MatMult_SeqAIJKokkos; 1119 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1120 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1121 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1122 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1123 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1124 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1125 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1126 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1127 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1128 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1129 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1130 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1131 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1132 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1133 1134 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1135 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1136 PetscFunctionReturn(0); 1137 } 1138 1139 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1140 { 1141 PetscErrorCode ierr; 1142 Mat_SeqAIJ *aseq; 1143 PetscInt i,m,n; 1144 1145 PetscFunctionBegin; 1146 PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1147 1148 m = akok->nrows(); 1149 n = akok->ncols(); 1150 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1151 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1152 1153 /* Set up data structures of A as a MATSEQAIJ */ 1154 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1155 aseq = (Mat_SeqAIJ*)(A)->data; 1156 1157 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1158 akok->j_dual.sync_host(); 1159 1160 aseq->i = akok->i_host_data(); 1161 aseq->j = akok->j_host_data(); 1162 aseq->a = akok->a_host_data(); 1163 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1164 aseq->singlemalloc = PETSC_FALSE; 1165 aseq->free_a = PETSC_FALSE; 1166 aseq->free_ij = PETSC_FALSE; 1167 aseq->nz = akok->nnz(); 1168 aseq->maxnz = aseq->nz; 1169 1170 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1171 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1172 for (i=0; i<m; i++) { 1173 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1174 } 1175 1176 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1177 akok->nonzerostate = A->nonzerostate; 1178 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1179 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1180 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1181 PetscFunctionReturn(0); 1182 } 1183 1184 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1185 1186 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1187 */ 1188 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1189 { 1190 PetscErrorCode ierr; 1191 1192 PetscFunctionBegin; 1193 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1194 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1195 PetscFunctionReturn(0); 1196 } 1197 1198 /* --------------------------------------------------------------------------------*/ 1199 /*@C 1200 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1201 (the default parallel PETSc format). This matrix will ultimately be handled by 1202 Kokkos for calculations. For good matrix 1203 assembly performance the user should preallocate the matrix storage by setting 1204 the parameter nz (or the array nnz). By setting these parameters accurately, 1205 performance during matrix assembly can be increased by more than a factor of 50. 1206 1207 Collective 1208 1209 Input Parameters: 1210 + comm - MPI communicator, set to PETSC_COMM_SELF 1211 . m - number of rows 1212 . n - number of columns 1213 . nz - number of nonzeros per row (same for all rows) 1214 - nnz - array containing the number of nonzeros in the various rows 1215 (possibly different for each row) or NULL 1216 1217 Output Parameter: 1218 . A - the matrix 1219 1220 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1221 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1222 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1223 1224 Notes: 1225 If nnz is given then nz is ignored 1226 1227 The AIJ format (also called the Yale sparse matrix format or 1228 compressed row storage), is fully compatible with standard Fortran 77 1229 storage. That is, the stored row and column indices can begin at 1230 either one (as in Fortran) or zero. See the users' manual for details. 1231 1232 Specify the preallocated storage with either nz or nnz (not both). 1233 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1234 allocation. For large problems you MUST preallocate memory or you 1235 will get TERRIBLE performance, see the users' manual chapter on matrices. 1236 1237 By default, this format uses inodes (identical nodes) when possible, to 1238 improve numerical efficiency of matrix-vector products and solves. We 1239 search for consecutive rows with the same nonzero structure, thereby 1240 reusing matrix information to achieve increased efficiency. 1241 1242 Level: intermediate 1243 1244 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1245 @*/ 1246 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1247 { 1248 PetscErrorCode ierr; 1249 1250 PetscFunctionBegin; 1251 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1252 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1253 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1254 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1255 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1256 PetscFunctionReturn(0); 1257 } 1258 1259 typedef Kokkos::TeamPolicy<>::member_type team_member; 1260 // 1261 // This factorization exploits block diagonal matrices with "Nf" (not used). 1262 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1263 // 1264 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1265 { 1266 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1267 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1268 IS isrow = b->row,isicol = b->icol; 1269 PetscErrorCode ierr; 1270 const PetscInt *r_h,*ic_h; 1271 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1272 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1273 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1274 PetscBool row_identity,col_identity; 1275 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1276 1277 PetscFunctionBegin; 1278 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1279 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1280 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1281 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1282 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1283 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1284 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1285 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1286 { 1287 #define KOKKOS_SHARED_LEVEL 1 1288 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1289 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1290 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1291 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1292 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1293 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1294 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1295 size_t flops_h = 0.0; 1296 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1297 Kokkos::View<size_t> d_flops_k ("flops"); 1298 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1299 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1300 Kokkos::deep_copy (d_flops_k, h_flops_k); 1301 Kokkos::deep_copy (d_r_k, h_r_k); 1302 Kokkos::deep_copy (d_ic_k, h_ic_k); 1303 // Fill A --> fact 1304 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1305 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1306 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1307 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1308 // zero rows of B 1309 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1310 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1311 PetscScalar *baL = ba_d + bi_d[rowb]; 1312 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1313 /* zero (unfactored row) */ 1314 for (int j=0;j<nzbL;j++) baL[j] = 0; 1315 for (int j=0;j<nzbU;j++) baU[j] = 0; 1316 }); 1317 // copy A into B 1318 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1319 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1320 const PetscScalar *av = aa_d + ai_d[rowa]; 1321 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1322 /* load in initial (unfactored row) */ 1323 for (int j=0;j<nza;j++) { 1324 PetscInt colb = ic[ajtmp[j]]; 1325 PetscScalar vala = av[j]; 1326 if (colb == rowb) { 1327 *(ba_d + bdiag_d[rowb]) = vala; 1328 } else { 1329 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1330 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1331 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1332 for (int j=0; j<nz ; j++) { 1333 if (pbj[j] == colb) { 1334 pba[j] = vala; 1335 set++; 1336 break; 1337 } 1338 } 1339 #if !defined(PETSC_HAVE_SYCL) 1340 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1341 #endif 1342 } 1343 } 1344 }); 1345 }); 1346 Kokkos::fence(); 1347 1348 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1349 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1350 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1351 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1352 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1353 const PetscInt start = field*nloc, end = start + nloc; 1354 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1355 // A22 panel update for each row A(1,:) and col A(:,1) 1356 for (int ii=start; ii<end-1; ii++) { 1357 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1358 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1359 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1360 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1361 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1362 PetscInt kIdx = j*Ni + field_block_idx; 1363 if (kIdx >= nzUi) /* void */ ; 1364 else { 1365 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1366 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1367 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1368 size_t st_idx; 1369 // find and do L(k,i) = A(:k,i) / A(i,i) 1370 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1371 // get column, there has got to be a better way 1372 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1373 if (pjL[j] == ii) { 1374 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1375 idx = j; // output 1376 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1377 } 1378 }, st_idx); 1379 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1380 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1381 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1382 #endif 1383 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1384 // U(i+1,:end) 1385 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1386 PetscScalar Uij = baUi[uiIdx]; 1387 PetscInt col = bjUi[uiIdx]; 1388 if (col==myk) { 1389 // A_kk = A_kk - L_ki * U_ij(k) 1390 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1391 *Akkv = *Akkv - L_ki() * Uij; // UiK 1392 } else { 1393 PetscScalar *start, *end, *pAkjv=NULL; 1394 PetscInt high, low; 1395 const PetscInt *startj; 1396 if (col<myk) { // L 1397 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1398 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1399 start = pLki+1; // start at pLki+1, A22(myk,1) 1400 startj= bj_d + bi_d[myk] + idx; 1401 end = ba_d + bi_d[myk+1]; 1402 } else { 1403 PetscInt idx = bdiag_d[myk+1]+1; 1404 start = ba_d + idx; 1405 startj= bj_d + idx; 1406 end = ba_d + bdiag_d[myk]; 1407 } 1408 // search for 'col', use bisection search - TODO 1409 low = 0; 1410 high = (PetscInt)(end-start); 1411 while (high-low > 5) { 1412 int t = (low+high)/2; 1413 if (startj[t] > col) high = t; 1414 else low = t; 1415 } 1416 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1417 if (startj[pAkjv-start] == col) break; 1418 } 1419 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1420 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1421 #endif 1422 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1423 } 1424 }); 1425 } 1426 }); 1427 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1428 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1429 } /* endof for (i=0; i<n; i++) { */ 1430 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1431 }); 1432 Kokkos::fence(); 1433 Kokkos::deep_copy (h_flops_k, d_flops_k); 1434 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1435 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1436 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1437 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1438 /* Invert diagonal for simpler triangular solves */ 1439 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1440 int i = start + outer_index*Ni + lg_rank%Ni; 1441 if (i < end) { 1442 PetscScalar *pv = ba_d + bdiag_d[i]; 1443 *pv = 1.0/(*pv); 1444 } 1445 }); 1446 }); 1447 } 1448 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1449 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1450 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1451 1452 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1453 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1454 if (b->inode.size) { 1455 B->ops->solve = MatSolve_SeqAIJ_Inode; 1456 } else if (row_identity && col_identity) { 1457 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1458 } else { 1459 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1460 } 1461 B->offloadmask = PETSC_OFFLOAD_GPU; 1462 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1463 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1464 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1465 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1466 B->ops->matsolve = MatMatSolve_SeqAIJ; 1467 B->assembled = PETSC_TRUE; 1468 B->preallocated = PETSC_TRUE; 1469 1470 PetscFunctionReturn(0); 1471 } 1472 1473 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1474 { 1475 PetscErrorCode ierr; 1476 1477 PetscFunctionBegin; 1478 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1479 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1480 PetscFunctionReturn(0); 1481 } 1482 1483 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1484 { 1485 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1486 1487 PetscFunctionBegin; 1488 if (!factors->sptrsv_symbolic_completed) { 1489 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1490 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1491 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1492 } 1493 PetscFunctionReturn(0); 1494 } 1495 1496 /* Check if we need to update factors etc for transpose solve */ 1497 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1498 { 1499 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1500 MatColIdxType n = A->rmap->n; 1501 1502 PetscFunctionBegin; 1503 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1504 /* Update L^T and do sptrsv symbolic */ 1505 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1506 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1507 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1508 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1509 1510 KokkosKernels::Impl::transpose_matrix< 1511 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1512 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1513 MatRowMapKokkosView,DefaultExecutionSpace>( 1514 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1515 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1516 1517 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1518 We have to sort the indices, until KK provides finer control options. 1519 */ 1520 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1521 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1522 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1523 1524 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1525 1526 /* Update U^T and do sptrsv symbolic */ 1527 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1528 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1529 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1530 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1531 1532 KokkosKernels::Impl::transpose_matrix< 1533 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1534 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1535 MatRowMapKokkosView,DefaultExecutionSpace>( 1536 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1537 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1538 1539 /* Sort indices. See comments above */ 1540 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1541 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1542 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1543 1544 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1545 factors->transpose_updated = PETSC_TRUE; 1546 } 1547 PetscFunctionReturn(0); 1548 } 1549 1550 /* Solve Ax = b, with A = LU */ 1551 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1552 { 1553 PetscErrorCode ierr; 1554 ConstPetscScalarKokkosView bv; 1555 PetscScalarKokkosView xv; 1556 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1557 1558 PetscFunctionBegin; 1559 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1560 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1561 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1562 ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1563 /* Solve L tmpv = b */ 1564 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1565 /* Solve Ux = tmpv */ 1566 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1567 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1568 ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1569 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 /* Solve A^T x = b, where A^T = U^T L^T */ 1574 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1575 { 1576 PetscErrorCode ierr; 1577 ConstPetscScalarKokkosView bv; 1578 PetscScalarKokkosView xv; 1579 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1580 1581 PetscFunctionBegin; 1582 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1583 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1584 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1585 ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1586 /* Solve U^T tmpv = b */ 1587 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1588 1589 /* Solve L^T x = tmpv */ 1590 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1591 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1592 ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1593 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1598 { 1599 PetscErrorCode ierr; 1600 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1601 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1602 PetscInt fill_lev = info->levels; 1603 1604 PetscFunctionBegin; 1605 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1606 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1607 1608 auto a_d = aijkok->a_dual.view_device(); 1609 auto i_d = aijkok->i_dual.view_device(); 1610 auto j_d = aijkok->j_dual.view_device(); 1611 1612 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1613 1614 B->assembled = PETSC_TRUE; 1615 B->preallocated = PETSC_TRUE; 1616 B->ops->solve = MatSolve_SeqAIJKokkos; 1617 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1618 B->ops->matsolve = NULL; 1619 B->ops->matsolvetranspose = NULL; 1620 B->offloadmask = PETSC_OFFLOAD_GPU; 1621 1622 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1623 factors->transpose_updated = PETSC_FALSE; 1624 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1625 /* TODO: log flops, but how to know that? */ 1626 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1627 PetscFunctionReturn(0); 1628 } 1629 1630 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1631 { 1632 PetscErrorCode ierr; 1633 Mat_SeqAIJKokkos *aijkok; 1634 Mat_SeqAIJ *b; 1635 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1636 PetscInt fill_lev = info->levels; 1637 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1638 PetscInt n = A->rmap->n; 1639 1640 PetscFunctionBegin; 1641 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1642 /* Rebuild factors */ 1643 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1644 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1645 1646 /* Create a spiluk handle and then do symbolic factorization */ 1647 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1648 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1649 1650 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1651 1652 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1653 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1654 Kokkos::realloc(factors->iU_d,n+1); 1655 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1656 1657 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1658 auto i_d = aijkok->i_dual.view_device(); 1659 auto j_d = aijkok->j_dual.view_device(); 1660 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1661 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1662 1663 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1664 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1665 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1666 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1667 1668 /* TODO: add options to select sptrsv algorithms */ 1669 /* Create sptrsv handles for L, U and their transpose */ 1670 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1671 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1672 #else 1673 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1674 #endif 1675 1676 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1677 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1678 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1679 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1680 1681 /* Fill fields of the factor matrix B */ 1682 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1683 b = (Mat_SeqAIJ*)B->data; 1684 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1685 B->info.fill_ratio_given = info->fill; 1686 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1687 1688 B->offloadmask = PETSC_OFFLOAD_GPU; 1689 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1690 PetscFunctionReturn(0); 1691 } 1692 1693 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1694 { 1695 PetscErrorCode ierr; 1696 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1697 const PetscInt nrows = A->rmap->n; 1698 1699 PetscFunctionBegin; 1700 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1701 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1702 // move B data into Kokkos 1703 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1704 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1705 { 1706 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1707 if (!baijkok->diag_d.extent(0)) { 1708 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1709 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1710 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1711 } 1712 } 1713 PetscFunctionReturn(0); 1714 } 1715 1716 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1717 { 1718 PetscFunctionBegin; 1719 *type = MATSOLVERKOKKOS; 1720 PetscFunctionReturn(0); 1721 } 1722 1723 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1724 { 1725 PetscFunctionBegin; 1726 *type = MATSOLVERKOKKOSDEVICE; 1727 PetscFunctionReturn(0); 1728 } 1729 1730 /*MC 1731 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1732 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1733 1734 Level: beginner 1735 1736 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1737 M*/ 1738 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1739 { 1740 PetscErrorCode ierr; 1741 PetscInt n = A->rmap->n; 1742 1743 PetscFunctionBegin; 1744 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1745 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1746 (*B)->factortype = ftype; 1747 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1748 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1749 1750 if (ftype == MAT_FACTOR_LU) { 1751 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1752 (*B)->canuseordering = PETSC_TRUE; 1753 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1754 } else if (ftype == MAT_FACTOR_ILU) { 1755 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1756 (*B)->canuseordering = PETSC_FALSE; 1757 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1758 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1759 1760 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1761 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1762 PetscFunctionReturn(0); 1763 } 1764 1765 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1766 { 1767 PetscErrorCode ierr; 1768 PetscInt n = A->rmap->n; 1769 1770 PetscFunctionBegin; 1771 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1772 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1773 (*B)->factortype = ftype; 1774 (*B)->canuseordering = PETSC_TRUE; 1775 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1776 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1777 1778 if (ftype == MAT_FACTOR_LU) { 1779 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1780 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1781 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1782 1783 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1784 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1785 PetscFunctionReturn(0); 1786 } 1787 1788 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1789 { 1790 PetscErrorCode ierr; 1791 1792 PetscFunctionBegin; 1793 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1794 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1795 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1796 PetscFunctionReturn(0); 1797 } 1798 1799 /* Utility to print out a KokkosCsrMatrix for debugging */ 1800 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1801 { 1802 PetscErrorCode ierr; 1803 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1804 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1805 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1806 const PetscInt *i = iv.data(); 1807 const PetscInt *j = jv.data(); 1808 const PetscScalar *a = av.data(); 1809 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1810 1811 PetscFunctionBegin; 1812 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1813 for (PetscInt k=0; k<m; k++) { 1814 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1815 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1816 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1817 } 1818 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1819 } 1820 PetscFunctionReturn(0); 1821 } 1822