1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/aij.h> 19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 20 21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 22 23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 24 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 25 In the latter case, it is important to set a_dual's sync state correctly. 26 */ 27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 28 { 29 PetscErrorCode ierr; 30 Mat_SeqAIJ *aijseq; 31 Mat_SeqAIJKokkos *aijkok; 32 33 PetscFunctionBegin; 34 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 35 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 36 37 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 38 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 39 40 /* If aijkok does not exist, we just copy i, j to device. 41 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 42 In both cases, we build a new aijkok structure. 43 */ 44 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 45 delete aijkok; 46 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 47 A->spptr = aijkok; 48 } 49 50 if (aijkok && aijkok->device_mat_d.data()) { 51 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 52 } 53 PetscFunctionReturn(0); 54 } 55 56 /* Sync CSR data to device if not yet */ 57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 58 { 59 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 61 PetscFunctionBegin; 62 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 63 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 64 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 65 if (aijkok->a_dual.need_sync_device()) { 66 aijkok->a_dual.sync_device(); 67 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 68 aijkok->hermitian_updated = PETSC_FALSE; 69 } 70 PetscFunctionReturn(0); 71 } 72 73 /* Mark the CSR data on device as modified */ 74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 75 { 76 PetscErrorCode ierr; 77 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 78 79 PetscFunctionBegin; 80 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 81 aijkok->a_dual.clear_sync_state(); 82 aijkok->a_dual.modify_device(); 83 aijkok->transpose_updated = PETSC_FALSE; 84 aijkok->hermitian_updated = PETSC_FALSE; 85 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 86 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 87 PetscFunctionReturn(0); 88 } 89 90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 91 { 92 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 93 94 PetscFunctionBegin; 95 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 96 /* We do not expect one needs factors on host */ 97 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 98 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 99 aijkok->a_dual.sync_host(); 100 PetscFunctionReturn(0); 101 } 102 103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 104 { 105 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 106 107 PetscFunctionBegin; 108 if (aijkok) { 109 aijkok->a_dual.sync_host(); 110 *array = aijkok->a_dual.view_host().data(); 111 } else { /* Happens when calling MatSetValues on a newly created matrix */ 112 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 113 } 114 PetscFunctionReturn(0); 115 } 116 117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 118 { 119 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 120 121 PetscFunctionBegin; 122 if (aijkok) aijkok->a_dual.modify_host(); 123 PetscFunctionReturn(0); 124 } 125 126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 129 130 PetscFunctionBegin; 131 if (aijkok) { 132 aijkok->a_dual.sync_host(); 133 *array = aijkok->a_dual.view_host().data(); 134 } else { 135 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 136 } 137 PetscFunctionReturn(0); 138 } 139 140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 141 { 142 PetscFunctionBegin; 143 *array = NULL; 144 PetscFunctionReturn(0); 145 } 146 147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 148 { 149 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 150 151 PetscFunctionBegin; 152 if (aijkok) { 153 *array = aijkok->a_dual.view_host().data(); 154 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 155 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 156 } 157 PetscFunctionReturn(0); 158 } 159 160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 161 { 162 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 164 PetscFunctionBegin; 165 if (aijkok) { 166 aijkok->a_dual.clear_sync_state(); 167 aijkok->a_dual.modify_host(); 168 } 169 PetscFunctionReturn(0); 170 } 171 172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 177 178 PetscFunctionBegin; 179 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 180 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 181 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 190 PetscFunctionBegin; 191 if (aijkok && aijkok->device_mat_d.data()) { 192 *d_mat = aijkok->device_mat_d.data(); 193 } else { 194 PetscErrorCode ierr; 195 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 196 *d_mat = NULL; 197 } 198 PetscFunctionReturn(0); 199 } 200 201 /* Generate the transpose on device and cache it internally */ 202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 205 206 PetscFunctionBegin; 207 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 208 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 209 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 210 CHKERRCXX(aijkok->a_dual.sync_device()); 211 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 212 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 213 aijkok->transpose_updated = PETSC_TRUE; 214 } 215 *csrmatT = &aijkok->csrmatT; 216 PetscFunctionReturn(0); 217 } 218 219 /* Generate the Hermitian on device and cache it internally */ 220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 221 { 222 PetscErrorCode ierr; 223 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 224 225 PetscFunctionBegin; 226 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 227 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 228 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 229 CHKERRCXX(aijkok->a_dual.sync_device()); 230 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 231 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 232 #if defined(PETSC_USE_COMPLEX) 233 const auto& a = aijkok->csrmatH.values; 234 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 235 #endif 236 aijkok->hermitian_updated = PETSC_TRUE; 237 } 238 *csrmatH = &aijkok->csrmatH; 239 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 240 PetscFunctionReturn(0); 241 } 242 243 /* y = A x */ 244 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 245 { 246 PetscErrorCode ierr; 247 Mat_SeqAIJKokkos *aijkok; 248 ConstPetscScalarKokkosView xv; 249 PetscScalarKokkosView yv; 250 251 PetscFunctionBegin; 252 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 255 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 257 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 258 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 259 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 260 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 261 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 /* y = A^T x */ 266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 267 { 268 PetscErrorCode ierr; 269 Mat_SeqAIJKokkos *aijkok; 270 const char *mode; 271 ConstPetscScalarKokkosView xv; 272 PetscScalarKokkosView yv; 273 KokkosCsrMatrix *csrmat; 274 275 PetscFunctionBegin; 276 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 277 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 280 if (A->form_explicit_transpose) { 281 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282 mode = "N"; 283 } else { 284 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285 csrmat = &aijkok->csrmat; 286 mode = "T"; 287 } 288 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 291 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 /* y = A^H x */ 297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 298 { 299 PetscErrorCode ierr; 300 Mat_SeqAIJKokkos *aijkok; 301 const char *mode; 302 ConstPetscScalarKokkosView xv; 303 PetscScalarKokkosView yv; 304 KokkosCsrMatrix *csrmat; 305 306 PetscFunctionBegin; 307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 308 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 311 if (A->form_explicit_transpose) { 312 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313 mode = "N"; 314 } else { 315 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316 csrmat = &aijkok->csrmat; 317 mode = "C"; 318 } 319 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 /* z = A x + y */ 328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 329 { 330 PetscErrorCode ierr; 331 Mat_SeqAIJKokkos *aijkok; 332 ConstPetscScalarKokkosView xv,yv; 333 PetscScalarKokkosView zv; 334 335 PetscFunctionBegin; 336 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 337 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 341 if (zz != yy) Kokkos::deep_copy(zv,yv); 342 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 /* z = A^T x + y */ 353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJKokkos *aijkok; 357 const char *mode; 358 ConstPetscScalarKokkosView xv,yv; 359 PetscScalarKokkosView zv; 360 KokkosCsrMatrix *csrmat; 361 362 PetscFunctionBegin; 363 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 364 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 368 if (zz != yy) Kokkos::deep_copy(zv,yv); 369 if (A->form_explicit_transpose) { 370 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374 csrmat = &aijkok->csrmat; 375 mode = "T"; 376 } 377 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 381 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 /* z = A^H x + y */ 387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 388 { 389 PetscErrorCode ierr; 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv,yv; 393 PetscScalarKokkosView zv; 394 KokkosCsrMatrix *csrmat; 395 396 PetscFunctionBegin; 397 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 398 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 402 if (zz != yy) Kokkos::deep_copy(zv,yv); 403 if (A->form_explicit_transpose) { 404 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408 csrmat = &aijkok->csrmat; 409 mode = "C"; 410 } 411 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 415 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424 425 PetscFunctionBegin; 426 switch (op) { 427 case MAT_FORM_EXPLICIT_TRANSPOSE: 428 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430 A->form_explicit_transpose = flg; 431 break; 432 default: 433 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434 break; 435 } 436 PetscFunctionReturn(0); 437 } 438 439 /* Depending on reuse, either build a new mat, or use the existing mat */ 440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 441 { 442 PetscErrorCode ierr; 443 Mat_SeqAIJ *aseq; 444 445 PetscFunctionBegin; 446 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 449 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 452 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457 aseq = static_cast<Mat_SeqAIJ*>(A->data); 458 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 459 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 461 } 462 } 463 PetscFunctionReturn(0); 464 } 465 466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468 */ 469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 470 { 471 PetscErrorCode ierr; 472 Mat_SeqAIJ *bseq; 473 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474 Mat mat; 475 476 PetscFunctionBegin; 477 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479 mat = *B; 480 if (A->assembled) { 481 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484 /* Now copy values to B if needed */ 485 if (dupOption == MAT_COPY_VALUES) { 486 if (akok->a_dual.need_sync_device()) { 487 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488 bkok->a_dual.modify_host(); 489 } else { /* If device has the latest data, we only copy data on device */ 490 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491 bkok->a_dual.modify_device(); 492 } 493 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495 bkok->a_dual.modify_host(); 496 } 497 mat->spptr = bkok; 498 } 499 500 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 508 { 509 PetscErrorCode ierr; 510 Mat At; 511 KokkosCsrMatrix *internT,*csrmatT; 512 Mat_SeqAIJKokkos *atkok,*bkok; 513 514 PetscFunctionBegin; 515 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 516 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 518 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 519 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 520 if (reuse == MAT_INITIAL_MATRIX) *B = At; 521 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 522 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 523 if ((*B)->assembled) { 524 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 525 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 526 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 527 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 528 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 529 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 530 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 531 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 532 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 533 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 534 } 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 if (A->factortype == MAT_FACTOR_NONE) { 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 if (aijkok) { 547 if (aijkok->device_mat_d.data()) { 548 delete aijkok->colmap_d; 549 delete aijkok->i_uncompressed_d; 550 } 551 if (aijkok->diag_d) delete aijkok->diag_d; 552 } 553 delete aijkok; 554 } else { 555 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 556 } 557 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 558 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 559 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 560 A->spptr = NULL; 561 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 565 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 566 { 567 PetscErrorCode ierr; 568 569 PetscFunctionBegin; 570 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 571 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 572 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 577 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 578 { 579 PetscErrorCode ierr; 580 Mat_SeqAIJ *a,*b; 581 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 582 MatScalarKokkosView aa,ba,ca; 583 MatRowMapKokkosView ai,bi,ci; 584 MatColIdxKokkosView aj,bj,cj; 585 PetscInt m,n,nnz,aN; 586 587 PetscFunctionBegin; 588 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 589 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 590 PetscValidPointer(C,4); 591 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 592 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 593 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 594 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 595 596 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 597 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 598 a = static_cast<Mat_SeqAIJ*>(A->data); 599 b = static_cast<Mat_SeqAIJ*>(B->data); 600 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 601 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 602 aa = akok->a_dual.view_device(); 603 ai = akok->i_dual.view_device(); 604 ba = bkok->a_dual.view_device(); 605 bi = bkok->i_dual.view_device(); 606 m = A->rmap->n; /* M, N and nnz of C */ 607 n = A->cmap->n + B->cmap->n; 608 nnz = a->nz + b->nz; 609 aN = A->cmap->n; /* N of A */ 610 if (reuse == MAT_INITIAL_MATRIX) { 611 aj = akok->j_dual.view_device(); 612 bj = bkok->j_dual.view_device(); 613 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 614 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 615 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 616 ca = ca_dual.view_device(); 617 ci = ci_dual.view_device(); 618 cj = cj_dual.view_device(); 619 620 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 621 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 622 PetscInt i = t.league_rank(); /* row i */ 623 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 624 625 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 626 ci(i) = coffset; 627 if (i == m-1) ci(m) = ai(m) + bi(m); 628 }); 629 630 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 631 if (k < alen) { 632 ca(coffset+k) = aa(ai(i)+k); 633 cj(coffset+k) = aj(ai(i)+k); 634 } else { 635 ca(coffset+k) = ba(bi(i)+k-alen); 636 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 637 } 638 }); 639 }); 640 ca_dual.modify_device(); 641 ci_dual.modify_device(); 642 cj_dual.modify_device(); 643 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 644 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 645 } else if (reuse == MAT_REUSE_MATRIX) { 646 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 647 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 648 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 649 ca = ckok->a_dual.view_device(); 650 ci = ckok->i_dual.view_device(); 651 652 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 653 PetscInt i = t.league_rank(); /* row i */ 654 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 655 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 656 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 657 else ca(ci(i)+k) = ba(bi(i)+k-alen); 658 }); 659 }); 660 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 661 } 662 PetscFunctionReturn(0); 663 } 664 665 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 666 { 667 PetscFunctionBegin; 668 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 673 { 674 PetscErrorCode ierr; 675 Mat_Product *product = C->product; 676 Mat A,B; 677 bool transA,transB; /* use bool, since KK needs this type */ 678 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 679 Mat_SeqAIJ *c; 680 MatProductData_SeqAIJKokkos *pdata; 681 KokkosCsrMatrix *csrmatA,*csrmatB; 682 683 PetscFunctionBegin; 684 MatCheckProduct(C,1); 685 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 686 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 687 688 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 689 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 690 PetscFunctionReturn(0); 691 } 692 693 switch (product->type) { 694 case MATPRODUCT_AB: transA = false; transB = false; break; 695 case MATPRODUCT_AtB: transA = true; transB = false; break; 696 case MATPRODUCT_ABt: transA = false; transB = true; break; 697 default: 698 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 699 } 700 701 A = product->A; 702 B = product->B; 703 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 704 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 705 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 706 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 707 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 708 709 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 710 711 csrmatA = &akok->csrmat; 712 csrmatB = &bkok->csrmat; 713 714 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 715 if (transA) { 716 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 717 transA = false; 718 } 719 720 if (transB) { 721 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 722 transB = false; 723 } 724 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 725 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 726 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 727 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 728 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 729 /* shorter version of MatAssemblyEnd_SeqAIJ */ 730 c = (Mat_SeqAIJ*)C->data; 731 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 732 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 733 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 734 c->reallocs = 0; 735 C->info.mallocs = 0; 736 C->info.nz_unneeded = 0; 737 C->assembled = C->was_assembled = PETSC_TRUE; 738 C->num_ass++; 739 PetscFunctionReturn(0); 740 } 741 742 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 743 { 744 PetscErrorCode ierr; 745 Mat_Product *product = C->product; 746 MatProductType ptype; 747 Mat A,B; 748 bool transA,transB; 749 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 750 MatProductData_SeqAIJKokkos *pdata; 751 MPI_Comm comm; 752 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 753 754 PetscFunctionBegin; 755 MatCheckProduct(C,1); 756 ierr = PetscObjectGetComm((PetscObject)C,&comm); 757 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 758 A = product->A; 759 B = product->B; 760 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 761 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 762 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 763 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 764 csrmatA = &akok->csrmat; 765 csrmatB = &bkok->csrmat; 766 767 ptype = product->type; 768 switch (ptype) { 769 case MATPRODUCT_AB: transA = false; transB = false; break; 770 case MATPRODUCT_AtB: transA = true; transB = false; break; 771 case MATPRODUCT_ABt: transA = false; transB = true; break; 772 default: 773 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 774 } 775 776 product->data = pdata = new MatProductData_SeqAIJKokkos(); 777 pdata->kh.set_team_work_size(16); 778 pdata->kh.set_dynamic_scheduling(true); 779 pdata->reusesym = product->api_user; 780 781 /* TODO: add command line options to select spgemm algorithms */ 782 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 783 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 784 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 785 */ 786 pdata->kh.create_spgemm_handle(spgemm_alg); 787 788 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 789 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 790 if (transA) { 791 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 792 transA = false; 793 } 794 795 if (transB) { 796 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 797 transB = false; 798 } 799 800 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 801 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 802 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 803 calling new Mat_SeqAIJKokkos(). 804 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 805 */ 806 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 808 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 809 810 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 811 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 812 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 813 PetscFunctionReturn(0); 814 } 815 816 /* handles sparse matrix matrix ops */ 817 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 818 { 819 PetscErrorCode ierr; 820 Mat_Product *product = mat->product; 821 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822 823 PetscFunctionBegin; 824 MatCheckProduct(mat,1); 825 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 826 if (product->type == MATPRODUCT_ABC) { 827 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 828 } 829 if (Biskok && Ciskok) { 830 switch (product->type) { 831 case MATPRODUCT_AB: 832 case MATPRODUCT_AtB: 833 case MATPRODUCT_ABt: 834 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835 break; 836 case MATPRODUCT_PtAP: 837 case MATPRODUCT_RARt: 838 case MATPRODUCT_ABC: 839 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840 break; 841 default: 842 break; 843 } 844 } else { /* fallback for AIJ */ 845 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 846 } 847 PetscFunctionReturn(0); 848 } 849 850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851 { 852 PetscErrorCode ierr; 853 Mat_SeqAIJKokkos *aijkok; 854 855 PetscFunctionBegin; 856 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 857 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 858 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 859 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 860 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 861 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 862 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 863 PetscFunctionReturn(0); 864 } 865 866 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 867 { 868 PetscErrorCode ierr; 869 Mat_SeqAIJKokkos *aijkok; 870 871 PetscFunctionBegin; 872 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 873 if (aijkok) { /* Only zero the device if data is already there */ 874 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 875 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 876 } else { /* Might be preallocated but not assembled */ 877 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 878 } 879 PetscFunctionReturn(0); 880 } 881 882 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 883 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 884 { 885 PetscErrorCode ierr; 886 Mat_SeqAIJKokkos *aijkok; 887 888 PetscFunctionBegin; 889 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 890 PetscValidPointer(kv,2); 891 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 892 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 893 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 894 *kv = aijkok->a_dual.view_device(); 895 PetscFunctionReturn(0); 896 } 897 898 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 899 { 900 PetscFunctionBegin; 901 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 902 PetscValidPointer(kv,2); 903 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 904 PetscFunctionReturn(0); 905 } 906 907 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 908 { 909 PetscErrorCode ierr; 910 Mat_SeqAIJKokkos *aijkok; 911 912 PetscFunctionBegin; 913 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 914 PetscValidPointer(kv,2); 915 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 916 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 917 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 918 *kv = aijkok->a_dual.view_device(); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 923 { 924 PetscErrorCode ierr; 925 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 934 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 935 { 936 Mat_SeqAIJKokkos *aijkok; 937 938 PetscFunctionBegin; 939 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 940 PetscValidPointer(kv,2); 941 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 942 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 943 *kv = aijkok->a_dual.view_device(); 944 PetscFunctionReturn(0); 945 } 946 947 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 948 { 949 PetscErrorCode ierr; 950 951 PetscFunctionBegin; 952 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 953 PetscValidPointer(kv,2); 954 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 955 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 956 PetscFunctionReturn(0); 957 } 958 959 /* Computes Y += alpha X */ 960 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 961 { 962 PetscErrorCode ierr; 963 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 964 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 965 ConstMatScalarKokkosView Xa; 966 MatScalarKokkosView Ya; 967 968 PetscFunctionBegin; 969 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 970 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 971 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 972 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 973 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 974 975 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 976 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 977 PetscBool e; 978 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 979 if (e) { 980 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 981 if (e) pattern = SAME_NONZERO_PATTERN; 982 } 983 } 984 985 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 986 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 987 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 988 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 989 */ 990 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 991 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 992 Xa = xkok->a_dual.view_device(); 993 Ya = ykok->a_dual.view_device(); 994 995 if (pattern == SAME_NONZERO_PATTERN) { 996 KokkosBlas::axpy(alpha,Xa,Ya); 997 ierr = MatSeqAIJKokkosModifyDevice(Y); 998 } else if (pattern == SUBSET_NONZERO_PATTERN) { 999 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1000 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1001 1002 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1003 PetscInt i = t.league_rank(); /* row i */ 1004 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1005 PetscInt p,q = Yi(i); 1006 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1007 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1008 if (Xj(p) == Yj(q)) { /* Found it */ 1009 Ya(q) += alpha * Xa(p); 1010 q++; 1011 } else { 1012 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1013 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1014 */ 1015 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1016 } 1017 } 1018 }); 1019 }); 1020 ierr = MatSeqAIJKokkosModifyDevice(Y); 1021 } else { /* different nonzero patterns */ 1022 Mat Z; 1023 KokkosCsrMatrix zcsr; 1024 KernelHandle kh; 1025 kh.create_spadd_handle(false); 1026 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1027 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1028 zkok = new Mat_SeqAIJKokkos(zcsr); 1029 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1030 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1031 kh.destroy_spadd_handle(); 1032 } 1033 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1034 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1035 PetscFunctionReturn(0); 1036 } 1037 1038 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1039 { 1040 PetscErrorCode ierr; 1041 MPI_Comm comm; 1042 PetscInt *i,*j,*perm,*jmap; 1043 PetscInt k,M,N,p,q,row,start,end,nnz,nneg; 1044 PetscBool has_repeats = PETSC_FALSE; 1045 PetscInt *Ai,*Aj; 1046 PetscScalar *Aa; 1047 Mat_SeqAIJKokkos *akok; 1048 Mat_SeqAIJ *aseq; 1049 MatRowMapKokkosViewHost perm_h("perm",n),jmap_h; 1050 Mat newmat; 1051 1052 PetscFunctionBegin; 1053 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 1054 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 1055 ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr); 1056 ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */ 1057 ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr); 1058 perm = perm_h.data(); 1059 for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */ 1060 if (j[k] < 0) i[k] = -1; 1061 perm[k] = k; 1062 } 1063 1064 /* Sort by row */ 1065 ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr); 1066 for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */ 1067 nneg = k; 1068 Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */ 1069 nnz = 0; /* Total number of unique nonzeros to be counted */ 1070 jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */ 1071 jmap++; /* Inc jmap by 1 for convinience */ 1072 1073 ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */ 1074 ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */ 1075 1076 /* In each row, sort by column, then unique column indices to get row length */ 1077 Ai++; /* Inc by 1 for convinience */ 1078 q = 0; /* q-th unique nonzero, with q starting from 0 */ 1079 while (k<n) { 1080 row = i[k]; 1081 start = k; /* [start,end) indices for this row */ 1082 while (k<n && i[k] == row) k++; 1083 end = k; 1084 ierr = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr); 1085 /* Find number of unique col entries in this row */ 1086 Aj[q] = j[start]; /* Log the first nonzero in this row */ 1087 jmap[q] = 1; /* Number of repeats of this nozero entry */ 1088 Ai[row] = 1; 1089 nnz++; 1090 1091 for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */ 1092 if (j[p] != j[p-1]) { /* Meet a new nonzero */ 1093 q++; 1094 jmap[q] = 1; 1095 Aj[q] = j[p]; 1096 Ai[row]++; 1097 nnz++; 1098 } else { 1099 jmap[q]++; 1100 has_repeats = PETSC_TRUE; 1101 } 1102 } 1103 q++; /* Move to next row and thus next unique nonzero */ 1104 } 1105 ierr = PetscFree2(i,j);CHKERRQ(ierr); 1106 1107 Ai--; /* Back to the beginning of Ai[] */ 1108 for (k=0; k<M; k++) Ai[k+1] += Ai[k]; 1109 jmap--; /* Back to the beginning of jmap[] */ 1110 jmap[0] = 0; 1111 if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */ 1112 for (k=0; k<nnz; k++) jmap[k+1] += jmap[k]; 1113 Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */ 1114 } 1115 1116 if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */ 1117 PetscInt *Aj_new; 1118 ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr); 1119 ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr); 1120 ierr = PetscFree(Aj);CHKERRQ(ierr); 1121 Aj = Aj_new; 1122 } 1123 1124 ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr); 1125 ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr); 1126 aseq = static_cast<Mat_SeqAIJ*>(newmat->data); 1127 aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */ 1128 aseq->free_a = aseq->free_ij = PETSC_TRUE; 1129 1130 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1131 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1132 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1133 1134 if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */ 1135 MatRowMapKokkosViewHost newperm_h("perm",n-nneg); 1136 Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n))); 1137 perm_h = newperm_h; 1138 } 1139 1140 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1141 akok->SetUpCOO(n,has_repeats,jmap_h,perm_h); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1146 { 1147 PetscErrorCode ierr; 1148 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1149 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1150 PetscInt nz = aseq->nz; 1151 const MatRowMapKokkosView& jmap = akok->jmap_d; 1152 const MatRowMapKokkosView& perm = akok->perm_d; 1153 MatScalarKokkosView Aa; 1154 ConstMatScalarKokkosView kv; 1155 PetscMemType memtype; 1156 1157 PetscFunctionBegin; 1158 if (!v) { /* NULL v means an all zero array */ 1159 ierr = MatZeroEntries(A);CHKERRQ(ierr); 1160 PetscFunctionReturn(0); 1161 } 1162 1163 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1164 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1165 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n)); 1166 } else { 1167 kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */ 1168 } 1169 1170 ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */ 1171 if (imode == INSERT_VALUES) { 1172 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1173 } 1174 1175 if (akok->coo_has_repeats) { 1176 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) { 1177 for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k)); 1178 }); 1179 } else { 1180 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));}); 1181 } 1182 ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 1183 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1184 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1185 PetscFunctionReturn(0); 1186 } 1187 1188 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1189 { 1190 PetscErrorCode ierr; 1191 1192 PetscFunctionBegin; 1193 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1194 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1195 B->offloadmask = PETSC_OFFLOAD_CPU; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1200 { 1201 PetscErrorCode ierr; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 1204 PetscFunctionBegin; 1205 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1206 A->boundtocpu = PETSC_FALSE; 1207 1208 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1209 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1210 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1211 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1212 A->ops->scale = MatScale_SeqAIJKokkos; 1213 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1214 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1215 A->ops->mult = MatMult_SeqAIJKokkos; 1216 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1217 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1218 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1219 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1220 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1221 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1222 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1223 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1224 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1225 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1226 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1227 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1228 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1229 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1230 1231 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1232 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1233 PetscFunctionReturn(0); 1234 } 1235 1236 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1237 { 1238 PetscErrorCode ierr; 1239 Mat_SeqAIJ *aseq; 1240 PetscInt i,m,n; 1241 1242 PetscFunctionBegin; 1243 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1244 1245 m = akok->nrows(); 1246 n = akok->ncols(); 1247 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1248 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1249 1250 /* Set up data structures of A as a MATSEQAIJ */ 1251 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1252 aseq = (Mat_SeqAIJ*)(A)->data; 1253 1254 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1255 akok->j_dual.sync_host(); 1256 1257 aseq->i = akok->i_host_data(); 1258 aseq->j = akok->j_host_data(); 1259 aseq->a = akok->a_host_data(); 1260 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1261 aseq->singlemalloc = PETSC_FALSE; 1262 aseq->free_a = PETSC_FALSE; 1263 aseq->free_ij = PETSC_FALSE; 1264 aseq->nz = akok->nnz(); 1265 aseq->maxnz = aseq->nz; 1266 1267 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1268 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1269 for (i=0; i<m; i++) { 1270 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1271 } 1272 1273 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1274 akok->nonzerostate = A->nonzerostate; 1275 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1276 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1277 A->spptr = akok; 1278 PetscFunctionReturn(0); 1279 } 1280 1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1282 1283 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1284 */ 1285 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1286 { 1287 PetscErrorCode ierr; 1288 1289 PetscFunctionBegin; 1290 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1291 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1292 PetscFunctionReturn(0); 1293 } 1294 1295 /* --------------------------------------------------------------------------------*/ 1296 /*@C 1297 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1298 (the default parallel PETSc format). This matrix will ultimately be handled by 1299 Kokkos for calculations. For good matrix 1300 assembly performance the user should preallocate the matrix storage by setting 1301 the parameter nz (or the array nnz). By setting these parameters accurately, 1302 performance during matrix assembly can be increased by more than a factor of 50. 1303 1304 Collective 1305 1306 Input Parameters: 1307 + comm - MPI communicator, set to PETSC_COMM_SELF 1308 . m - number of rows 1309 . n - number of columns 1310 . nz - number of nonzeros per row (same for all rows) 1311 - nnz - array containing the number of nonzeros in the various rows 1312 (possibly different for each row) or NULL 1313 1314 Output Parameter: 1315 . A - the matrix 1316 1317 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1318 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1319 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1320 1321 Notes: 1322 If nnz is given then nz is ignored 1323 1324 The AIJ format (also called the Yale sparse matrix format or 1325 compressed row storage), is fully compatible with standard Fortran 77 1326 storage. That is, the stored row and column indices can begin at 1327 either one (as in Fortran) or zero. See the users' manual for details. 1328 1329 Specify the preallocated storage with either nz or nnz (not both). 1330 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1331 allocation. For large problems you MUST preallocate memory or you 1332 will get TERRIBLE performance, see the users' manual chapter on matrices. 1333 1334 By default, this format uses inodes (identical nodes) when possible, to 1335 improve numerical efficiency of matrix-vector products and solves. We 1336 search for consecutive rows with the same nonzero structure, thereby 1337 reusing matrix information to achieve increased efficiency. 1338 1339 Level: intermediate 1340 1341 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1342 @*/ 1343 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1344 { 1345 PetscErrorCode ierr; 1346 1347 PetscFunctionBegin; 1348 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1349 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1350 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1351 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1352 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1353 PetscFunctionReturn(0); 1354 } 1355 1356 typedef Kokkos::TeamPolicy<>::member_type team_member; 1357 // 1358 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1359 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1360 // 1361 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1362 { 1363 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1364 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1365 IS isrow = b->row,isicol = b->icol; 1366 PetscErrorCode ierr; 1367 const PetscInt *r_h,*ic_h; 1368 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1369 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1370 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1371 PetscBool row_identity,col_identity; 1372 PetscInt nc, Nf, nVec=32; // should be a parameter 1373 PetscContainer container; 1374 1375 PetscFunctionBegin; 1376 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1377 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1378 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1379 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1380 if (container) { 1381 PetscInt *pNf=NULL, nv; 1382 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1383 Nf = (*pNf)%1000; 1384 nv = (*pNf)/1000; 1385 if (nv>0) nVec = nv; 1386 } else Nf = 1; 1387 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf); 1388 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1389 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1390 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1391 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1392 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1393 { 1394 #define KOKKOS_SHARED_LEVEL 1 1395 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1396 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1397 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1398 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1399 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1400 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1401 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1402 size_t flops_h = 0.0; 1403 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1404 Kokkos::View<size_t> d_flops_k ("flops"); 1405 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1406 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1407 Kokkos::deep_copy (d_flops_k, h_flops_k); 1408 Kokkos::deep_copy (d_r_k, h_r_k); 1409 Kokkos::deep_copy (d_ic_k, h_ic_k); 1410 // Fill A --> fact 1411 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1412 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1413 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1414 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1415 // zero rows of B 1416 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1417 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1418 PetscScalar *baL = ba_d + bi_d[rowb]; 1419 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1420 /* zero (unfactored row) */ 1421 for (int j=0;j<nzbL;j++) baL[j] = 0; 1422 for (int j=0;j<nzbU;j++) baU[j] = 0; 1423 }); 1424 // copy A into B 1425 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1426 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1427 const PetscScalar *av = aa_d + ai_d[rowa]; 1428 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1429 /* load in initial (unfactored row) */ 1430 for (int j=0;j<nza;j++) { 1431 PetscInt colb = ic[ajtmp[j]]; 1432 PetscScalar vala = av[j]; 1433 if (colb == rowb) { 1434 *(ba_d + bdiag_d[rowb]) = vala; 1435 } else { 1436 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1437 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1438 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1439 for (int j=0; j<nz ; j++) { 1440 if (pbj[j] == colb) { 1441 pba[j] = vala; 1442 set++; 1443 break; 1444 } 1445 } 1446 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1447 } 1448 } 1449 }); 1450 }); 1451 Kokkos::fence(); 1452 1453 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1454 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1455 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1456 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1457 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1458 const PetscInt start = field*nloc, end = start + nloc; 1459 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1460 // A22 panel update for each row A(1,:) and col A(:,1) 1461 for (int ii=start; ii<end-1; ii++) { 1462 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1463 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1464 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1465 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1466 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1467 PetscInt kIdx = j*Ni + field_block_idx; 1468 if (kIdx >= nzUi) /* void */ ; 1469 else { 1470 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1471 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1472 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1473 size_t st_idx; 1474 // find and do L(k,i) = A(:k,i) / A(i,i) 1475 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1476 // get column, there has got to be a better way 1477 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1478 if (pjL[j] == ii) { 1479 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1480 idx = j; // output 1481 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1482 } 1483 }, st_idx); 1484 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1485 #if defined(PETSC_USE_DEBUG) 1486 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1487 #endif 1488 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1489 // U(i+1,:end) 1490 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1491 PetscScalar Uij = baUi[uiIdx]; 1492 PetscInt col = bjUi[uiIdx]; 1493 if (col==myk) { 1494 // A_kk = A_kk - L_ki * U_ij(k) 1495 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1496 *Akkv = *Akkv - L_ki() * Uij; // UiK 1497 } else { 1498 PetscScalar *start, *end, *pAkjv=NULL; 1499 PetscInt high, low; 1500 const PetscInt *startj; 1501 if (col<myk) { // L 1502 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1503 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1504 start = pLki+1; // start at pLki+1, A22(myk,1) 1505 startj= bj_d + bi_d[myk] + idx; 1506 end = ba_d + bi_d[myk+1]; 1507 } else { 1508 PetscInt idx = bdiag_d[myk+1]+1; 1509 start = ba_d + idx; 1510 startj= bj_d + idx; 1511 end = ba_d + bdiag_d[myk]; 1512 } 1513 // search for 'col', use bisection search - TODO 1514 low = 0; 1515 high = (PetscInt)(end-start); 1516 while (high-low > 5) { 1517 int t = (low+high)/2; 1518 if (startj[t] > col) high = t; 1519 else low = t; 1520 } 1521 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1522 if (startj[pAkjv-start] == col) break; 1523 } 1524 #if defined(PETSC_USE_DEBUG) 1525 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1526 #endif 1527 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1528 } 1529 }); 1530 } 1531 }); 1532 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1533 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1534 } /* endof for (i=0; i<n; i++) { */ 1535 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1536 }); 1537 Kokkos::fence(); 1538 Kokkos::deep_copy (h_flops_k, d_flops_k); 1539 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1540 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1541 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1542 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1543 /* Invert diagonal for simpler triangular solves */ 1544 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1545 int i = start + outer_index*Ni + lg_rank%Ni; 1546 if (i < end) { 1547 PetscScalar *pv = ba_d + bdiag_d[i]; 1548 *pv = 1.0/(*pv); 1549 } 1550 }); 1551 }); 1552 } 1553 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1554 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1555 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1556 1557 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1558 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1559 if (b->inode.size) { 1560 B->ops->solve = MatSolve_SeqAIJ_Inode; 1561 } else if (row_identity && col_identity) { 1562 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1563 } else { 1564 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1565 } 1566 B->offloadmask = PETSC_OFFLOAD_GPU; 1567 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1568 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1569 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1570 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1571 B->ops->matsolve = MatMatSolve_SeqAIJ; 1572 B->assembled = PETSC_TRUE; 1573 B->preallocated = PETSC_TRUE; 1574 1575 PetscFunctionReturn(0); 1576 } 1577 1578 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1579 { 1580 PetscErrorCode ierr; 1581 1582 PetscFunctionBegin; 1583 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1584 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1585 PetscFunctionReturn(0); 1586 } 1587 1588 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1589 { 1590 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1591 1592 PetscFunctionBegin; 1593 if (!factors->sptrsv_symbolic_completed) { 1594 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1595 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1596 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1597 } 1598 PetscFunctionReturn(0); 1599 } 1600 1601 /* Check if we need to update factors etc for transpose solve */ 1602 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1603 { 1604 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1605 MatColIdxType n = A->rmap->n; 1606 1607 PetscFunctionBegin; 1608 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1609 /* Update L^T and do sptrsv symbolic */ 1610 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1611 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1612 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1613 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1614 1615 KokkosKernels::Impl::transpose_matrix< 1616 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1617 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1618 MatRowMapKokkosView,DefaultExecutionSpace>( 1619 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1620 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1621 1622 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1623 We have to sort the indices, until KK provides finer control options. 1624 */ 1625 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1626 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1627 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1628 1629 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1630 1631 /* Update U^T and do sptrsv symbolic */ 1632 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1633 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1634 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1635 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1636 1637 KokkosKernels::Impl::transpose_matrix< 1638 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1639 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1640 MatRowMapKokkosView,DefaultExecutionSpace>( 1641 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1642 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1643 1644 /* Sort indices. See comments above */ 1645 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1646 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1647 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1648 1649 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1650 factors->transpose_updated = PETSC_TRUE; 1651 } 1652 PetscFunctionReturn(0); 1653 } 1654 1655 /* Solve Ax = b, with A = LU */ 1656 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1657 { 1658 PetscErrorCode ierr; 1659 ConstPetscScalarKokkosView bv; 1660 PetscScalarKokkosView xv; 1661 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1662 1663 PetscFunctionBegin; 1664 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1665 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1666 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1667 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1668 /* Solve L tmpv = b */ 1669 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1670 /* Solve Ux = tmpv */ 1671 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1672 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1673 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1674 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1675 PetscFunctionReturn(0); 1676 } 1677 1678 /* Solve A^T x = b, where A^T = U^T L^T */ 1679 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1680 { 1681 PetscErrorCode ierr; 1682 ConstPetscScalarKokkosView bv; 1683 PetscScalarKokkosView xv; 1684 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1685 1686 PetscFunctionBegin; 1687 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1688 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1689 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1690 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1691 /* Solve U^T tmpv = b */ 1692 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1693 1694 /* Solve L^T x = tmpv */ 1695 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1696 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1697 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1698 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1699 PetscFunctionReturn(0); 1700 } 1701 1702 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1703 { 1704 PetscErrorCode ierr; 1705 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1706 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1707 PetscInt fill_lev = info->levels; 1708 1709 PetscFunctionBegin; 1710 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1711 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1712 1713 auto a_d = aijkok->a_dual.view_device(); 1714 auto i_d = aijkok->i_dual.view_device(); 1715 auto j_d = aijkok->j_dual.view_device(); 1716 1717 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1718 1719 B->assembled = PETSC_TRUE; 1720 B->preallocated = PETSC_TRUE; 1721 B->ops->solve = MatSolve_SeqAIJKokkos; 1722 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1723 B->ops->matsolve = NULL; 1724 B->ops->matsolvetranspose = NULL; 1725 B->offloadmask = PETSC_OFFLOAD_GPU; 1726 1727 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1728 factors->transpose_updated = PETSC_FALSE; 1729 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1730 /* TODO: log flops, but how to know that? */ 1731 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1736 { 1737 PetscErrorCode ierr; 1738 Mat_SeqAIJKokkos *aijkok; 1739 Mat_SeqAIJ *b; 1740 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1741 PetscInt fill_lev = info->levels; 1742 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1743 PetscInt n = A->rmap->n; 1744 1745 PetscFunctionBegin; 1746 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1747 /* Rebuild factors */ 1748 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1749 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1750 1751 /* Create a spiluk handle and then do symbolic factorization */ 1752 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1753 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1754 1755 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1756 1757 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1758 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1759 Kokkos::realloc(factors->iU_d,n+1); 1760 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1761 1762 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1763 auto i_d = aijkok->i_dual.view_device(); 1764 auto j_d = aijkok->j_dual.view_device(); 1765 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1766 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1767 1768 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1769 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1770 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1771 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1772 1773 /* TODO: add options to select sptrsv algorithms */ 1774 /* Create sptrsv handles for L, U and their transpose */ 1775 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1776 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1777 #else 1778 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1779 #endif 1780 1781 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1782 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1783 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1784 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1785 1786 /* Fill fields of the factor matrix B */ 1787 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1788 b = (Mat_SeqAIJ*)B->data; 1789 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1790 B->info.fill_ratio_given = info->fill; 1791 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1792 1793 B->offloadmask = PETSC_OFFLOAD_GPU; 1794 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1795 PetscFunctionReturn(0); 1796 } 1797 1798 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1799 { 1800 PetscErrorCode ierr; 1801 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1802 const PetscInt nrows = A->rmap->n; 1803 1804 PetscFunctionBegin; 1805 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1806 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1807 // move B data into Kokkos 1808 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1809 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1810 { 1811 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1812 if (!baijkok->diag_d) { 1813 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1814 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1815 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1816 } 1817 } 1818 PetscFunctionReturn(0); 1819 } 1820 1821 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1822 { 1823 PetscFunctionBegin; 1824 *type = MATSOLVERKOKKOS; 1825 PetscFunctionReturn(0); 1826 } 1827 1828 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1829 { 1830 PetscFunctionBegin; 1831 *type = MATSOLVERKOKKOSDEVICE; 1832 PetscFunctionReturn(0); 1833 } 1834 1835 /*MC 1836 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1837 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1838 1839 Level: beginner 1840 1841 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1842 M*/ 1843 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1844 { 1845 PetscErrorCode ierr; 1846 PetscInt n = A->rmap->n; 1847 1848 PetscFunctionBegin; 1849 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1850 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1851 (*B)->factortype = ftype; 1852 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1853 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1854 1855 if (ftype == MAT_FACTOR_LU) { 1856 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1857 (*B)->canuseordering = PETSC_TRUE; 1858 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1859 } else if (ftype == MAT_FACTOR_ILU) { 1860 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1861 (*B)->canuseordering = PETSC_FALSE; 1862 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1863 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1864 1865 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1866 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1867 PetscFunctionReturn(0); 1868 } 1869 1870 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1871 { 1872 PetscErrorCode ierr; 1873 PetscInt n = A->rmap->n; 1874 1875 PetscFunctionBegin; 1876 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1877 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1878 (*B)->factortype = ftype; 1879 (*B)->canuseordering = PETSC_TRUE; 1880 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1881 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1882 1883 if (ftype == MAT_FACTOR_LU) { 1884 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1885 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1886 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1887 1888 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1889 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1894 { 1895 PetscErrorCode ierr; 1896 1897 PetscFunctionBegin; 1898 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1899 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1900 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1901 PetscFunctionReturn(0); 1902 } 1903 1904 /* Utility to print out a KokkosCsrMatrix for debugging */ 1905 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1906 { 1907 PetscErrorCode ierr; 1908 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1909 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1910 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1911 const PetscInt *i = iv.data(); 1912 const PetscInt *j = jv.data(); 1913 const PetscScalar *a = av.data(); 1914 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1915 1916 PetscFunctionBegin; 1917 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1918 for (PetscInt k=0; k<m; k++) { 1919 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1920 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1921 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1922 } 1923 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1924 } 1925 PetscFunctionReturn(0); 1926 } 1927