1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/aij.h> 19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 20 21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 22 23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 24 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 25 In the latter case, it is important to set a_dual's sync state correctly. 26 */ 27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 28 { 29 PetscErrorCode ierr; 30 Mat_SeqAIJ *aijseq; 31 Mat_SeqAIJKokkos *aijkok; 32 33 PetscFunctionBegin; 34 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 35 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 36 37 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 38 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 39 40 /* If aijkok does not exist, we just copy i, j to device. 41 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 42 In both cases, we build a new aijkok structure. 43 */ 44 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 45 delete aijkok; 46 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 47 A->spptr = aijkok; 48 } 49 50 if (aijkok && aijkok->device_mat_d.data()) { 51 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 52 } 53 PetscFunctionReturn(0); 54 } 55 56 /* Sync CSR data to device if not yet */ 57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 58 { 59 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 60 61 PetscFunctionBegin; 62 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 63 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 64 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 65 if (aijkok->a_dual.need_sync_device()) { 66 aijkok->a_dual.sync_device(); 67 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 68 aijkok->hermitian_updated = PETSC_FALSE; 69 } 70 PetscFunctionReturn(0); 71 } 72 73 /* Mark the CSR data on device as modified */ 74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 75 { 76 PetscErrorCode ierr; 77 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 78 79 PetscFunctionBegin; 80 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 81 aijkok->a_dual.clear_sync_state(); 82 aijkok->a_dual.modify_device(); 83 aijkok->transpose_updated = PETSC_FALSE; 84 aijkok->hermitian_updated = PETSC_FALSE; 85 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 86 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 87 PetscFunctionReturn(0); 88 } 89 90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 91 { 92 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 93 94 PetscFunctionBegin; 95 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 96 /* We do not expect one needs factors on host */ 97 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 98 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 99 aijkok->a_dual.sync_host(); 100 PetscFunctionReturn(0); 101 } 102 103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 104 { 105 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 106 107 PetscFunctionBegin; 108 if (aijkok) { 109 aijkok->a_dual.sync_host(); 110 *array = aijkok->a_dual.view_host().data(); 111 } else { /* Happens when calling MatSetValues on a newly created matrix */ 112 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 113 } 114 PetscFunctionReturn(0); 115 } 116 117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 118 { 119 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 120 121 PetscFunctionBegin; 122 if (aijkok) aijkok->a_dual.modify_host(); 123 PetscFunctionReturn(0); 124 } 125 126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 129 130 PetscFunctionBegin; 131 if (aijkok) { 132 aijkok->a_dual.sync_host(); 133 *array = aijkok->a_dual.view_host().data(); 134 } else { 135 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 136 } 137 PetscFunctionReturn(0); 138 } 139 140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 141 { 142 PetscFunctionBegin; 143 *array = NULL; 144 PetscFunctionReturn(0); 145 } 146 147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 148 { 149 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 150 151 PetscFunctionBegin; 152 if (aijkok) { 153 *array = aijkok->a_dual.view_host().data(); 154 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 155 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 156 } 157 PetscFunctionReturn(0); 158 } 159 160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 161 { 162 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 164 PetscFunctionBegin; 165 if (aijkok) { 166 aijkok->a_dual.clear_sync_state(); 167 aijkok->a_dual.modify_host(); 168 } 169 PetscFunctionReturn(0); 170 } 171 172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 177 178 PetscFunctionBegin; 179 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 180 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 181 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 190 PetscFunctionBegin; 191 if (aijkok && aijkok->device_mat_d.data()) { 192 *d_mat = aijkok->device_mat_d.data(); 193 } else { 194 PetscErrorCode ierr; 195 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 196 *d_mat = NULL; 197 } 198 PetscFunctionReturn(0); 199 } 200 201 /* Generate the transpose on device and cache it internally */ 202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 205 206 PetscFunctionBegin; 207 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 208 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 209 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 210 CHKERRCXX(aijkok->a_dual.sync_device()); 211 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 212 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 213 aijkok->transpose_updated = PETSC_TRUE; 214 } 215 *csrmatT = &aijkok->csrmatT; 216 PetscFunctionReturn(0); 217 } 218 219 /* Generate the Hermitian on device and cache it internally */ 220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 221 { 222 PetscErrorCode ierr; 223 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 224 225 PetscFunctionBegin; 226 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 227 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 228 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 229 CHKERRCXX(aijkok->a_dual.sync_device()); 230 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 231 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 232 #if defined(PETSC_USE_COMPLEX) 233 const auto& a = aijkok->csrmatH.values; 234 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 235 #endif 236 aijkok->hermitian_updated = PETSC_TRUE; 237 } 238 *csrmatH = &aijkok->csrmatH; 239 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 240 PetscFunctionReturn(0); 241 } 242 243 /* y = A x */ 244 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 245 { 246 PetscErrorCode ierr; 247 Mat_SeqAIJKokkos *aijkok; 248 ConstPetscScalarKokkosView xv; 249 PetscScalarKokkosView yv; 250 251 PetscFunctionBegin; 252 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 255 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 257 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 258 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 259 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 260 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 261 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 /* y = A^T x */ 266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 267 { 268 PetscErrorCode ierr; 269 Mat_SeqAIJKokkos *aijkok; 270 const char *mode; 271 ConstPetscScalarKokkosView xv; 272 PetscScalarKokkosView yv; 273 KokkosCsrMatrix *csrmat; 274 275 PetscFunctionBegin; 276 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 277 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 280 if (A->form_explicit_transpose) { 281 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282 mode = "N"; 283 } else { 284 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285 csrmat = &aijkok->csrmat; 286 mode = "T"; 287 } 288 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 291 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 /* y = A^H x */ 297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 298 { 299 PetscErrorCode ierr; 300 Mat_SeqAIJKokkos *aijkok; 301 const char *mode; 302 ConstPetscScalarKokkosView xv; 303 PetscScalarKokkosView yv; 304 KokkosCsrMatrix *csrmat; 305 306 PetscFunctionBegin; 307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 308 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 311 if (A->form_explicit_transpose) { 312 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313 mode = "N"; 314 } else { 315 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316 csrmat = &aijkok->csrmat; 317 mode = "C"; 318 } 319 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 322 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 /* z = A x + y */ 328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 329 { 330 PetscErrorCode ierr; 331 Mat_SeqAIJKokkos *aijkok; 332 ConstPetscScalarKokkosView xv,yv; 333 PetscScalarKokkosView zv; 334 335 PetscFunctionBegin; 336 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 337 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 341 if (zz != yy) Kokkos::deep_copy(zv,yv); 342 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 /* z = A^T x + y */ 353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJKokkos *aijkok; 357 const char *mode; 358 ConstPetscScalarKokkosView xv,yv; 359 PetscScalarKokkosView zv; 360 KokkosCsrMatrix *csrmat; 361 362 PetscFunctionBegin; 363 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 364 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 368 if (zz != yy) Kokkos::deep_copy(zv,yv); 369 if (A->form_explicit_transpose) { 370 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374 csrmat = &aijkok->csrmat; 375 mode = "T"; 376 } 377 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 381 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 /* z = A^H x + y */ 387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 388 { 389 PetscErrorCode ierr; 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv,yv; 393 PetscScalarKokkosView zv; 394 KokkosCsrMatrix *csrmat; 395 396 PetscFunctionBegin; 397 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 398 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 402 if (zz != yy) Kokkos::deep_copy(zv,yv); 403 if (A->form_explicit_transpose) { 404 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408 csrmat = &aijkok->csrmat; 409 mode = "C"; 410 } 411 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 415 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421 { 422 PetscErrorCode ierr; 423 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424 425 PetscFunctionBegin; 426 switch (op) { 427 case MAT_FORM_EXPLICIT_TRANSPOSE: 428 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430 A->form_explicit_transpose = flg; 431 break; 432 default: 433 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434 break; 435 } 436 PetscFunctionReturn(0); 437 } 438 439 /* Depending on reuse, either build a new mat, or use the existing mat */ 440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 441 { 442 PetscErrorCode ierr; 443 Mat_SeqAIJ *aseq; 444 445 PetscFunctionBegin; 446 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 449 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 452 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457 aseq = static_cast<Mat_SeqAIJ*>(A->data); 458 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 459 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 461 } 462 } 463 PetscFunctionReturn(0); 464 } 465 466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468 */ 469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 470 { 471 PetscErrorCode ierr; 472 Mat_SeqAIJ *bseq; 473 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474 Mat mat; 475 476 PetscFunctionBegin; 477 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479 mat = *B; 480 if (A->assembled) { 481 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484 /* Now copy values to B if needed */ 485 if (dupOption == MAT_COPY_VALUES) { 486 if (akok->a_dual.need_sync_device()) { 487 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488 bkok->a_dual.modify_host(); 489 } else { /* If device has the latest data, we only copy data on device */ 490 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491 bkok->a_dual.modify_device(); 492 } 493 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495 bkok->a_dual.modify_host(); 496 } 497 mat->spptr = bkok; 498 } 499 500 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 508 { 509 PetscErrorCode ierr; 510 Mat At; 511 KokkosCsrMatrix *internT,*csrmatT; 512 Mat_SeqAIJKokkos *atkok,*bkok; 513 514 PetscFunctionBegin; 515 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 516 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 518 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 519 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 520 if (reuse == MAT_INITIAL_MATRIX) *B = At; 521 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 522 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 523 if ((*B)->assembled) { 524 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 525 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 526 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 527 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 528 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 529 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 530 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 531 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 532 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 533 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 534 } 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 if (A->factortype == MAT_FACTOR_NONE) { 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 if (aijkok) { 547 if (aijkok->device_mat_d.data()) { 548 delete aijkok->colmap_d; 549 delete aijkok->i_uncompressed_d; 550 } 551 if (aijkok->diag_d) delete aijkok->diag_d; 552 } 553 delete aijkok; 554 } else { 555 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 556 } 557 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 558 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 559 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 560 A->spptr = NULL; 561 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 565 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 566 { 567 PetscErrorCode ierr; 568 569 PetscFunctionBegin; 570 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 571 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 572 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 577 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 578 { 579 PetscErrorCode ierr; 580 Mat_SeqAIJ *a,*b; 581 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 582 MatScalarKokkosView aa,ba,ca; 583 MatRowMapKokkosView ai,bi,ci; 584 MatColIdxKokkosView aj,bj,cj; 585 PetscInt m,n,nnz,aN; 586 587 PetscFunctionBegin; 588 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 589 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 590 PetscValidPointer(C,4); 591 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 592 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 593 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 594 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 595 596 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 597 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 598 a = static_cast<Mat_SeqAIJ*>(A->data); 599 b = static_cast<Mat_SeqAIJ*>(B->data); 600 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 601 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 602 aa = akok->a_dual.view_device(); 603 ai = akok->i_dual.view_device(); 604 ba = bkok->a_dual.view_device(); 605 bi = bkok->i_dual.view_device(); 606 m = A->rmap->n; /* M, N and nnz of C */ 607 n = A->cmap->n + B->cmap->n; 608 nnz = a->nz + b->nz; 609 aN = A->cmap->n; /* N of A */ 610 if (reuse == MAT_INITIAL_MATRIX) { 611 aj = akok->j_dual.view_device(); 612 bj = bkok->j_dual.view_device(); 613 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 614 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 615 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 616 ca = ca_dual.view_device(); 617 ci = ci_dual.view_device(); 618 cj = cj_dual.view_device(); 619 620 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 621 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 622 PetscInt i = t.league_rank(); /* row i */ 623 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 624 625 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 626 ci(i) = coffset; 627 if (i == m-1) ci(m) = ai(m) + bi(m); 628 }); 629 630 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 631 if (k < alen) { 632 ca(coffset+k) = aa(ai(i)+k); 633 cj(coffset+k) = aj(ai(i)+k); 634 } else { 635 ca(coffset+k) = ba(bi(i)+k-alen); 636 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 637 } 638 }); 639 }); 640 ca_dual.modify_device(); 641 ci_dual.modify_device(); 642 cj_dual.modify_device(); 643 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 644 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 645 } else if (reuse == MAT_REUSE_MATRIX) { 646 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 647 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 648 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 649 ca = ckok->a_dual.view_device(); 650 ci = ckok->i_dual.view_device(); 651 652 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 653 PetscInt i = t.league_rank(); /* row i */ 654 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 655 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 656 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 657 else ca(ci(i)+k) = ba(bi(i)+k-alen); 658 }); 659 }); 660 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 661 } 662 PetscFunctionReturn(0); 663 } 664 665 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 666 { 667 PetscFunctionBegin; 668 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 673 { 674 PetscErrorCode ierr; 675 Mat_Product *product = C->product; 676 Mat A,B; 677 bool transA,transB; /* use bool, since KK needs this type */ 678 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 679 Mat_SeqAIJ *c; 680 MatProductData_SeqAIJKokkos *pdata; 681 KokkosCsrMatrix *csrmatA,*csrmatB; 682 683 PetscFunctionBegin; 684 MatCheckProduct(C,1); 685 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 686 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 687 688 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 689 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 690 PetscFunctionReturn(0); 691 } 692 693 switch (product->type) { 694 case MATPRODUCT_AB: transA = false; transB = false; break; 695 case MATPRODUCT_AtB: transA = true; transB = false; break; 696 case MATPRODUCT_ABt: transA = false; transB = true; break; 697 default: 698 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 699 } 700 701 A = product->A; 702 B = product->B; 703 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 704 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 705 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 706 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 707 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 708 709 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 710 711 csrmatA = &akok->csrmat; 712 csrmatB = &bkok->csrmat; 713 714 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 715 if (transA) { 716 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 717 transA = false; 718 } 719 720 if (transB) { 721 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 722 transB = false; 723 } 724 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 725 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 726 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 727 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 728 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 729 /* shorter version of MatAssemblyEnd_SeqAIJ */ 730 c = (Mat_SeqAIJ*)C->data; 731 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 732 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 733 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 734 c->reallocs = 0; 735 C->info.mallocs = 0; 736 C->info.nz_unneeded = 0; 737 C->assembled = C->was_assembled = PETSC_TRUE; 738 C->num_ass++; 739 PetscFunctionReturn(0); 740 } 741 742 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 743 { 744 PetscErrorCode ierr; 745 Mat_Product *product = C->product; 746 MatProductType ptype; 747 Mat A,B; 748 bool transA,transB; 749 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 750 MatProductData_SeqAIJKokkos *pdata; 751 MPI_Comm comm; 752 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 753 754 PetscFunctionBegin; 755 MatCheckProduct(C,1); 756 ierr = PetscObjectGetComm((PetscObject)C,&comm); 757 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 758 A = product->A; 759 B = product->B; 760 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 761 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 762 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 763 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 764 csrmatA = &akok->csrmat; 765 csrmatB = &bkok->csrmat; 766 767 ptype = product->type; 768 switch (ptype) { 769 case MATPRODUCT_AB: transA = false; transB = false; break; 770 case MATPRODUCT_AtB: transA = true; transB = false; break; 771 case MATPRODUCT_ABt: transA = false; transB = true; break; 772 default: 773 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 774 } 775 776 product->data = pdata = new MatProductData_SeqAIJKokkos(); 777 pdata->kh.set_team_work_size(16); 778 pdata->kh.set_dynamic_scheduling(true); 779 pdata->reusesym = product->api_user; 780 781 /* TODO: add command line options to select spgemm algorithms */ 782 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 783 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 784 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 785 */ 786 pdata->kh.create_spgemm_handle(spgemm_alg); 787 788 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 789 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 790 if (transA) { 791 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 792 transA = false; 793 } 794 795 if (transB) { 796 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 797 transB = false; 798 } 799 800 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 801 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 802 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 803 calling new Mat_SeqAIJKokkos(). 804 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 805 */ 806 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 808 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 809 810 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 811 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 812 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 813 PetscFunctionReturn(0); 814 } 815 816 /* handles sparse matrix matrix ops */ 817 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 818 { 819 PetscErrorCode ierr; 820 Mat_Product *product = mat->product; 821 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822 823 PetscFunctionBegin; 824 MatCheckProduct(mat,1); 825 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 826 if (product->type == MATPRODUCT_ABC) { 827 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 828 } 829 if (Biskok && Ciskok) { 830 switch (product->type) { 831 case MATPRODUCT_AB: 832 case MATPRODUCT_AtB: 833 case MATPRODUCT_ABt: 834 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835 break; 836 case MATPRODUCT_PtAP: 837 case MATPRODUCT_RARt: 838 case MATPRODUCT_ABC: 839 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840 break; 841 default: 842 break; 843 } 844 } else { /* fallback for AIJ */ 845 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 846 } 847 PetscFunctionReturn(0); 848 } 849 850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851 { 852 PetscErrorCode ierr; 853 Mat_SeqAIJKokkos *aijkok; 854 855 PetscFunctionBegin; 856 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 857 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 858 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 859 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 860 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 861 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 862 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 863 PetscFunctionReturn(0); 864 } 865 866 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 867 { 868 PetscErrorCode ierr; 869 Mat_SeqAIJKokkos *aijkok; 870 871 PetscFunctionBegin; 872 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 873 if (aijkok) { /* Only zero the device if data is already there */ 874 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 875 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 876 } else { /* Might be preallocated but not assembled */ 877 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 878 } 879 PetscFunctionReturn(0); 880 } 881 882 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 883 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 884 { 885 PetscErrorCode ierr; 886 Mat_SeqAIJKokkos *aijkok; 887 888 PetscFunctionBegin; 889 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 890 PetscValidPointer(kv,2); 891 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 892 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 893 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 894 *kv = aijkok->a_dual.view_device(); 895 PetscFunctionReturn(0); 896 } 897 898 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 899 { 900 PetscFunctionBegin; 901 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 902 PetscValidPointer(kv,2); 903 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 904 PetscFunctionReturn(0); 905 } 906 907 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 908 { 909 PetscErrorCode ierr; 910 Mat_SeqAIJKokkos *aijkok; 911 912 PetscFunctionBegin; 913 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 914 PetscValidPointer(kv,2); 915 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 916 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 917 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 918 *kv = aijkok->a_dual.view_device(); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 923 { 924 PetscErrorCode ierr; 925 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 934 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 935 { 936 Mat_SeqAIJKokkos *aijkok; 937 938 PetscFunctionBegin; 939 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 940 PetscValidPointer(kv,2); 941 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 942 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 943 *kv = aijkok->a_dual.view_device(); 944 PetscFunctionReturn(0); 945 } 946 947 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 948 { 949 PetscErrorCode ierr; 950 951 PetscFunctionBegin; 952 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 953 PetscValidPointer(kv,2); 954 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 955 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 956 PetscFunctionReturn(0); 957 } 958 959 /* Computes Y += alpha X */ 960 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 961 { 962 PetscErrorCode ierr; 963 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 964 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 965 ConstMatScalarKokkosView Xa; 966 MatScalarKokkosView Ya; 967 968 PetscFunctionBegin; 969 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 970 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 971 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 972 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 973 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 974 975 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 976 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 977 PetscBool e; 978 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 979 if (e) { 980 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 981 if (e) pattern = SAME_NONZERO_PATTERN; 982 } 983 } 984 985 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 986 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 987 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 988 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 989 */ 990 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 991 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 992 Xa = xkok->a_dual.view_device(); 993 Ya = ykok->a_dual.view_device(); 994 995 if (pattern == SAME_NONZERO_PATTERN) { 996 KokkosBlas::axpy(alpha,Xa,Ya); 997 ierr = MatSeqAIJKokkosModifyDevice(Y); 998 } else if (pattern == SUBSET_NONZERO_PATTERN) { 999 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1000 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1001 1002 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1003 PetscInt i = t.league_rank(); /* row i */ 1004 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1005 PetscInt p,q = Yi(i); 1006 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1007 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1008 if (Xj(p) == Yj(q)) { /* Found it */ 1009 Ya(q) += alpha * Xa(p); 1010 q++; 1011 } else { 1012 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1013 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1014 */ 1015 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1016 } 1017 } 1018 }); 1019 }); 1020 ierr = MatSeqAIJKokkosModifyDevice(Y); 1021 } else { /* different nonzero patterns */ 1022 Mat Z; 1023 KokkosCsrMatrix zcsr; 1024 KernelHandle kh; 1025 kh.create_spadd_handle(false); 1026 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1027 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1028 zkok = new Mat_SeqAIJKokkos(zcsr); 1029 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1030 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1031 kh.destroy_spadd_handle(); 1032 } 1033 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1034 ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1035 PetscFunctionReturn(0); 1036 } 1037 1038 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1039 { 1040 PetscErrorCode ierr; 1041 MPI_Comm comm; 1042 PetscInt *i,*j,*perm,*jmap; 1043 PetscInt k,M,N,p,q,row,start,end,nnz,nneg; 1044 PetscBool has_repeats = PETSC_FALSE; 1045 PetscInt *Ai,*Aj; 1046 PetscScalar *Aa; 1047 Mat_SeqAIJKokkos *akok; 1048 Mat_SeqAIJ *aseq; 1049 MatRowMapKokkosViewHost perm_h("perm",n),jmap_h; 1050 Mat newmat; 1051 1052 PetscFunctionBegin; 1053 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 1054 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 1055 ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr); 1056 ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */ 1057 ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr); 1058 perm = perm_h.data(); 1059 for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */ 1060 if (j[k] < 0) i[k] = -1; 1061 perm[k] = k; 1062 } 1063 1064 /* Sort by row */ 1065 ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr); 1066 for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */ 1067 nneg = k; 1068 Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */ 1069 nnz = 0; /* Total number of unique nonzeros to be counted */ 1070 jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */ 1071 jmap++; /* Inc jmap by 1 for convinience */ 1072 1073 ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */ 1074 ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */ 1075 1076 /* In each row, sort by column, then unique column indices to get row length */ 1077 Ai++; /* Inc by 1 for convinience */ 1078 q = 0; /* q-th unique nonzero, with q starting from 0 */ 1079 while (k<n) { 1080 row = i[k]; 1081 start = k; /* [start,end) indices for this row */ 1082 while (k<n && i[k] == row) k++; 1083 end = k; 1084 ierr = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr); 1085 /* Find number of unique col entries in this row */ 1086 Aj[q] = j[start]; /* Log the first nonzero in this row */ 1087 jmap[q] = 1; /* Number of repeats of this nozero entry */ 1088 Ai[row] = 1; 1089 nnz++; 1090 1091 for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */ 1092 if (j[p] != j[p-1]) { /* Meet a new nonzero */ 1093 q++; 1094 jmap[q] = 1; 1095 Aj[q] = j[p]; 1096 Ai[row]++; 1097 nnz++; 1098 } else { 1099 jmap[q]++; 1100 has_repeats = PETSC_TRUE; 1101 } 1102 } 1103 q++; /* Move to next row and thus next unique nonzero */ 1104 } 1105 ierr = PetscFree2(i,j);CHKERRQ(ierr); 1106 1107 Ai--; /* Back to the beginning of Ai[] */ 1108 for (k=0; k<M; k++) Ai[k+1] += Ai[k]; 1109 jmap--; /* Back to the beginning of jmap[] */ 1110 jmap[0] = 0; 1111 if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */ 1112 for (k=0; k<nnz; k++) jmap[k+1] += jmap[k]; 1113 Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */ 1114 } 1115 1116 if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */ 1117 PetscInt *Aj_new; 1118 ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr); 1119 ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr); 1120 ierr = PetscFree(Aj);CHKERRQ(ierr); 1121 Aj = Aj_new; 1122 } 1123 1124 ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr); 1125 ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr); 1126 aseq = static_cast<Mat_SeqAIJ*>(newmat->data); 1127 aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */ 1128 aseq->free_a = aseq->free_ij = PETSC_TRUE; 1129 1130 ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1131 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1132 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1133 1134 if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */ 1135 MatRowMapKokkosViewHost newperm_h("perm",n-nneg); 1136 Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n))); 1137 perm_h = newperm_h; 1138 } 1139 1140 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1141 akok->SetUpCOO(n,has_repeats,jmap_h,perm_h); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1146 { 1147 PetscErrorCode ierr; 1148 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1149 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1150 PetscInt nz = aseq->nz; 1151 const MatRowMapKokkosView& jmap = akok->jmap_d; 1152 const MatRowMapKokkosView& perm = akok->perm_d; 1153 MatScalarKokkosView Aa; 1154 ConstMatScalarKokkosView kv; 1155 PetscMemType memtype; 1156 1157 PetscFunctionBegin; 1158 if (!v) { /* NULL v means an all zero array */ 1159 ierr = MatZeroEntries(A);CHKERRQ(ierr); 1160 PetscFunctionReturn(0); 1161 } 1162 1163 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1164 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1165 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n)); 1166 } else { 1167 kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */ 1168 } 1169 1170 ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */ 1171 if (imode == INSERT_VALUES) { 1172 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1173 } 1174 1175 if (akok->coo_has_repeats) { 1176 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) { 1177 for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k)); 1178 }); 1179 } else { 1180 Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));}); 1181 } 1182 ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 1183 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1184 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1185 PetscFunctionReturn(0); 1186 } 1187 1188 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1189 { 1190 PetscErrorCode ierr; 1191 1192 PetscFunctionBegin; 1193 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1194 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1195 B->offloadmask = PETSC_OFFLOAD_CPU; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1200 { 1201 PetscErrorCode ierr; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 1204 PetscFunctionBegin; 1205 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1206 A->boundtocpu = PETSC_FALSE; 1207 1208 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1209 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1210 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1211 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1212 A->ops->scale = MatScale_SeqAIJKokkos; 1213 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1214 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1215 A->ops->mult = MatMult_SeqAIJKokkos; 1216 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1217 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1218 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1219 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1220 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1221 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1222 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1223 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1224 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1225 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1226 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1227 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1228 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1229 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1230 1231 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 1232 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1233 PetscFunctionReturn(0); 1234 } 1235 1236 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1237 { 1238 PetscErrorCode ierr; 1239 Mat_SeqAIJ *aseq; 1240 PetscInt i,m,n; 1241 1242 PetscFunctionBegin; 1243 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1244 1245 m = akok->nrows(); 1246 n = akok->ncols(); 1247 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1248 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1249 1250 /* Set up data structures of A as a MATSEQAIJ */ 1251 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1252 aseq = (Mat_SeqAIJ*)(A)->data; 1253 1254 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1255 akok->j_dual.sync_host(); 1256 1257 aseq->i = akok->i_host_data(); 1258 aseq->j = akok->j_host_data(); 1259 aseq->a = akok->a_host_data(); 1260 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1261 aseq->singlemalloc = PETSC_FALSE; 1262 aseq->free_a = PETSC_FALSE; 1263 aseq->free_ij = PETSC_FALSE; 1264 aseq->nz = akok->nnz(); 1265 aseq->maxnz = aseq->nz; 1266 1267 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1268 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1269 for (i=0; i<m; i++) { 1270 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1271 } 1272 1273 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1274 akok->nonzerostate = A->nonzerostate; 1275 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1276 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1277 A->spptr = akok; 1278 PetscFunctionReturn(0); 1279 } 1280 1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1282 1283 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1284 */ 1285 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1286 { 1287 PetscErrorCode ierr; 1288 1289 PetscFunctionBegin; 1290 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1291 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1292 PetscFunctionReturn(0); 1293 } 1294 1295 /* --------------------------------------------------------------------------------*/ 1296 /*@C 1297 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1298 (the default parallel PETSc format). This matrix will ultimately be handled by 1299 Kokkos for calculations. For good matrix 1300 assembly performance the user should preallocate the matrix storage by setting 1301 the parameter nz (or the array nnz). By setting these parameters accurately, 1302 performance during matrix assembly can be increased by more than a factor of 50. 1303 1304 Collective 1305 1306 Input Parameters: 1307 + comm - MPI communicator, set to PETSC_COMM_SELF 1308 . m - number of rows 1309 . n - number of columns 1310 . nz - number of nonzeros per row (same for all rows) 1311 - nnz - array containing the number of nonzeros in the various rows 1312 (possibly different for each row) or NULL 1313 1314 Output Parameter: 1315 . A - the matrix 1316 1317 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1318 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1319 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1320 1321 Notes: 1322 If nnz is given then nz is ignored 1323 1324 The AIJ format (also called the Yale sparse matrix format or 1325 compressed row storage), is fully compatible with standard Fortran 77 1326 storage. That is, the stored row and column indices can begin at 1327 either one (as in Fortran) or zero. See the users' manual for details. 1328 1329 Specify the preallocated storage with either nz or nnz (not both). 1330 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1331 allocation. For large problems you MUST preallocate memory or you 1332 will get TERRIBLE performance, see the users' manual chapter on matrices. 1333 1334 By default, this format uses inodes (identical nodes) when possible, to 1335 improve numerical efficiency of matrix-vector products and solves. We 1336 search for consecutive rows with the same nonzero structure, thereby 1337 reusing matrix information to achieve increased efficiency. 1338 1339 Level: intermediate 1340 1341 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1342 @*/ 1343 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1344 { 1345 PetscErrorCode ierr; 1346 1347 PetscFunctionBegin; 1348 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1349 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1350 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1351 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1352 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1353 PetscFunctionReturn(0); 1354 } 1355 1356 typedef Kokkos::TeamPolicy<>::member_type team_member; 1357 // 1358 // This factorization exploits block diagonal matrices with "Nf" (not used). 1359 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1360 // 1361 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1362 { 1363 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1364 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1365 IS isrow = b->row,isicol = b->icol; 1366 PetscErrorCode ierr; 1367 const PetscInt *r_h,*ic_h; 1368 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1369 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1370 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1371 PetscBool row_identity,col_identity; 1372 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1373 1374 PetscFunctionBegin; 1375 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1376 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1377 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1378 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1379 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1380 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1381 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1382 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1383 { 1384 #define KOKKOS_SHARED_LEVEL 1 1385 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1386 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1387 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1388 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1389 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1390 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1391 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1392 size_t flops_h = 0.0; 1393 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1394 Kokkos::View<size_t> d_flops_k ("flops"); 1395 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1396 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1397 Kokkos::deep_copy (d_flops_k, h_flops_k); 1398 Kokkos::deep_copy (d_r_k, h_r_k); 1399 Kokkos::deep_copy (d_ic_k, h_ic_k); 1400 // Fill A --> fact 1401 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1402 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1403 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1404 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1405 // zero rows of B 1406 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1407 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1408 PetscScalar *baL = ba_d + bi_d[rowb]; 1409 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1410 /* zero (unfactored row) */ 1411 for (int j=0;j<nzbL;j++) baL[j] = 0; 1412 for (int j=0;j<nzbU;j++) baU[j] = 0; 1413 }); 1414 // copy A into B 1415 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1416 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1417 const PetscScalar *av = aa_d + ai_d[rowa]; 1418 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1419 /* load in initial (unfactored row) */ 1420 for (int j=0;j<nza;j++) { 1421 PetscInt colb = ic[ajtmp[j]]; 1422 PetscScalar vala = av[j]; 1423 if (colb == rowb) { 1424 *(ba_d + bdiag_d[rowb]) = vala; 1425 } else { 1426 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1427 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1428 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1429 for (int j=0; j<nz ; j++) { 1430 if (pbj[j] == colb) { 1431 pba[j] = vala; 1432 set++; 1433 break; 1434 } 1435 } 1436 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1437 } 1438 } 1439 }); 1440 }); 1441 Kokkos::fence(); 1442 1443 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1444 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1445 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1446 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1447 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1448 const PetscInt start = field*nloc, end = start + nloc; 1449 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1450 // A22 panel update for each row A(1,:) and col A(:,1) 1451 for (int ii=start; ii<end-1; ii++) { 1452 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1453 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1454 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1455 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1456 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1457 PetscInt kIdx = j*Ni + field_block_idx; 1458 if (kIdx >= nzUi) /* void */ ; 1459 else { 1460 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1461 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1462 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1463 size_t st_idx; 1464 // find and do L(k,i) = A(:k,i) / A(i,i) 1465 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1466 // get column, there has got to be a better way 1467 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1468 if (pjL[j] == ii) { 1469 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1470 idx = j; // output 1471 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1472 } 1473 }, st_idx); 1474 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1475 #if defined(PETSC_USE_DEBUG) 1476 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1477 #endif 1478 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1479 // U(i+1,:end) 1480 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1481 PetscScalar Uij = baUi[uiIdx]; 1482 PetscInt col = bjUi[uiIdx]; 1483 if (col==myk) { 1484 // A_kk = A_kk - L_ki * U_ij(k) 1485 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1486 *Akkv = *Akkv - L_ki() * Uij; // UiK 1487 } else { 1488 PetscScalar *start, *end, *pAkjv=NULL; 1489 PetscInt high, low; 1490 const PetscInt *startj; 1491 if (col<myk) { // L 1492 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1493 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1494 start = pLki+1; // start at pLki+1, A22(myk,1) 1495 startj= bj_d + bi_d[myk] + idx; 1496 end = ba_d + bi_d[myk+1]; 1497 } else { 1498 PetscInt idx = bdiag_d[myk+1]+1; 1499 start = ba_d + idx; 1500 startj= bj_d + idx; 1501 end = ba_d + bdiag_d[myk]; 1502 } 1503 // search for 'col', use bisection search - TODO 1504 low = 0; 1505 high = (PetscInt)(end-start); 1506 while (high-low > 5) { 1507 int t = (low+high)/2; 1508 if (startj[t] > col) high = t; 1509 else low = t; 1510 } 1511 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1512 if (startj[pAkjv-start] == col) break; 1513 } 1514 #if defined(PETSC_USE_DEBUG) 1515 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1516 #endif 1517 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1518 } 1519 }); 1520 } 1521 }); 1522 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1523 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1524 } /* endof for (i=0; i<n; i++) { */ 1525 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1526 }); 1527 Kokkos::fence(); 1528 Kokkos::deep_copy (h_flops_k, d_flops_k); 1529 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1530 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1531 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1532 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1533 /* Invert diagonal for simpler triangular solves */ 1534 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1535 int i = start + outer_index*Ni + lg_rank%Ni; 1536 if (i < end) { 1537 PetscScalar *pv = ba_d + bdiag_d[i]; 1538 *pv = 1.0/(*pv); 1539 } 1540 }); 1541 }); 1542 } 1543 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1544 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1545 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1546 1547 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1548 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1549 if (b->inode.size) { 1550 B->ops->solve = MatSolve_SeqAIJ_Inode; 1551 } else if (row_identity && col_identity) { 1552 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1553 } else { 1554 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1555 } 1556 B->offloadmask = PETSC_OFFLOAD_GPU; 1557 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1558 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1559 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1560 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1561 B->ops->matsolve = MatMatSolve_SeqAIJ; 1562 B->assembled = PETSC_TRUE; 1563 B->preallocated = PETSC_TRUE; 1564 1565 PetscFunctionReturn(0); 1566 } 1567 1568 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1569 { 1570 PetscErrorCode ierr; 1571 1572 PetscFunctionBegin; 1573 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1574 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1575 PetscFunctionReturn(0); 1576 } 1577 1578 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1579 { 1580 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1581 1582 PetscFunctionBegin; 1583 if (!factors->sptrsv_symbolic_completed) { 1584 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1585 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1586 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1587 } 1588 PetscFunctionReturn(0); 1589 } 1590 1591 /* Check if we need to update factors etc for transpose solve */ 1592 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1593 { 1594 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1595 MatColIdxType n = A->rmap->n; 1596 1597 PetscFunctionBegin; 1598 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1599 /* Update L^T and do sptrsv symbolic */ 1600 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1601 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1602 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1603 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1604 1605 KokkosKernels::Impl::transpose_matrix< 1606 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1607 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1608 MatRowMapKokkosView,DefaultExecutionSpace>( 1609 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1610 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1611 1612 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1613 We have to sort the indices, until KK provides finer control options. 1614 */ 1615 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1616 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1617 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1618 1619 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1620 1621 /* Update U^T and do sptrsv symbolic */ 1622 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1623 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1624 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1625 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1626 1627 KokkosKernels::Impl::transpose_matrix< 1628 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1629 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1630 MatRowMapKokkosView,DefaultExecutionSpace>( 1631 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1632 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1633 1634 /* Sort indices. See comments above */ 1635 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1636 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1637 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1638 1639 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1640 factors->transpose_updated = PETSC_TRUE; 1641 } 1642 PetscFunctionReturn(0); 1643 } 1644 1645 /* Solve Ax = b, with A = LU */ 1646 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1647 { 1648 PetscErrorCode ierr; 1649 ConstPetscScalarKokkosView bv; 1650 PetscScalarKokkosView xv; 1651 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1652 1653 PetscFunctionBegin; 1654 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1655 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1656 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1657 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1658 /* Solve L tmpv = b */ 1659 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1660 /* Solve Ux = tmpv */ 1661 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1662 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1663 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1664 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1665 PetscFunctionReturn(0); 1666 } 1667 1668 /* Solve A^T x = b, where A^T = U^T L^T */ 1669 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1670 { 1671 PetscErrorCode ierr; 1672 ConstPetscScalarKokkosView bv; 1673 PetscScalarKokkosView xv; 1674 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1675 1676 PetscFunctionBegin; 1677 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1678 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1679 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1680 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1681 /* Solve U^T tmpv = b */ 1682 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1683 1684 /* Solve L^T x = tmpv */ 1685 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1686 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1687 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1688 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1693 { 1694 PetscErrorCode ierr; 1695 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1696 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1697 PetscInt fill_lev = info->levels; 1698 1699 PetscFunctionBegin; 1700 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1701 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1702 1703 auto a_d = aijkok->a_dual.view_device(); 1704 auto i_d = aijkok->i_dual.view_device(); 1705 auto j_d = aijkok->j_dual.view_device(); 1706 1707 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1708 1709 B->assembled = PETSC_TRUE; 1710 B->preallocated = PETSC_TRUE; 1711 B->ops->solve = MatSolve_SeqAIJKokkos; 1712 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1713 B->ops->matsolve = NULL; 1714 B->ops->matsolvetranspose = NULL; 1715 B->offloadmask = PETSC_OFFLOAD_GPU; 1716 1717 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1718 factors->transpose_updated = PETSC_FALSE; 1719 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1720 /* TODO: log flops, but how to know that? */ 1721 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1722 PetscFunctionReturn(0); 1723 } 1724 1725 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1726 { 1727 PetscErrorCode ierr; 1728 Mat_SeqAIJKokkos *aijkok; 1729 Mat_SeqAIJ *b; 1730 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1731 PetscInt fill_lev = info->levels; 1732 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1733 PetscInt n = A->rmap->n; 1734 1735 PetscFunctionBegin; 1736 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1737 /* Rebuild factors */ 1738 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1739 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1740 1741 /* Create a spiluk handle and then do symbolic factorization */ 1742 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1743 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1744 1745 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1746 1747 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1748 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1749 Kokkos::realloc(factors->iU_d,n+1); 1750 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1751 1752 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1753 auto i_d = aijkok->i_dual.view_device(); 1754 auto j_d = aijkok->j_dual.view_device(); 1755 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1756 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1757 1758 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1759 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1760 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1761 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1762 1763 /* TODO: add options to select sptrsv algorithms */ 1764 /* Create sptrsv handles for L, U and their transpose */ 1765 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1766 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1767 #else 1768 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1769 #endif 1770 1771 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1772 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1773 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1774 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1775 1776 /* Fill fields of the factor matrix B */ 1777 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1778 b = (Mat_SeqAIJ*)B->data; 1779 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1780 B->info.fill_ratio_given = info->fill; 1781 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1782 1783 B->offloadmask = PETSC_OFFLOAD_GPU; 1784 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1785 PetscFunctionReturn(0); 1786 } 1787 1788 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1789 { 1790 PetscErrorCode ierr; 1791 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1792 const PetscInt nrows = A->rmap->n; 1793 1794 PetscFunctionBegin; 1795 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1796 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1797 // move B data into Kokkos 1798 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1799 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1800 { 1801 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1802 if (!baijkok->diag_d) { 1803 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1804 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1805 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1806 } 1807 } 1808 PetscFunctionReturn(0); 1809 } 1810 1811 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1812 { 1813 PetscFunctionBegin; 1814 *type = MATSOLVERKOKKOS; 1815 PetscFunctionReturn(0); 1816 } 1817 1818 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1819 { 1820 PetscFunctionBegin; 1821 *type = MATSOLVERKOKKOSDEVICE; 1822 PetscFunctionReturn(0); 1823 } 1824 1825 /*MC 1826 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1827 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1828 1829 Level: beginner 1830 1831 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1832 M*/ 1833 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1834 { 1835 PetscErrorCode ierr; 1836 PetscInt n = A->rmap->n; 1837 1838 PetscFunctionBegin; 1839 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1840 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1841 (*B)->factortype = ftype; 1842 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1843 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1844 1845 if (ftype == MAT_FACTOR_LU) { 1846 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1847 (*B)->canuseordering = PETSC_TRUE; 1848 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1849 } else if (ftype == MAT_FACTOR_ILU) { 1850 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1851 (*B)->canuseordering = PETSC_FALSE; 1852 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1853 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1854 1855 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1856 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1857 PetscFunctionReturn(0); 1858 } 1859 1860 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1861 { 1862 PetscErrorCode ierr; 1863 PetscInt n = A->rmap->n; 1864 1865 PetscFunctionBegin; 1866 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1867 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1868 (*B)->factortype = ftype; 1869 (*B)->canuseordering = PETSC_TRUE; 1870 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1871 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1872 1873 if (ftype == MAT_FACTOR_LU) { 1874 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1875 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1876 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1877 1878 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1879 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1880 PetscFunctionReturn(0); 1881 } 1882 1883 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1884 { 1885 PetscErrorCode ierr; 1886 1887 PetscFunctionBegin; 1888 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1889 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1890 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1891 PetscFunctionReturn(0); 1892 } 1893 1894 /* Utility to print out a KokkosCsrMatrix for debugging */ 1895 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1896 { 1897 PetscErrorCode ierr; 1898 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1899 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1900 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1901 const PetscInt *i = iv.data(); 1902 const PetscInt *j = jv.data(); 1903 const PetscScalar *a = av.data(); 1904 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1905 1906 PetscFunctionBegin; 1907 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1908 for (PetscInt k=0; k<m; k++) { 1909 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1910 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1911 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1912 } 1913 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1914 } 1915 PetscFunctionReturn(0); 1916 } 1917