1 #include <petscvec_kokkos.hpp> 2 #include <petsc/private/petscimpl.h> 3 #include <petscsystypes.h> 4 #include <petscerror.h> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <KokkosSparse_spiluk.hpp> 11 #include <KokkosSparse_sptrsv.hpp> 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 16 #include <petscmat.h> 17 18 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 19 20 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 21 { 22 PetscErrorCode ierr; 23 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 24 25 PetscFunctionBegin; 26 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 27 A->offloadmask = PETSC_OFFLOAD_CPU; 28 if (aijkok && aijkok->device_mat_d.data()) { 29 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 30 } 31 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 32 PetscFunctionReturn(0); 33 } 34 35 /* Sync CSR data to device if not yet */ 36 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 37 { 38 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 39 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 40 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 41 42 PetscFunctionBegin; 43 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 44 /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 45 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 46 delete aijkok; 47 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 48 aijkok->nonzerostate = A->nonzerostate; 49 A->spptr = aijkok; 50 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 51 aijkok->a_dual.clear_sync_state(); 52 aijkok->a_dual.modify_host(); /* Mark host is modified */ 53 aijkok->a_dual.sync_device(); /* Sync the device */ 54 aijkok->transpose_updated = PETSC_FALSE; 55 aijkok->hermitian_updated = PETSC_FALSE; 56 } 57 A->offloadmask = PETSC_OFFLOAD_BOTH; 58 PetscFunctionReturn(0); 59 } 60 61 /* Mark the CSR data on device is modified */ 62 static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A) 63 { 64 PetscErrorCode ierr; 65 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 66 67 PetscFunctionBegin; 68 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 69 aijkok->a_dual.clear_sync_state(); 70 aijkok->a_dual.modify_device(); 71 aijkok->transpose_updated = PETSC_FALSE; 72 aijkok->hermitian_updated = PETSC_FALSE; 73 A->offloadmask = PETSC_OFFLOAD_GPU; 74 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 75 PetscFunctionReturn(0); 76 } 77 78 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 79 { 80 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 81 82 PetscFunctionBegin; 83 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 84 /* We do not expect one needs factors on host */ 85 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 86 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 87 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 88 aijkok->a_dual.clear_sync_state(); 89 aijkok->a_dual.modify_device(); /* Mark device is modified */ 90 aijkok->a_dual.sync_host(); /* Sync the host */ 91 A->offloadmask = PETSC_OFFLOAD_BOTH; 92 } 93 PetscFunctionReturn(0); 94 } 95 96 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 97 { 98 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 99 PetscErrorCode ierr; 100 101 PetscFunctionBegin; 102 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 103 *array = a->a; 104 A->offloadmask = PETSC_OFFLOAD_CPU; 105 PetscFunctionReturn(0); 106 } 107 108 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 109 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 110 { 111 Mat_SeqAIJKokkos *aijkok; 112 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 113 114 PetscFunctionBegin; 115 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 116 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 117 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 118 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 119 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 120 PetscFunctionReturn(0); 121 } 122 123 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 124 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 125 { 126 Mat_SeqAIJKokkos *aijkok; 127 128 PetscFunctionBegin; 129 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 if (aijkok && aijkok->device_mat_d.data()) { 131 *d_mat = aijkok->device_mat_d.data(); 132 } else { 133 PetscErrorCode ierr; 134 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 135 *d_mat = NULL; 136 } 137 PetscFunctionReturn(0); 138 } 139 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 140 { 141 PetscErrorCode ierr; 142 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 143 144 PetscFunctionBegin; 145 if (!aijkok->At) { /* Generate At for the first time */ 146 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 147 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 148 } else if (!aijkok->transpose_updated) { /* Only update At values */ 149 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 150 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 151 } 152 aijkok->transpose_updated = PETSC_TRUE; 153 PetscFunctionReturn(0); 154 } 155 156 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A) 157 { 158 PetscErrorCode ierr; 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 160 161 PetscFunctionBegin; 162 if (!aijkok->Ah) { /* Generate Ah for the first time */ 163 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 164 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 165 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 166 } else if (!aijkok->hermitian_updated) { /* Only update Ah values */ 167 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 168 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 169 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 170 } 171 aijkok->hermitian_updated = PETSC_TRUE; 172 PetscFunctionReturn(0); 173 } 174 175 /* y = A x */ 176 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 177 { 178 PetscErrorCode ierr; 179 Mat_SeqAIJKokkos *aijkok; 180 ConstPetscScalarKokkosView xv; 181 PetscScalarKokkosView yv; 182 183 PetscFunctionBegin; 184 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 186 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 187 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 189 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 190 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 191 ierr = WaitForKokkos();CHKERRQ(ierr); 192 /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 193 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 194 PetscFunctionReturn(0); 195 } 196 197 /* y = A^T x */ 198 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 199 { 200 PetscErrorCode ierr; 201 Mat_SeqAIJKokkos *aijkok; 202 Mat B; 203 const char *mode; 204 ConstPetscScalarKokkosView xv; 205 PetscScalarKokkosView yv; 206 207 PetscFunctionBegin; 208 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 210 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 211 if (A->form_explicit_transpose) { 212 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 213 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 214 mode = "N"; 215 } else { 216 B = A; 217 mode = "T"; 218 } 219 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 220 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 221 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 222 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 223 ierr = WaitForKokkos();CHKERRQ(ierr); 224 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 /* y = A^H x */ 229 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqAIJKokkos *aijkok; 233 Mat B; 234 const char *mode; 235 ConstPetscScalarKokkosView xv; 236 PetscScalarKokkosView yv; 237 238 PetscFunctionBegin; 239 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 240 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 241 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 242 if (A->form_explicit_transpose) { 243 ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 244 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 245 mode = "N"; 246 } else { 247 B = A; 248 mode = "C"; 249 } 250 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 251 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 252 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 253 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 254 ierr = WaitForKokkos();CHKERRQ(ierr); 255 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 /* z = A x + y */ 260 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 261 { 262 PetscErrorCode ierr; 263 Mat_SeqAIJKokkos *aijkok; 264 ConstPetscScalarKokkosView xv,yv; 265 PetscScalarKokkosView zv; 266 267 PetscFunctionBegin; 268 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 269 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 270 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 271 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 272 if (zz != yy) Kokkos::deep_copy(zv,yv); 273 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 274 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 275 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 276 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 277 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 278 ierr = WaitForKokkos();CHKERRQ(ierr); 279 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 280 PetscFunctionReturn(0); 281 } 282 283 /* z = A^T x + y */ 284 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 285 { 286 PetscErrorCode ierr; 287 Mat_SeqAIJKokkos *aijkok; 288 Mat B; 289 const char *mode; 290 ConstPetscScalarKokkosView xv,yv; 291 PetscScalarKokkosView zv; 292 293 PetscFunctionBegin; 294 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 295 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 296 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 297 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 298 if (zz != yy) Kokkos::deep_copy(zv,yv); 299 if (A->form_explicit_transpose) { 300 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 301 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 302 mode = "N"; 303 } else { 304 B = A; 305 mode = "T"; 306 } 307 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 308 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 309 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 311 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 312 ierr = WaitForKokkos();CHKERRQ(ierr); 313 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 /* z = A^H x + y */ 318 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 319 { 320 PetscErrorCode ierr; 321 Mat_SeqAIJKokkos *aijkok; 322 Mat B; 323 const char *mode; 324 ConstPetscScalarKokkosView xv,yv; 325 PetscScalarKokkosView zv; 326 327 PetscFunctionBegin; 328 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 329 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 330 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 331 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 332 if (zz != yy) Kokkos::deep_copy(zv,yv); 333 if (A->form_explicit_transpose) { 334 ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 335 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 336 mode = "N"; 337 } else { 338 B = A; 339 mode = "C"; 340 } 341 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 342 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 343 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346 ierr = WaitForKokkos();CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 352 { 353 PetscErrorCode ierr; 354 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355 356 PetscFunctionBegin; 357 switch (op) { 358 case MAT_FORM_EXPLICIT_TRANSPOSE: 359 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 360 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 361 A->form_explicit_transpose = flg; 362 break; 363 default: 364 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 365 break; 366 } 367 PetscFunctionReturn(0); 368 } 369 370 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 371 { 372 PetscErrorCode ierr; 373 Mat B; 374 Mat_SeqAIJ *aij; 375 376 PetscFunctionBegin; 377 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 378 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 379 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 380 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 381 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 382 } 383 384 B = *newmat; 385 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 386 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 387 388 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 389 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 390 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 391 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 392 aij = (Mat_SeqAIJ*)B->data; 393 aij->inode.use = PETSC_FALSE; 394 395 PetscFunctionReturn(0); 396 } 397 398 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 399 { 400 PetscErrorCode ierr; 401 402 PetscFunctionBegin; 403 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 404 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 405 PetscFunctionReturn(0); 406 } 407 408 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 409 { 410 PetscErrorCode ierr; 411 Mat_SeqAIJKokkos *aijkok; 412 413 PetscFunctionBegin; 414 if (A->factortype == MAT_FACTOR_NONE) { 415 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 416 if (aijkok) { 417 if (aijkok->device_mat_d.data()) { 418 delete aijkok->colmap_d; 419 delete aijkok->i_uncompressed_d; 420 } 421 if (aijkok->diag_d) delete aijkok->diag_d; 422 } 423 delete aijkok; 424 } else { 425 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 426 } 427 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 428 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 429 A->spptr = NULL; 430 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 431 PetscFunctionReturn(0); 432 } 433 434 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 435 { 436 PetscErrorCode ierr; 437 438 PetscFunctionBegin; 439 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 440 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 441 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 442 PetscFunctionReturn(0); 443 } 444 445 #if 0 446 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 447 { 448 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 449 450 PetscFunctionBegin; 451 delete kh; 452 PetscFunctionReturn(0); 453 } 454 455 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 456 { 457 Mat_Product *product = C->product; 458 Mat A,B; 459 MatProductType ptype; 460 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 461 bool tA,tB; 462 PetscErrorCode ierr; 463 MatMatKernelHandle_t *kh; 464 Mat_SeqAIJ *c; 465 466 PetscFunctionBegin; 467 MatCheckProduct(C,1); 468 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 469 A = product->A; 470 B = product->B; 471 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 472 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 473 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 474 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 475 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 476 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 477 ptype = product->type; 478 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 479 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 480 switch (ptype) { 481 case MATPRODUCT_AB: 482 tA = false; 483 tB = false; 484 break; 485 case MATPRODUCT_AtB: 486 tA = true; 487 tB = false; 488 break; 489 case MATPRODUCT_ABt: 490 tA = false; 491 tB = true; 492 break; 493 default: 494 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 495 } 496 497 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 498 C->offloadmask = PETSC_OFFLOAD_GPU; 499 /* shorter version of MatAssemblyEnd_SeqAIJ */ 500 c = (Mat_SeqAIJ*)C->data; 501 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 502 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 503 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 504 c->reallocs = 0; 505 C->info.mallocs += 0; 506 C->info.nz_unneeded = 0; 507 C->assembled = C->was_assembled = PETSC_TRUE; 508 C->num_ass++; 509 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 510 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 511 C->offloadmask = PETSC_OFFLOAD_BOTH; 512 // Also, we should add support to copy back from device to host 513 PetscFunctionReturn(0); 514 } 515 516 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 517 { 518 Mat_Product *product = C->product; 519 Mat A,B; 520 MatProductType ptype; 521 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 522 PetscInt m,n,k; 523 bool tA,tB; 524 PetscErrorCode ierr; 525 Mat_SeqAIJ *c; 526 MatMatKernelHandle_t *kh; 527 528 PetscFunctionBegin; 529 MatCheckProduct(C,1); 530 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 531 A = product->A; 532 B = product->B; 533 // TODO only copy the i,j data, not the values 534 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 535 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 536 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 537 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 538 ptype = product->type; 539 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 540 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 541 switch (ptype) { 542 case MATPRODUCT_AB: 543 tA = false; 544 tB = false; 545 m = A->rmap->n; 546 n = B->cmap->n; 547 break; 548 case MATPRODUCT_AtB: 549 tA = true; 550 tB = false; 551 m = A->cmap->n; 552 n = B->cmap->n; 553 break; 554 case MATPRODUCT_ABt: 555 tA = false; 556 tB = true; 557 m = A->rmap->n; 558 n = B->rmap->n; 559 break; 560 default: 561 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 562 } 563 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 564 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 565 c = (Mat_SeqAIJ*)C->data; 566 567 kh = new MatMatKernelHandle_t; 568 // TODO SZ: ADD RUNTIME SELECTION OF THESE 569 kh->set_team_work_size(16); 570 kh->set_dynamic_scheduling(true); 571 // Select an spgemm algorithm, limited by configuration at compile-time and 572 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 573 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 574 std::string myalg("SPGEMM_KK_MEMORY"); 575 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 576 577 ///////////////////////////////////// 578 // TODO JZ 579 ckok = NULL; //new Mat_SeqAIJKokkos(); 580 C->spptr = ckok; 581 KokkosCsrMatrix_t ccsr; // here only to have the code compile 582 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 583 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 584 //c->nz = get_nnz_from_ccsr 585 ////////////////////////////////////// 586 c->singlemalloc = PETSC_FALSE; 587 c->free_a = PETSC_TRUE; 588 c->free_ij = PETSC_TRUE; 589 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 590 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 591 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 592 //////////////////////////////////// 593 // TODO JZ copy from device to c->i and c->j 594 //////////////////////////////////// 595 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 596 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 597 c->maxnz = c->nz; 598 c->nonzerorowcnt = 0; 599 c->rmax = 0; 600 for (k = 0; k < m; k++) { 601 const PetscInt nn = c->i[k+1] - c->i[k]; 602 c->ilen[k] = c->imax[k] = nn; 603 c->nonzerorowcnt += (PetscInt)!!nn; 604 c->rmax = PetscMax(c->rmax,nn); 605 } 606 607 C->nonzerostate++; 608 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 609 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 610 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 611 ckok->nonzerostate = C->nonzerostate; 612 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 613 C->preallocated = PETSC_TRUE; 614 C->assembled = PETSC_FALSE; 615 C->was_assembled = PETSC_FALSE; 616 617 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 618 C->product->data = kh; 619 C->product->destroy = MatMatKernelHandleDestroy_Private; 620 PetscFunctionReturn(0); 621 } 622 623 /* handles sparse matrix matrix ops */ 624 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 625 { 626 Mat_Product *product = mat->product; 627 PetscErrorCode ierr; 628 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 629 630 PetscFunctionBegin; 631 MatCheckProduct(mat,1); 632 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 633 if (product->type == MATPRODUCT_ABC) { 634 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 635 } 636 if (Biskok && Ciskok) { 637 switch (product->type) { 638 case MATPRODUCT_AB: 639 case MATPRODUCT_AtB: 640 case MATPRODUCT_ABt: 641 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 642 break; 643 case MATPRODUCT_PtAP: 644 case MATPRODUCT_RARt: 645 case MATPRODUCT_ABC: 646 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 647 break; 648 default: 649 break; 650 } 651 } else { /* fallback for AIJ */ 652 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 653 } 654 PetscFunctionReturn(0); 655 } 656 #endif 657 658 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 659 { 660 PetscErrorCode ierr; 661 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 662 PetscFunctionBegin; 663 if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 664 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 665 PetscFunctionReturn(0); 666 } 667 668 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 669 { 670 PetscErrorCode ierr; 671 Mat_SeqAIJKokkos *aijkok; 672 673 PetscFunctionBegin; 674 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 675 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 676 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 677 A->offloadmask = PETSC_OFFLOAD_GPU; 678 ierr = WaitForKokkos();CHKERRQ(ierr); 679 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 680 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 681 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 686 { 687 PetscErrorCode ierr; 688 PetscBool both = PETSC_FALSE; 689 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 690 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 691 692 PetscFunctionBegin; 693 if (aijkok && aijkok->a_d.data()) { 694 KokkosBlas::fill(aijkok->a_d,0.0); 695 both = PETSC_TRUE; 696 } 697 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 698 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 699 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 700 else A->offloadmask = PETSC_OFFLOAD_CPU; 701 PetscFunctionReturn(0); 702 } 703 704 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 705 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 706 { 707 PetscErrorCode ierr; 708 Mat_SeqAIJKokkos *aijkok; 709 710 PetscFunctionBegin; 711 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 712 PetscValidPointer(kv,2); 713 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 714 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 715 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 716 *kv = aijkok->a_d; 717 PetscFunctionReturn(0); 718 } 719 720 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 721 { 722 PetscFunctionBegin; 723 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 724 PetscValidPointer(kv,2); 725 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 726 PetscFunctionReturn(0); 727 } 728 729 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 730 { 731 PetscErrorCode ierr; 732 Mat_SeqAIJKokkos *aijkok; 733 734 PetscFunctionBegin; 735 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 736 PetscValidPointer(kv,2); 737 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 738 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 739 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 740 *kv = aijkok->a_d; 741 PetscFunctionReturn(0); 742 } 743 744 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 745 { 746 PetscErrorCode ierr; 747 748 PetscFunctionBegin; 749 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 750 PetscValidPointer(kv,2); 751 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 752 ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 753 PetscFunctionReturn(0); 754 } 755 756 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 757 { 758 Mat_SeqAIJKokkos *aijkok; 759 760 PetscFunctionBegin; 761 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 762 PetscValidPointer(kv,2); 763 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 764 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 765 *kv = aijkok->a_d; 766 PetscFunctionReturn(0); 767 } 768 769 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 770 { 771 PetscErrorCode ierr; 772 773 PetscFunctionBegin; 774 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 775 PetscValidPointer(kv,2); 776 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 777 ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 778 PetscFunctionReturn(0); 779 } 780 781 /* Computes Y += a*X */ 782 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 783 { 784 PetscErrorCode ierr; 785 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 786 787 PetscFunctionBegin; 788 if (X->ops->axpy != Y->ops->axpy) { 789 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 790 PetscFunctionReturn(0); 791 } 792 793 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 794 PetscBool e; 795 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 796 if (e) { 797 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 798 if (e) str = SAME_NONZERO_PATTERN; 799 } 800 } 801 802 if (str != SAME_NONZERO_PATTERN) { 803 /* TODO: do MatAXPY on device */ 804 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 805 PetscFunctionReturn(0); 806 } else { 807 ConstPetscScalarKokkosView xv; 808 PetscScalarKokkosView yv; 809 810 ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr); 811 ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr); 812 KokkosBlas::axpy(a,xv,yv); 813 ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr); 814 ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr); 815 } 816 PetscFunctionReturn(0); 817 } 818 819 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 820 { 821 PetscErrorCode ierr; 822 823 PetscFunctionBegin; 824 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 825 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 826 B->offloadmask = PETSC_OFFLOAD_CPU; 827 PetscFunctionReturn(0); 828 } 829 830 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 831 { 832 PetscFunctionBegin; 833 A->boundtocpu = PETSC_FALSE; 834 835 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 836 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 837 A->ops->destroy = MatDestroy_SeqAIJKokkos; 838 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 839 A->ops->axpy = MatAXPY_SeqAIJKokkos; 840 A->ops->scale = MatScale_SeqAIJKokkos; 841 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 842 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 843 A->ops->mult = MatMult_SeqAIJKokkos; 844 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 845 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 846 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 847 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 848 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 849 A->ops->setoption = MatSetOption_SeqAIJKokkos; 850 PetscFunctionReturn(0); 851 } 852 853 /* --------------------------------------------------------------------------------*/ 854 /*@C 855 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 856 (the default parallel PETSc format). This matrix will ultimately be handled by 857 Kokkos for calculations. For good matrix 858 assembly performance the user should preallocate the matrix storage by setting 859 the parameter nz (or the array nnz). By setting these parameters accurately, 860 performance during matrix assembly can be increased by more than a factor of 50. 861 862 Collective 863 864 Input Parameters: 865 + comm - MPI communicator, set to PETSC_COMM_SELF 866 . m - number of rows 867 . n - number of columns 868 . nz - number of nonzeros per row (same for all rows) 869 - nnz - array containing the number of nonzeros in the various rows 870 (possibly different for each row) or NULL 871 872 Output Parameter: 873 . A - the matrix 874 875 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 876 MatXXXXSetPreallocation() paradgm instead of this routine directly. 877 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 878 879 Notes: 880 If nnz is given then nz is ignored 881 882 The AIJ format (also called the Yale sparse matrix format or 883 compressed row storage), is fully compatible with standard Fortran 77 884 storage. That is, the stored row and column indices can begin at 885 either one (as in Fortran) or zero. See the users' manual for details. 886 887 Specify the preallocated storage with either nz or nnz (not both). 888 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 889 allocation. For large problems you MUST preallocate memory or you 890 will get TERRIBLE performance, see the users' manual chapter on matrices. 891 892 By default, this format uses inodes (identical nodes) when possible, to 893 improve numerical efficiency of matrix-vector products and solves. We 894 search for consecutive rows with the same nonzero structure, thereby 895 reusing matrix information to achieve increased efficiency. 896 897 Level: intermediate 898 899 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 900 @*/ 901 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 902 { 903 PetscErrorCode ierr; 904 905 PetscFunctionBegin; 906 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 907 ierr = MatCreate(comm,A);CHKERRQ(ierr); 908 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 909 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 910 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 911 PetscFunctionReturn(0); 912 } 913 914 // factorizations 915 typedef Kokkos::TeamPolicy<>::member_type team_member; 916 // 917 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 918 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 919 // 920 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 921 { 922 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 923 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 924 IS isrow = b->row,isicol = b->icol; 925 PetscErrorCode ierr; 926 const PetscInt *r_h,*ic_h; 927 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 928 const PetscScalar *aa_d = aijkok->a_d.data(); 929 PetscScalar *ba_d = baijkok->a_d.data(); 930 PetscBool row_identity,col_identity; 931 PetscInt nc, Nf, nVec=32; // should be a parameter 932 PetscContainer container; 933 934 PetscFunctionBegin; 935 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 936 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 937 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 938 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 939 if (container) { 940 PetscInt *pNf=NULL, nv; 941 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 942 Nf = (*pNf)%1000; 943 nv = (*pNf)/1000; 944 if (nv>0) nVec = nv; 945 } else Nf = 1; 946 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 947 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 948 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 949 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 950 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 951 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 952 #endif 953 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 954 { 955 #define KOKKOS_SHARED_LEVEL 1 956 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 957 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 958 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 959 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 960 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 961 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 962 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 963 size_t flops_h = 0.0; 964 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 965 Kokkos::View<size_t> d_flops_k ("flops"); 966 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 967 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 968 Kokkos::deep_copy (d_flops_k, h_flops_k); 969 Kokkos::deep_copy (d_r_k, h_r_k); 970 Kokkos::deep_copy (d_ic_k, h_ic_k); 971 // Fill A --> fact 972 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 973 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 974 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 975 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 976 // zero rows of B 977 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 978 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 979 PetscScalar *baL = ba_d + bi_d[rowb]; 980 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 981 /* zero (unfactored row) */ 982 for (int j=0;j<nzbL;j++) baL[j] = 0; 983 for (int j=0;j<nzbU;j++) baU[j] = 0; 984 }); 985 // copy A into B 986 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 987 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 988 const PetscScalar *av = aa_d + ai_d[rowa]; 989 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 990 /* load in initial (unfactored row) */ 991 for (int j=0;j<nza;j++) { 992 PetscInt colb = ic[ajtmp[j]]; 993 PetscScalar vala = av[j]; 994 if (colb == rowb) { 995 *(ba_d + bdiag_d[rowb]) = vala; 996 } else { 997 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 998 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 999 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1000 for (int j=0; j<nz ; j++) { 1001 if (pbj[j] == colb) { 1002 pba[j] = vala; 1003 set++; 1004 break; 1005 } 1006 } 1007 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1008 } 1009 } 1010 }); 1011 }); 1012 Kokkos::fence(); 1013 1014 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1015 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1016 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1017 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1018 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 1019 const PetscInt start = field*nloc, end = start + nloc; 1020 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1021 // A22 panel update for each row A(1,:) and col A(:,1) 1022 for (int ii=start; ii<end-1; ii++) { 1023 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1024 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1025 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1026 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1027 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1028 PetscInt kIdx = j*Ni + field_block_idx; 1029 if (kIdx >= nzUi) /* void */ ; 1030 else { 1031 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1032 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1033 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1034 size_t st_idx; 1035 // find and do L(k,i) = A(:k,i) / A(i,i) 1036 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1037 // get column, there has got to be a better way 1038 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1039 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 1040 if (pjL[j] == ii) { 1041 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1042 idx = j; // output 1043 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1044 } 1045 }, st_idx); 1046 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1047 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 1048 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1049 // U(i+1,:end) 1050 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1051 PetscScalar Uij = baUi[uiIdx]; 1052 PetscInt col = bjUi[uiIdx]; 1053 if (col==myk) { 1054 // A_kk = A_kk - L_ki * U_ij(k) 1055 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1056 *Akkv = *Akkv - L_ki() * Uij; // UiK 1057 } else { 1058 PetscScalar *start, *end, *pAkjv=NULL; 1059 PetscInt high, low; 1060 const PetscInt *startj; 1061 if (col<myk) { // L 1062 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1063 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1064 start = pLki+1; // start at pLki+1, A22(myk,1) 1065 startj= bj_d + bi_d[myk] + idx; 1066 end = ba_d + bi_d[myk+1]; 1067 } else { 1068 PetscInt idx = bdiag_d[myk+1]+1; 1069 start = ba_d + idx; 1070 startj= bj_d + idx; 1071 end = ba_d + bdiag_d[myk]; 1072 } 1073 // search for 'col', use bisection search - TODO 1074 low = 0; 1075 high = (PetscInt)(end-start); 1076 while (high-low > 5) { 1077 int t = (low+high)/2; 1078 if (startj[t] > col) high = t; 1079 else low = t; 1080 } 1081 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1082 if (startj[pAkjv-start] == col) break; 1083 } 1084 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 1085 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1086 } 1087 }); 1088 } 1089 } 1090 }); 1091 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1092 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1093 } /* endof for (i=0; i<n; i++) { */ 1094 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1095 }); 1096 Kokkos::fence(); 1097 Kokkos::deep_copy (h_flops_k, d_flops_k); 1098 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1099 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1100 #elif defined(PETSC_USE_LOG) 1101 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1102 #endif 1103 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1104 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1105 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1106 /* Invert diagonal for simpler triangular solves */ 1107 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1108 int i = start + outer_index*Ni + lg_rank%Ni; 1109 if (i < end) { 1110 PetscScalar *pv = ba_d + bdiag_d[i]; 1111 *pv = 1.0/(*pv); 1112 } 1113 }); 1114 }); 1115 } 1116 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1117 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1118 #endif 1119 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1120 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1121 1122 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1123 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1124 if (b->inode.size) { 1125 B->ops->solve = MatSolve_SeqAIJ_Inode; 1126 } else if (row_identity && col_identity) { 1127 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1128 } else { 1129 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1130 } 1131 B->offloadmask = PETSC_OFFLOAD_GPU; 1132 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1133 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1134 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1135 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1136 B->ops->matsolve = MatMatSolve_SeqAIJ; 1137 B->assembled = PETSC_TRUE; 1138 B->preallocated = PETSC_TRUE; 1139 1140 PetscFunctionReturn(0); 1141 } 1142 1143 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1144 { 1145 PetscErrorCode ierr; 1146 1147 PetscFunctionBegin; 1148 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1149 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1150 PetscFunctionReturn(0); 1151 } 1152 1153 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1154 { 1155 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1156 1157 PetscFunctionBegin; 1158 if (!factors->sptrsv_symbolic_completed) { 1159 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1160 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1161 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1162 } 1163 PetscFunctionReturn(0); 1164 } 1165 1166 /* Check if we need to update factors etc for transpose solve */ 1167 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1168 { 1169 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1170 MatColumnIndexType n = A->rmap->n; 1171 1172 PetscFunctionBegin; 1173 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1174 /* Update L^T and do sptrsv symbolic */ 1175 factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1); 1176 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1177 factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1178 factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1179 1180 KokkosKernels::Impl::transpose_matrix< 1181 ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1182 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1183 MatRowOffsetKokkosView,DefaultExecutionSpace>( 1184 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1185 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1186 1187 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1188 We have to sort the indices, until KK provides finer control options. 1189 */ 1190 KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1191 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1192 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1193 1194 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1195 1196 /* Update U^T and do sptrsv symbolic */ 1197 factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1); 1198 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1199 factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1200 factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1201 1202 KokkosKernels::Impl::transpose_matrix< 1203 ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1204 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1205 MatRowOffsetKokkosView,DefaultExecutionSpace>( 1206 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1207 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1208 1209 /* Sort indices. See comments above */ 1210 KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1211 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1212 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1213 1214 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1215 factors->transpose_updated = PETSC_TRUE; 1216 } 1217 PetscFunctionReturn(0); 1218 } 1219 1220 /* Solve Ax = b, with A = LU */ 1221 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1222 { 1223 PetscErrorCode ierr; 1224 ConstPetscScalarKokkosView bv; 1225 PetscScalarKokkosView xv; 1226 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1227 1228 PetscFunctionBegin; 1229 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1230 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1231 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1232 /* Solve L tmpv = b */ 1233 KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector); 1234 /* Solve Ux = tmpv */ 1235 KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv); 1236 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1237 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1238 PetscFunctionReturn(0); 1239 } 1240 1241 /* Solve A^T x = b, with A^T = U^T L^T */ 1242 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1243 { 1244 PetscErrorCode ierr; 1245 ConstPetscScalarKokkosView bv; 1246 PetscScalarKokkosView xv; 1247 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1248 1249 PetscFunctionBegin; 1250 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1251 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1252 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1253 /* Solve U^T tmpv = b */ 1254 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1255 1256 /* Solve L^T x = tmpv */ 1257 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1258 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1259 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1260 PetscFunctionReturn(0); 1261 } 1262 1263 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1264 { 1265 PetscErrorCode ierr; 1266 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1267 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1268 PetscInt fill_lev = info->levels; 1269 1270 PetscFunctionBegin; 1271 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1272 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1273 1274 B->assembled = PETSC_TRUE; 1275 B->preallocated = PETSC_TRUE; 1276 B->ops->solve = MatSolve_SeqAIJKokkos; 1277 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1278 B->ops->matsolve = NULL; 1279 B->ops->matsolvetranspose = NULL; 1280 B->offloadmask = PETSC_OFFLOAD_GPU; 1281 1282 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1283 factors->transpose_updated = PETSC_FALSE; 1284 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1285 PetscFunctionReturn(0); 1286 } 1287 1288 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1289 { 1290 PetscErrorCode ierr; 1291 Mat_SeqAIJKokkos *aijkok; 1292 Mat_SeqAIJ *b; 1293 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1294 PetscInt fill_lev = info->levels; 1295 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1296 PetscInt n = A->rmap->n; 1297 1298 PetscFunctionBegin; 1299 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1300 /* Rebuild factors */ 1301 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1302 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1303 1304 /* Create a spiluk handle and then do symbolic factorization */ 1305 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1306 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1307 1308 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1309 1310 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1311 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1312 Kokkos::realloc(factors->iU_d,n+1); 1313 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1314 1315 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1316 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1317 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1318 1319 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1320 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1321 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1322 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1323 1324 /* TODO: add options to select sptrsv algorithms */ 1325 /* Create sptrsv handles for L, U and their transpose */ 1326 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1327 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1328 #else 1329 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1330 #endif 1331 1332 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1333 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1334 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1335 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1336 1337 /* Fill fields of the factor matrix B */ 1338 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1339 b = (Mat_SeqAIJ*)B->data; 1340 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1341 B->info.fill_ratio_given = info->fill; 1342 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1343 1344 B->offloadmask = PETSC_OFFLOAD_GPU; 1345 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1346 PetscFunctionReturn(0); 1347 } 1348 1349 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1350 { 1351 PetscErrorCode ierr; 1352 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1353 const PetscInt nrows = A->rmap->n; 1354 1355 PetscFunctionBegin; 1356 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1357 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1358 // move B data into Kokkos 1359 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1360 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1361 { 1362 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1363 if (!baijkok->diag_d) { 1364 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1365 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1366 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1367 } 1368 } 1369 PetscFunctionReturn(0); 1370 } 1371 1372 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1373 { 1374 PetscFunctionBegin; 1375 *type = MATSOLVERKOKKOS; 1376 PetscFunctionReturn(0); 1377 } 1378 1379 // use -pc_factor_mat_solver_type kokkosdevice 1380 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1381 { 1382 PetscFunctionBegin; 1383 *type = MATSOLVERKOKKOSDEVICE; 1384 PetscFunctionReturn(0); 1385 } 1386 1387 /*MC 1388 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1389 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1390 1391 Level: beginner 1392 1393 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1394 M*/ 1395 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1396 { 1397 PetscErrorCode ierr; 1398 PetscInt n = A->rmap->n; 1399 1400 PetscFunctionBegin; 1401 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1402 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1403 (*B)->factortype = ftype; 1404 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1405 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1406 1407 if (ftype == MAT_FACTOR_LU) { 1408 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1409 (*B)->canuseordering = PETSC_TRUE; 1410 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1411 } else if (ftype == MAT_FACTOR_ILU) { 1412 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1413 (*B)->canuseordering = PETSC_FALSE; 1414 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1415 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1416 1417 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1418 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1419 PetscFunctionReturn(0); 1420 } 1421 1422 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1423 { 1424 PetscErrorCode ierr; 1425 PetscInt n = A->rmap->n; 1426 1427 PetscFunctionBegin; 1428 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1429 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1430 (*B)->factortype = ftype; 1431 (*B)->canuseordering = PETSC_TRUE; 1432 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1433 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1434 1435 if (ftype == MAT_FACTOR_LU) { 1436 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1437 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1438 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1439 1440 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1441 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1442 PetscFunctionReturn(0); 1443 } 1444 1445 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1446 { 1447 PetscErrorCode ierr; 1448 1449 PetscFunctionBegin; 1450 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1451 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1452 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1453 PetscFunctionReturn(0); 1454 } 1455 1456