1 #include <petscvec_kokkos.hpp> 2 #include <petsc/private/petscimpl.h> 3 #include <petscsystypes.h> 4 #include <petscerror.h> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <KokkosSparse_spiluk.hpp> 11 #include <KokkosSparse_sptrsv.hpp> 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 15 16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 17 18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 19 { 20 PetscErrorCode ierr; 21 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 22 23 PetscFunctionBegin; 24 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 25 A->offloadmask = PETSC_OFFLOAD_CPU; 26 if (aijkok && aijkok->device_mat_d.data()) { 27 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 28 } 29 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 30 PetscFunctionReturn(0); 31 } 32 33 /* Sync CSR data to device if not yet */ 34 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 35 { 36 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 39 40 PetscFunctionBegin; 41 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 42 /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 46 aijkok->nonzerostate = A->nonzerostate; 47 A->spptr = aijkok; 48 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 49 aijkok->a_dual.clear_sync_state(); 50 aijkok->a_dual.modify_host(); /* Mark host is modified */ 51 aijkok->a_dual.sync_device(); /* Sync the device */ 52 aijkok->transpose_updated = PETSC_FALSE; 53 aijkok->hermitian_updated = PETSC_FALSE; 54 } 55 A->offloadmask = PETSC_OFFLOAD_BOTH; 56 PetscFunctionReturn(0); 57 } 58 59 /* Mark the CSR data on device is modified */ 60 static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A) 61 { 62 PetscErrorCode ierr; 63 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 64 65 PetscFunctionBegin; 66 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 67 aijkok->a_dual.clear_sync_state(); 68 aijkok->a_dual.modify_device(); 69 aijkok->transpose_updated = PETSC_FALSE; 70 aijkok->hermitian_updated = PETSC_FALSE; 71 A->offloadmask = PETSC_OFFLOAD_GPU; 72 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 77 { 78 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 79 80 PetscFunctionBegin; 81 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 82 /* We do not expect one needs factors on host */ 83 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 84 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 85 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); /* Mark device is modified */ 88 aijkok->a_dual.sync_host(); /* Sync the host */ 89 A->offloadmask = PETSC_OFFLOAD_BOTH; 90 } 91 PetscFunctionReturn(0); 92 } 93 94 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 95 { 96 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 97 PetscErrorCode ierr; 98 99 PetscFunctionBegin; 100 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 101 *array = a->a; 102 A->offloadmask = PETSC_OFFLOAD_CPU; 103 PetscFunctionReturn(0); 104 } 105 106 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 107 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 108 { 109 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 110 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 111 112 PetscFunctionBegin; 113 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 114 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 115 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 116 PetscFunctionReturn(0); 117 } 118 119 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 120 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 121 { 122 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 123 124 PetscFunctionBegin; 125 if (aijkok && aijkok->device_mat_d.data()) { 126 *d_mat = aijkok->device_mat_d.data(); 127 } else { 128 PetscErrorCode ierr; 129 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 130 *d_mat = NULL; 131 } 132 PetscFunctionReturn(0); 133 } 134 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 135 { 136 PetscErrorCode ierr; 137 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 138 139 PetscFunctionBegin; 140 if (!aijkok->At) { /* Generate At for the first time */ 141 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 142 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 143 } else if (!aijkok->transpose_updated) { /* Only update At values */ 144 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 145 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 146 } 147 aijkok->transpose_updated = PETSC_TRUE; 148 PetscFunctionReturn(0); 149 } 150 151 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A) 152 { 153 PetscErrorCode ierr; 154 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 155 156 PetscFunctionBegin; 157 if (!aijkok->Ah) { /* Generate Ah for the first time */ 158 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 159 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 160 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 161 } else if (!aijkok->hermitian_updated) { /* Only update Ah values */ 162 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 163 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 164 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 165 } 166 aijkok->hermitian_updated = PETSC_TRUE; 167 PetscFunctionReturn(0); 168 } 169 170 /* y = A x */ 171 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 172 { 173 PetscErrorCode ierr; 174 Mat_SeqAIJKokkos *aijkok; 175 ConstPetscScalarKokkosView xv; 176 PetscScalarKokkosView yv; 177 178 PetscFunctionBegin; 179 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 180 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 181 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 182 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 183 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 184 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 185 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 186 ierr = WaitForKokkos();CHKERRQ(ierr); 187 /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 188 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 /* y = A^T x */ 193 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 194 { 195 PetscErrorCode ierr; 196 Mat_SeqAIJKokkos *aijkok; 197 Mat B; 198 const char *mode; 199 ConstPetscScalarKokkosView xv; 200 PetscScalarKokkosView yv; 201 202 PetscFunctionBegin; 203 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 204 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 205 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 206 if (A->form_explicit_transpose) { 207 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 208 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 209 mode = "N"; 210 } else { 211 B = A; 212 mode = "T"; 213 } 214 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 215 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 216 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 217 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 218 ierr = WaitForKokkos();CHKERRQ(ierr); 219 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 220 PetscFunctionReturn(0); 221 } 222 223 /* y = A^H x */ 224 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 225 { 226 PetscErrorCode ierr; 227 Mat_SeqAIJKokkos *aijkok; 228 Mat B; 229 const char *mode; 230 ConstPetscScalarKokkosView xv; 231 PetscScalarKokkosView yv; 232 233 PetscFunctionBegin; 234 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 235 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 236 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 237 if (A->form_explicit_transpose) { 238 ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 239 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 240 mode = "N"; 241 } else { 242 B = A; 243 mode = "C"; 244 } 245 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 246 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 247 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 248 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 249 ierr = WaitForKokkos();CHKERRQ(ierr); 250 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 251 PetscFunctionReturn(0); 252 } 253 254 /* z = A x + y */ 255 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 256 { 257 PetscErrorCode ierr; 258 Mat_SeqAIJKokkos *aijkok; 259 ConstPetscScalarKokkosView xv,yv; 260 PetscScalarKokkosView zv; 261 262 PetscFunctionBegin; 263 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 264 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 265 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 266 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 267 if (zz != yy) Kokkos::deep_copy(zv,yv); 268 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 270 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 271 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 272 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 273 ierr = WaitForKokkos();CHKERRQ(ierr); 274 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 275 PetscFunctionReturn(0); 276 } 277 278 /* z = A^T x + y */ 279 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 280 { 281 PetscErrorCode ierr; 282 Mat_SeqAIJKokkos *aijkok; 283 Mat B; 284 const char *mode; 285 ConstPetscScalarKokkosView xv,yv; 286 PetscScalarKokkosView zv; 287 288 PetscFunctionBegin; 289 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 290 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 291 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 292 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 293 if (zz != yy) Kokkos::deep_copy(zv,yv); 294 if (A->form_explicit_transpose) { 295 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 296 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 297 mode = "N"; 298 } else { 299 B = A; 300 mode = "T"; 301 } 302 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 303 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 304 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 305 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 306 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 307 ierr = WaitForKokkos();CHKERRQ(ierr); 308 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 /* z = A^H x + y */ 313 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 314 { 315 PetscErrorCode ierr; 316 Mat_SeqAIJKokkos *aijkok; 317 Mat B; 318 const char *mode; 319 ConstPetscScalarKokkosView xv,yv; 320 PetscScalarKokkosView zv; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 324 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 325 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 326 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 327 if (zz != yy) Kokkos::deep_copy(zv,yv); 328 if (A->form_explicit_transpose) { 329 ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 330 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 331 mode = "N"; 332 } else { 333 B = A; 334 mode = "C"; 335 } 336 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 337 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 338 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 339 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 340 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 341 ierr = WaitForKokkos();CHKERRQ(ierr); 342 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 347 { 348 PetscErrorCode ierr; 349 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 350 351 PetscFunctionBegin; 352 switch (op) { 353 case MAT_FORM_EXPLICIT_TRANSPOSE: 354 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 355 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 356 A->form_explicit_transpose = flg; 357 break; 358 default: 359 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 360 break; 361 } 362 PetscFunctionReturn(0); 363 } 364 365 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 366 { 367 PetscErrorCode ierr; 368 Mat B; 369 Mat_SeqAIJ *aij; 370 371 PetscFunctionBegin; 372 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 373 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 374 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 375 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 376 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 377 } 378 379 B = *newmat; 380 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 381 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 382 383 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 384 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 385 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 386 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 387 aij = (Mat_SeqAIJ*)B->data; 388 aij->inode.use = PETSC_FALSE; 389 390 PetscFunctionReturn(0); 391 } 392 393 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 394 { 395 PetscErrorCode ierr; 396 397 PetscFunctionBegin; 398 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 399 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 400 PetscFunctionReturn(0); 401 } 402 403 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 404 { 405 PetscErrorCode ierr; 406 Mat_SeqAIJKokkos *aijkok; 407 408 PetscFunctionBegin; 409 if (A->factortype == MAT_FACTOR_NONE) { 410 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 411 if (aijkok) { 412 if (aijkok->device_mat_d.data()) { 413 delete aijkok->colmap_d; 414 delete aijkok->i_uncompressed_d; 415 } 416 if (aijkok->diag_d) delete aijkok->diag_d; 417 } 418 delete aijkok; 419 } else { 420 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 421 } 422 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 423 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 424 A->spptr = NULL; 425 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 426 PetscFunctionReturn(0); 427 } 428 429 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 430 { 431 PetscErrorCode ierr; 432 433 PetscFunctionBegin; 434 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 435 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 436 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 437 PetscFunctionReturn(0); 438 } 439 440 #if 0 441 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 442 { 443 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 444 445 PetscFunctionBegin; 446 delete kh; 447 PetscFunctionReturn(0); 448 } 449 450 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 451 { 452 Mat_Product *product = C->product; 453 Mat A,B; 454 MatProductType ptype; 455 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 456 bool tA,tB; 457 PetscErrorCode ierr; 458 MatMatKernelHandle_t *kh; 459 Mat_SeqAIJ *c; 460 461 PetscFunctionBegin; 462 MatCheckProduct(C,1); 463 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 464 A = product->A; 465 B = product->B; 466 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 467 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 468 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 469 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 470 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 471 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 472 ptype = product->type; 473 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 474 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 475 switch (ptype) { 476 case MATPRODUCT_AB: 477 tA = false; 478 tB = false; 479 break; 480 case MATPRODUCT_AtB: 481 tA = true; 482 tB = false; 483 break; 484 case MATPRODUCT_ABt: 485 tA = false; 486 tB = true; 487 break; 488 default: 489 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 490 } 491 492 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 493 C->offloadmask = PETSC_OFFLOAD_GPU; 494 /* shorter version of MatAssemblyEnd_SeqAIJ */ 495 c = (Mat_SeqAIJ*)C->data; 496 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 497 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 498 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 499 c->reallocs = 0; 500 C->info.mallocs += 0; 501 C->info.nz_unneeded = 0; 502 C->assembled = C->was_assembled = PETSC_TRUE; 503 C->num_ass++; 504 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 505 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 506 C->offloadmask = PETSC_OFFLOAD_BOTH; 507 // Also, we should add support to copy back from device to host 508 PetscFunctionReturn(0); 509 } 510 511 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 512 { 513 Mat_Product *product = C->product; 514 Mat A,B; 515 MatProductType ptype; 516 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 517 PetscInt m,n,k; 518 bool tA,tB; 519 PetscErrorCode ierr; 520 Mat_SeqAIJ *c; 521 MatMatKernelHandle_t *kh; 522 523 PetscFunctionBegin; 524 MatCheckProduct(C,1); 525 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 526 A = product->A; 527 B = product->B; 528 // TODO only copy the i,j data, not the values 529 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 530 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 531 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 532 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 533 ptype = product->type; 534 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 535 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 536 switch (ptype) { 537 case MATPRODUCT_AB: 538 tA = false; 539 tB = false; 540 m = A->rmap->n; 541 n = B->cmap->n; 542 break; 543 case MATPRODUCT_AtB: 544 tA = true; 545 tB = false; 546 m = A->cmap->n; 547 n = B->cmap->n; 548 break; 549 case MATPRODUCT_ABt: 550 tA = false; 551 tB = true; 552 m = A->rmap->n; 553 n = B->rmap->n; 554 break; 555 default: 556 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 557 } 558 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 559 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 560 c = (Mat_SeqAIJ*)C->data; 561 562 kh = new MatMatKernelHandle_t; 563 // TODO SZ: ADD RUNTIME SELECTION OF THESE 564 kh->set_team_work_size(16); 565 kh->set_dynamic_scheduling(true); 566 // Select an spgemm algorithm, limited by configuration at compile-time and 567 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 568 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 569 std::string myalg("SPGEMM_KK_MEMORY"); 570 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 571 572 // TODO JZ 573 ckok = NULL; //new Mat_SeqAIJKokkos(); 574 C->spptr = ckok; 575 KokkosCsrMatrix_t ccsr; // here only to have the code compile 576 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 577 578 c->singlemalloc = PETSC_FALSE; 579 c->free_a = PETSC_TRUE; 580 c->free_ij = PETSC_TRUE; 581 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 582 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 583 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 584 585 // TODO JZ copy from device to c->i and c->j 586 587 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 588 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 589 c->maxnz = c->nz; 590 c->nonzerorowcnt = 0; 591 c->rmax = 0; 592 for (k = 0; k < m; k++) { 593 const PetscInt nn = c->i[k+1] - c->i[k]; 594 c->ilen[k] = c->imax[k] = nn; 595 c->nonzerorowcnt += (PetscInt)!!nn; 596 c->rmax = PetscMax(c->rmax,nn); 597 } 598 599 C->nonzerostate++; 600 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 601 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 602 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 603 ckok->nonzerostate = C->nonzerostate; 604 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 605 C->preallocated = PETSC_TRUE; 606 C->assembled = PETSC_FALSE; 607 C->was_assembled = PETSC_FALSE; 608 609 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 610 C->product->data = kh; 611 C->product->destroy = MatMatKernelHandleDestroy_Private; 612 PetscFunctionReturn(0); 613 } 614 615 /* handles sparse matrix matrix ops */ 616 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 617 { 618 Mat_Product *product = mat->product; 619 PetscErrorCode ierr; 620 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 621 622 PetscFunctionBegin; 623 MatCheckProduct(mat,1); 624 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 625 if (product->type == MATPRODUCT_ABC) { 626 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 627 } 628 if (Biskok && Ciskok) { 629 switch (product->type) { 630 case MATPRODUCT_AB: 631 case MATPRODUCT_AtB: 632 case MATPRODUCT_ABt: 633 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 634 break; 635 case MATPRODUCT_PtAP: 636 case MATPRODUCT_RARt: 637 case MATPRODUCT_ABC: 638 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 639 break; 640 default: 641 break; 642 } 643 } else { /* fallback for AIJ */ 644 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 645 } 646 PetscFunctionReturn(0); 647 } 648 #endif 649 650 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 651 { 652 PetscErrorCode ierr; 653 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 654 PetscFunctionBegin; 655 if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 656 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 657 PetscFunctionReturn(0); 658 } 659 660 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 661 { 662 PetscErrorCode ierr; 663 Mat_SeqAIJKokkos *aijkok; 664 665 PetscFunctionBegin; 666 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 667 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 668 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 669 A->offloadmask = PETSC_OFFLOAD_GPU; 670 ierr = WaitForKokkos();CHKERRQ(ierr); 671 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 672 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 673 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 674 PetscFunctionReturn(0); 675 } 676 677 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 678 { 679 PetscErrorCode ierr; 680 PetscBool both = PETSC_FALSE; 681 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 682 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683 684 PetscFunctionBegin; 685 if (aijkok && aijkok->a_d.data()) { 686 KokkosBlas::fill(aijkok->a_d,0.0); 687 both = PETSC_TRUE; 688 } 689 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 690 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 691 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 692 else A->offloadmask = PETSC_OFFLOAD_CPU; 693 PetscFunctionReturn(0); 694 } 695 696 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 697 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 698 { 699 PetscErrorCode ierr; 700 Mat_SeqAIJKokkos *aijkok; 701 702 PetscFunctionBegin; 703 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 704 PetscValidPointer(kv,2); 705 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 706 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 707 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 708 *kv = aijkok->a_d; 709 PetscFunctionReturn(0); 710 } 711 712 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 713 { 714 PetscFunctionBegin; 715 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 716 PetscValidPointer(kv,2); 717 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 718 PetscFunctionReturn(0); 719 } 720 721 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 722 { 723 PetscErrorCode ierr; 724 Mat_SeqAIJKokkos *aijkok; 725 726 PetscFunctionBegin; 727 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 728 PetscValidPointer(kv,2); 729 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 730 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 731 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 732 *kv = aijkok->a_d; 733 PetscFunctionReturn(0); 734 } 735 736 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 737 { 738 PetscErrorCode ierr; 739 740 PetscFunctionBegin; 741 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 742 PetscValidPointer(kv,2); 743 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 744 ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 745 PetscFunctionReturn(0); 746 } 747 748 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 749 { 750 Mat_SeqAIJKokkos *aijkok; 751 752 PetscFunctionBegin; 753 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 754 PetscValidPointer(kv,2); 755 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 756 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 757 *kv = aijkok->a_d; 758 PetscFunctionReturn(0); 759 } 760 761 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 762 { 763 PetscErrorCode ierr; 764 765 PetscFunctionBegin; 766 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 767 PetscValidPointer(kv,2); 768 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 769 ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 770 PetscFunctionReturn(0); 771 } 772 773 /* Computes Y += a*X */ 774 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 775 { 776 PetscErrorCode ierr; 777 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 778 779 PetscFunctionBegin; 780 if (X->ops->axpy != Y->ops->axpy) { 781 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 782 PetscFunctionReturn(0); 783 } 784 785 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 786 PetscBool e; 787 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 788 if (e) { 789 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 790 if (e) str = SAME_NONZERO_PATTERN; 791 } 792 } 793 794 if (str != SAME_NONZERO_PATTERN) { 795 /* TODO: do MatAXPY on device */ 796 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 797 PetscFunctionReturn(0); 798 } else { 799 ConstPetscScalarKokkosView xv; 800 PetscScalarKokkosView yv; 801 802 ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr); 803 ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr); 804 KokkosBlas::axpy(a,xv,yv); 805 ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr); 806 ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr); 807 } 808 PetscFunctionReturn(0); 809 } 810 811 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 812 { 813 PetscErrorCode ierr; 814 815 PetscFunctionBegin; 816 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 817 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 818 B->offloadmask = PETSC_OFFLOAD_CPU; 819 PetscFunctionReturn(0); 820 } 821 822 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 823 { 824 PetscFunctionBegin; 825 A->boundtocpu = PETSC_FALSE; 826 827 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 828 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 829 A->ops->destroy = MatDestroy_SeqAIJKokkos; 830 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 831 A->ops->axpy = MatAXPY_SeqAIJKokkos; 832 A->ops->scale = MatScale_SeqAIJKokkos; 833 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 834 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 835 A->ops->mult = MatMult_SeqAIJKokkos; 836 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 837 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 838 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 839 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 840 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 841 A->ops->setoption = MatSetOption_SeqAIJKokkos; 842 PetscFunctionReturn(0); 843 } 844 845 /* --------------------------------------------------------------------------------*/ 846 /*@C 847 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 848 (the default parallel PETSc format). This matrix will ultimately be handled by 849 Kokkos for calculations. For good matrix 850 assembly performance the user should preallocate the matrix storage by setting 851 the parameter nz (or the array nnz). By setting these parameters accurately, 852 performance during matrix assembly can be increased by more than a factor of 50. 853 854 Collective 855 856 Input Parameters: 857 + comm - MPI communicator, set to PETSC_COMM_SELF 858 . m - number of rows 859 . n - number of columns 860 . nz - number of nonzeros per row (same for all rows) 861 - nnz - array containing the number of nonzeros in the various rows 862 (possibly different for each row) or NULL 863 864 Output Parameter: 865 . A - the matrix 866 867 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 868 MatXXXXSetPreallocation() paradgm instead of this routine directly. 869 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 870 871 Notes: 872 If nnz is given then nz is ignored 873 874 The AIJ format (also called the Yale sparse matrix format or 875 compressed row storage), is fully compatible with standard Fortran 77 876 storage. That is, the stored row and column indices can begin at 877 either one (as in Fortran) or zero. See the users' manual for details. 878 879 Specify the preallocated storage with either nz or nnz (not both). 880 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 881 allocation. For large problems you MUST preallocate memory or you 882 will get TERRIBLE performance, see the users' manual chapter on matrices. 883 884 By default, this format uses inodes (identical nodes) when possible, to 885 improve numerical efficiency of matrix-vector products and solves. We 886 search for consecutive rows with the same nonzero structure, thereby 887 reusing matrix information to achieve increased efficiency. 888 889 Level: intermediate 890 891 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 892 @*/ 893 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 894 { 895 PetscErrorCode ierr; 896 897 PetscFunctionBegin; 898 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 899 ierr = MatCreate(comm,A);CHKERRQ(ierr); 900 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 901 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 902 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 903 PetscFunctionReturn(0); 904 } 905 906 typedef Kokkos::TeamPolicy<>::member_type team_member; 907 // 908 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 909 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 910 // 911 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 912 { 913 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 914 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 915 IS isrow = b->row,isicol = b->icol; 916 PetscErrorCode ierr; 917 const PetscInt *r_h,*ic_h; 918 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 919 const PetscScalar *aa_d = aijkok->a_d.data(); 920 PetscScalar *ba_d = baijkok->a_d.data(); 921 PetscBool row_identity,col_identity; 922 PetscInt nc, Nf, nVec=32; // should be a parameter 923 PetscContainer container; 924 925 PetscFunctionBegin; 926 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 927 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 928 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 929 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 930 if (container) { 931 PetscInt *pNf=NULL, nv; 932 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 933 Nf = (*pNf)%1000; 934 nv = (*pNf)/1000; 935 if (nv>0) nVec = nv; 936 } else Nf = 1; 937 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 938 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 939 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 940 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 941 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 942 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 943 #endif 944 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 945 { 946 #define KOKKOS_SHARED_LEVEL 1 947 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 948 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 949 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 950 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 951 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 952 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 953 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 954 size_t flops_h = 0.0; 955 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 956 Kokkos::View<size_t> d_flops_k ("flops"); 957 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 958 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 959 Kokkos::deep_copy (d_flops_k, h_flops_k); 960 Kokkos::deep_copy (d_r_k, h_r_k); 961 Kokkos::deep_copy (d_ic_k, h_ic_k); 962 // Fill A --> fact 963 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 964 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 965 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 966 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 967 // zero rows of B 968 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 969 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 970 PetscScalar *baL = ba_d + bi_d[rowb]; 971 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 972 /* zero (unfactored row) */ 973 for (int j=0;j<nzbL;j++) baL[j] = 0; 974 for (int j=0;j<nzbU;j++) baU[j] = 0; 975 }); 976 // copy A into B 977 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 978 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 979 const PetscScalar *av = aa_d + ai_d[rowa]; 980 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 981 /* load in initial (unfactored row) */ 982 for (int j=0;j<nza;j++) { 983 PetscInt colb = ic[ajtmp[j]]; 984 PetscScalar vala = av[j]; 985 if (colb == rowb) { 986 *(ba_d + bdiag_d[rowb]) = vala; 987 } else { 988 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 989 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 990 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 991 for (int j=0; j<nz ; j++) { 992 if (pbj[j] == colb) { 993 pba[j] = vala; 994 set++; 995 break; 996 } 997 } 998 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 999 } 1000 } 1001 }); 1002 }); 1003 Kokkos::fence(); 1004 1005 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1006 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1007 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1008 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1009 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1010 const PetscInt start = field*nloc, end = start + nloc; 1011 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1012 // A22 panel update for each row A(1,:) and col A(:,1) 1013 for (int ii=start; ii<end-1; ii++) { 1014 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1015 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1016 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1017 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1018 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1019 PetscInt kIdx = j*Ni + field_block_idx; 1020 if (kIdx >= nzUi) /* void */ ; 1021 else { 1022 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1023 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1024 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1025 size_t st_idx; 1026 // find and do L(k,i) = A(:k,i) / A(i,i) 1027 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1028 // get column, there has got to be a better way 1029 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1030 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 1031 if (pjL[j] == ii) { 1032 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1033 idx = j; // output 1034 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1035 } 1036 }, st_idx); 1037 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1038 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 1039 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1040 // U(i+1,:end) 1041 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1042 PetscScalar Uij = baUi[uiIdx]; 1043 PetscInt col = bjUi[uiIdx]; 1044 if (col==myk) { 1045 // A_kk = A_kk - L_ki * U_ij(k) 1046 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1047 *Akkv = *Akkv - L_ki() * Uij; // UiK 1048 } else { 1049 PetscScalar *start, *end, *pAkjv=NULL; 1050 PetscInt high, low; 1051 const PetscInt *startj; 1052 if (col<myk) { // L 1053 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1054 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1055 start = pLki+1; // start at pLki+1, A22(myk,1) 1056 startj= bj_d + bi_d[myk] + idx; 1057 end = ba_d + bi_d[myk+1]; 1058 } else { 1059 PetscInt idx = bdiag_d[myk+1]+1; 1060 start = ba_d + idx; 1061 startj= bj_d + idx; 1062 end = ba_d + bdiag_d[myk]; 1063 } 1064 // search for 'col', use bisection search - TODO 1065 low = 0; 1066 high = (PetscInt)(end-start); 1067 while (high-low > 5) { 1068 int t = (low+high)/2; 1069 if (startj[t] > col) high = t; 1070 else low = t; 1071 } 1072 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1073 if (startj[pAkjv-start] == col) break; 1074 } 1075 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 1076 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1077 } 1078 }); 1079 } 1080 } 1081 }); 1082 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1083 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1084 } /* endof for (i=0; i<n; i++) { */ 1085 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1086 }); 1087 Kokkos::fence(); 1088 Kokkos::deep_copy (h_flops_k, d_flops_k); 1089 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1090 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1091 #elif defined(PETSC_USE_LOG) 1092 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1093 #endif 1094 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1095 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1096 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1097 /* Invert diagonal for simpler triangular solves */ 1098 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1099 int i = start + outer_index*Ni + lg_rank%Ni; 1100 if (i < end) { 1101 PetscScalar *pv = ba_d + bdiag_d[i]; 1102 *pv = 1.0/(*pv); 1103 } 1104 }); 1105 }); 1106 } 1107 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1108 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1109 #endif 1110 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1112 1113 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1114 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1115 if (b->inode.size) { 1116 B->ops->solve = MatSolve_SeqAIJ_Inode; 1117 } else if (row_identity && col_identity) { 1118 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1119 } else { 1120 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1121 } 1122 B->offloadmask = PETSC_OFFLOAD_GPU; 1123 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1124 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1125 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1126 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1127 B->ops->matsolve = MatMatSolve_SeqAIJ; 1128 B->assembled = PETSC_TRUE; 1129 B->preallocated = PETSC_TRUE; 1130 1131 PetscFunctionReturn(0); 1132 } 1133 1134 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1135 { 1136 PetscErrorCode ierr; 1137 1138 PetscFunctionBegin; 1139 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1140 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1141 PetscFunctionReturn(0); 1142 } 1143 1144 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1145 { 1146 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1147 1148 PetscFunctionBegin; 1149 if (!factors->sptrsv_symbolic_completed) { 1150 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1151 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1152 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1153 } 1154 PetscFunctionReturn(0); 1155 } 1156 1157 /* Check if we need to update factors etc for transpose solve */ 1158 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1159 { 1160 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1161 MatColumnIndexType n = A->rmap->n; 1162 1163 PetscFunctionBegin; 1164 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1165 /* Update L^T and do sptrsv symbolic */ 1166 factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1); 1167 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1168 factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1169 factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1170 1171 KokkosKernels::Impl::transpose_matrix< 1172 ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1173 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1174 MatRowOffsetKokkosView,DefaultExecutionSpace>( 1175 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1176 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1177 1178 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1179 We have to sort the indices, until KK provides finer control options. 1180 */ 1181 KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1182 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1183 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1184 1185 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1186 1187 /* Update U^T and do sptrsv symbolic */ 1188 factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1); 1189 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1190 factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1191 factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1192 1193 KokkosKernels::Impl::transpose_matrix< 1194 ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1195 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1196 MatRowOffsetKokkosView,DefaultExecutionSpace>( 1197 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1198 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1199 1200 /* Sort indices. See comments above */ 1201 KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1202 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1203 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1204 1205 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1206 factors->transpose_updated = PETSC_TRUE; 1207 } 1208 PetscFunctionReturn(0); 1209 } 1210 1211 /* Solve Ax = b, with A = LU */ 1212 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1213 { 1214 PetscErrorCode ierr; 1215 ConstPetscScalarKokkosView bv; 1216 PetscScalarKokkosView xv; 1217 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1218 1219 PetscFunctionBegin; 1220 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1221 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1222 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1223 /* Solve L tmpv = b */ 1224 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1225 /* Solve Ux = tmpv */ 1226 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1227 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1228 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 /* Solve A^T x = b, with A^T = U^T L^T */ 1233 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1234 { 1235 PetscErrorCode ierr; 1236 ConstPetscScalarKokkosView bv; 1237 PetscScalarKokkosView xv; 1238 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1239 1240 PetscFunctionBegin; 1241 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1242 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1243 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1244 /* Solve U^T tmpv = b */ 1245 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1246 1247 /* Solve L^T x = tmpv */ 1248 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1249 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1250 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1251 PetscFunctionReturn(0); 1252 } 1253 1254 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1255 { 1256 PetscErrorCode ierr; 1257 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1258 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1259 PetscInt fill_lev = info->levels; 1260 1261 PetscFunctionBegin; 1262 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1263 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1264 1265 B->assembled = PETSC_TRUE; 1266 B->preallocated = PETSC_TRUE; 1267 B->ops->solve = MatSolve_SeqAIJKokkos; 1268 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1269 B->ops->matsolve = NULL; 1270 B->ops->matsolvetranspose = NULL; 1271 B->offloadmask = PETSC_OFFLOAD_GPU; 1272 1273 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1274 factors->transpose_updated = PETSC_FALSE; 1275 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1276 PetscFunctionReturn(0); 1277 } 1278 1279 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1280 { 1281 PetscErrorCode ierr; 1282 Mat_SeqAIJKokkos *aijkok; 1283 Mat_SeqAIJ *b; 1284 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1285 PetscInt fill_lev = info->levels; 1286 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1287 PetscInt n = A->rmap->n; 1288 1289 PetscFunctionBegin; 1290 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1291 /* Rebuild factors */ 1292 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1293 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1294 1295 /* Create a spiluk handle and then do symbolic factorization */ 1296 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1297 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1298 1299 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1300 1301 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1302 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1303 Kokkos::realloc(factors->iU_d,n+1); 1304 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1305 1306 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1307 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1308 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1309 1310 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1311 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1312 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1313 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1314 1315 /* TODO: add options to select sptrsv algorithms */ 1316 /* Create sptrsv handles for L, U and their transpose */ 1317 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1318 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1319 #else 1320 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1321 #endif 1322 1323 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1324 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1325 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1326 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1327 1328 /* Fill fields of the factor matrix B */ 1329 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1330 b = (Mat_SeqAIJ*)B->data; 1331 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1332 B->info.fill_ratio_given = info->fill; 1333 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1334 1335 B->offloadmask = PETSC_OFFLOAD_GPU; 1336 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1337 PetscFunctionReturn(0); 1338 } 1339 1340 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1341 { 1342 PetscErrorCode ierr; 1343 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1344 const PetscInt nrows = A->rmap->n; 1345 1346 PetscFunctionBegin; 1347 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1348 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1349 // move B data into Kokkos 1350 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1351 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1352 { 1353 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1354 if (!baijkok->diag_d) { 1355 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1356 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1357 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1358 } 1359 } 1360 PetscFunctionReturn(0); 1361 } 1362 1363 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1364 { 1365 PetscFunctionBegin; 1366 *type = MATSOLVERKOKKOS; 1367 PetscFunctionReturn(0); 1368 } 1369 1370 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1371 { 1372 PetscFunctionBegin; 1373 *type = MATSOLVERKOKKOSDEVICE; 1374 PetscFunctionReturn(0); 1375 } 1376 1377 /*MC 1378 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1379 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1380 1381 Level: beginner 1382 1383 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1384 M*/ 1385 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1386 { 1387 PetscErrorCode ierr; 1388 PetscInt n = A->rmap->n; 1389 1390 PetscFunctionBegin; 1391 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1392 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1393 (*B)->factortype = ftype; 1394 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1395 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1396 1397 if (ftype == MAT_FACTOR_LU) { 1398 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1399 (*B)->canuseordering = PETSC_TRUE; 1400 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1401 } else if (ftype == MAT_FACTOR_ILU) { 1402 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1403 (*B)->canuseordering = PETSC_FALSE; 1404 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1405 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1406 1407 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1408 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1409 PetscFunctionReturn(0); 1410 } 1411 1412 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1413 { 1414 PetscErrorCode ierr; 1415 PetscInt n = A->rmap->n; 1416 1417 PetscFunctionBegin; 1418 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1419 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1420 (*B)->factortype = ftype; 1421 (*B)->canuseordering = PETSC_TRUE; 1422 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1423 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1424 1425 if (ftype == MAT_FACTOR_LU) { 1426 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1427 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1428 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1429 1430 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1431 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1432 PetscFunctionReturn(0); 1433 } 1434 1435 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1436 { 1437 PetscErrorCode ierr; 1438 1439 PetscFunctionBegin; 1440 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1441 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1442 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1443 PetscFunctionReturn(0); 1444 } 1445 1446