1 #include <petscvec_kokkos.hpp> 2 #include <petsc/private/petscimpl.h> 3 #include <petscsystypes.h> 4 #include <petscerror.h> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <KokkosSparse_spiluk.hpp> 11 #include <KokkosSparse_sptrsv.hpp> 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 16 #include <petscmat.h> 17 18 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 19 20 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 21 { 22 PetscErrorCode ierr; 23 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 24 25 PetscFunctionBegin; 26 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 27 A->offloadmask = PETSC_OFFLOAD_CPU; 28 if (aijkok && aijkok->device_mat_d.data()) { 29 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 30 } 31 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 32 PetscFunctionReturn(0); 33 } 34 35 /* Sync CSR data to device if not yet */ 36 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 37 { 38 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 39 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 40 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 41 42 PetscFunctionBegin; 43 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 44 /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 45 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 46 delete aijkok; 47 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 48 aijkok->nonzerostate = A->nonzerostate; 49 A->spptr = aijkok; 50 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 51 aijkok->a_dual.clear_sync_state(); 52 aijkok->a_dual.modify_host(); /* Mark host is modified */ 53 aijkok->a_dual.sync_device(); /* Sync the device */ 54 aijkok->transpose_updated = PETSC_FALSE; 55 aijkok->hermitian_updated = PETSC_FALSE; 56 } 57 A->offloadmask = PETSC_OFFLOAD_BOTH; 58 PetscFunctionReturn(0); 59 } 60 61 /* Mark the CSR data on device is modified */ 62 static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A) 63 { 64 PetscErrorCode ierr; 65 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 66 67 PetscFunctionBegin; 68 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 69 aijkok->a_dual.clear_sync_state(); 70 aijkok->a_dual.modify_device(); 71 aijkok->transpose_updated = PETSC_FALSE; 72 aijkok->hermitian_updated = PETSC_FALSE; 73 A->offloadmask = PETSC_OFFLOAD_GPU; 74 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 75 PetscFunctionReturn(0); 76 } 77 78 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 79 { 80 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 81 82 PetscFunctionBegin; 83 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 84 /* We do not expect one needs factors on host */ 85 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 86 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 87 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 88 aijkok->a_dual.clear_sync_state(); 89 aijkok->a_dual.modify_device(); /* Mark device is modified */ 90 aijkok->a_dual.sync_host(); /* Sync the host */ 91 A->offloadmask = PETSC_OFFLOAD_BOTH; 92 } 93 PetscFunctionReturn(0); 94 } 95 96 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 97 { 98 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 99 PetscErrorCode ierr; 100 101 PetscFunctionBegin; 102 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 103 *array = a->a; 104 A->offloadmask = PETSC_OFFLOAD_CPU; 105 PetscFunctionReturn(0); 106 } 107 108 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 109 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 110 { 111 Mat_SeqAIJKokkos *aijkok; 112 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 113 114 PetscFunctionBegin; 115 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 116 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 117 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 118 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 119 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 120 PetscFunctionReturn(0); 121 } 122 123 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 124 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 125 { 126 Mat_SeqAIJKokkos *aijkok; 127 128 PetscFunctionBegin; 129 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 if (aijkok && aijkok->device_mat_d.data()) { 131 *d_mat = aijkok->device_mat_d.data(); 132 } else { 133 PetscErrorCode ierr; 134 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 135 *d_mat = NULL; 136 } 137 PetscFunctionReturn(0); 138 } 139 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 140 { 141 PetscErrorCode ierr; 142 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 143 144 PetscFunctionBegin; 145 if (!aijkok->At) { /* Generate At for the first time */ 146 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 147 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 148 } else if (!aijkok->transpose_updated) { /* Only update At values */ 149 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 150 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 151 } 152 aijkok->transpose_updated = PETSC_TRUE; 153 PetscFunctionReturn(0); 154 } 155 156 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A) 157 { 158 PetscErrorCode ierr; 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 160 161 PetscFunctionBegin; 162 if (!aijkok->Ah) { /* Generate Ah for the first time */ 163 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 164 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 165 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 166 } else if (!aijkok->hermitian_updated) { /* Only update Ah values */ 167 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 168 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 169 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 170 } 171 aijkok->hermitian_updated = PETSC_TRUE; 172 PetscFunctionReturn(0); 173 } 174 175 /* y = A x */ 176 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 177 { 178 PetscErrorCode ierr; 179 Mat_SeqAIJKokkos *aijkok; 180 ConstPetscScalarKokkosView xv; 181 PetscScalarKokkosView yv; 182 183 PetscFunctionBegin; 184 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 186 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 187 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 189 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 190 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 191 ierr = WaitForKokkos();CHKERRQ(ierr); 192 /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 193 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 194 PetscFunctionReturn(0); 195 } 196 197 /* y = A^T x */ 198 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 199 { 200 PetscErrorCode ierr; 201 Mat_SeqAIJKokkos *aijkok; 202 Mat B; 203 const char *mode; 204 ConstPetscScalarKokkosView xv; 205 PetscScalarKokkosView yv; 206 207 PetscFunctionBegin; 208 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 210 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 211 if (A->form_explicit_transpose) { 212 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 213 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 214 mode = "N"; 215 } else { 216 B = A; 217 mode = "T"; 218 } 219 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 220 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 221 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 222 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 223 ierr = WaitForKokkos();CHKERRQ(ierr); 224 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 /* y = A^H x */ 229 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqAIJKokkos *aijkok; 233 Mat B; 234 const char *mode; 235 ConstPetscScalarKokkosView xv; 236 PetscScalarKokkosView yv; 237 238 PetscFunctionBegin; 239 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 240 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 241 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 242 if (A->form_explicit_transpose) { 243 ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 244 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 245 mode = "N"; 246 } else { 247 B = A; 248 mode = "C"; 249 } 250 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 251 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 252 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 253 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 254 ierr = WaitForKokkos();CHKERRQ(ierr); 255 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 /* z = A x + y */ 260 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 261 { 262 PetscErrorCode ierr; 263 Mat_SeqAIJKokkos *aijkok; 264 ConstPetscScalarKokkosView xv,yv; 265 PetscScalarKokkosView zv; 266 267 PetscFunctionBegin; 268 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 269 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 270 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 271 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 272 if (zz != yy) Kokkos::deep_copy(zv,yv); 273 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 274 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 275 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 276 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 277 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 278 ierr = WaitForKokkos();CHKERRQ(ierr); 279 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 280 PetscFunctionReturn(0); 281 } 282 283 /* z = A^T x + y */ 284 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 285 { 286 PetscErrorCode ierr; 287 Mat_SeqAIJKokkos *aijkok; 288 Mat B; 289 const char *mode; 290 ConstPetscScalarKokkosView xv,yv; 291 PetscScalarKokkosView zv; 292 293 PetscFunctionBegin; 294 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 295 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 296 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 297 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 298 if (zz != yy) Kokkos::deep_copy(zv,yv); 299 if (A->form_explicit_transpose) { 300 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 301 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 302 mode = "N"; 303 } else { 304 B = A; 305 mode = "T"; 306 } 307 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 308 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 309 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 310 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 311 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 312 ierr = WaitForKokkos();CHKERRQ(ierr); 313 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 /* z = A^H x + y */ 318 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 319 { 320 PetscErrorCode ierr; 321 Mat_SeqAIJKokkos *aijkok; 322 Mat B; 323 const char *mode; 324 ConstPetscScalarKokkosView xv,yv; 325 PetscScalarKokkosView zv; 326 327 PetscFunctionBegin; 328 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 329 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 330 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 331 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 332 if (zz != yy) Kokkos::deep_copy(zv,yv); 333 if (A->form_explicit_transpose) { 334 ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 335 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 336 mode = "N"; 337 } else { 338 B = A; 339 mode = "C"; 340 } 341 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 342 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 343 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346 ierr = WaitForKokkos();CHKERRQ(ierr); 347 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 352 { 353 PetscErrorCode ierr; 354 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355 356 PetscFunctionBegin; 357 switch (op) { 358 case MAT_FORM_EXPLICIT_TRANSPOSE: 359 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 360 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 361 A->form_explicit_transpose = flg; 362 break; 363 default: 364 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 365 break; 366 } 367 PetscFunctionReturn(0); 368 } 369 370 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 371 { 372 PetscErrorCode ierr; 373 Mat B; 374 Mat_SeqAIJ *aij; 375 376 PetscFunctionBegin; 377 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 378 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 379 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 380 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 381 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 382 } 383 384 B = *newmat; 385 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 386 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 387 388 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 389 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 390 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 391 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 392 aij = (Mat_SeqAIJ*)B->data; 393 aij->inode.use = PETSC_FALSE; 394 395 PetscFunctionReturn(0); 396 } 397 398 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 399 { 400 PetscErrorCode ierr; 401 402 PetscFunctionBegin; 403 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 404 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 405 PetscFunctionReturn(0); 406 } 407 408 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 409 { 410 PetscErrorCode ierr; 411 Mat_SeqAIJKokkos *aijkok; 412 413 PetscFunctionBegin; 414 if (A->factortype == MAT_FACTOR_NONE) { 415 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 416 if (aijkok) { 417 if (aijkok->device_mat_d.data()) { 418 delete aijkok->colmap_d; 419 delete aijkok->i_uncompressed_d; 420 } 421 if (aijkok->diag_d) delete aijkok->diag_d; 422 } 423 delete aijkok; 424 } else { 425 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 426 } 427 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 428 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 429 A->spptr = NULL; 430 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 431 PetscFunctionReturn(0); 432 } 433 434 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 435 { 436 PetscErrorCode ierr; 437 438 PetscFunctionBegin; 439 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 440 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 441 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 442 PetscFunctionReturn(0); 443 } 444 445 #if 0 446 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 447 { 448 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 449 450 PetscFunctionBegin; 451 delete kh; 452 PetscFunctionReturn(0); 453 } 454 455 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 456 { 457 Mat_Product *product = C->product; 458 Mat A,B; 459 MatProductType ptype; 460 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 461 bool tA,tB; 462 PetscErrorCode ierr; 463 MatMatKernelHandle_t *kh; 464 Mat_SeqAIJ *c; 465 466 PetscFunctionBegin; 467 MatCheckProduct(C,1); 468 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 469 A = product->A; 470 B = product->B; 471 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 472 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 473 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 474 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 475 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 476 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 477 ptype = product->type; 478 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 479 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 480 switch (ptype) { 481 case MATPRODUCT_AB: 482 tA = false; 483 tB = false; 484 break; 485 case MATPRODUCT_AtB: 486 tA = true; 487 tB = false; 488 break; 489 case MATPRODUCT_ABt: 490 tA = false; 491 tB = true; 492 break; 493 default: 494 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 495 } 496 497 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 498 C->offloadmask = PETSC_OFFLOAD_GPU; 499 /* shorter version of MatAssemblyEnd_SeqAIJ */ 500 c = (Mat_SeqAIJ*)C->data; 501 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 502 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 503 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 504 c->reallocs = 0; 505 C->info.mallocs += 0; 506 C->info.nz_unneeded = 0; 507 C->assembled = C->was_assembled = PETSC_TRUE; 508 C->num_ass++; 509 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 510 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 511 C->offloadmask = PETSC_OFFLOAD_BOTH; 512 // Also, we should add support to copy back from device to host 513 PetscFunctionReturn(0); 514 } 515 516 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 517 { 518 Mat_Product *product = C->product; 519 Mat A,B; 520 MatProductType ptype; 521 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 522 PetscInt m,n,k; 523 bool tA,tB; 524 PetscErrorCode ierr; 525 Mat_SeqAIJ *c; 526 MatMatKernelHandle_t *kh; 527 528 PetscFunctionBegin; 529 MatCheckProduct(C,1); 530 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 531 A = product->A; 532 B = product->B; 533 // TODO only copy the i,j data, not the values 534 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 535 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 536 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 537 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 538 ptype = product->type; 539 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 540 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 541 switch (ptype) { 542 case MATPRODUCT_AB: 543 tA = false; 544 tB = false; 545 m = A->rmap->n; 546 n = B->cmap->n; 547 break; 548 case MATPRODUCT_AtB: 549 tA = true; 550 tB = false; 551 m = A->cmap->n; 552 n = B->cmap->n; 553 break; 554 case MATPRODUCT_ABt: 555 tA = false; 556 tB = true; 557 m = A->rmap->n; 558 n = B->rmap->n; 559 break; 560 default: 561 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 562 } 563 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 564 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 565 c = (Mat_SeqAIJ*)C->data; 566 567 kh = new MatMatKernelHandle_t; 568 // TODO SZ: ADD RUNTIME SELECTION OF THESE 569 kh->set_team_work_size(16); 570 kh->set_dynamic_scheduling(true); 571 // Select an spgemm algorithm, limited by configuration at compile-time and 572 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 573 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 574 std::string myalg("SPGEMM_KK_MEMORY"); 575 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 576 577 ///////////////////////////////////// 578 // TODO JZ 579 ckok = NULL; //new Mat_SeqAIJKokkos(); 580 C->spptr = ckok; 581 KokkosCsrMatrix_t ccsr; // here only to have the code compile 582 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 583 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 584 //c->nz = get_nnz_from_ccsr 585 ////////////////////////////////////// 586 c->singlemalloc = PETSC_FALSE; 587 c->free_a = PETSC_TRUE; 588 c->free_ij = PETSC_TRUE; 589 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 590 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 591 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 592 //////////////////////////////////// 593 // TODO JZ copy from device to c->i and c->j 594 //////////////////////////////////// 595 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 596 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 597 c->maxnz = c->nz; 598 c->nonzerorowcnt = 0; 599 c->rmax = 0; 600 for (k = 0; k < m; k++) { 601 const PetscInt nn = c->i[k+1] - c->i[k]; 602 c->ilen[k] = c->imax[k] = nn; 603 c->nonzerorowcnt += (PetscInt)!!nn; 604 c->rmax = PetscMax(c->rmax,nn); 605 } 606 607 C->nonzerostate++; 608 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 609 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 610 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 611 ckok->nonzerostate = C->nonzerostate; 612 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 613 C->preallocated = PETSC_TRUE; 614 C->assembled = PETSC_FALSE; 615 C->was_assembled = PETSC_FALSE; 616 617 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 618 C->product->data = kh; 619 C->product->destroy = MatMatKernelHandleDestroy_Private; 620 PetscFunctionReturn(0); 621 } 622 623 /* handles sparse matrix matrix ops */ 624 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 625 { 626 Mat_Product *product = mat->product; 627 PetscErrorCode ierr; 628 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 629 630 PetscFunctionBegin; 631 MatCheckProduct(mat,1); 632 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 633 if (product->type == MATPRODUCT_ABC) { 634 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 635 } 636 if (Biskok && Ciskok) { 637 switch (product->type) { 638 case MATPRODUCT_AB: 639 case MATPRODUCT_AtB: 640 case MATPRODUCT_ABt: 641 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 642 break; 643 case MATPRODUCT_PtAP: 644 case MATPRODUCT_RARt: 645 case MATPRODUCT_ABC: 646 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 647 break; 648 default: 649 break; 650 } 651 } else { /* fallback for AIJ */ 652 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 653 } 654 PetscFunctionReturn(0); 655 } 656 #endif 657 658 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 659 { 660 PetscErrorCode ierr; 661 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 662 PetscFunctionBegin; 663 if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 664 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 665 PetscFunctionReturn(0); 666 } 667 668 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 669 { 670 PetscErrorCode ierr; 671 Mat_SeqAIJKokkos *aijkok; 672 673 PetscFunctionBegin; 674 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 675 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 676 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 677 A->offloadmask = PETSC_OFFLOAD_GPU; 678 ierr = WaitForKokkos();CHKERRQ(ierr); 679 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 680 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 681 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 686 { 687 PetscErrorCode ierr; 688 PetscBool both = PETSC_FALSE; 689 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 690 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 691 692 PetscFunctionBegin; 693 if (aijkok && aijkok->a_d.data()) { 694 KokkosBlas::fill(aijkok->a_d,0.0); 695 both = PETSC_TRUE; 696 } 697 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 698 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 699 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 700 else A->offloadmask = PETSC_OFFLOAD_CPU; 701 PetscFunctionReturn(0); 702 } 703 704 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 705 { 706 PetscErrorCode ierr; 707 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 708 709 PetscFunctionBegin; 710 if (X->ops->axpy != Y->ops->axpy) { 711 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 715 PetscBool e; 716 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 717 if (e) { 718 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 719 if (e) str = SAME_NONZERO_PATTERN; 720 } 721 } 722 if (str != SAME_NONZERO_PATTERN) { 723 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } else { 726 if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 727 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 728 PetscFunctionReturn(0); 729 } else { 730 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 731 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 732 } 733 Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 734 Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 735 if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 736 KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 737 Y->offloadmask = PETSC_OFFLOAD_GPU; 738 ierr = WaitForKokkos();CHKERRQ(ierr); 739 ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 740 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 741 ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 742 } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 743 } 744 PetscFunctionReturn(0); 745 } 746 747 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 748 { 749 PetscErrorCode ierr; 750 751 PetscFunctionBegin; 752 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 753 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 754 B->offloadmask = PETSC_OFFLOAD_CPU; 755 PetscFunctionReturn(0); 756 } 757 758 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 759 { 760 PetscFunctionBegin; 761 A->boundtocpu = PETSC_FALSE; 762 763 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 764 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 765 A->ops->destroy = MatDestroy_SeqAIJKokkos; 766 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 767 A->ops->axpy = MatAXPY_SeqAIJKokkos; 768 A->ops->scale = MatScale_SeqAIJKokkos; 769 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 770 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 771 A->ops->mult = MatMult_SeqAIJKokkos; 772 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 773 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 774 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 775 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 776 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 777 A->ops->setoption = MatSetOption_SeqAIJKokkos; 778 PetscFunctionReturn(0); 779 } 780 781 /* --------------------------------------------------------------------------------*/ 782 /*@C 783 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 784 (the default parallel PETSc format). This matrix will ultimately be handled by 785 Kokkos for calculations. For good matrix 786 assembly performance the user should preallocate the matrix storage by setting 787 the parameter nz (or the array nnz). By setting these parameters accurately, 788 performance during matrix assembly can be increased by more than a factor of 50. 789 790 Collective 791 792 Input Parameters: 793 + comm - MPI communicator, set to PETSC_COMM_SELF 794 . m - number of rows 795 . n - number of columns 796 . nz - number of nonzeros per row (same for all rows) 797 - nnz - array containing the number of nonzeros in the various rows 798 (possibly different for each row) or NULL 799 800 Output Parameter: 801 . A - the matrix 802 803 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 804 MatXXXXSetPreallocation() paradgm instead of this routine directly. 805 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 806 807 Notes: 808 If nnz is given then nz is ignored 809 810 The AIJ format (also called the Yale sparse matrix format or 811 compressed row storage), is fully compatible with standard Fortran 77 812 storage. That is, the stored row and column indices can begin at 813 either one (as in Fortran) or zero. See the users' manual for details. 814 815 Specify the preallocated storage with either nz or nnz (not both). 816 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 817 allocation. For large problems you MUST preallocate memory or you 818 will get TERRIBLE performance, see the users' manual chapter on matrices. 819 820 By default, this format uses inodes (identical nodes) when possible, to 821 improve numerical efficiency of matrix-vector products and solves. We 822 search for consecutive rows with the same nonzero structure, thereby 823 reusing matrix information to achieve increased efficiency. 824 825 Level: intermediate 826 827 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 828 @*/ 829 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 830 { 831 PetscErrorCode ierr; 832 833 PetscFunctionBegin; 834 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 835 ierr = MatCreate(comm,A);CHKERRQ(ierr); 836 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 837 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 838 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 839 PetscFunctionReturn(0); 840 } 841 842 // factorizations 843 typedef Kokkos::TeamPolicy<>::member_type team_member; 844 // 845 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 846 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 847 // 848 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 849 { 850 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 851 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 852 IS isrow = b->row,isicol = b->icol; 853 PetscErrorCode ierr; 854 const PetscInt *r_h,*ic_h; 855 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 856 const PetscScalar *aa_d = aijkok->a_d.data(); 857 PetscScalar *ba_d = baijkok->a_d.data(); 858 PetscBool row_identity,col_identity; 859 PetscInt nc, Nf, nVec=32; // should be a parameter 860 PetscContainer container; 861 862 PetscFunctionBegin; 863 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 864 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 865 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 866 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 867 if (container) { 868 PetscInt *pNf=NULL, nv; 869 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 870 Nf = (*pNf)%1000; 871 nv = (*pNf)/1000; 872 if (nv>0) nVec = nv; 873 } else Nf = 1; 874 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 875 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 876 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 877 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 878 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 879 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 880 #endif 881 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 882 { 883 #define KOKKOS_SHARED_LEVEL 1 884 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 885 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 886 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 887 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 888 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 889 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 890 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 891 size_t flops_h = 0.0; 892 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 893 Kokkos::View<size_t> d_flops_k ("flops"); 894 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 895 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 896 Kokkos::deep_copy (d_flops_k, h_flops_k); 897 Kokkos::deep_copy (d_r_k, h_r_k); 898 Kokkos::deep_copy (d_ic_k, h_ic_k); 899 // Fill A --> fact 900 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 901 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 902 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 903 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 904 // zero rows of B 905 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 906 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 907 PetscScalar *baL = ba_d + bi_d[rowb]; 908 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 909 /* zero (unfactored row) */ 910 for (int j=0;j<nzbL;j++) baL[j] = 0; 911 for (int j=0;j<nzbU;j++) baU[j] = 0; 912 }); 913 // copy A into B 914 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 915 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 916 const PetscScalar *av = aa_d + ai_d[rowa]; 917 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 918 /* load in initial (unfactored row) */ 919 for (int j=0;j<nza;j++) { 920 PetscInt colb = ic[ajtmp[j]]; 921 PetscScalar vala = av[j]; 922 if (colb == rowb) { 923 *(ba_d + bdiag_d[rowb]) = vala; 924 } else { 925 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 926 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 927 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 928 for (int j=0; j<nz ; j++) { 929 if (pbj[j] == colb) { 930 pba[j] = vala; 931 set++; 932 break; 933 } 934 } 935 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 936 } 937 } 938 }); 939 }); 940 Kokkos::fence(); 941 942 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 943 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 944 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 945 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 946 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 947 const PetscInt start = field*nloc, end = start + nloc; 948 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 949 // A22 panel update for each row A(1,:) and col A(:,1) 950 for (int ii=start; ii<end-1; ii++) { 951 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 952 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 953 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 954 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 955 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 956 PetscInt kIdx = j*Ni + field_block_idx; 957 if (kIdx >= nzUi) /* void */ ; 958 else { 959 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 960 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 961 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 962 size_t st_idx; 963 // find and do L(k,i) = A(:k,i) / A(i,i) 964 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 965 // get column, there has got to be a better way 966 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 967 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 968 if (pjL[j] == ii) { 969 PetscScalar *pLki = ba_d + bi_d[myk] + j; 970 idx = j; // output 971 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 972 } 973 }, st_idx); 974 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 975 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 976 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 977 // U(i+1,:end) 978 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 979 PetscScalar Uij = baUi[uiIdx]; 980 PetscInt col = bjUi[uiIdx]; 981 if (col==myk) { 982 // A_kk = A_kk - L_ki * U_ij(k) 983 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 984 *Akkv = *Akkv - L_ki() * Uij; // UiK 985 } else { 986 PetscScalar *start, *end, *pAkjv=NULL; 987 PetscInt high, low; 988 const PetscInt *startj; 989 if (col<myk) { // L 990 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 991 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 992 start = pLki+1; // start at pLki+1, A22(myk,1) 993 startj= bj_d + bi_d[myk] + idx; 994 end = ba_d + bi_d[myk+1]; 995 } else { 996 PetscInt idx = bdiag_d[myk+1]+1; 997 start = ba_d + idx; 998 startj= bj_d + idx; 999 end = ba_d + bdiag_d[myk]; 1000 } 1001 // search for 'col', use bisection search - TODO 1002 low = 0; 1003 high = (PetscInt)(end-start); 1004 while (high-low > 5) { 1005 int t = (low+high)/2; 1006 if (startj[t] > col) high = t; 1007 else low = t; 1008 } 1009 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1010 if (startj[pAkjv-start] == col) break; 1011 } 1012 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 1013 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1014 } 1015 }); 1016 } 1017 } 1018 }); 1019 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1020 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1021 } /* endof for (i=0; i<n; i++) { */ 1022 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1023 }); 1024 Kokkos::fence(); 1025 Kokkos::deep_copy (h_flops_k, d_flops_k); 1026 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1027 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1028 #elif defined(PETSC_USE_LOG) 1029 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1030 #endif 1031 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1032 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1033 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1034 /* Invert diagonal for simpler triangular solves */ 1035 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1036 int i = start + outer_index*Ni + lg_rank%Ni; 1037 if (i < end) { 1038 PetscScalar *pv = ba_d + bdiag_d[i]; 1039 *pv = 1.0/(*pv); 1040 } 1041 }); 1042 }); 1043 } 1044 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1045 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1046 #endif 1047 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1048 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1049 1050 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1051 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1052 if (b->inode.size) { 1053 B->ops->solve = MatSolve_SeqAIJ_Inode; 1054 } else if (row_identity && col_identity) { 1055 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1056 } else { 1057 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1058 } 1059 B->offloadmask = PETSC_OFFLOAD_GPU; 1060 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1061 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1062 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1063 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1064 B->ops->matsolve = MatMatSolve_SeqAIJ; 1065 B->assembled = PETSC_TRUE; 1066 B->preallocated = PETSC_TRUE; 1067 1068 PetscFunctionReturn(0); 1069 } 1070 1071 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1072 { 1073 PetscErrorCode ierr; 1074 1075 PetscFunctionBegin; 1076 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1077 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1078 PetscFunctionReturn(0); 1079 } 1080 1081 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1082 { 1083 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1084 1085 PetscFunctionBegin; 1086 if (!factors->sptrsv_symbolic_completed) { 1087 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1088 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1089 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1090 } 1091 PetscFunctionReturn(0); 1092 } 1093 1094 /* Check if we need to update factors etc for transpose solve */ 1095 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1096 { 1097 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1098 MatColumnIndexType n = A->rmap->n; 1099 1100 PetscFunctionBegin; 1101 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1102 /* Update L^T and do sptrsv symbolic */ 1103 factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1); 1104 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1105 factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1106 factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1107 1108 KokkosKernels::Impl::transpose_matrix< 1109 ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1110 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1111 MatRowOffsetKokkosView,DefaultExecutionSpace>( 1112 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1113 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1114 1115 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1116 We have to sort the indices, until KK provides finer control options. 1117 */ 1118 KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1119 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1120 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1121 1122 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1123 1124 /* Update U^T and do sptrsv symbolic */ 1125 factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1); 1126 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1127 factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1128 factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1129 1130 KokkosKernels::Impl::transpose_matrix< 1131 ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1132 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1133 MatRowOffsetKokkosView,DefaultExecutionSpace>( 1134 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1135 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1136 1137 /* Sort indices. See comments above */ 1138 KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1139 MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1140 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1141 1142 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1143 factors->transpose_updated = PETSC_TRUE; 1144 } 1145 PetscFunctionReturn(0); 1146 } 1147 1148 /* Solve Ax = b, with A = LU */ 1149 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1150 { 1151 PetscErrorCode ierr; 1152 ConstPetscScalarKokkosView bv; 1153 PetscScalarKokkosView xv; 1154 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1155 1156 PetscFunctionBegin; 1157 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1158 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1159 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1160 /* Solve L tmpv = b */ 1161 KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector); 1162 /* Solve Ux = tmpv */ 1163 KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv); 1164 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1165 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1166 PetscFunctionReturn(0); 1167 } 1168 1169 /* Solve A^T x = b, with A^T = U^T L^T */ 1170 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1171 { 1172 PetscErrorCode ierr; 1173 ConstPetscScalarKokkosView bv; 1174 PetscScalarKokkosView xv; 1175 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1176 1177 PetscFunctionBegin; 1178 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1179 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1180 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1181 /* Solve U^T tmpv = b */ 1182 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1183 1184 /* Solve L^T x = tmpv */ 1185 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1186 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1187 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1188 PetscFunctionReturn(0); 1189 } 1190 1191 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1192 { 1193 PetscErrorCode ierr; 1194 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1195 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1196 PetscInt fill_lev = info->levels; 1197 1198 PetscFunctionBegin; 1199 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1200 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1201 1202 B->assembled = PETSC_TRUE; 1203 B->preallocated = PETSC_TRUE; 1204 B->ops->solve = MatSolve_SeqAIJKokkos; 1205 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1206 B->ops->matsolve = NULL; 1207 B->ops->matsolvetranspose = NULL; 1208 B->offloadmask = PETSC_OFFLOAD_GPU; 1209 1210 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1211 factors->transpose_updated = PETSC_FALSE; 1212 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1213 PetscFunctionReturn(0); 1214 } 1215 1216 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1217 { 1218 PetscErrorCode ierr; 1219 Mat_SeqAIJKokkos *aijkok; 1220 Mat_SeqAIJ *b; 1221 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1222 PetscInt fill_lev = info->levels; 1223 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1224 PetscInt n = A->rmap->n; 1225 1226 PetscFunctionBegin; 1227 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1228 /* Rebuild factors */ 1229 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1230 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1231 1232 /* Create a spiluk handle and then do symbolic factorization */ 1233 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1234 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1235 1236 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1237 1238 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1239 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1240 Kokkos::realloc(factors->iU_d,n+1); 1241 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1242 1243 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1244 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1245 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1246 1247 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1248 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1249 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1250 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1251 1252 /* TODO: add options to select sptrsv algorithms */ 1253 /* Create sptrsv handles for L, U and their transpose */ 1254 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1255 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1256 #else 1257 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1258 #endif 1259 1260 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1261 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1262 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1263 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1264 1265 /* Fill fields of the factor matrix B */ 1266 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1267 b = (Mat_SeqAIJ*)B->data; 1268 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1269 B->info.fill_ratio_given = info->fill; 1270 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1271 1272 B->offloadmask = PETSC_OFFLOAD_GPU; 1273 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1274 PetscFunctionReturn(0); 1275 } 1276 1277 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1278 { 1279 PetscErrorCode ierr; 1280 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1281 const PetscInt nrows = A->rmap->n; 1282 1283 PetscFunctionBegin; 1284 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1285 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1286 // move B data into Kokkos 1287 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1288 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1289 { 1290 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1291 if (!baijkok->diag_d) { 1292 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1293 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1294 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1295 } 1296 } 1297 PetscFunctionReturn(0); 1298 } 1299 1300 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1301 { 1302 PetscFunctionBegin; 1303 *type = MATSOLVERKOKKOS; 1304 PetscFunctionReturn(0); 1305 } 1306 1307 // use -pc_factor_mat_solver_type kokkosdevice 1308 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1309 { 1310 PetscFunctionBegin; 1311 *type = MATSOLVERKOKKOSDEVICE; 1312 PetscFunctionReturn(0); 1313 } 1314 1315 /*MC 1316 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1317 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1318 1319 Level: beginner 1320 1321 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1322 M*/ 1323 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1324 { 1325 PetscErrorCode ierr; 1326 PetscInt n = A->rmap->n; 1327 1328 PetscFunctionBegin; 1329 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1330 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1331 (*B)->factortype = ftype; 1332 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1333 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1334 1335 if (ftype == MAT_FACTOR_LU) { 1336 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1337 (*B)->canuseordering = PETSC_TRUE; 1338 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1339 } else if (ftype == MAT_FACTOR_ILU) { 1340 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1341 (*B)->canuseordering = PETSC_FALSE; 1342 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1343 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1344 1345 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1346 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1347 PetscFunctionReturn(0); 1348 } 1349 1350 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1351 { 1352 PetscErrorCode ierr; 1353 PetscInt n = A->rmap->n; 1354 1355 PetscFunctionBegin; 1356 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1357 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1358 (*B)->factortype = ftype; 1359 (*B)->canuseordering = PETSC_TRUE; 1360 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1361 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1362 1363 if (ftype == MAT_FACTOR_LU) { 1364 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1365 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1366 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1367 1368 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1369 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1374 { 1375 PetscErrorCode ierr; 1376 1377 PetscFunctionBegin; 1378 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1379 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1380 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1381 PetscFunctionReturn(0); 1382 } 1383 1384