1 #include <petsc/private/petscimpl.h> 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscvec_kokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <../src/mat/impls/aij/seq/aij.h> 11 12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 13 #include <petscmat.h> 14 15 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 16 17 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 18 { 19 PetscErrorCode ierr; 20 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 21 22 PetscFunctionBegin; 23 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 24 if (aijkok && aijkok->device_mat_d.data()) { 25 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 26 } 27 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 28 PetscFunctionReturn(0); 29 } 30 31 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 32 { 33 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 34 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 35 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 36 37 PetscFunctionBegin; 38 /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 39 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 40 delete aijkok; 41 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 42 aijkok->nonzerostate = A->nonzerostate; 43 A->spptr = aijkok; 44 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 45 // Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 46 aijkok->a_dual.modify_host(); /* Mark host is modified */ 47 aijkok->a_dual.sync_device(); /* Sync the device */ 48 aijkok->need_sync_At_values = PETSC_TRUE; /* Mark matT and matH need to sync, but don't sync them now */ 49 aijkok->need_sync_Ah_values = PETSC_TRUE; 50 } 51 A->offloadmask = PETSC_OFFLOAD_BOTH; 52 PetscFunctionReturn(0); 53 } 54 55 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 61 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 62 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 63 Kokkos::deep_copy(aijkok->a_h,aijkok->a_d); 64 A->offloadmask = PETSC_OFFLOAD_BOTH; 65 } 66 PetscFunctionReturn(0); 67 } 68 69 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 70 { 71 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 72 PetscErrorCode ierr; 73 74 PetscFunctionBegin; 75 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 76 *array = a->a; 77 A->offloadmask = PETSC_OFFLOAD_CPU; 78 PetscFunctionReturn(0); 79 } 80 81 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 82 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 83 { 84 Mat_SeqAIJKokkos *aijkok; 85 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 86 87 PetscFunctionBegin; 88 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 89 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 90 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 91 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 92 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 93 PetscFunctionReturn(0); 94 } 95 96 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 97 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 98 { 99 Mat_SeqAIJKokkos *aijkok; 100 101 PetscFunctionBegin; 102 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 103 if (aijkok && aijkok->device_mat_d.data()) { 104 *d_mat = aijkok->device_mat_d.data(); 105 } else { 106 PetscErrorCode ierr; 107 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 108 *d_mat = NULL; 109 } 110 PetscFunctionReturn(0); 111 } 112 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 113 { 114 PetscErrorCode ierr; 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116 117 PetscFunctionBegin; 118 if (!aijkok->At) { /* Generate At for the first time */ 119 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 120 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 121 } else if (aijkok->need_sync_At_values) { /* Only update At values */ 122 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 123 ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 124 } 125 aijkok->need_sync_At_values = PETSC_FALSE; 126 PetscFunctionReturn(0); 127 } 128 129 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeHermitian(Mat A) 130 { 131 PetscErrorCode ierr; 132 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 133 134 PetscFunctionBegin; 135 if (!aijkok->Ah) { /* Generate Ah for the first time */ 136 ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 137 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 138 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 139 } else if (aijkok->need_sync_Ah_values) { /* Only update Ah values */ 140 ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 141 ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 142 ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 143 } 144 aijkok->need_sync_Ah_values = PETSC_FALSE; 145 PetscFunctionReturn(0); 146 } 147 148 /* y = A x */ 149 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 150 { 151 PetscErrorCode ierr; 152 Mat_SeqAIJKokkos *aijkok; 153 ConstPetscScalarKokkosView xv; 154 PetscScalarKokkosView yv; 155 156 PetscFunctionBegin; 157 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 158 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 159 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 160 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 161 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 162 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 163 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 164 ierr = WaitForKokkos();CHKERRQ(ierr); 165 /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 166 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 167 PetscFunctionReturn(0); 168 } 169 170 /* y = A^T x */ 171 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 172 { 173 PetscErrorCode ierr; 174 Mat_SeqAIJKokkos *aijkok; 175 Mat B; 176 const char *mode; 177 ConstPetscScalarKokkosView xv; 178 PetscScalarKokkosView yv; 179 180 PetscFunctionBegin; 181 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 182 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 183 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 184 if (A->form_explicit_transpose) { 185 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 186 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 187 mode = "N"; 188 } else { 189 B = A; 190 mode = "T"; 191 } 192 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 193 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 194 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 195 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 196 ierr = WaitForKokkos();CHKERRQ(ierr); 197 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 198 PetscFunctionReturn(0); 199 } 200 201 /* y = A^H x */ 202 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 203 { 204 PetscErrorCode ierr; 205 Mat_SeqAIJKokkos *aijkok; 206 Mat B; 207 const char *mode; 208 ConstPetscScalarKokkosView xv; 209 PetscScalarKokkosView yv; 210 211 PetscFunctionBegin; 212 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 213 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 214 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 215 if (A->form_explicit_transpose) { 216 ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr); 217 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 218 mode = "N"; 219 } else { 220 B = A; 221 mode = "C"; 222 } 223 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 224 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 225 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 226 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 227 ierr = WaitForKokkos();CHKERRQ(ierr); 228 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 /* z = A x + y */ 233 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 234 { 235 PetscErrorCode ierr; 236 Mat_SeqAIJKokkos *aijkok; 237 ConstPetscScalarKokkosView xv,yv; 238 PetscScalarKokkosView zv; 239 240 PetscFunctionBegin; 241 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 242 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 243 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 244 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 245 if (zz != yy) Kokkos::deep_copy(zv,yv); 246 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 247 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 248 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 249 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 250 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 251 ierr = WaitForKokkos();CHKERRQ(ierr); 252 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 253 PetscFunctionReturn(0); 254 } 255 256 /* z = A^T x + y */ 257 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 258 { 259 PetscErrorCode ierr; 260 Mat_SeqAIJKokkos *aijkok; 261 Mat B; 262 const char *mode; 263 ConstPetscScalarKokkosView xv,yv; 264 PetscScalarKokkosView zv; 265 266 PetscFunctionBegin; 267 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 268 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 269 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 270 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 271 if (zz != yy) Kokkos::deep_copy(zv,yv); 272 if (A->form_explicit_transpose) { 273 ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 274 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 275 mode = "N"; 276 } else { 277 B = A; 278 mode = "T"; 279 } 280 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 281 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 282 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 283 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 284 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 285 ierr = WaitForKokkos();CHKERRQ(ierr); 286 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 287 PetscFunctionReturn(0); 288 } 289 290 /* z = A^H x + y */ 291 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 292 { 293 PetscErrorCode ierr; 294 Mat_SeqAIJKokkos *aijkok; 295 Mat B; 296 const char *mode; 297 ConstPetscScalarKokkosView xv,yv; 298 PetscScalarKokkosView zv; 299 300 PetscFunctionBegin; 301 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 302 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 303 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 304 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 305 if (zz != yy) Kokkos::deep_copy(zv,yv); 306 if (A->form_explicit_transpose) { 307 ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr); 308 B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 309 mode = "N"; 310 } else { 311 B = A; 312 mode = "C"; 313 } 314 aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 315 KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 316 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 317 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 318 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 319 ierr = WaitForKokkos();CHKERRQ(ierr); 320 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 321 PetscFunctionReturn(0); 322 } 323 324 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 325 { 326 PetscErrorCode ierr; 327 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 328 329 PetscFunctionBegin; 330 switch (op) { 331 case MAT_FORM_EXPLICIT_TRANSPOSE: 332 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 333 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 334 A->form_explicit_transpose = flg; 335 break; 336 default: 337 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 338 break; 339 } 340 PetscFunctionReturn(0); 341 } 342 343 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 344 { 345 PetscErrorCode ierr; 346 Mat B; 347 Mat_SeqAIJ *aij; 348 349 PetscFunctionBegin; 350 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 351 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 352 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 353 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 354 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 355 } 356 357 B = *newmat; 358 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 359 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 360 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 361 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 362 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 363 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 364 aij = (Mat_SeqAIJ*)B->data; 365 aij->inode.use = PETSC_FALSE; 366 367 B->offloadmask = PETSC_OFFLOAD_CPU; 368 PetscFunctionReturn(0); 369 } 370 371 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 377 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 378 PetscFunctionReturn(0); 379 } 380 381 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 382 { 383 PetscErrorCode ierr; 384 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 385 386 PetscFunctionBegin; 387 if (aijkok) { 388 if (aijkok->device_mat_d.data()) { 389 delete aijkok->colmap_d; 390 delete aijkok->i_uncompressed_d; 391 } 392 if (aijkok->diag_d) delete aijkok->diag_d; 393 } 394 delete aijkok; 395 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 396 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 397 A->spptr = NULL; 398 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 399 PetscFunctionReturn(0); 400 } 401 402 #if 0 403 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 404 { 405 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 406 407 PetscFunctionBegin; 408 delete kh; 409 PetscFunctionReturn(0); 410 } 411 412 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 413 { 414 Mat_Product *product = C->product; 415 Mat A,B; 416 MatProductType ptype; 417 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 418 bool tA,tB; 419 PetscErrorCode ierr; 420 MatMatKernelHandle_t *kh; 421 Mat_SeqAIJ *c; 422 423 PetscFunctionBegin; 424 MatCheckProduct(C,1); 425 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 426 A = product->A; 427 B = product->B; 428 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 429 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 430 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 431 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 432 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 433 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 434 ptype = product->type; 435 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 436 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 437 switch (ptype) { 438 case MATPRODUCT_AB: 439 tA = false; 440 tB = false; 441 break; 442 case MATPRODUCT_AtB: 443 tA = true; 444 tB = false; 445 break; 446 case MATPRODUCT_ABt: 447 tA = false; 448 tB = true; 449 break; 450 default: 451 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 452 } 453 454 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 455 C->offloadmask = PETSC_OFFLOAD_GPU; 456 /* shorter version of MatAssemblyEnd_SeqAIJ */ 457 c = (Mat_SeqAIJ*)C->data; 458 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 459 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 460 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 461 c->reallocs = 0; 462 C->info.mallocs += 0; 463 C->info.nz_unneeded = 0; 464 C->assembled = C->was_assembled = PETSC_TRUE; 465 C->num_ass++; 466 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 467 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 468 C->offloadmask = PETSC_OFFLOAD_BOTH; 469 // Also, we should add support to copy back from device to host 470 PetscFunctionReturn(0); 471 } 472 473 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 474 { 475 Mat_Product *product = C->product; 476 Mat A,B; 477 MatProductType ptype; 478 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 479 PetscInt m,n,k; 480 bool tA,tB; 481 PetscErrorCode ierr; 482 Mat_SeqAIJ *c; 483 MatMatKernelHandle_t *kh; 484 485 PetscFunctionBegin; 486 MatCheckProduct(C,1); 487 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 488 A = product->A; 489 B = product->B; 490 // TODO only copy the i,j data, not the values 491 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 492 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 493 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 494 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 495 ptype = product->type; 496 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 497 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 498 switch (ptype) { 499 case MATPRODUCT_AB: 500 tA = false; 501 tB = false; 502 m = A->rmap->n; 503 n = B->cmap->n; 504 break; 505 case MATPRODUCT_AtB: 506 tA = true; 507 tB = false; 508 m = A->cmap->n; 509 n = B->cmap->n; 510 break; 511 case MATPRODUCT_ABt: 512 tA = false; 513 tB = true; 514 m = A->rmap->n; 515 n = B->rmap->n; 516 break; 517 default: 518 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 519 } 520 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 521 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 522 c = (Mat_SeqAIJ*)C->data; 523 524 kh = new MatMatKernelHandle_t; 525 // TODO SZ: ADD RUNTIME SELECTION OF THESE 526 kh->set_team_work_size(16); 527 kh->set_dynamic_scheduling(true); 528 // Select an spgemm algorithm, limited by configuration at compile-time and 529 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 530 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 531 std::string myalg("SPGEMM_KK_MEMORY"); 532 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 533 534 ///////////////////////////////////// 535 // TODO JZ 536 ckok = NULL; //new Mat_SeqAIJKokkos(); 537 C->spptr = ckok; 538 KokkosCsrMatrix_t ccsr; // here only to have the code compile 539 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 540 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 541 //c->nz = get_nnz_from_ccsr 542 ////////////////////////////////////// 543 c->singlemalloc = PETSC_FALSE; 544 c->free_a = PETSC_TRUE; 545 c->free_ij = PETSC_TRUE; 546 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 547 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 548 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 549 //////////////////////////////////// 550 // TODO JZ copy from device to c->i and c->j 551 //////////////////////////////////// 552 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 553 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 554 c->maxnz = c->nz; 555 c->nonzerorowcnt = 0; 556 c->rmax = 0; 557 for (k = 0; k < m; k++) { 558 const PetscInt nn = c->i[k+1] - c->i[k]; 559 c->ilen[k] = c->imax[k] = nn; 560 c->nonzerorowcnt += (PetscInt)!!nn; 561 c->rmax = PetscMax(c->rmax,nn); 562 } 563 564 C->nonzerostate++; 565 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 566 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 567 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 568 ckok->nonzerostate = C->nonzerostate; 569 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 570 C->preallocated = PETSC_TRUE; 571 C->assembled = PETSC_FALSE; 572 C->was_assembled = PETSC_FALSE; 573 574 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 575 C->product->data = kh; 576 C->product->destroy = MatMatKernelHandleDestroy_Private; 577 PetscFunctionReturn(0); 578 } 579 580 /* handles sparse matrix matrix ops */ 581 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 582 { 583 Mat_Product *product = mat->product; 584 PetscErrorCode ierr; 585 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 586 587 PetscFunctionBegin; 588 MatCheckProduct(mat,1); 589 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 590 if (product->type == MATPRODUCT_ABC) { 591 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 592 } 593 if (Biskok && Ciskok) { 594 switch (product->type) { 595 case MATPRODUCT_AB: 596 case MATPRODUCT_AtB: 597 case MATPRODUCT_ABt: 598 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 599 break; 600 case MATPRODUCT_PtAP: 601 case MATPRODUCT_RARt: 602 case MATPRODUCT_ABC: 603 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 604 break; 605 default: 606 break; 607 } 608 } else { /* fallback for AIJ */ 609 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 610 } 611 PetscFunctionReturn(0); 612 } 613 #endif 614 615 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 616 { 617 PetscErrorCode ierr; 618 619 PetscFunctionBegin; 620 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 621 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 622 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 623 PetscFunctionReturn(0); 624 } 625 626 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 627 { 628 PetscErrorCode ierr; 629 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 630 PetscFunctionBegin; 631 if (aijkok && aijkok->device_mat_d.data()) { 632 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 633 } 634 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 635 PetscFunctionReturn(0); 636 } 637 638 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 639 { 640 PetscErrorCode ierr; 641 Mat_SeqAIJKokkos *aijkok; 642 643 PetscFunctionBegin; 644 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 645 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 646 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 647 A->offloadmask = PETSC_OFFLOAD_GPU; 648 ierr = WaitForKokkos();CHKERRQ(ierr); 649 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 650 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 651 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 652 PetscFunctionReturn(0); 653 } 654 655 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 656 { 657 PetscErrorCode ierr; 658 PetscBool both = PETSC_FALSE; 659 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 660 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 661 662 PetscFunctionBegin; 663 if (aijkok && aijkok->a_d.data()) { 664 KokkosBlas::fill(aijkok->a_d,0.0); 665 both = PETSC_TRUE; 666 } 667 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 668 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 669 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 670 else A->offloadmask = PETSC_OFFLOAD_CPU; 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 675 { 676 PetscErrorCode ierr; 677 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 678 679 PetscFunctionBegin; 680 if (X->ops->axpy != Y->ops->axpy) { 681 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 685 PetscBool e; 686 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 687 if (e) { 688 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 689 if (e) str = SAME_NONZERO_PATTERN; 690 } 691 } 692 if (str != SAME_NONZERO_PATTERN) { 693 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } else { 696 if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 697 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 698 PetscFunctionReturn(0); 699 } else { 700 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 701 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 702 } 703 Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 704 Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 705 if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 706 KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 707 Y->offloadmask = PETSC_OFFLOAD_GPU; 708 ierr = WaitForKokkos();CHKERRQ(ierr); 709 ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 710 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 711 ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 712 } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 713 } 714 PetscFunctionReturn(0); 715 } 716 717 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info) 718 { 719 PetscErrorCode ierr; 720 721 PetscFunctionBegin; 722 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 723 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 724 B->offloadmask = PETSC_OFFLOAD_CPU; 725 PetscFunctionReturn(0); 726 } 727 728 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 729 { 730 PetscFunctionBegin; 731 A->boundtocpu = PETSC_FALSE; 732 733 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 734 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 735 A->ops->destroy = MatDestroy_SeqAIJKokkos; 736 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 737 A->ops->axpy = MatAXPY_SeqAIJKokkos; 738 A->ops->scale = MatScale_SeqAIJKokkos; 739 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 740 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 741 A->ops->mult = MatMult_SeqAIJKokkos; 742 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 743 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 744 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 745 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 746 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 747 A->ops->setoption = MatSetOption_SeqAIJKokkos; 748 PetscFunctionReturn(0); 749 } 750 751 /* --------------------------------------------------------------------------------*/ 752 /*@C 753 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 754 (the default parallel PETSc format). This matrix will ultimately be handled by 755 Kokkos for calculations. For good matrix 756 assembly performance the user should preallocate the matrix storage by setting 757 the parameter nz (or the array nnz). By setting these parameters accurately, 758 performance during matrix assembly can be increased by more than a factor of 50. 759 760 Collective 761 762 Input Parameters: 763 + comm - MPI communicator, set to PETSC_COMM_SELF 764 . m - number of rows 765 . n - number of columns 766 . nz - number of nonzeros per row (same for all rows) 767 - nnz - array containing the number of nonzeros in the various rows 768 (possibly different for each row) or NULL 769 770 Output Parameter: 771 . A - the matrix 772 773 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 774 MatXXXXSetPreallocation() paradgm instead of this routine directly. 775 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 776 777 Notes: 778 If nnz is given then nz is ignored 779 780 The AIJ format (also called the Yale sparse matrix format or 781 compressed row storage), is fully compatible with standard Fortran 77 782 storage. That is, the stored row and column indices can begin at 783 either one (as in Fortran) or zero. See the users' manual for details. 784 785 Specify the preallocated storage with either nz or nnz (not both). 786 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 787 allocation. For large problems you MUST preallocate memory or you 788 will get TERRIBLE performance, see the users' manual chapter on matrices. 789 790 By default, this format uses inodes (identical nodes) when possible, to 791 improve numerical efficiency of matrix-vector products and solves. We 792 search for consecutive rows with the same nonzero structure, thereby 793 reusing matrix information to achieve increased efficiency. 794 795 Level: intermediate 796 797 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 798 @*/ 799 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 800 { 801 PetscErrorCode ierr; 802 803 PetscFunctionBegin; 804 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 805 ierr = MatCreate(comm,A);CHKERRQ(ierr); 806 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 807 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 808 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 809 PetscFunctionReturn(0); 810 } 811 812 // factorizations 813 typedef Kokkos::TeamPolicy<>::member_type team_member; 814 // 815 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 816 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 817 // 818 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 819 { 820 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 821 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 822 IS isrow = b->row,isicol = b->icol; 823 PetscErrorCode ierr; 824 const PetscInt *r_h,*ic_h; 825 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 826 const PetscScalar *aa_d = aijkok->a_d.data(); 827 PetscScalar *ba_d = baijkok->a_d.data(); 828 PetscBool row_identity,col_identity; 829 PetscInt nc, Nf, nVec=32; // should be a parameter 830 PetscContainer container; 831 832 PetscFunctionBegin; 833 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 834 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 835 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 836 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 837 if (container) { 838 PetscInt *pNf=NULL, nv; 839 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 840 Nf = (*pNf)%1000; 841 nv = (*pNf)/1000; 842 if (nv>0) nVec = nv; 843 } else Nf = 1; 844 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 845 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 846 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 847 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 848 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 849 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 850 #endif 851 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 852 { 853 #define KOKKOS_SHARED_LEVEL 1 854 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 855 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 856 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 857 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 858 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 859 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 860 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 861 size_t flops_h = 0.0; 862 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 863 Kokkos::View<size_t> d_flops_k ("flops"); 864 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 865 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 866 Kokkos::deep_copy (d_flops_k, h_flops_k); 867 Kokkos::deep_copy (d_r_k, h_r_k); 868 Kokkos::deep_copy (d_ic_k, h_ic_k); 869 // Fill A --> fact 870 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 871 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 872 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 873 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 874 // zero rows of B 875 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 876 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 877 PetscScalar *baL = ba_d + bi_d[rowb]; 878 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 879 /* zero (unfactored row) */ 880 for (int j=0;j<nzbL;j++) baL[j] = 0; 881 for (int j=0;j<nzbU;j++) baU[j] = 0; 882 }); 883 // copy A into B 884 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 885 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 886 const PetscScalar *av = aa_d + ai_d[rowa]; 887 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 888 /* load in initial (unfactored row) */ 889 for (int j=0;j<nza;j++) { 890 PetscInt colb = ic[ajtmp[j]]; 891 PetscScalar vala = av[j]; 892 if (colb == rowb) { 893 *(ba_d + bdiag_d[rowb]) = vala; 894 } else { 895 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 896 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 897 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 898 for (int j=0; j<nz ; j++) { 899 if (pbj[j] == colb) { 900 pba[j] = vala; 901 set++; 902 break; 903 } 904 } 905 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 906 } 907 } 908 }); 909 }); 910 Kokkos::fence(); 911 912 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 913 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 914 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 915 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 916 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 917 const PetscInt start = field*nloc, end = start + nloc; 918 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 919 // A22 panel update for each row A(1,:) and col A(:,1) 920 for (int ii=start; ii<end-1; ii++) { 921 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 922 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 923 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 924 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 925 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 926 PetscInt kIdx = j*Ni + field_block_idx; 927 if (kIdx >= nzUi) /* void */ ; 928 else { 929 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 930 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 931 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 932 size_t st_idx; 933 // find and do L(k,i) = A(:k,i) / A(i,i) 934 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 935 // get column, there has got to be a better way 936 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 937 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 938 if (pjL[j] == ii) { 939 PetscScalar *pLki = ba_d + bi_d[myk] + j; 940 idx = j; // output 941 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 942 } 943 }, st_idx); 944 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 945 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 946 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 947 // U(i+1,:end) 948 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 949 PetscScalar Uij = baUi[uiIdx]; 950 PetscInt col = bjUi[uiIdx]; 951 if (col==myk) { 952 // A_kk = A_kk - L_ki * U_ij(k) 953 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 954 *Akkv = *Akkv - L_ki() * Uij; // UiK 955 } else { 956 PetscScalar *start, *end, *pAkjv=NULL; 957 PetscInt high, low; 958 const PetscInt *startj; 959 if (col<myk) { // L 960 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 961 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 962 start = pLki+1; // start at pLki+1, A22(myk,1) 963 startj= bj_d + bi_d[myk] + idx; 964 end = ba_d + bi_d[myk+1]; 965 } else { 966 PetscInt idx = bdiag_d[myk+1]+1; 967 start = ba_d + idx; 968 startj= bj_d + idx; 969 end = ba_d + bdiag_d[myk]; 970 } 971 // search for 'col', use bisection search - TODO 972 low = 0; 973 high = (PetscInt)(end-start); 974 while (high-low > 5) { 975 int t = (low+high)/2; 976 if (startj[t] > col) high = t; 977 else low = t; 978 } 979 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 980 if (startj[pAkjv-start] == col) break; 981 } 982 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 983 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 984 } 985 }); 986 } 987 } 988 }); 989 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 990 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 991 } /* endof for (i=0; i<n; i++) { */ 992 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 993 }); 994 Kokkos::fence(); 995 Kokkos::deep_copy (h_flops_k, d_flops_k); 996 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 997 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 998 #elif defined(PETSC_USE_LOG) 999 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1000 #endif 1001 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1002 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1003 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1004 /* Invert diagonal for simpler triangular solves */ 1005 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1006 int i = start + outer_index*Ni + lg_rank%Ni; 1007 if (i < end) { 1008 PetscScalar *pv = ba_d + bdiag_d[i]; 1009 *pv = 1.0/(*pv); 1010 } 1011 }); 1012 }); 1013 } 1014 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1015 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1016 #endif 1017 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1018 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1019 1020 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1021 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1022 if (b->inode.size) { 1023 B->ops->solve = MatSolve_SeqAIJ_Inode; 1024 } else if (row_identity && col_identity) { 1025 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1026 } else { 1027 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1028 } 1029 B->offloadmask = PETSC_OFFLOAD_GPU; 1030 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1031 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1032 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1033 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1034 B->ops->matsolve = MatMatSolve_SeqAIJ; 1035 B->assembled = PETSC_TRUE; 1036 B->preallocated = PETSC_TRUE; 1037 1038 PetscFunctionReturn(0); 1039 } 1040 1041 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1042 { 1043 PetscErrorCode ierr; 1044 1045 PetscFunctionBegin; 1046 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1047 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS; 1048 PetscFunctionReturn(0); 1049 } 1050 1051 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1052 { 1053 PetscErrorCode ierr; 1054 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1055 const PetscInt nrows = A->rmap->n; 1056 1057 PetscFunctionBegin; 1058 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1059 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1060 // move B data into Kokkos 1061 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1062 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1063 { 1064 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1065 if (!baijkok->diag_d) { 1066 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1067 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1068 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1069 } 1070 } 1071 PetscFunctionReturn(0); 1072 } 1073 1074 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*); 1075 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*); 1076 1077 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1078 { 1079 PetscErrorCode ierr; 1080 1081 PetscFunctionBegin; 1082 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr); 1083 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1084 PetscFunctionReturn(0); 1085 } 1086 1087 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type) 1088 { 1089 PetscFunctionBegin; 1090 *type = MATSOLVERKOKKOS; 1091 PetscFunctionReturn(0); 1092 } 1093 1094 // use -pc_factor_mat_solver_type kokkosdevice 1095 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1096 { 1097 PetscFunctionBegin; 1098 *type = MATSOLVERKOKKOSDEVICE; 1099 PetscFunctionReturn(0); 1100 } 1101 1102 /*MC 1103 MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1104 on a single GPU of type, seqaijkokkos, aijkokkos. 1105 1106 Level: beginner 1107 1108 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation 1109 M*/ 1110 1111 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B) 1112 { 1113 PetscErrorCode ierr; 1114 PetscInt n = A->rmap->n; 1115 1116 PetscFunctionBegin; 1117 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1118 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1119 (*B)->factortype = ftype; 1120 (*B)->useordering = PETSC_TRUE; 1121 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1122 1123 if (ftype == MAT_FACTOR_LU) { 1124 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1125 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOS; 1126 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1127 1128 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1129 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr); 1130 PetscFunctionReturn(0); 1131 } 1132 1133 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1134 { 1135 PetscErrorCode ierr; 1136 PetscInt n = A->rmap->n; 1137 1138 PetscFunctionBegin; 1139 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1140 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1141 (*B)->factortype = ftype; 1142 (*B)->useordering = PETSC_TRUE; 1143 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1144 1145 if (ftype == MAT_FACTOR_LU) { 1146 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1147 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1148 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1149 1150 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1151 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1152 PetscFunctionReturn(0); 1153 } 1154