1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscveckokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <KokkosSparse_spmv.hpp> 11 #include <../src/mat/impls/aij/seq/aij.h> 12 13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 14 #include <petscmat.h> 15 16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 17 18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 19 { 20 PetscErrorCode ierr; 21 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 22 23 PetscFunctionBegin; 24 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 25 if (aijkok && aijkok->device_mat_d.data()) { 26 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 27 } 28 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 29 PetscFunctionReturn(0); 30 } 31 32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 33 { 34 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 35 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 36 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 37 38 PetscFunctionBegin; 39 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 40 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 41 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 42 delete aijkok; 43 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 44 aijkok->nonzerostate = A->nonzerostate; 45 A->spptr = aijkok; 46 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 47 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 48 } 49 A->offloadmask = PETSC_OFFLOAD_BOTH; 50 PetscFunctionReturn(0); 51 } 52 53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 54 { 55 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 56 57 PetscFunctionBegin; 58 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 59 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 60 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 61 Kokkos::deep_copy(aijkok->a_h,aijkok->a_d); 62 A->offloadmask = PETSC_OFFLOAD_BOTH; 63 } 64 PetscFunctionReturn(0); 65 } 66 67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 68 { 69 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 70 PetscErrorCode ierr; 71 72 PetscFunctionBegin; 73 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 74 *array = a->a; 75 A->offloadmask = PETSC_OFFLOAD_CPU; 76 PetscFunctionReturn(0); 77 } 78 79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 81 { 82 Mat_SeqAIJKokkos *aijkok; 83 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 84 85 PetscFunctionBegin; 86 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 87 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 88 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 89 aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k); 90 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 91 PetscFunctionReturn(0); 92 } 93 94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 96 { 97 Mat_SeqAIJKokkos *aijkok; 98 99 PetscFunctionBegin; 100 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 101 if (aijkok && aijkok->device_mat_d.data()) { 102 *d_mat = aijkok->device_mat_d.data(); 103 } else { 104 PetscErrorCode ierr; 105 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 106 *d_mat = NULL; 107 } 108 PetscFunctionReturn(0); 109 } 110 111 /* y = A x */ 112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 113 { 114 PetscErrorCode ierr; 115 Mat_SeqAIJKokkos *aijkok; 116 ConstPetscScalarViewDevice_t xv; 117 PetscScalarViewDevice_t yv; 118 119 PetscFunctionBegin; 120 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 121 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 122 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 123 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 124 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 125 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 126 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 127 /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 128 ierr = WaitForKokkos();CHKERRQ(ierr); 129 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 130 PetscFunctionReturn(0); 131 } 132 133 /* y = A^T x */ 134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 135 { 136 PetscErrorCode ierr; 137 Mat_SeqAIJKokkos *aijkok; 138 ConstPetscScalarViewDevice_t xv; 139 PetscScalarViewDevice_t yv; 140 141 PetscFunctionBegin; 142 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 143 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 144 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 145 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 147 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 148 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 149 ierr = WaitForKokkos();CHKERRQ(ierr); 150 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 151 PetscFunctionReturn(0); 152 } 153 154 /* y = A^H x */ 155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 156 { 157 PetscErrorCode ierr; 158 Mat_SeqAIJKokkos *aijkok; 159 ConstPetscScalarViewDevice_t xv; 160 PetscScalarViewDevice_t yv; 161 162 PetscFunctionBegin; 163 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 164 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 166 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 167 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 168 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 169 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 170 ierr = WaitForKokkos();CHKERRQ(ierr); 171 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 } 174 175 /* z = A x + y */ 176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 177 { 178 PetscErrorCode ierr; 179 Mat_SeqAIJKokkos *aijkok; 180 ConstPetscScalarViewDevice_t xv,yv; 181 PetscScalarViewDevice_t zv; 182 183 PetscFunctionBegin; 184 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 186 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 187 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 188 if (zz != yy) Kokkos::deep_copy(zv,yv); 189 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 190 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 191 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 192 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 193 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 194 ierr = WaitForKokkos();CHKERRQ(ierr); 195 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 /* z = A^T x + y */ 200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 201 { 202 PetscErrorCode ierr; 203 Mat_SeqAIJKokkos *aijkok; 204 ConstPetscScalarViewDevice_t xv,yv; 205 PetscScalarViewDevice_t zv; 206 207 PetscFunctionBegin; 208 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 210 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 211 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 212 if (zz != yy) Kokkos::deep_copy(zv,yv); 213 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 214 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 215 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 216 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 217 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 218 ierr = WaitForKokkos();CHKERRQ(ierr); 219 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 220 PetscFunctionReturn(0); 221 } 222 223 /* z = A^H x + y */ 224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 225 { 226 PetscErrorCode ierr; 227 Mat_SeqAIJKokkos *aijkok; 228 ConstPetscScalarViewDevice_t xv,yv; 229 PetscScalarViewDevice_t zv; 230 231 PetscFunctionBegin; 232 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 233 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 234 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 235 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 236 if (zz != yy) Kokkos::deep_copy(zv,yv); 237 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 238 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 239 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 240 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 241 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 242 ierr = WaitForKokkos();CHKERRQ(ierr); 243 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 248 { 249 PetscErrorCode ierr; 250 Mat B; 251 Mat_SeqAIJ *aij; 252 253 PetscFunctionBegin; 254 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 255 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 256 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 257 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 258 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 259 } 260 261 B = *newmat; 262 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 263 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 264 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 265 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 266 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 267 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 268 aij = (Mat_SeqAIJ*)B->data; 269 aij->inode.use = PETSC_FALSE; 270 271 B->offloadmask = PETSC_OFFLOAD_CPU; 272 PetscFunctionReturn(0); 273 } 274 275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 276 { 277 PetscErrorCode ierr; 278 279 PetscFunctionBegin; 280 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 281 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 286 { 287 PetscErrorCode ierr; 288 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 289 290 PetscFunctionBegin; 291 if (aijkok && aijkok->device_mat_d.data()) { 292 delete aijkok->colmap_d; 293 delete aijkok->i_uncompressed_d; 294 } 295 delete aijkok; 296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 297 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 298 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 #if 0 303 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 304 { 305 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 306 307 PetscFunctionBegin; 308 delete kh; 309 PetscFunctionReturn(0); 310 } 311 312 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 313 { 314 Mat_Product *product = C->product; 315 Mat A,B; 316 MatProductType ptype; 317 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 318 bool tA,tB; 319 PetscErrorCode ierr; 320 MatMatKernelHandle_t *kh; 321 Mat_SeqAIJ *c; 322 323 PetscFunctionBegin; 324 MatCheckProduct(C,1); 325 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 326 A = product->A; 327 B = product->B; 328 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 329 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 330 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 331 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 332 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 333 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 334 ptype = product->type; 335 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 336 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 337 switch (ptype) { 338 case MATPRODUCT_AB: 339 tA = false; 340 tB = false; 341 break; 342 case MATPRODUCT_AtB: 343 tA = true; 344 tB = false; 345 break; 346 case MATPRODUCT_ABt: 347 tA = false; 348 tB = true; 349 break; 350 default: 351 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 352 } 353 354 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 355 C->offloadmask = PETSC_OFFLOAD_GPU; 356 /* shorter version of MatAssemblyEnd_SeqAIJ */ 357 c = (Mat_SeqAIJ*)C->data; 358 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 359 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 360 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 361 c->reallocs = 0; 362 C->info.mallocs += 0; 363 C->info.nz_unneeded = 0; 364 C->assembled = C->was_assembled = PETSC_TRUE; 365 C->num_ass++; 366 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 367 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 368 C->offloadmask = PETSC_OFFLOAD_BOTH; 369 // Also, we should add support to copy back from device to host 370 PetscFunctionReturn(0); 371 } 372 373 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 374 { 375 Mat_Product *product = C->product; 376 Mat A,B; 377 MatProductType ptype; 378 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 379 PetscInt m,n,k; 380 bool tA,tB; 381 PetscErrorCode ierr; 382 Mat_SeqAIJ *c; 383 MatMatKernelHandle_t *kh; 384 385 PetscFunctionBegin; 386 MatCheckProduct(C,1); 387 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 388 A = product->A; 389 B = product->B; 390 // TODO only copy the i,j data, not the values 391 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 392 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 393 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 394 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 395 ptype = product->type; 396 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 397 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 398 switch (ptype) { 399 case MATPRODUCT_AB: 400 tA = false; 401 tB = false; 402 m = A->rmap->n; 403 n = B->cmap->n; 404 break; 405 case MATPRODUCT_AtB: 406 tA = true; 407 tB = false; 408 m = A->cmap->n; 409 n = B->cmap->n; 410 break; 411 case MATPRODUCT_ABt: 412 tA = false; 413 tB = true; 414 m = A->rmap->n; 415 n = B->rmap->n; 416 break; 417 default: 418 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 419 } 420 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 421 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 422 c = (Mat_SeqAIJ*)C->data; 423 424 kh = new MatMatKernelHandle_t; 425 // TODO SZ: ADD RUNTIME SELECTION OF THESE 426 kh->set_team_work_size(16); 427 kh->set_dynamic_scheduling(true); 428 // Select an spgemm algorithm, limited by configuration at compile-time and 429 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 430 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 431 std::string myalg("SPGEMM_KK_MEMORY"); 432 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 433 434 ///////////////////////////////////// 435 // TODO JZ 436 ckok = NULL; //new Mat_SeqAIJKokkos(); 437 C->spptr = ckok; 438 KokkosCsrMatrix_t ccsr; // here only to have the code compile 439 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 440 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 441 //c->nz = get_nnz_from_ccsr 442 ////////////////////////////////////// 443 c->singlemalloc = PETSC_FALSE; 444 c->free_a = PETSC_TRUE; 445 c->free_ij = PETSC_TRUE; 446 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 447 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 448 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 449 //////////////////////////////////// 450 // TODO JZ copy from device to c->i and c->j 451 //////////////////////////////////// 452 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 453 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 454 c->maxnz = c->nz; 455 c->nonzerorowcnt = 0; 456 c->rmax = 0; 457 for (k = 0; k < m; k++) { 458 const PetscInt nn = c->i[k+1] - c->i[k]; 459 c->ilen[k] = c->imax[k] = nn; 460 c->nonzerorowcnt += (PetscInt)!!nn; 461 c->rmax = PetscMax(c->rmax,nn); 462 } 463 464 C->nonzerostate++; 465 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 466 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 467 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 468 ckok->nonzerostate = C->nonzerostate; 469 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 470 C->preallocated = PETSC_TRUE; 471 C->assembled = PETSC_FALSE; 472 C->was_assembled = PETSC_FALSE; 473 474 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 475 C->product->data = kh; 476 C->product->destroy = MatMatKernelHandleDestroy_Private; 477 PetscFunctionReturn(0); 478 } 479 480 /* handles sparse matrix matrix ops */ 481 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 482 { 483 Mat_Product *product = mat->product; 484 PetscErrorCode ierr; 485 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 486 487 PetscFunctionBegin; 488 MatCheckProduct(mat,1); 489 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 490 if (product->type == MATPRODUCT_ABC) { 491 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 492 } 493 if (Biskok && Ciskok) { 494 switch (product->type) { 495 case MATPRODUCT_AB: 496 case MATPRODUCT_AtB: 497 case MATPRODUCT_ABt: 498 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 499 break; 500 case MATPRODUCT_PtAP: 501 case MATPRODUCT_RARt: 502 case MATPRODUCT_ABC: 503 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 504 break; 505 default: 506 break; 507 } 508 } else { /* fallback for AIJ */ 509 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 510 } 511 PetscFunctionReturn(0); 512 } 513 #endif 514 515 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 516 { 517 PetscErrorCode ierr; 518 519 PetscFunctionBegin; 520 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 521 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 522 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 523 PetscFunctionReturn(0); 524 } 525 526 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 527 { 528 PetscErrorCode ierr; 529 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 530 PetscFunctionBegin; 531 if (aijkok && aijkok->device_mat_d.data()) { 532 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 533 } 534 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 547 A->offloadmask = PETSC_OFFLOAD_GPU; 548 ierr = WaitForKokkos();CHKERRQ(ierr); 549 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 550 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 551 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 552 PetscFunctionReturn(0); 553 } 554 555 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 556 { 557 PetscErrorCode ierr; 558 PetscBool both = PETSC_FALSE; 559 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 560 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 561 562 PetscFunctionBegin; 563 if (aijkok && aijkok->a_d.data()) { 564 KokkosBlas::fill(aijkok->a_d,0.0); 565 both = PETSC_TRUE; 566 } 567 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 568 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 569 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 570 else A->offloadmask = PETSC_OFFLOAD_CPU; 571 PetscFunctionReturn(0); 572 } 573 574 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 575 { 576 PetscErrorCode ierr; 577 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 578 579 PetscFunctionBegin; 580 if (X->ops->axpy != Y->ops->axpy) { 581 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 585 PetscBool e; 586 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 587 if (e) { 588 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 589 if (e) str = SAME_NONZERO_PATTERN; 590 } 591 } 592 if (str != SAME_NONZERO_PATTERN) { 593 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 594 PetscFunctionReturn(0); 595 } else { 596 if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 597 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 598 PetscFunctionReturn(0); 599 } else { 600 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 601 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 602 } 603 Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 604 Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 605 if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 606 KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 607 Y->offloadmask = PETSC_OFFLOAD_GPU; 608 ierr = WaitForKokkos();CHKERRQ(ierr); 609 ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 610 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 611 ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 612 } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 613 } 614 PetscFunctionReturn(0); 615 } 616 617 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 618 { 619 PetscFunctionBegin; 620 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 621 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 622 A->ops->destroy = MatDestroy_SeqAIJKokkos; 623 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 624 A->ops->axpy = MatAXPY_SeqAIJKokkos; 625 A->ops->scale = MatScale_SeqAIJKokkos; 626 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 627 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 628 A->ops->mult = MatMult_SeqAIJKokkos; 629 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 630 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 631 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 632 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 633 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 634 PetscFunctionReturn(0); 635 } 636 637 /* --------------------------------------------------------------------------------*/ 638 /*C 639 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 640 (the default parallel PETSc format). This matrix will ultimately be handled by 641 Kokkos for calculations. For good matrix 642 assembly performance the user should preallocate the matrix storage by setting 643 the parameter nz (or the array nnz). By setting these parameters accurately, 644 performance during matrix assembly can be increased by more than a factor of 50. 645 646 Collective 647 648 Input Parameters: 649 + comm - MPI communicator, set to PETSC_COMM_SELF 650 . m - number of rows 651 . n - number of columns 652 . nz - number of nonzeros per row (same for all rows) 653 - nnz - array containing the number of nonzeros in the various rows 654 (possibly different for each row) or NULL 655 656 Output Parameter: 657 . A - the matrix 658 659 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 660 MatXXXXSetPreallocation() paradgm instead of this routine directly. 661 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 662 663 Notes: 664 If nnz is given then nz is ignored 665 666 The AIJ format (also called the Yale sparse matrix format or 667 compressed row storage), is fully compatible with standard Fortran 77 668 storage. That is, the stored row and column indices can begin at 669 either one (as in Fortran) or zero. See the users' manual for details. 670 671 Specify the preallocated storage with either nz or nnz (not both). 672 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 673 allocation. For large problems you MUST preallocate memory or you 674 will get TERRIBLE performance, see the users' manual chapter on matrices. 675 676 By default, this format uses inodes (identical nodes) when possible, to 677 improve numerical efficiency of matrix-vector products and solves. We 678 search for consecutive rows with the same nonzero structure, thereby 679 reusing matrix information to achieve increased efficiency. 680 681 Level: intermediate 682 683 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 684 @*/ 685 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 686 { 687 PetscErrorCode ierr; 688 689 PetscFunctionBegin; 690 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 691 ierr = MatCreate(comm,A);CHKERRQ(ierr); 692 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 693 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 694 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 // factorizations 699 700 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info) 701 { 702 // Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 703 // IS isrow = b->row,iscol = b->col; 704 // PetscBool row_identity,col_identity; 705 PetscErrorCode ierr; 706 707 PetscFunctionBegin; 708 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 709 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 710 B->offloadmask = PETSC_OFFLOAD_CPU; 711 // solves stay on CPU now 712 /* determine which version of MatSolve needs to be used. */ 713 // ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 714 // ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 715 // if (row_identity && col_identity) { 716 // B->ops->solve = MatSolve_SeqAIJKOKKOS_NaturalOrdering; 717 // B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS_NaturalOrdering; 718 // B->ops->matsolve = NULL; 719 // B->ops->matsolvetranspose = NULL; 720 // } else { 721 // B->ops->solve = MatSolve_SeqAIJKOKKOS; 722 // B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS; 723 // B->ops->matsolve = NULL; 724 // B->ops->matsolvetranspose = NULL; 725 // } 726 727 /* get the triangular factors */ 728 // ierr = MatSeqAIJKOKKOSILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 729 PetscFunctionReturn(0); 730 } 731 732 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 733 { 734 PetscErrorCode ierr; 735 736 PetscFunctionBegin; 737 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 738 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS; 739 PetscFunctionReturn(0); 740 } 741 742 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*); 743 744 745 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 746 { 747 PetscErrorCode ierr; 748 749 PetscFunctionBegin; 750 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr); 751 PetscFunctionReturn(0); 752 } 753 754 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type) 755 { 756 PetscFunctionBegin; 757 *type = MATSOLVERKOKKOS; 758 PetscFunctionReturn(0); 759 } 760 761 /*MC 762 MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices 763 on a single GPU of type, seqaijkokkos, aijkokkos. 764 765 Level: beginner 766 767 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation 768 M*/ 769 770 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B) 771 { 772 PetscErrorCode ierr; 773 PetscInt n = A->rmap->n; 774 775 PetscFunctionBegin; 776 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 777 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 778 (*B)->factortype = ftype; 779 (*B)->useordering = PETSC_TRUE; 780 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 781 782 if (ftype == MAT_FACTOR_LU /* || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT*/) { 783 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 784 // (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKOKKOS; 785 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOS; 786 // } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 787 // (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKOKKOS; 788 // (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKOKKOS; 789 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 790 791 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 792 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr); 793 PetscFunctionReturn(0); 794 } 795