1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscveckokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosSparse_CrsMatrix.hpp> 8 #include <KokkosSparse_spmv.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <../src/mat/impls/aij/seq/aij.h> 11 12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 13 #include <petscmat.h> 14 15 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 16 17 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 18 { 19 PetscErrorCode ierr; 20 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 21 22 PetscFunctionBegin; 23 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 24 if (aijkok && aijkok->device_mat_d.data()) { 25 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 26 } 27 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 28 PetscFunctionReturn(0); 29 } 30 31 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 32 { 33 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 34 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 35 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 36 37 PetscFunctionBegin; 38 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 39 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 40 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 41 delete aijkok; 42 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 43 aijkok->nonzerostate = A->nonzerostate; 44 A->spptr = aijkok; 45 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 46 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 47 } 48 A->offloadmask = PETSC_OFFLOAD_BOTH; 49 PetscFunctionReturn(0); 50 } 51 52 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 53 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 54 { 55 Mat_SeqAIJKokkos *aijkok; 56 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 57 58 PetscFunctionBegin; 59 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 60 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 61 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 62 aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k); 63 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 64 PetscFunctionReturn(0); 65 } 66 67 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 68 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 69 { 70 Mat_SeqAIJKokkos *aijkok; 71 72 PetscFunctionBegin; 73 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 if (aijkok && aijkok->device_mat_d.data()) { 75 *d_mat = aijkok->device_mat_d.data(); 76 } else { 77 PetscErrorCode ierr; 78 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 79 *d_mat = NULL; 80 } 81 PetscFunctionReturn(0); 82 } 83 84 /* y = A x */ 85 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 86 { 87 PetscErrorCode ierr; 88 Mat_SeqAIJKokkos *aijkok; 89 ConstPetscScalarViewDevice_t xv; 90 PetscScalarViewDevice_t yv; 91 92 PetscFunctionBegin; 93 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 94 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 95 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 96 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 97 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 98 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 99 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 100 /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 101 ierr = WaitForKokkos();CHKERRQ(ierr); 102 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 103 PetscFunctionReturn(0); 104 } 105 106 /* y = A^T x */ 107 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 108 { 109 PetscErrorCode ierr; 110 Mat_SeqAIJKokkos *aijkok; 111 ConstPetscScalarViewDevice_t xv; 112 PetscScalarViewDevice_t yv; 113 114 PetscFunctionBegin; 115 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 116 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 117 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 118 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 120 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 121 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 122 ierr = WaitForKokkos();CHKERRQ(ierr); 123 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 124 PetscFunctionReturn(0); 125 } 126 127 /* y = A^H x */ 128 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 129 { 130 PetscErrorCode ierr; 131 Mat_SeqAIJKokkos *aijkok; 132 ConstPetscScalarViewDevice_t xv; 133 PetscScalarViewDevice_t yv; 134 135 PetscFunctionBegin; 136 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 137 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 138 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 139 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 140 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 141 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 142 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 143 ierr = WaitForKokkos();CHKERRQ(ierr); 144 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 /* z = A x + y */ 149 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 150 { 151 PetscErrorCode ierr; 152 Mat_SeqAIJKokkos *aijkok; 153 ConstPetscScalarViewDevice_t xv,yv; 154 PetscScalarViewDevice_t zv; 155 156 PetscFunctionBegin; 157 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 158 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 159 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 160 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 161 if (zz != yy) Kokkos::deep_copy(zv,yv); 162 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 164 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 166 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 167 ierr = WaitForKokkos();CHKERRQ(ierr); 168 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 169 PetscFunctionReturn(0); 170 } 171 172 /* z = A^T x + y */ 173 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 174 { 175 PetscErrorCode ierr; 176 Mat_SeqAIJKokkos *aijkok; 177 ConstPetscScalarViewDevice_t xv,yv; 178 PetscScalarViewDevice_t zv; 179 180 PetscFunctionBegin; 181 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 182 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 183 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 184 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 185 if (zz != yy) Kokkos::deep_copy(zv,yv); 186 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 187 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 188 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 189 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 190 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 191 ierr = WaitForKokkos();CHKERRQ(ierr); 192 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 /* z = A^H x + y */ 197 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 198 { 199 PetscErrorCode ierr; 200 Mat_SeqAIJKokkos *aijkok; 201 ConstPetscScalarViewDevice_t xv,yv; 202 PetscScalarViewDevice_t zv; 203 204 PetscFunctionBegin; 205 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 206 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 207 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 208 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 209 if (zz != yy) Kokkos::deep_copy(zv,yv); 210 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 212 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 213 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 214 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 215 ierr = WaitForKokkos();CHKERRQ(ierr); 216 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 217 PetscFunctionReturn(0); 218 } 219 220 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 221 { 222 PetscErrorCode ierr; 223 Mat B; 224 Mat_SeqAIJ *aij; 225 226 PetscFunctionBegin; 227 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 228 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 229 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 230 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 231 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 232 } 233 234 B = *newmat; 235 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 236 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 237 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 238 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 239 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 240 aij = (Mat_SeqAIJ*)B->data; 241 aij->inode.use = PETSC_FALSE; 242 243 B->offloadmask = PETSC_OFFLOAD_CPU; 244 PetscFunctionReturn(0); 245 } 246 247 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 248 { 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 253 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 254 PetscFunctionReturn(0); 255 } 256 257 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 258 { 259 PetscErrorCode ierr; 260 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 261 262 PetscFunctionBegin; 263 if (aijkok && aijkok->device_mat_d.data()) { 264 delete aijkok->colmap_d; 265 delete aijkok->i_uncompressed_d; 266 } 267 delete aijkok; 268 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 269 PetscFunctionReturn(0); 270 } 271 272 #if 0 273 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 274 { 275 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 276 277 PetscFunctionBegin; 278 delete kh; 279 PetscFunctionReturn(0); 280 } 281 282 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 283 { 284 Mat_Product *product = C->product; 285 Mat A,B; 286 MatProductType ptype; 287 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 288 bool tA,tB; 289 PetscErrorCode ierr; 290 MatMatKernelHandle_t *kh; 291 Mat_SeqAIJ *c; 292 293 PetscFunctionBegin; 294 MatCheckProduct(C,1); 295 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 296 A = product->A; 297 B = product->B; 298 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 299 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 300 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 301 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 302 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 303 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 304 ptype = product->type; 305 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 306 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 307 switch (ptype) { 308 case MATPRODUCT_AB: 309 tA = false; 310 tB = false; 311 break; 312 case MATPRODUCT_AtB: 313 tA = true; 314 tB = false; 315 break; 316 case MATPRODUCT_ABt: 317 tA = false; 318 tB = true; 319 break; 320 default: 321 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 322 } 323 324 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 325 C->offloadmask = PETSC_OFFLOAD_GPU; 326 /* shorter version of MatAssemblyEnd_SeqAIJ */ 327 c = (Mat_SeqAIJ*)C->data; 328 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 329 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 330 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 331 c->reallocs = 0; 332 C->info.mallocs += 0; 333 C->info.nz_unneeded = 0; 334 C->assembled = C->was_assembled = PETSC_TRUE; 335 C->num_ass++; 336 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 337 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 338 C->offloadmask = PETSC_OFFLOAD_BOTH; 339 // Also, we should add support to copy back from device to host 340 PetscFunctionReturn(0); 341 } 342 343 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 344 { 345 Mat_Product *product = C->product; 346 Mat A,B; 347 MatProductType ptype; 348 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 349 PetscInt m,n,k; 350 bool tA,tB; 351 PetscErrorCode ierr; 352 Mat_SeqAIJ *c; 353 MatMatKernelHandle_t *kh; 354 355 PetscFunctionBegin; 356 MatCheckProduct(C,1); 357 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 358 A = product->A; 359 B = product->B; 360 // TODO only copy the i,j data, not the values 361 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 362 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 363 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 364 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 365 ptype = product->type; 366 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 367 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 368 switch (ptype) { 369 case MATPRODUCT_AB: 370 tA = false; 371 tB = false; 372 m = A->rmap->n; 373 n = B->cmap->n; 374 break; 375 case MATPRODUCT_AtB: 376 tA = true; 377 tB = false; 378 m = A->cmap->n; 379 n = B->cmap->n; 380 break; 381 case MATPRODUCT_ABt: 382 tA = false; 383 tB = true; 384 m = A->rmap->n; 385 n = B->rmap->n; 386 break; 387 default: 388 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 389 } 390 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 391 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 392 c = (Mat_SeqAIJ*)C->data; 393 394 kh = new MatMatKernelHandle_t; 395 // TODO SZ: ADD RUNTIME SELECTION OF THESE 396 kh->set_team_work_size(16); 397 kh->set_dynamic_scheduling(true); 398 // Select an spgemm algorithm, limited by configuration at compile-time and 399 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 400 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 401 std::string myalg("SPGEMM_KK_MEMORY"); 402 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 403 404 ///////////////////////////////////// 405 // TODO JZ 406 ckok = NULL; //new Mat_SeqAIJKokkos(); 407 C->spptr = ckok; 408 KokkosCsrMatrix_t ccsr; // here only to have the code compile 409 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 410 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 411 //c->nz = get_nnz_from_ccsr 412 ////////////////////////////////////// 413 c->singlemalloc = PETSC_FALSE; 414 c->free_a = PETSC_TRUE; 415 c->free_ij = PETSC_TRUE; 416 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 417 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 418 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 419 //////////////////////////////////// 420 // TODO JZ copy from device to c->i and c->j 421 //////////////////////////////////// 422 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 423 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 424 c->maxnz = c->nz; 425 c->nonzerorowcnt = 0; 426 c->rmax = 0; 427 for (k = 0; k < m; k++) { 428 const PetscInt nn = c->i[k+1] - c->i[k]; 429 c->ilen[k] = c->imax[k] = nn; 430 c->nonzerorowcnt += (PetscInt)!!nn; 431 c->rmax = PetscMax(c->rmax,nn); 432 } 433 434 C->nonzerostate++; 435 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 436 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 437 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 438 ckok->nonzerostate = C->nonzerostate; 439 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 440 C->preallocated = PETSC_TRUE; 441 C->assembled = PETSC_FALSE; 442 C->was_assembled = PETSC_FALSE; 443 444 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 445 C->product->data = kh; 446 C->product->destroy = MatMatKernelHandleDestroy_Private; 447 PetscFunctionReturn(0); 448 } 449 450 /* handles sparse matrix matrix ops */ 451 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 452 { 453 Mat_Product *product = mat->product; 454 PetscErrorCode ierr; 455 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 456 457 PetscFunctionBegin; 458 MatCheckProduct(mat,1); 459 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 460 if (product->type == MATPRODUCT_ABC) { 461 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 462 } 463 if (Biskok && Ciskok) { 464 switch (product->type) { 465 case MATPRODUCT_AB: 466 case MATPRODUCT_AtB: 467 case MATPRODUCT_ABt: 468 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 469 break; 470 case MATPRODUCT_PtAP: 471 case MATPRODUCT_RARt: 472 case MATPRODUCT_ABC: 473 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 474 break; 475 default: 476 break; 477 } 478 } else { /* fallback for AIJ */ 479 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 480 } 481 PetscFunctionReturn(0); 482 } 483 #endif 484 485 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 486 { 487 PetscErrorCode ierr; 488 489 PetscFunctionBegin; 490 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 491 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 492 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 493 PetscFunctionReturn(0); 494 } 495 496 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 497 { 498 PetscErrorCode ierr; 499 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 500 PetscFunctionBegin; 501 if (aijkok && aijkok->device_mat_d.data()) { 502 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 503 } 504 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 505 PetscFunctionReturn(0); 506 } 507 508 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 509 { 510 PetscErrorCode ierr; 511 PetscBool both = PETSC_FALSE; 512 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 513 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 514 515 PetscFunctionBegin; 516 if (aijkok && aijkok->a_d.data()) { 517 Kokkos::parallel_for (aijkok->a_d.size(), KOKKOS_LAMBDA (int i) { aijkok->a_d(i) = 0; }); 518 both = PETSC_TRUE; 519 } 520 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 521 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 522 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 523 else A->offloadmask = PETSC_OFFLOAD_CPU; 524 525 PetscFunctionReturn(0); 526 } 527 528 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 529 { 530 PetscErrorCode ierr; 531 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 532 PetscBool flgx,flgy; 533 534 PetscFunctionBegin; 535 if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 536 PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 537 PetscValidHeaderSpecific(X,MAT_CLASSID,3); 538 ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJKOKKOS,&flgy);CHKERRQ(ierr); 539 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJKOKKOS,&flgx);CHKERRQ(ierr); 540 if (!flgx || !flgy) { 541 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 542 PetscFunctionReturn(0); 543 } 544 if (str == DIFFERENT_NONZERO_PATTERN) { 545 if (x->nz == y->nz) { 546 PetscBool e; 547 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 548 if (e) { 549 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 550 if (e) { 551 str = SAME_NONZERO_PATTERN; 552 } 553 } 554 } 555 } 556 if (str != SAME_NONZERO_PATTERN) { 557 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 558 PetscFunctionReturn(0); 559 } else { 560 if (Y->offloadmask==PETSC_OFFLOAD_CPU && X->offloadmask==PETSC_OFFLOAD_CPU) { 561 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } else if ((Y->offloadmask==PETSC_OFFLOAD_GPU || Y->offloadmask==PETSC_OFFLOAD_BOTH) && X->offloadmask == PETSC_OFFLOAD_CPU) { 564 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 565 } else if ((X->offloadmask==PETSC_OFFLOAD_GPU || X->offloadmask==PETSC_OFFLOAD_BOTH) && Y->offloadmask == PETSC_OFFLOAD_CPU) { 566 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); /* promote Y */ 567 } 568 { 569 Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 570 Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 571 if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 572 Kokkos::parallel_for (aijkokY->a_d.size(), KOKKOS_LAMBDA (int i) { aijkokY->a_d(i) += a*aijkokX->a_d(i); }); 573 Kokkos::deep_copy(aijkokY->a_h,aijkokY->a_d); // we could not copy and keep GPU 574 Y->offloadmask = PETSC_OFFLOAD_BOTH; 575 ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 576 } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 577 } 578 } 579 PetscFunctionReturn(0); 580 } 581 582 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 583 { 584 PetscFunctionBegin; 585 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 586 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 587 A->ops->destroy = MatDestroy_SeqAIJKokkos; 588 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 589 A->ops->axpy = MatAXPY_SeqAIJKokkos; 590 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 591 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 592 A->ops->mult = MatMult_SeqAIJKokkos; 593 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 594 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 595 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 596 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 597 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 598 PetscFunctionReturn(0); 599 } 600 601 /* --------------------------------------------------------------------------------*/ 602 /*C 603 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 604 (the default parallel PETSc format). This matrix will ultimately be handled by 605 Kokkos for calculations. For good matrix 606 assembly performance the user should preallocate the matrix storage by setting 607 the parameter nz (or the array nnz). By setting these parameters accurately, 608 performance during matrix assembly can be increased by more than a factor of 50. 609 610 Collective 611 612 Input Parameters: 613 + comm - MPI communicator, set to PETSC_COMM_SELF 614 . m - number of rows 615 . n - number of columns 616 . nz - number of nonzeros per row (same for all rows) 617 - nnz - array containing the number of nonzeros in the various rows 618 (possibly different for each row) or NULL 619 620 Output Parameter: 621 . A - the matrix 622 623 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 624 MatXXXXSetPreallocation() paradgm instead of this routine directly. 625 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 626 627 Notes: 628 If nnz is given then nz is ignored 629 630 The AIJ format (also called the Yale sparse matrix format or 631 compressed row storage), is fully compatible with standard Fortran 77 632 storage. That is, the stored row and column indices can begin at 633 either one (as in Fortran) or zero. See the users' manual for details. 634 635 Specify the preallocated storage with either nz or nnz (not both). 636 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 637 allocation. For large problems you MUST preallocate memory or you 638 will get TERRIBLE performance, see the users' manual chapter on matrices. 639 640 By default, this format uses inodes (identical nodes) when possible, to 641 improve numerical efficiency of matrix-vector products and solves. We 642 search for consecutive rows with the same nonzero structure, thereby 643 reusing matrix information to achieve increased efficiency. 644 645 Level: intermediate 646 647 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 648 @*/ 649 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 650 { 651 PetscErrorCode ierr; 652 653 PetscFunctionBegin; 654 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 655 ierr = MatCreate(comm,A);CHKERRQ(ierr); 656 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 657 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 658 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 659 PetscFunctionReturn(0); 660 } 661