1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscveckokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <KokkosSparse_spmv.hpp> 11 #include <../src/mat/impls/aij/seq/aij.h> 12 13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 14 #include <petscmat.h> 15 16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 17 18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 19 { 20 PetscErrorCode ierr; 21 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 22 23 PetscFunctionBegin; 24 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 25 if (aijkok && aijkok->device_mat_d.data()) { 26 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 27 } 28 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 29 PetscFunctionReturn(0); 30 } 31 32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 33 { 34 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 35 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 36 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 37 38 PetscFunctionBegin; 39 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 40 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 41 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 42 delete aijkok; 43 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 44 aijkok->nonzerostate = A->nonzerostate; 45 A->spptr = aijkok; 46 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 47 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 48 } 49 A->offloadmask = PETSC_OFFLOAD_BOTH; 50 PetscFunctionReturn(0); 51 } 52 53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 54 { 55 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 56 57 PetscFunctionBegin; 58 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 59 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 60 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 61 Kokkos::deep_copy(aijkok->a_h,aijkok->a_d); 62 A->offloadmask = PETSC_OFFLOAD_BOTH; 63 } 64 PetscFunctionReturn(0); 65 } 66 67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 68 { 69 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 70 PetscErrorCode ierr; 71 72 PetscFunctionBegin; 73 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 74 *array = a->a; 75 A->offloadmask = PETSC_OFFLOAD_CPU; 76 PetscFunctionReturn(0); 77 } 78 79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 81 { 82 Mat_SeqAIJKokkos *aijkok; 83 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 84 85 PetscFunctionBegin; 86 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 87 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 88 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 89 aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k); 90 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 91 PetscFunctionReturn(0); 92 } 93 94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 96 { 97 Mat_SeqAIJKokkos *aijkok; 98 99 PetscFunctionBegin; 100 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 101 if (aijkok && aijkok->device_mat_d.data()) { 102 *d_mat = aijkok->device_mat_d.data(); 103 } else { 104 PetscErrorCode ierr; 105 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 106 *d_mat = NULL; 107 } 108 PetscFunctionReturn(0); 109 } 110 111 /* y = A x */ 112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 113 { 114 PetscErrorCode ierr; 115 Mat_SeqAIJKokkos *aijkok; 116 ConstPetscScalarViewDevice_t xv; 117 PetscScalarViewDevice_t yv; 118 119 PetscFunctionBegin; 120 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 121 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 122 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 123 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 124 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 125 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 126 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 127 /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 128 ierr = WaitForKokkos();CHKERRQ(ierr); 129 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 130 PetscFunctionReturn(0); 131 } 132 133 /* y = A^T x */ 134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 135 { 136 PetscErrorCode ierr; 137 Mat_SeqAIJKokkos *aijkok; 138 ConstPetscScalarViewDevice_t xv; 139 PetscScalarViewDevice_t yv; 140 141 PetscFunctionBegin; 142 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 143 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 144 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 145 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 147 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 148 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 149 ierr = WaitForKokkos();CHKERRQ(ierr); 150 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 151 PetscFunctionReturn(0); 152 } 153 154 /* y = A^H x */ 155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 156 { 157 PetscErrorCode ierr; 158 Mat_SeqAIJKokkos *aijkok; 159 ConstPetscScalarViewDevice_t xv; 160 PetscScalarViewDevice_t yv; 161 162 PetscFunctionBegin; 163 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 164 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 166 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 167 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 168 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 169 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 170 ierr = WaitForKokkos();CHKERRQ(ierr); 171 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 } 174 175 /* z = A x + y */ 176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 177 { 178 PetscErrorCode ierr; 179 Mat_SeqAIJKokkos *aijkok; 180 ConstPetscScalarViewDevice_t xv,yv; 181 PetscScalarViewDevice_t zv; 182 183 PetscFunctionBegin; 184 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 186 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 187 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 188 if (zz != yy) Kokkos::deep_copy(zv,yv); 189 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 190 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 191 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 192 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 193 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 194 ierr = WaitForKokkos();CHKERRQ(ierr); 195 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 /* z = A^T x + y */ 200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 201 { 202 PetscErrorCode ierr; 203 Mat_SeqAIJKokkos *aijkok; 204 ConstPetscScalarViewDevice_t xv,yv; 205 PetscScalarViewDevice_t zv; 206 207 PetscFunctionBegin; 208 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 210 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 211 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 212 if (zz != yy) Kokkos::deep_copy(zv,yv); 213 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 214 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 215 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 216 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 217 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 218 ierr = WaitForKokkos();CHKERRQ(ierr); 219 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 220 PetscFunctionReturn(0); 221 } 222 223 /* z = A^H x + y */ 224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 225 { 226 PetscErrorCode ierr; 227 Mat_SeqAIJKokkos *aijkok; 228 ConstPetscScalarViewDevice_t xv,yv; 229 PetscScalarViewDevice_t zv; 230 231 PetscFunctionBegin; 232 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 233 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 234 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 235 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 236 if (zz != yy) Kokkos::deep_copy(zv,yv); 237 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 238 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 239 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 240 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 241 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 242 ierr = WaitForKokkos();CHKERRQ(ierr); 243 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 248 { 249 PetscErrorCode ierr; 250 Mat B; 251 Mat_SeqAIJ *aij; 252 253 PetscFunctionBegin; 254 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 255 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 256 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 257 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 258 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 259 } 260 261 B = *newmat; 262 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 263 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 264 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 265 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 266 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 267 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 268 aij = (Mat_SeqAIJ*)B->data; 269 aij->inode.use = PETSC_FALSE; 270 271 B->offloadmask = PETSC_OFFLOAD_CPU; 272 PetscFunctionReturn(0); 273 } 274 275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 276 { 277 PetscErrorCode ierr; 278 279 PetscFunctionBegin; 280 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 281 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 286 { 287 PetscErrorCode ierr; 288 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 289 290 PetscFunctionBegin; 291 if (aijkok && aijkok->device_mat_d.data()) { 292 delete aijkok->colmap_d; 293 delete aijkok->i_uncompressed_d; 294 } 295 delete aijkok; 296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 297 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 298 PetscFunctionReturn(0); 299 } 300 301 #if 0 302 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 303 { 304 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 305 306 PetscFunctionBegin; 307 delete kh; 308 PetscFunctionReturn(0); 309 } 310 311 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 312 { 313 Mat_Product *product = C->product; 314 Mat A,B; 315 MatProductType ptype; 316 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 317 bool tA,tB; 318 PetscErrorCode ierr; 319 MatMatKernelHandle_t *kh; 320 Mat_SeqAIJ *c; 321 322 PetscFunctionBegin; 323 MatCheckProduct(C,1); 324 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 325 A = product->A; 326 B = product->B; 327 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 328 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 329 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 330 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 331 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 332 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 333 ptype = product->type; 334 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 335 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 336 switch (ptype) { 337 case MATPRODUCT_AB: 338 tA = false; 339 tB = false; 340 break; 341 case MATPRODUCT_AtB: 342 tA = true; 343 tB = false; 344 break; 345 case MATPRODUCT_ABt: 346 tA = false; 347 tB = true; 348 break; 349 default: 350 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 351 } 352 353 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 354 C->offloadmask = PETSC_OFFLOAD_GPU; 355 /* shorter version of MatAssemblyEnd_SeqAIJ */ 356 c = (Mat_SeqAIJ*)C->data; 357 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 358 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 359 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 360 c->reallocs = 0; 361 C->info.mallocs += 0; 362 C->info.nz_unneeded = 0; 363 C->assembled = C->was_assembled = PETSC_TRUE; 364 C->num_ass++; 365 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 366 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 367 C->offloadmask = PETSC_OFFLOAD_BOTH; 368 // Also, we should add support to copy back from device to host 369 PetscFunctionReturn(0); 370 } 371 372 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 373 { 374 Mat_Product *product = C->product; 375 Mat A,B; 376 MatProductType ptype; 377 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 378 PetscInt m,n,k; 379 bool tA,tB; 380 PetscErrorCode ierr; 381 Mat_SeqAIJ *c; 382 MatMatKernelHandle_t *kh; 383 384 PetscFunctionBegin; 385 MatCheckProduct(C,1); 386 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 387 A = product->A; 388 B = product->B; 389 // TODO only copy the i,j data, not the values 390 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 391 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 392 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 393 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 394 ptype = product->type; 395 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 396 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 397 switch (ptype) { 398 case MATPRODUCT_AB: 399 tA = false; 400 tB = false; 401 m = A->rmap->n; 402 n = B->cmap->n; 403 break; 404 case MATPRODUCT_AtB: 405 tA = true; 406 tB = false; 407 m = A->cmap->n; 408 n = B->cmap->n; 409 break; 410 case MATPRODUCT_ABt: 411 tA = false; 412 tB = true; 413 m = A->rmap->n; 414 n = B->rmap->n; 415 break; 416 default: 417 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 418 } 419 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 420 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 421 c = (Mat_SeqAIJ*)C->data; 422 423 kh = new MatMatKernelHandle_t; 424 // TODO SZ: ADD RUNTIME SELECTION OF THESE 425 kh->set_team_work_size(16); 426 kh->set_dynamic_scheduling(true); 427 // Select an spgemm algorithm, limited by configuration at compile-time and 428 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 429 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 430 std::string myalg("SPGEMM_KK_MEMORY"); 431 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 432 433 ///////////////////////////////////// 434 // TODO JZ 435 ckok = NULL; //new Mat_SeqAIJKokkos(); 436 C->spptr = ckok; 437 KokkosCsrMatrix_t ccsr; // here only to have the code compile 438 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 439 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 440 //c->nz = get_nnz_from_ccsr 441 ////////////////////////////////////// 442 c->singlemalloc = PETSC_FALSE; 443 c->free_a = PETSC_TRUE; 444 c->free_ij = PETSC_TRUE; 445 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 446 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 447 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 448 //////////////////////////////////// 449 // TODO JZ copy from device to c->i and c->j 450 //////////////////////////////////// 451 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 452 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 453 c->maxnz = c->nz; 454 c->nonzerorowcnt = 0; 455 c->rmax = 0; 456 for (k = 0; k < m; k++) { 457 const PetscInt nn = c->i[k+1] - c->i[k]; 458 c->ilen[k] = c->imax[k] = nn; 459 c->nonzerorowcnt += (PetscInt)!!nn; 460 c->rmax = PetscMax(c->rmax,nn); 461 } 462 463 C->nonzerostate++; 464 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 465 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 466 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 467 ckok->nonzerostate = C->nonzerostate; 468 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 469 C->preallocated = PETSC_TRUE; 470 C->assembled = PETSC_FALSE; 471 C->was_assembled = PETSC_FALSE; 472 473 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 474 C->product->data = kh; 475 C->product->destroy = MatMatKernelHandleDestroy_Private; 476 PetscFunctionReturn(0); 477 } 478 479 /* handles sparse matrix matrix ops */ 480 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 481 { 482 Mat_Product *product = mat->product; 483 PetscErrorCode ierr; 484 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 485 486 PetscFunctionBegin; 487 MatCheckProduct(mat,1); 488 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 489 if (product->type == MATPRODUCT_ABC) { 490 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 491 } 492 if (Biskok && Ciskok) { 493 switch (product->type) { 494 case MATPRODUCT_AB: 495 case MATPRODUCT_AtB: 496 case MATPRODUCT_ABt: 497 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 498 break; 499 case MATPRODUCT_PtAP: 500 case MATPRODUCT_RARt: 501 case MATPRODUCT_ABC: 502 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 503 break; 504 default: 505 break; 506 } 507 } else { /* fallback for AIJ */ 508 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 509 } 510 PetscFunctionReturn(0); 511 } 512 #endif 513 514 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 515 { 516 PetscErrorCode ierr; 517 518 PetscFunctionBegin; 519 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 520 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 521 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 526 { 527 PetscErrorCode ierr; 528 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 529 PetscFunctionBegin; 530 if (aijkok && aijkok->device_mat_d.data()) { 531 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 532 } 533 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 534 PetscFunctionReturn(0); 535 } 536 537 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 538 { 539 PetscErrorCode ierr; 540 Mat_SeqAIJKokkos *aijkok; 541 542 PetscFunctionBegin; 543 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 544 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 545 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 546 A->offloadmask = PETSC_OFFLOAD_GPU; 547 ierr = WaitForKokkos();CHKERRQ(ierr); 548 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 549 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 550 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 551 PetscFunctionReturn(0); 552 } 553 554 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 555 { 556 PetscErrorCode ierr; 557 PetscBool both = PETSC_FALSE; 558 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 559 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 560 561 PetscFunctionBegin; 562 if (aijkok && aijkok->a_d.data()) { 563 KokkosBlas::fill(aijkok->a_d,0.0); 564 both = PETSC_TRUE; 565 } 566 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 567 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 568 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 569 else A->offloadmask = PETSC_OFFLOAD_CPU; 570 PetscFunctionReturn(0); 571 } 572 573 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 574 { 575 PetscErrorCode ierr; 576 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 577 578 PetscFunctionBegin; 579 if (X->ops->axpy != Y->ops->axpy) { 580 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 581 PetscFunctionReturn(0); 582 } 583 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 584 PetscBool e; 585 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 586 if (e) { 587 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 588 if (e) str = SAME_NONZERO_PATTERN; 589 } 590 } 591 if (str != SAME_NONZERO_PATTERN) { 592 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 593 PetscFunctionReturn(0); 594 } else { 595 if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 596 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 597 PetscFunctionReturn(0); 598 } else { 599 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 600 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 601 } 602 Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 603 Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 604 if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 605 KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 606 Y->offloadmask = PETSC_OFFLOAD_GPU; 607 ierr = WaitForKokkos();CHKERRQ(ierr); 608 ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 609 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 610 ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 611 } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 612 } 613 PetscFunctionReturn(0); 614 } 615 616 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 617 { 618 PetscFunctionBegin; 619 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 620 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 621 A->ops->destroy = MatDestroy_SeqAIJKokkos; 622 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 623 A->ops->axpy = MatAXPY_SeqAIJKokkos; 624 A->ops->scale = MatScale_SeqAIJKokkos; 625 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 626 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 627 A->ops->mult = MatMult_SeqAIJKokkos; 628 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 629 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 630 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 631 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 632 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 633 PetscFunctionReturn(0); 634 } 635 636 /* --------------------------------------------------------------------------------*/ 637 /*C 638 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 639 (the default parallel PETSc format). This matrix will ultimately be handled by 640 Kokkos for calculations. For good matrix 641 assembly performance the user should preallocate the matrix storage by setting 642 the parameter nz (or the array nnz). By setting these parameters accurately, 643 performance during matrix assembly can be increased by more than a factor of 50. 644 645 Collective 646 647 Input Parameters: 648 + comm - MPI communicator, set to PETSC_COMM_SELF 649 . m - number of rows 650 . n - number of columns 651 . nz - number of nonzeros per row (same for all rows) 652 - nnz - array containing the number of nonzeros in the various rows 653 (possibly different for each row) or NULL 654 655 Output Parameter: 656 . A - the matrix 657 658 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 659 MatXXXXSetPreallocation() paradgm instead of this routine directly. 660 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 661 662 Notes: 663 If nnz is given then nz is ignored 664 665 The AIJ format (also called the Yale sparse matrix format or 666 compressed row storage), is fully compatible with standard Fortran 77 667 storage. That is, the stored row and column indices can begin at 668 either one (as in Fortran) or zero. See the users' manual for details. 669 670 Specify the preallocated storage with either nz or nnz (not both). 671 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 672 allocation. For large problems you MUST preallocate memory or you 673 will get TERRIBLE performance, see the users' manual chapter on matrices. 674 675 By default, this format uses inodes (identical nodes) when possible, to 676 improve numerical efficiency of matrix-vector products and solves. We 677 search for consecutive rows with the same nonzero structure, thereby 678 reusing matrix information to achieve increased efficiency. 679 680 Level: intermediate 681 682 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 683 @*/ 684 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 685 { 686 PetscErrorCode ierr; 687 688 PetscFunctionBegin; 689 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 690 ierr = MatCreate(comm,A);CHKERRQ(ierr); 691 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 692 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 693 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696