1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscveckokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <KokkosSparse_spmv.hpp> 11 #include <../src/mat/impls/aij/seq/aij.h> 12 13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 14 #include <petscmat.h> 15 16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 17 18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 19 { 20 PetscErrorCode ierr; 21 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 22 23 PetscFunctionBegin; 24 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 25 if (aijkok && aijkok->device_mat_d.data()) { 26 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 27 } 28 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 29 PetscFunctionReturn(0); 30 } 31 32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 33 { 34 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 35 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 36 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 37 38 PetscFunctionBegin; 39 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 40 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 41 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 42 delete aijkok; 43 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 44 aijkok->nonzerostate = A->nonzerostate; 45 A->spptr = aijkok; 46 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 47 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 48 } 49 A->offloadmask = PETSC_OFFLOAD_BOTH; 50 PetscFunctionReturn(0); 51 } 52 53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 54 { 55 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 56 57 PetscFunctionBegin; 58 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 59 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 60 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 61 Kokkos::deep_copy(aijkok->a_h,aijkok->a_d); 62 A->offloadmask = PETSC_OFFLOAD_BOTH; 63 } 64 PetscFunctionReturn(0); 65 } 66 67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 68 { 69 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 70 PetscErrorCode ierr; 71 72 PetscFunctionBegin; 73 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 74 *array = a->a; 75 A->offloadmask = PETSC_OFFLOAD_CPU; 76 PetscFunctionReturn(0); 77 } 78 79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 81 { 82 Mat_SeqAIJKokkos *aijkok; 83 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 84 85 PetscFunctionBegin; 86 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 87 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 88 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 89 aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k); 90 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 91 PetscFunctionReturn(0); 92 } 93 94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 96 { 97 Mat_SeqAIJKokkos *aijkok; 98 99 PetscFunctionBegin; 100 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 101 if (aijkok && aijkok->device_mat_d.data()) { 102 *d_mat = aijkok->device_mat_d.data(); 103 } else { 104 PetscErrorCode ierr; 105 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 106 *d_mat = NULL; 107 } 108 PetscFunctionReturn(0); 109 } 110 111 /* y = A x */ 112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 113 { 114 PetscErrorCode ierr; 115 Mat_SeqAIJKokkos *aijkok; 116 ConstPetscScalarViewDevice_t xv; 117 PetscScalarViewDevice_t yv; 118 119 PetscFunctionBegin; 120 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 121 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 122 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 123 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 124 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 125 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 126 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 127 /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 128 ierr = WaitForKokkos();CHKERRQ(ierr); 129 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 130 PetscFunctionReturn(0); 131 } 132 133 /* y = A^T x */ 134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 135 { 136 PetscErrorCode ierr; 137 Mat_SeqAIJKokkos *aijkok; 138 ConstPetscScalarViewDevice_t xv; 139 PetscScalarViewDevice_t yv; 140 141 PetscFunctionBegin; 142 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 143 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 144 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 145 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 147 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 148 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 149 ierr = WaitForKokkos();CHKERRQ(ierr); 150 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 151 PetscFunctionReturn(0); 152 } 153 154 /* y = A^H x */ 155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 156 { 157 PetscErrorCode ierr; 158 Mat_SeqAIJKokkos *aijkok; 159 ConstPetscScalarViewDevice_t xv; 160 PetscScalarViewDevice_t yv; 161 162 PetscFunctionBegin; 163 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 164 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 166 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 167 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 168 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 169 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 170 ierr = WaitForKokkos();CHKERRQ(ierr); 171 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 } 174 175 /* z = A x + y */ 176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 177 { 178 PetscErrorCode ierr; 179 Mat_SeqAIJKokkos *aijkok; 180 ConstPetscScalarViewDevice_t xv,yv; 181 PetscScalarViewDevice_t zv; 182 183 PetscFunctionBegin; 184 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 186 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 187 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 188 if (zz != yy) Kokkos::deep_copy(zv,yv); 189 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 190 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 191 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 192 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 193 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 194 ierr = WaitForKokkos();CHKERRQ(ierr); 195 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 /* z = A^T x + y */ 200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 201 { 202 PetscErrorCode ierr; 203 Mat_SeqAIJKokkos *aijkok; 204 ConstPetscScalarViewDevice_t xv,yv; 205 PetscScalarViewDevice_t zv; 206 207 PetscFunctionBegin; 208 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 210 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 211 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 212 if (zz != yy) Kokkos::deep_copy(zv,yv); 213 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 214 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 215 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 216 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 217 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 218 ierr = WaitForKokkos();CHKERRQ(ierr); 219 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 220 PetscFunctionReturn(0); 221 } 222 223 /* z = A^H x + y */ 224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 225 { 226 PetscErrorCode ierr; 227 Mat_SeqAIJKokkos *aijkok; 228 ConstPetscScalarViewDevice_t xv,yv; 229 PetscScalarViewDevice_t zv; 230 231 PetscFunctionBegin; 232 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 233 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 234 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 235 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 236 if (zz != yy) Kokkos::deep_copy(zv,yv); 237 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 238 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 239 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 240 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 241 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 242 ierr = WaitForKokkos();CHKERRQ(ierr); 243 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 248 { 249 PetscErrorCode ierr; 250 Mat B; 251 Mat_SeqAIJ *aij; 252 253 PetscFunctionBegin; 254 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 255 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 256 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 257 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 258 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 259 } 260 261 B = *newmat; 262 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 263 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 264 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 265 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 266 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 267 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 268 aij = (Mat_SeqAIJ*)B->data; 269 aij->inode.use = PETSC_FALSE; 270 271 B->offloadmask = PETSC_OFFLOAD_CPU; 272 PetscFunctionReturn(0); 273 } 274 275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 276 { 277 PetscErrorCode ierr; 278 279 PetscFunctionBegin; 280 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 281 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 286 { 287 PetscErrorCode ierr; 288 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 289 290 PetscFunctionBegin; 291 if (aijkok && aijkok->device_mat_d.data()) { 292 delete aijkok->colmap_d; 293 delete aijkok->i_uncompressed_d; 294 } 295 delete aijkok; 296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 297 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 298 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 #if 0 303 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 304 { 305 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 306 307 PetscFunctionBegin; 308 delete kh; 309 PetscFunctionReturn(0); 310 } 311 312 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 313 { 314 Mat_Product *product = C->product; 315 Mat A,B; 316 MatProductType ptype; 317 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 318 bool tA,tB; 319 PetscErrorCode ierr; 320 MatMatKernelHandle_t *kh; 321 Mat_SeqAIJ *c; 322 323 PetscFunctionBegin; 324 MatCheckProduct(C,1); 325 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 326 A = product->A; 327 B = product->B; 328 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 329 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 330 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 331 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 332 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 333 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 334 ptype = product->type; 335 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 336 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 337 switch (ptype) { 338 case MATPRODUCT_AB: 339 tA = false; 340 tB = false; 341 break; 342 case MATPRODUCT_AtB: 343 tA = true; 344 tB = false; 345 break; 346 case MATPRODUCT_ABt: 347 tA = false; 348 tB = true; 349 break; 350 default: 351 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 352 } 353 354 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 355 C->offloadmask = PETSC_OFFLOAD_GPU; 356 /* shorter version of MatAssemblyEnd_SeqAIJ */ 357 c = (Mat_SeqAIJ*)C->data; 358 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 359 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 360 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 361 c->reallocs = 0; 362 C->info.mallocs += 0; 363 C->info.nz_unneeded = 0; 364 C->assembled = C->was_assembled = PETSC_TRUE; 365 C->num_ass++; 366 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 367 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 368 C->offloadmask = PETSC_OFFLOAD_BOTH; 369 // Also, we should add support to copy back from device to host 370 PetscFunctionReturn(0); 371 } 372 373 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 374 { 375 Mat_Product *product = C->product; 376 Mat A,B; 377 MatProductType ptype; 378 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 379 PetscInt m,n,k; 380 bool tA,tB; 381 PetscErrorCode ierr; 382 Mat_SeqAIJ *c; 383 MatMatKernelHandle_t *kh; 384 385 PetscFunctionBegin; 386 MatCheckProduct(C,1); 387 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 388 A = product->A; 389 B = product->B; 390 // TODO only copy the i,j data, not the values 391 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 392 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 393 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 394 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 395 ptype = product->type; 396 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 397 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 398 switch (ptype) { 399 case MATPRODUCT_AB: 400 tA = false; 401 tB = false; 402 m = A->rmap->n; 403 n = B->cmap->n; 404 break; 405 case MATPRODUCT_AtB: 406 tA = true; 407 tB = false; 408 m = A->cmap->n; 409 n = B->cmap->n; 410 break; 411 case MATPRODUCT_ABt: 412 tA = false; 413 tB = true; 414 m = A->rmap->n; 415 n = B->rmap->n; 416 break; 417 default: 418 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 419 } 420 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 421 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 422 c = (Mat_SeqAIJ*)C->data; 423 424 kh = new MatMatKernelHandle_t; 425 // TODO SZ: ADD RUNTIME SELECTION OF THESE 426 kh->set_team_work_size(16); 427 kh->set_dynamic_scheduling(true); 428 // Select an spgemm algorithm, limited by configuration at compile-time and 429 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 430 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 431 std::string myalg("SPGEMM_KK_MEMORY"); 432 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 433 434 ///////////////////////////////////// 435 // TODO JZ 436 ckok = NULL; //new Mat_SeqAIJKokkos(); 437 C->spptr = ckok; 438 KokkosCsrMatrix_t ccsr; // here only to have the code compile 439 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 440 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 441 //c->nz = get_nnz_from_ccsr 442 ////////////////////////////////////// 443 c->singlemalloc = PETSC_FALSE; 444 c->free_a = PETSC_TRUE; 445 c->free_ij = PETSC_TRUE; 446 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 447 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 448 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 449 //////////////////////////////////// 450 // TODO JZ copy from device to c->i and c->j 451 //////////////////////////////////// 452 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 453 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 454 c->maxnz = c->nz; 455 c->nonzerorowcnt = 0; 456 c->rmax = 0; 457 for (k = 0; k < m; k++) { 458 const PetscInt nn = c->i[k+1] - c->i[k]; 459 c->ilen[k] = c->imax[k] = nn; 460 c->nonzerorowcnt += (PetscInt)!!nn; 461 c->rmax = PetscMax(c->rmax,nn); 462 } 463 464 C->nonzerostate++; 465 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 466 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 467 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 468 ckok->nonzerostate = C->nonzerostate; 469 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 470 C->preallocated = PETSC_TRUE; 471 C->assembled = PETSC_FALSE; 472 C->was_assembled = PETSC_FALSE; 473 474 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 475 C->product->data = kh; 476 C->product->destroy = MatMatKernelHandleDestroy_Private; 477 PetscFunctionReturn(0); 478 } 479 480 /* handles sparse matrix matrix ops */ 481 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 482 { 483 Mat_Product *product = mat->product; 484 PetscErrorCode ierr; 485 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 486 487 PetscFunctionBegin; 488 MatCheckProduct(mat,1); 489 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 490 if (product->type == MATPRODUCT_ABC) { 491 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 492 } 493 if (Biskok && Ciskok) { 494 switch (product->type) { 495 case MATPRODUCT_AB: 496 case MATPRODUCT_AtB: 497 case MATPRODUCT_ABt: 498 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 499 break; 500 case MATPRODUCT_PtAP: 501 case MATPRODUCT_RARt: 502 case MATPRODUCT_ABC: 503 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 504 break; 505 default: 506 break; 507 } 508 } else { /* fallback for AIJ */ 509 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 510 } 511 PetscFunctionReturn(0); 512 } 513 #endif 514 515 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 516 { 517 PetscErrorCode ierr; 518 519 PetscFunctionBegin; 520 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 521 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 522 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 523 PetscFunctionReturn(0); 524 } 525 526 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 527 { 528 PetscErrorCode ierr; 529 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 530 PetscFunctionBegin; 531 if (aijkok && aijkok->device_mat_d.data()) { 532 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 533 } 534 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 535 PetscFunctionReturn(0); 536 } 537 538 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 539 { 540 PetscErrorCode ierr; 541 Mat_SeqAIJKokkos *aijkok; 542 543 PetscFunctionBegin; 544 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 545 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 546 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 547 A->offloadmask = PETSC_OFFLOAD_GPU; 548 ierr = WaitForKokkos();CHKERRQ(ierr); 549 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 550 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 551 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 552 PetscFunctionReturn(0); 553 } 554 555 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 556 { 557 PetscErrorCode ierr; 558 PetscBool both = PETSC_FALSE; 559 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 560 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 561 562 PetscFunctionBegin; 563 if (aijkok && aijkok->a_d.data()) { 564 KokkosBlas::fill(aijkok->a_d,0.0); 565 both = PETSC_TRUE; 566 } 567 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 568 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 569 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 570 else A->offloadmask = PETSC_OFFLOAD_CPU; 571 PetscFunctionReturn(0); 572 } 573 574 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 575 { 576 PetscErrorCode ierr; 577 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 578 579 PetscFunctionBegin; 580 if (X->ops->axpy != Y->ops->axpy) { 581 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 585 PetscBool e; 586 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 587 if (e) { 588 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 589 if (e) str = SAME_NONZERO_PATTERN; 590 } 591 } 592 if (str != SAME_NONZERO_PATTERN) { 593 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 594 PetscFunctionReturn(0); 595 } else { 596 if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 597 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 598 PetscFunctionReturn(0); 599 } else { 600 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 601 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 602 } 603 Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 604 Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 605 if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 606 KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 607 Y->offloadmask = PETSC_OFFLOAD_GPU; 608 ierr = WaitForKokkos();CHKERRQ(ierr); 609 ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 610 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 611 ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 612 } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 613 } 614 PetscFunctionReturn(0); 615 } 616 617 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 618 { 619 PetscFunctionBegin; 620 A->boundtocpu = PETSC_FALSE; 621 622 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 623 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 624 A->ops->destroy = MatDestroy_SeqAIJKokkos; 625 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 626 A->ops->axpy = MatAXPY_SeqAIJKokkos; 627 A->ops->scale = MatScale_SeqAIJKokkos; 628 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 629 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 630 A->ops->mult = MatMult_SeqAIJKokkos; 631 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 632 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 633 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 634 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 635 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 636 PetscFunctionReturn(0); 637 } 638 639 /* --------------------------------------------------------------------------------*/ 640 /*C 641 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 642 (the default parallel PETSc format). This matrix will ultimately be handled by 643 Kokkos for calculations. For good matrix 644 assembly performance the user should preallocate the matrix storage by setting 645 the parameter nz (or the array nnz). By setting these parameters accurately, 646 performance during matrix assembly can be increased by more than a factor of 50. 647 648 Collective 649 650 Input Parameters: 651 + comm - MPI communicator, set to PETSC_COMM_SELF 652 . m - number of rows 653 . n - number of columns 654 . nz - number of nonzeros per row (same for all rows) 655 - nnz - array containing the number of nonzeros in the various rows 656 (possibly different for each row) or NULL 657 658 Output Parameter: 659 . A - the matrix 660 661 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 662 MatXXXXSetPreallocation() paradgm instead of this routine directly. 663 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 664 665 Notes: 666 If nnz is given then nz is ignored 667 668 The AIJ format (also called the Yale sparse matrix format or 669 compressed row storage), is fully compatible with standard Fortran 77 670 storage. That is, the stored row and column indices can begin at 671 either one (as in Fortran) or zero. See the users' manual for details. 672 673 Specify the preallocated storage with either nz or nnz (not both). 674 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 675 allocation. For large problems you MUST preallocate memory or you 676 will get TERRIBLE performance, see the users' manual chapter on matrices. 677 678 By default, this format uses inodes (identical nodes) when possible, to 679 improve numerical efficiency of matrix-vector products and solves. We 680 search for consecutive rows with the same nonzero structure, thereby 681 reusing matrix information to achieve increased efficiency. 682 683 Level: intermediate 684 685 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 686 @*/ 687 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 688 { 689 PetscErrorCode ierr; 690 691 PetscFunctionBegin; 692 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 693 ierr = MatCreate(comm,A);CHKERRQ(ierr); 694 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 695 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 696 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 697 PetscFunctionReturn(0); 698 } 699 700 // factorizations 701 702 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info) 703 { 704 // Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 705 // IS isrow = b->row,iscol = b->col; 706 // PetscBool row_identity,col_identity; 707 PetscErrorCode ierr; 708 709 PetscFunctionBegin; 710 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 711 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 712 B->offloadmask = PETSC_OFFLOAD_CPU; 713 // solves stay on CPU now 714 /* determine which version of MatSolve needs to be used. */ 715 // ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 716 // ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 717 // if (row_identity && col_identity) { 718 // B->ops->solve = MatSolve_SeqAIJKOKKOS_NaturalOrdering; 719 // B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS_NaturalOrdering; 720 // B->ops->matsolve = NULL; 721 // B->ops->matsolvetranspose = NULL; 722 // } else { 723 // B->ops->solve = MatSolve_SeqAIJKOKKOS; 724 // B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS; 725 // B->ops->matsolve = NULL; 726 // B->ops->matsolvetranspose = NULL; 727 // } 728 729 /* get the triangular factors */ 730 // ierr = MatSeqAIJKOKKOSILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 731 PetscFunctionReturn(0); 732 } 733 734 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 735 { 736 PetscErrorCode ierr; 737 738 PetscFunctionBegin; 739 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 740 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS; 741 PetscFunctionReturn(0); 742 } 743 744 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*); 745 746 747 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 748 { 749 PetscErrorCode ierr; 750 751 PetscFunctionBegin; 752 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr); 753 PetscFunctionReturn(0); 754 } 755 756 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type) 757 { 758 PetscFunctionBegin; 759 *type = MATSOLVERKOKKOS; 760 PetscFunctionReturn(0); 761 } 762 763 /*MC 764 MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices 765 on a single GPU of type, seqaijkokkos, aijkokkos. 766 767 Level: beginner 768 769 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation 770 M*/ 771 772 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B) 773 { 774 PetscErrorCode ierr; 775 PetscInt n = A->rmap->n; 776 777 PetscFunctionBegin; 778 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 779 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 780 (*B)->factortype = ftype; 781 (*B)->useordering = PETSC_TRUE; 782 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 783 784 if (ftype == MAT_FACTOR_LU /* || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT*/) { 785 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 786 // (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKOKKOS; 787 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOS; 788 // } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 789 // (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKOKKOS; 790 // (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKOKKOS; 791 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 792 793 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 794 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr); 795 PetscFunctionReturn(0); 796 } 797