1 #include "petsc/private/petscimpl.h" 2 #include <petscsystypes.h> 3 #include <petscerror.h> 4 #include <petscveckokkos.hpp> 5 6 #include <Kokkos_Core.hpp> 7 #include <KokkosBlas.hpp> 8 #include <KokkosSparse_CrsMatrix.hpp> 9 #include <KokkosSparse_spmv.hpp> 10 #include <KokkosSparse_spmv.hpp> 11 #include <../src/mat/impls/aij/seq/aij.h> 12 13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 14 #include <petscmat.h> 15 16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 17 18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 19 { 20 PetscErrorCode ierr; 21 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 22 23 PetscFunctionBegin; 24 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 25 if (aijkok && aijkok->device_mat_d.data()) { 26 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 27 } 28 /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 29 PetscFunctionReturn(0); 30 } 31 32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 33 { 34 Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 35 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 36 PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 37 38 PetscFunctionBegin; 39 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 40 /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 41 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 42 delete aijkok; 43 aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 44 aijkok->nonzerostate = A->nonzerostate; 45 A->spptr = aijkok; 46 } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 47 Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 48 } 49 A->offloadmask = PETSC_OFFLOAD_BOTH; 50 PetscFunctionReturn(0); 51 } 52 53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 54 { 55 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 56 57 PetscFunctionBegin; 58 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 59 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 60 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 61 Kokkos::deep_copy(aijkok->a_h,aijkok->a_d); 62 A->offloadmask = PETSC_OFFLOAD_BOTH; 63 } 64 PetscFunctionReturn(0); 65 } 66 67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 68 { 69 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 70 PetscErrorCode ierr; 71 72 PetscFunctionBegin; 73 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 74 *array = a->a; 75 A->offloadmask = PETSC_OFFLOAD_CPU; 76 PetscFunctionReturn(0); 77 } 78 79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 81 { 82 Mat_SeqAIJKokkos *aijkok; 83 Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 84 85 PetscFunctionBegin; 86 // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 87 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 88 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 89 aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k); 90 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 91 PetscFunctionReturn(0); 92 } 93 94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 96 { 97 Mat_SeqAIJKokkos *aijkok; 98 99 PetscFunctionBegin; 100 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 101 if (aijkok && aijkok->device_mat_d.data()) { 102 *d_mat = aijkok->device_mat_d.data(); 103 } else { 104 PetscErrorCode ierr; 105 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 106 *d_mat = NULL; 107 } 108 PetscFunctionReturn(0); 109 } 110 111 /* y = A x */ 112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 113 { 114 PetscErrorCode ierr; 115 Mat_SeqAIJKokkos *aijkok; 116 ConstPetscScalarViewDevice_t xv; 117 PetscScalarViewDevice_t yv; 118 119 PetscFunctionBegin; 120 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 121 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 122 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 123 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 124 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 125 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 126 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 127 /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 128 ierr = WaitForKokkos();CHKERRQ(ierr); 129 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 130 PetscFunctionReturn(0); 131 } 132 133 /* y = A^T x */ 134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 135 { 136 PetscErrorCode ierr; 137 Mat_SeqAIJKokkos *aijkok; 138 ConstPetscScalarViewDevice_t xv; 139 PetscScalarViewDevice_t yv; 140 141 PetscFunctionBegin; 142 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 143 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 144 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 145 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 147 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 148 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 149 ierr = WaitForKokkos();CHKERRQ(ierr); 150 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 151 PetscFunctionReturn(0); 152 } 153 154 /* y = A^H x */ 155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 156 { 157 PetscErrorCode ierr; 158 Mat_SeqAIJKokkos *aijkok; 159 ConstPetscScalarViewDevice_t xv; 160 PetscScalarViewDevice_t yv; 161 162 PetscFunctionBegin; 163 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 164 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 165 ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 166 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 167 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 168 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 169 ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 170 ierr = WaitForKokkos();CHKERRQ(ierr); 171 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 } 174 175 /* z = A x + y */ 176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 177 { 178 PetscErrorCode ierr; 179 Mat_SeqAIJKokkos *aijkok; 180 ConstPetscScalarViewDevice_t xv,yv; 181 PetscScalarViewDevice_t zv; 182 183 PetscFunctionBegin; 184 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 186 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 187 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 188 if (zz != yy) Kokkos::deep_copy(zv,yv); 189 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 190 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 191 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 192 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 193 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 194 ierr = WaitForKokkos();CHKERRQ(ierr); 195 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 /* z = A^T x + y */ 200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 201 { 202 PetscErrorCode ierr; 203 Mat_SeqAIJKokkos *aijkok; 204 ConstPetscScalarViewDevice_t xv,yv; 205 PetscScalarViewDevice_t zv; 206 207 PetscFunctionBegin; 208 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 210 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 211 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 212 if (zz != yy) Kokkos::deep_copy(zv,yv); 213 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 214 KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 215 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 216 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 217 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 218 ierr = WaitForKokkos();CHKERRQ(ierr); 219 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 220 PetscFunctionReturn(0); 221 } 222 223 /* z = A^H x + y */ 224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 225 { 226 PetscErrorCode ierr; 227 Mat_SeqAIJKokkos *aijkok; 228 ConstPetscScalarViewDevice_t xv,yv; 229 PetscScalarViewDevice_t zv; 230 231 PetscFunctionBegin; 232 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 233 ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 234 ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 235 ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 236 if (zz != yy) Kokkos::deep_copy(zv,yv); 237 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 238 KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 239 ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 240 ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 241 ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 242 ierr = WaitForKokkos();CHKERRQ(ierr); 243 ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 248 { 249 PetscErrorCode ierr; 250 Mat B; 251 Mat_SeqAIJ *aij; 252 253 PetscFunctionBegin; 254 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 255 if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 256 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 257 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 258 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 259 } 260 261 B = *newmat; 262 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 263 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 264 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 265 ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 266 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 267 /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 268 aij = (Mat_SeqAIJ*)B->data; 269 aij->inode.use = PETSC_FALSE; 270 271 B->offloadmask = PETSC_OFFLOAD_CPU; 272 PetscFunctionReturn(0); 273 } 274 275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 276 { 277 PetscErrorCode ierr; 278 279 PetscFunctionBegin; 280 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 281 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 286 { 287 PetscErrorCode ierr; 288 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 289 290 PetscFunctionBegin; 291 if (aijkok) { 292 if (aijkok->device_mat_d.data()) { 293 delete aijkok->colmap_d; 294 delete aijkok->i_uncompressed_d; 295 } 296 if (aijkok->diag_d) delete aijkok->diag_d; 297 } 298 delete aijkok; 299 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 300 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 301 PetscFunctionReturn(0); 302 } 303 304 #if 0 305 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 306 { 307 MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 308 309 PetscFunctionBegin; 310 delete kh; 311 PetscFunctionReturn(0); 312 } 313 314 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 315 { 316 Mat_Product *product = C->product; 317 Mat A,B; 318 MatProductType ptype; 319 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 320 bool tA,tB; 321 PetscErrorCode ierr; 322 MatMatKernelHandle_t *kh; 323 Mat_SeqAIJ *c; 324 325 PetscFunctionBegin; 326 MatCheckProduct(C,1); 327 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 328 A = product->A; 329 B = product->B; 330 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 331 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 332 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 333 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 334 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 335 kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 336 ptype = product->type; 337 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 338 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 339 switch (ptype) { 340 case MATPRODUCT_AB: 341 tA = false; 342 tB = false; 343 break; 344 case MATPRODUCT_AtB: 345 tA = true; 346 tB = false; 347 break; 348 case MATPRODUCT_ABt: 349 tA = false; 350 tB = true; 351 break; 352 default: 353 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 354 } 355 356 KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 357 C->offloadmask = PETSC_OFFLOAD_GPU; 358 /* shorter version of MatAssemblyEnd_SeqAIJ */ 359 c = (Mat_SeqAIJ*)C->data; 360 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 361 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 362 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 363 c->reallocs = 0; 364 C->info.mallocs += 0; 365 C->info.nz_unneeded = 0; 366 C->assembled = C->was_assembled = PETSC_TRUE; 367 C->num_ass++; 368 /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 369 // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 370 C->offloadmask = PETSC_OFFLOAD_BOTH; 371 // Also, we should add support to copy back from device to host 372 PetscFunctionReturn(0); 373 } 374 375 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 376 { 377 Mat_Product *product = C->product; 378 Mat A,B; 379 MatProductType ptype; 380 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 381 PetscInt m,n,k; 382 bool tA,tB; 383 PetscErrorCode ierr; 384 Mat_SeqAIJ *c; 385 MatMatKernelHandle_t *kh; 386 387 PetscFunctionBegin; 388 MatCheckProduct(C,1); 389 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 390 A = product->A; 391 B = product->B; 392 // TODO only copy the i,j data, not the values 393 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 394 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 395 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 396 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 397 ptype = product->type; 398 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 399 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 400 switch (ptype) { 401 case MATPRODUCT_AB: 402 tA = false; 403 tB = false; 404 m = A->rmap->n; 405 n = B->cmap->n; 406 break; 407 case MATPRODUCT_AtB: 408 tA = true; 409 tB = false; 410 m = A->cmap->n; 411 n = B->cmap->n; 412 break; 413 case MATPRODUCT_ABt: 414 tA = false; 415 tB = true; 416 m = A->rmap->n; 417 n = B->rmap->n; 418 break; 419 default: 420 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 421 } 422 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 423 ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 424 c = (Mat_SeqAIJ*)C->data; 425 426 kh = new MatMatKernelHandle_t; 427 // TODO SZ: ADD RUNTIME SELECTION OF THESE 428 kh->set_team_work_size(16); 429 kh->set_dynamic_scheduling(true); 430 // Select an spgemm algorithm, limited by configuration at compile-time and 431 // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 432 // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 433 std::string myalg("SPGEMM_KK_MEMORY"); 434 kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 435 436 ///////////////////////////////////// 437 // TODO JZ 438 ckok = NULL; //new Mat_SeqAIJKokkos(); 439 C->spptr = ckok; 440 KokkosCsrMatrix_t ccsr; // here only to have the code compile 441 KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 442 //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 443 //c->nz = get_nnz_from_ccsr 444 ////////////////////////////////////// 445 c->singlemalloc = PETSC_FALSE; 446 c->free_a = PETSC_TRUE; 447 c->free_ij = PETSC_TRUE; 448 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 449 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 450 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 451 //////////////////////////////////// 452 // TODO JZ copy from device to c->i and c->j 453 //////////////////////////////////// 454 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 455 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 456 c->maxnz = c->nz; 457 c->nonzerorowcnt = 0; 458 c->rmax = 0; 459 for (k = 0; k < m; k++) { 460 const PetscInt nn = c->i[k+1] - c->i[k]; 461 c->ilen[k] = c->imax[k] = nn; 462 c->nonzerorowcnt += (PetscInt)!!nn; 463 c->rmax = PetscMax(c->rmax,nn); 464 } 465 466 C->nonzerostate++; 467 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 468 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 469 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 470 ckok->nonzerostate = C->nonzerostate; 471 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 472 C->preallocated = PETSC_TRUE; 473 C->assembled = PETSC_FALSE; 474 C->was_assembled = PETSC_FALSE; 475 476 C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 477 C->product->data = kh; 478 C->product->destroy = MatMatKernelHandleDestroy_Private; 479 PetscFunctionReturn(0); 480 } 481 482 /* handles sparse matrix matrix ops */ 483 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 484 { 485 Mat_Product *product = mat->product; 486 PetscErrorCode ierr; 487 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 488 489 PetscFunctionBegin; 490 MatCheckProduct(mat,1); 491 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 492 if (product->type == MATPRODUCT_ABC) { 493 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 494 } 495 if (Biskok && Ciskok) { 496 switch (product->type) { 497 case MATPRODUCT_AB: 498 case MATPRODUCT_AtB: 499 case MATPRODUCT_ABt: 500 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 501 break; 502 case MATPRODUCT_PtAP: 503 case MATPRODUCT_RARt: 504 case MATPRODUCT_ABC: 505 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 506 break; 507 default: 508 break; 509 } 510 } else { /* fallback for AIJ */ 511 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 512 } 513 PetscFunctionReturn(0); 514 } 515 #endif 516 517 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 518 { 519 PetscErrorCode ierr; 520 521 PetscFunctionBegin; 522 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 523 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 524 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 525 PetscFunctionReturn(0); 526 } 527 528 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 529 { 530 PetscErrorCode ierr; 531 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 532 PetscFunctionBegin; 533 if (aijkok && aijkok->device_mat_d.data()) { 534 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 535 } 536 ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 541 { 542 PetscErrorCode ierr; 543 Mat_SeqAIJKokkos *aijkok; 544 545 PetscFunctionBegin; 546 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 547 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 548 KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 549 A->offloadmask = PETSC_OFFLOAD_GPU; 550 ierr = WaitForKokkos();CHKERRQ(ierr); 551 ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 552 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 553 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 554 PetscFunctionReturn(0); 555 } 556 557 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 558 { 559 PetscErrorCode ierr; 560 PetscBool both = PETSC_FALSE; 561 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 562 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 563 564 PetscFunctionBegin; 565 if (aijkok && aijkok->a_d.data()) { 566 KokkosBlas::fill(aijkok->a_d,0.0); 567 both = PETSC_TRUE; 568 } 569 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 570 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 571 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 572 else A->offloadmask = PETSC_OFFLOAD_CPU; 573 PetscFunctionReturn(0); 574 } 575 576 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 577 { 578 PetscErrorCode ierr; 579 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 580 581 PetscFunctionBegin; 582 if (X->ops->axpy != Y->ops->axpy) { 583 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 584 PetscFunctionReturn(0); 585 } 586 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 587 PetscBool e; 588 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 589 if (e) { 590 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 591 if (e) str = SAME_NONZERO_PATTERN; 592 } 593 } 594 if (str != SAME_NONZERO_PATTERN) { 595 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 596 PetscFunctionReturn(0); 597 } else { 598 if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 599 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 600 PetscFunctionReturn(0); 601 } else { 602 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 603 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 604 } 605 Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 606 Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 607 if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 608 KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 609 Y->offloadmask = PETSC_OFFLOAD_GPU; 610 ierr = WaitForKokkos();CHKERRQ(ierr); 611 ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 612 // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 613 ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 614 } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 615 } 616 PetscFunctionReturn(0); 617 } 618 619 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info) 620 { 621 PetscErrorCode ierr; 622 623 PetscFunctionBegin; 624 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 625 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 626 B->offloadmask = PETSC_OFFLOAD_CPU; 627 PetscFunctionReturn(0); 628 } 629 630 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 631 { 632 PetscFunctionBegin; 633 A->boundtocpu = PETSC_FALSE; 634 635 A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 636 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 637 A->ops->destroy = MatDestroy_SeqAIJKokkos; 638 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 639 A->ops->axpy = MatAXPY_SeqAIJKokkos; 640 A->ops->scale = MatScale_SeqAIJKokkos; 641 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 642 //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 643 A->ops->mult = MatMult_SeqAIJKokkos; 644 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 645 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 646 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 647 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 648 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 649 PetscFunctionReturn(0); 650 } 651 652 /* --------------------------------------------------------------------------------*/ 653 /*C 654 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 655 (the default parallel PETSc format). This matrix will ultimately be handled by 656 Kokkos for calculations. For good matrix 657 assembly performance the user should preallocate the matrix storage by setting 658 the parameter nz (or the array nnz). By setting these parameters accurately, 659 performance during matrix assembly can be increased by more than a factor of 50. 660 661 Collective 662 663 Input Parameters: 664 + comm - MPI communicator, set to PETSC_COMM_SELF 665 . m - number of rows 666 . n - number of columns 667 . nz - number of nonzeros per row (same for all rows) 668 - nnz - array containing the number of nonzeros in the various rows 669 (possibly different for each row) or NULL 670 671 Output Parameter: 672 . A - the matrix 673 674 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 675 MatXXXXSetPreallocation() paradgm instead of this routine directly. 676 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 677 678 Notes: 679 If nnz is given then nz is ignored 680 681 The AIJ format (also called the Yale sparse matrix format or 682 compressed row storage), is fully compatible with standard Fortran 77 683 storage. That is, the stored row and column indices can begin at 684 either one (as in Fortran) or zero. See the users' manual for details. 685 686 Specify the preallocated storage with either nz or nnz (not both). 687 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 688 allocation. For large problems you MUST preallocate memory or you 689 will get TERRIBLE performance, see the users' manual chapter on matrices. 690 691 By default, this format uses inodes (identical nodes) when possible, to 692 improve numerical efficiency of matrix-vector products and solves. We 693 search for consecutive rows with the same nonzero structure, thereby 694 reusing matrix information to achieve increased efficiency. 695 696 Level: intermediate 697 698 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 699 @*/ 700 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 701 { 702 PetscErrorCode ierr; 703 704 PetscFunctionBegin; 705 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 706 ierr = MatCreate(comm,A);CHKERRQ(ierr); 707 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 708 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 709 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 710 PetscFunctionReturn(0); 711 } 712 713 // factorizations 714 typedef Kokkos::TeamPolicy<>::member_type team_member; 715 // 716 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 717 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 718 // 719 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 720 { 721 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 722 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 723 IS isrow = b->row,isicol = b->icol; 724 PetscErrorCode ierr; 725 const PetscInt *r_h,*ic_h; 726 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 727 const PetscScalar *aa_d = aijkok->a_d.data(); 728 PetscScalar *ba_d = baijkok->a_d.data(); 729 PetscBool row_identity,col_identity; 730 PetscInt nc, Nf, nVec=32; // should be a parameter 731 PetscContainer container; 732 733 PetscFunctionBegin; 734 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 735 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 736 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 737 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 738 if (container) { 739 PetscInt *pNf=NULL, nv; 740 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 741 Nf = (*pNf)%1000; 742 nv = (*pNf)/1000; 743 if (nv>0) nVec = nv; 744 } else Nf = 1; 745 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 746 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 747 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 748 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 749 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 750 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 751 #endif 752 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 753 { 754 #define KOKKOS_SHARED_LEVEL 1 755 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 756 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 757 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 758 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 759 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 760 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 761 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 762 size_t flops_h = 0.0; 763 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 764 Kokkos::View<size_t> d_flops_k ("flops"); 765 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 766 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 767 Kokkos::deep_copy (d_flops_k, h_flops_k); 768 Kokkos::deep_copy (d_r_k, h_r_k); 769 Kokkos::deep_copy (d_ic_k, h_ic_k); 770 // Fill A --> fact 771 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 772 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 773 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 774 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 775 // zero rows of B 776 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 777 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 778 PetscScalar *baL = ba_d + bi_d[rowb]; 779 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 780 /* zero (unfactored row) */ 781 for (int j=0;j<nzbL;j++) baL[j] = 0; 782 for (int j=0;j<nzbU;j++) baU[j] = 0; 783 }); 784 // copy A into B 785 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 786 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 787 const PetscScalar *av = aa_d + ai_d[rowa]; 788 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 789 /* load in initial (unfactored row) */ 790 for (int j=0;j<nza;j++) { 791 PetscInt colb = ic[ajtmp[j]]; 792 PetscScalar vala = av[j]; 793 if (colb == rowb) { 794 *(ba_d + bdiag_d[rowb]) = vala; 795 } else { 796 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 797 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 798 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 799 for (int j=0; j<nz ; j++) { 800 if (pbj[j] == colb) { 801 pba[j] = vala; 802 set++; 803 break; 804 } 805 } 806 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 807 } 808 } 809 }); 810 }); 811 Kokkos::fence(); 812 813 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 814 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 815 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 816 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 817 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 818 const PetscInt start = field*nloc, end = start + nloc; 819 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 820 // A22 panel update for each row A(1,:) and col A(:,1) 821 for (int ii=start; ii<end-1; ii++) { 822 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 823 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 824 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 825 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 826 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 827 PetscInt kIdx = j*Ni + field_block_idx; 828 if (kIdx >= nzUi) /* void */ ; 829 else { 830 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 831 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 832 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 833 size_t st_idx; 834 // find and do L(k,i) = A(:k,i) / A(i,i) 835 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 836 // get column, there has got to be a better way 837 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 838 //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 839 if (pjL[j] == ii) { 840 PetscScalar *pLki = ba_d + bi_d[myk] + j; 841 idx = j; // output 842 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 843 } 844 }, st_idx); 845 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 846 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 847 else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 848 // U(i+1,:end) 849 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 850 PetscScalar Uij = baUi[uiIdx]; 851 PetscInt col = bjUi[uiIdx]; 852 if (col==myk) { 853 // A_kk = A_kk - L_ki * U_ij(k) 854 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 855 *Akkv = *Akkv - L_ki() * Uij; // UiK 856 } else { 857 PetscScalar *start, *end, *pAkjv=NULL; 858 PetscInt high, low; 859 const PetscInt *startj; 860 if (col<myk) { // L 861 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 862 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 863 start = pLki+1; // start at pLki+1, A22(myk,1) 864 startj= bj_d + bi_d[myk] + idx; 865 end = ba_d + bi_d[myk+1]; 866 } else { 867 PetscInt idx = bdiag_d[myk+1]+1; 868 start = ba_d + idx; 869 startj= bj_d + idx; 870 end = ba_d + bdiag_d[myk]; 871 } 872 // search for 'col', use bisection search - TODO 873 low = 0; 874 high = (PetscInt)(end-start); 875 while (high-low > 5) { 876 int t = (low+high)/2; 877 if (startj[t] > col) high = t; 878 else low = t; 879 } 880 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 881 if (startj[pAkjv-start] == col) break; 882 } 883 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 884 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 885 } 886 }); 887 } 888 } 889 }); 890 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 891 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 892 } /* endof for (i=0; i<n; i++) { */ 893 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 894 }); 895 Kokkos::fence(); 896 Kokkos::deep_copy (h_flops_k, d_flops_k); 897 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 898 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 899 #elif defined(PETSC_USE_LOG) 900 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 901 #endif 902 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 903 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 904 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 905 /* Invert diagonal for simpler triangular solves */ 906 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 907 int i = start + outer_index*Ni + lg_rank%Ni; 908 if (i < end) { 909 PetscScalar *pv = ba_d + bdiag_d[i]; 910 *pv = 1.0/(*pv); 911 } 912 }); 913 }); 914 } 915 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 916 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 917 #endif 918 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 919 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 920 921 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 922 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 923 if (b->inode.size) { 924 B->ops->solve = MatSolve_SeqAIJ_Inode; 925 } else if (row_identity && col_identity) { 926 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 927 } else { 928 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 929 } 930 B->offloadmask = PETSC_OFFLOAD_GPU; 931 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 932 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 933 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 934 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 935 B->ops->matsolve = MatMatSolve_SeqAIJ; 936 B->assembled = PETSC_TRUE; 937 B->preallocated = PETSC_TRUE; 938 939 PetscFunctionReturn(0); 940 } 941 942 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 943 { 944 PetscErrorCode ierr; 945 946 PetscFunctionBegin; 947 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 948 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS; 949 PetscFunctionReturn(0); 950 } 951 952 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 953 { 954 PetscErrorCode ierr; 955 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 956 const PetscInt nrows = A->rmap->n; 957 958 PetscFunctionBegin; 959 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 960 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 961 // move B data into Kokkos 962 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 963 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 964 { 965 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 966 if (!baijkok->diag_d) { 967 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 968 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_diag)); 969 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 970 } 971 } 972 PetscFunctionReturn(0); 973 } 974 975 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*); 976 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*); 977 978 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 979 { 980 PetscErrorCode ierr; 981 982 PetscFunctionBegin; 983 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr); 984 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 985 PetscFunctionReturn(0); 986 } 987 988 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type) 989 { 990 PetscFunctionBegin; 991 *type = MATSOLVERKOKKOS; 992 PetscFunctionReturn(0); 993 } 994 995 // use -pc_factor_mat_solver_type kokkosdevice 996 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 997 { 998 PetscFunctionBegin; 999 *type = MATSOLVERKOKKOSDEVICE; 1000 PetscFunctionReturn(0); 1001 } 1002 1003 /*MC 1004 MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1005 on a single GPU of type, seqaijkokkos, aijkokkos. 1006 1007 Level: beginner 1008 1009 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation 1010 M*/ 1011 1012 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B) 1013 { 1014 PetscErrorCode ierr; 1015 PetscInt n = A->rmap->n; 1016 1017 PetscFunctionBegin; 1018 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1019 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1020 (*B)->factortype = ftype; 1021 (*B)->useordering = PETSC_TRUE; 1022 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1023 1024 if (ftype == MAT_FACTOR_LU) { 1025 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1026 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOS; 1027 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1028 1029 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1030 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr); 1031 PetscFunctionReturn(0); 1032 } 1033 1034 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1035 { 1036 PetscErrorCode ierr; 1037 PetscInt n = A->rmap->n; 1038 1039 PetscFunctionBegin; 1040 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1041 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1042 (*B)->factortype = ftype; 1043 (*B)->useordering = PETSC_TRUE; 1044 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1045 1046 if (ftype == MAT_FACTOR_LU) { 1047 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1048 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1049 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1050 1051 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1052 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1053 PetscFunctionReturn(0); 1054 } 1055