1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <petscbt.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h> 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat); 24 25 #undef __FUNCT__ 26 #define __FUNCT__ "MatViennaCLCopyToGPU" 27 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 28 { 29 30 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 31 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 32 PetscErrorCode ierr; 33 34 35 PetscFunctionBegin; 36 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 37 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 38 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 39 40 try { 41 if (a->compressedrow.use) { 42 if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix(); 43 44 // Since PetscInt is different from cl_uint, we have to convert: 45 viennacl::backend::mem_handle dummy; 46 47 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1); 48 for (PetscInt i=0; i<=a->compressedrow.nrows; ++i) 49 row_buffer.set(i, (a->compressedrow.i)[i]); 50 51 viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows); 52 for (PetscInt i=0; i<a->compressedrow.nrows; ++i) 53 row_indices.set(i, (a->compressedrow.rindex)[i]); 54 55 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 56 for (PetscInt i=0; i<a->nz; ++i) 57 col_buffer.set(i, (a->j)[i]); 58 59 viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz); 60 } else { 61 if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix(); 62 63 // Since PetscInt is in general different from cl_uint, we have to convert: 64 viennacl::backend::mem_handle dummy; 65 66 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 67 for (PetscInt i=0; i<=A->rmap->n; ++i) 68 row_buffer.set(i, (a->i)[i]); 69 70 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 71 for (PetscInt i=0; i<a->nz; ++i) 72 col_buffer.set(i, (a->j)[i]); 73 74 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 75 } 76 ViennaCLWaitForGPU(); 77 } catch(std::exception const & ex) { 78 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 79 } 80 81 // Create temporary vector for v += A*x: 82 if (viennaclstruct->tempvec) { 83 if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) { 84 delete (ViennaCLVector*)viennaclstruct->tempvec; 85 viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 86 } else { 87 viennaclstruct->tempvec->clear(); 88 } 89 } else { 90 viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 91 } 92 93 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 94 95 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 96 } 97 } 98 PetscFunctionReturn(0); 99 } 100 101 #undef __FUNCT__ 102 #define __FUNCT__ "MatViennaCLCopyFromGPU" 103 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 104 { 105 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 106 PetscInt m = A->rmap->n; 107 PetscErrorCode ierr; 108 109 110 PetscFunctionBegin; 111 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 112 try { 113 if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 114 else { 115 116 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 117 a->nz = Agpu->nnz(); 118 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 119 A->preallocated = PETSC_TRUE; 120 if (a->singlemalloc) { 121 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 122 } else { 123 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 124 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 125 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 126 } 127 ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr); 128 ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 129 130 a->singlemalloc = PETSC_TRUE; 131 132 /* Setup row lengths */ 133 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 134 ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr); 135 ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 136 137 /* Copy data back from GPU */ 138 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 139 140 // copy row array 141 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 142 (a->i)[0] = row_buffer[0]; 143 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 144 (a->i)[i+1] = row_buffer[i+1]; 145 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 146 } 147 148 // copy column indices 149 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 150 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 151 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 152 (a->j)[i] = col_buffer[i]; 153 154 // copy nonzero entries directly to destination (no conversion required) 155 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 156 157 ViennaCLWaitForGPU(); 158 /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 159 } 160 } catch(std::exception const & ex) { 161 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 162 } 163 164 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 165 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 166 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 167 168 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 169 } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 170 PetscFunctionReturn(0); 171 } 172 173 #undef __FUNCT__ 174 #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL" 175 PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 176 { 177 PetscErrorCode ierr; 178 PetscInt rbs,cbs; 179 180 PetscFunctionBegin; 181 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 182 if (right) { 183 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 184 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 185 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 186 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 187 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 188 } 189 if (left) { 190 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 191 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 192 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 193 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 194 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 195 } 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 201 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 202 { 203 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 204 PetscErrorCode ierr; 205 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 206 const ViennaCLVector *xgpu=NULL; 207 ViennaCLVector *ygpu=NULL; 208 209 PetscFunctionBegin; 210 if (A->rmap->n > 0 && A->cmap->n > 0) { 211 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 212 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 213 try { 214 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 215 ViennaCLWaitForGPU(); 216 } catch (std::exception const & ex) { 217 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 218 } 219 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 220 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 221 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 230 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 235 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 236 ViennaCLVector *zgpu=NULL; 237 238 PetscFunctionBegin; 239 if (A->rmap->n > 0 && A->cmap->n > 0) { 240 try { 241 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 242 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 243 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 244 245 if (a->compressedrow.use) { 246 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 247 *zgpu = *ygpu + temp; 248 ViennaCLWaitForGPU(); 249 } else { 250 if (zz == xx || zz == yy) { //temporary required 251 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 252 *zgpu = *ygpu; 253 *zgpu += temp; 254 ViennaCLWaitForGPU(); 255 } else { 256 *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 257 *zgpu = *ygpu + *viennaclstruct->tempvec; 258 ViennaCLWaitForGPU(); 259 } 260 } 261 262 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 263 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 264 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 265 266 } catch(std::exception const & ex) { 267 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 268 } 269 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 270 } 271 PetscFunctionReturn(0); 272 } 273 274 #undef __FUNCT__ 275 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 276 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 277 { 278 PetscErrorCode ierr; 279 280 PetscFunctionBegin; 281 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 282 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 283 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 284 A->ops->mult = MatMult_SeqAIJViennaCL; 285 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 286 PetscFunctionReturn(0); 287 } 288 289 /* --------------------------------------------------------------------------------*/ 290 #undef __FUNCT__ 291 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 292 /*@ 293 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 294 (the default parallel PETSc format). This matrix will ultimately be pushed down 295 to GPUs and use the ViennaCL library for calculations. For good matrix 296 assembly performance the user should preallocate the matrix storage by setting 297 the parameter nz (or the array nnz). By setting these parameters accurately, 298 performance during matrix assembly can be increased substantially. 299 300 301 Collective on MPI_Comm 302 303 Input Parameters: 304 + comm - MPI communicator, set to PETSC_COMM_SELF 305 . m - number of rows 306 . n - number of columns 307 . nz - number of nonzeros per row (same for all rows) 308 - nnz - array containing the number of nonzeros in the various rows 309 (possibly different for each row) or NULL 310 311 Output Parameter: 312 . A - the matrix 313 314 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 315 MatXXXXSetPreallocation() paradigm instead of this routine directly. 316 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 317 318 Notes: 319 If nnz is given then nz is ignored 320 321 The AIJ format (also called the Yale sparse matrix format or 322 compressed row storage), is fully compatible with standard Fortran 77 323 storage. That is, the stored row and column indices can begin at 324 either one (as in Fortran) or zero. See the users' manual for details. 325 326 Specify the preallocated storage with either nz or nnz (not both). 327 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 328 allocation. For large problems you MUST preallocate memory or you 329 will get TERRIBLE performance, see the users' manual chapter on matrices. 330 331 Level: intermediate 332 333 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 334 335 @*/ 336 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 337 { 338 PetscErrorCode ierr; 339 340 PetscFunctionBegin; 341 ierr = MatCreate(comm,A);CHKERRQ(ierr); 342 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 343 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 344 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 349 #undef __FUNCT__ 350 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 351 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 352 { 353 PetscErrorCode ierr; 354 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 355 356 PetscFunctionBegin; 357 try { 358 if (viennaclcontainer) { 359 delete viennaclcontainer->tempvec; 360 delete viennaclcontainer->mat; 361 delete viennaclcontainer->compressed_mat; 362 delete viennaclcontainer; 363 } 364 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 365 } catch(std::exception const & ex) { 366 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 367 } 368 369 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr); 370 371 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 372 A->spptr = 0; 373 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 374 PetscFunctionReturn(0); 375 } 376 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 380 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 381 { 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 386 ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B); 387 PetscFunctionReturn(0); 388 } 389 390 #undef __FUNCT__ 391 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJViennaCL" 392 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat) 393 { 394 PetscErrorCode ierr; 395 Mat B; 396 Mat_SeqAIJ *aij; 397 398 PetscFunctionBegin; 399 400 if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead"); 401 402 if (reuse == MAT_INITIAL_MATRIX) { 403 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 404 } 405 406 B = *newmat; 407 408 aij = (Mat_SeqAIJ*)B->data; 409 aij->inode.use = PETSC_FALSE; 410 411 B->ops->mult = MatMult_SeqAIJViennaCL; 412 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 413 B->spptr = new Mat_SeqAIJViennaCL(); 414 415 ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec = NULL; 416 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = NULL; 417 ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL; 418 419 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 420 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 421 B->ops->getvecs = MatCreateVecs_SeqAIJViennaCL; 422 423 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 424 425 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr); 426 427 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 428 429 /* If the source matrix is already assembled, copy the destination matrix to the GPU */ 430 if (B->assembled) { 431 ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr); 432 } 433 434 PetscFunctionReturn(0); 435 } 436 437 438 /*M 439 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 440 441 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 442 default. All matrix calculations are performed using the ViennaCL library. 443 444 Options Database Keys: 445 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 446 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 447 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 448 449 Level: beginner 450 451 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 452 M*/ 453 454