1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include "petscconf.h" 9 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10 #include "petscbt.h" 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscInt *ii; 31 PetscErrorCode ierr; 32 33 34 PetscFunctionBegin; 35 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 36 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 37 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38 39 try { 40 viennaclstruct->mat = new ViennaCLAIJMatrix(); 41 if (a->compressedrow.use) { 42 ii = a->compressedrow.i; 43 44 viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 45 46 // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 47 // Cannot be reasonably supported in ViennaCL 1.4.x (custom kernels required), hence postponing until there is support for compressed CSR 48 49 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 50 } else { 51 52 // Since PetscInt is in general different from cl_uint, we have to convert: 53 viennacl::backend::mem_handle dummy; 54 55 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 56 for (PetscInt i=0; i<=A->rmap->n; ++i) 57 row_buffer.set(i, (a->i)[i]); 58 59 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 60 for (PetscInt i=0; i<a->nz; ++i) 61 col_buffer.set(i, (a->j)[i]); 62 63 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 64 } 65 } catch(std::exception const & ex) { 66 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 67 } 68 69 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 70 71 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 72 } 73 } 74 PetscFunctionReturn(0); 75 } 76 77 #undef __FUNCT__ 78 #define __FUNCT__ "MatViennaCLCopyFromGPU" 79 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 80 { 81 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 82 PetscInt m = A->rmap->n; 83 PetscErrorCode ierr; 84 85 86 PetscFunctionBegin; 87 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 88 try { 89 if (a->compressedrow.use) { 90 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 91 } else { 92 93 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 94 a->nz = Agpu->nnz(); 95 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 96 A->preallocated = PETSC_TRUE; 97 if (a->singlemalloc) { 98 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 99 } else { 100 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 101 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 102 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 103 } 104 ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 105 ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 106 107 a->singlemalloc = PETSC_TRUE; 108 109 /* Setup row lengths */ 110 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 111 ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 112 ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 113 114 /* Copy data back from GPU */ 115 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 116 117 // copy row array 118 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 119 (a->i)[0] = row_buffer[0]; 120 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 121 (a->i)[i+1] = row_buffer[i+1]; 122 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 123 } 124 125 // copy column indices 126 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 127 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 128 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 129 (a->j)[i] = col_buffer[i]; 130 131 // copy nonzero entries directly to destination (no conversion required) 132 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 133 134 /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 135 } 136 } catch(std::exception const & ex) { 137 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 138 } 139 140 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 141 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 142 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 143 144 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 145 } else { 146 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 147 } 148 PetscFunctionReturn(0); 149 } 150 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 153 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 154 { 155 PetscErrorCode ierr; 156 157 PetscFunctionBegin; 158 if (right) { 159 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 160 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 161 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 162 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 163 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 164 } 165 if (left) { 166 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 167 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 168 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 169 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 170 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 171 } 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 177 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 178 { 179 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 180 PetscErrorCode ierr; 181 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 182 const ViennaCLVector *xgpu=NULL; 183 ViennaCLVector *ygpu=NULL; 184 185 PetscFunctionBegin; 186 if (A->rmap->n > 0 && A->cmap->n > 0) { 187 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 188 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 189 try { 190 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 191 } catch (std::exception const & ex) { 192 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 193 } 194 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 195 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 196 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 197 } 198 PetscFunctionReturn(0); 199 } 200 201 202 203 #undef __FUNCT__ 204 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 205 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 206 { 207 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 208 PetscErrorCode ierr; 209 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 210 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 211 ViennaCLVector *zgpu=NULL; 212 213 PetscFunctionBegin; 214 if (A->rmap->n > 0 && A->cmap->n > 0) { 215 try { 216 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 217 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 218 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 219 220 if (a->compressedrow.use) { 221 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 222 } else { 223 if (zz == xx || zz == yy) { //temporary required 224 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 225 *zgpu = *ygpu + temp; 226 } else { 227 *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 228 *zgpu += *ygpu; 229 } 230 } 231 232 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 233 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 234 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 235 236 } catch(std::exception const & ex) { 237 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 238 } 239 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 240 } 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 246 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 247 { 248 PetscErrorCode ierr; 249 250 PetscFunctionBegin; 251 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 252 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 253 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 254 A->ops->mult = MatMult_SeqAIJViennaCL; 255 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 256 PetscFunctionReturn(0); 257 } 258 259 /* --------------------------------------------------------------------------------*/ 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 262 /*@ 263 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 264 (the default parallel PETSc format). This matrix will ultimately be pushed down 265 to GPUs and use the ViennaCL library for calculations. For good matrix 266 assembly performance the user should preallocate the matrix storage by setting 267 the parameter nz (or the array nnz). By setting these parameters accurately, 268 performance during matrix assembly can be increased substantially. 269 270 271 Collective on MPI_Comm 272 273 Input Parameters: 274 + comm - MPI communicator, set to PETSC_COMM_SELF 275 . m - number of rows 276 . n - number of columns 277 . nz - number of nonzeros per row (same for all rows) 278 - nnz - array containing the number of nonzeros in the various rows 279 (possibly different for each row) or NULL 280 281 Output Parameter: 282 . A - the matrix 283 284 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 285 MatXXXXSetPreallocation() paradigm instead of this routine directly. 286 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 287 288 Notes: 289 If nnz is given then nz is ignored 290 291 The AIJ format (also called the Yale sparse matrix format or 292 compressed row storage), is fully compatible with standard Fortran 77 293 storage. That is, the stored row and column indices can begin at 294 either one (as in Fortran) or zero. See the users' manual for details. 295 296 Specify the preallocated storage with either nz or nnz (not both). 297 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 298 allocation. For large problems you MUST preallocate memory or you 299 will get TERRIBLE performance, see the users' manual chapter on matrices. 300 301 Level: intermediate 302 303 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 304 305 @*/ 306 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 307 { 308 PetscErrorCode ierr; 309 310 PetscFunctionBegin; 311 ierr = MatCreate(comm,A);CHKERRQ(ierr); 312 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 313 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 314 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 319 #undef __FUNCT__ 320 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 321 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 322 { 323 PetscErrorCode ierr; 324 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 325 326 PetscFunctionBegin; 327 try { 328 if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 329 delete viennaclcontainer; 330 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 331 } catch(std::exception const & ex) { 332 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 333 } 334 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 335 A->spptr = 0; 336 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 337 PetscFunctionReturn(0); 338 } 339 340 341 #undef __FUNCT__ 342 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 343 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 344 { 345 PetscErrorCode ierr; 346 Mat_SeqAIJ *aij; 347 348 PetscFunctionBegin; 349 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 350 aij = (Mat_SeqAIJ*)B->data; 351 aij->inode.use = PETSC_FALSE; 352 B->ops->mult = MatMult_SeqAIJViennaCL; 353 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 354 B->spptr = new Mat_SeqAIJViennaCL(); 355 356 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 357 358 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 359 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 360 B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 361 362 ierr = MatSetFromOptions_SeqViennaCL(B);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */ 363 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 364 365 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 366 PetscFunctionReturn(0); 367 } 368 369 370 /*M 371 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 372 373 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 374 default. All matrix calculations are performed using the ViennaCL library. 375 376 Options Database Keys: 377 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 378 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 379 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 380 381 Level: beginner 382 383 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 384 M*/ 385 386