1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include "petscconf.h" 9 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10 #include "petscbt.h" 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscInt *ii; 31 PetscErrorCode ierr; 32 33 34 PetscFunctionBegin; 35 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 36 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 37 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38 39 try { 40 viennaclstruct->mat = new ViennaCLAIJMatrix(); 41 if (a->compressedrow.use) { 42 ii = a->compressedrow.i; 43 44 viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 45 46 // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 47 48 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 49 } else { 50 51 // Since PetscInt is in general different from cl_uint, we have to convert: 52 viennacl::backend::mem_handle dummy; 53 54 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 55 for (PetscInt i=0; i<=A->rmap->n; ++i) 56 row_buffer.set(i, (a->i)[i]); 57 58 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 59 for (PetscInt i=0; i<a->nz; ++i) 60 col_buffer.set(i, (a->j)[i]); 61 62 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 63 } 64 } catch(std::exception const & ex) { 65 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 66 } 67 68 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 69 70 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 71 } 72 } 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatViennaCLCopyFromGPU" 78 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 79 { 80 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 81 PetscInt m = A->rmap->n; 82 PetscErrorCode ierr; 83 84 85 PetscFunctionBegin; 86 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 87 try { 88 if (a->compressedrow.use) { 89 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 90 } else { 91 92 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 93 a->nz = Agpu->nnz(); 94 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 95 A->preallocated = PETSC_TRUE; 96 if (a->singlemalloc) { 97 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 98 } else { 99 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 100 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 101 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 102 } 103 ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 104 ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 105 106 a->singlemalloc = PETSC_TRUE; 107 108 /* Setup row lengths */ 109 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 110 ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 111 ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 112 113 /* Copy data back from GPU */ 114 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 115 116 // copy row array 117 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 118 (a->i)[0] = row_buffer[0]; 119 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 120 (a->i)[i+1] = row_buffer[i+1]; 121 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 122 } 123 124 // copy column indices 125 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 126 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 127 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 128 (a->j)[i] = col_buffer[i]; 129 130 // copy nonzero entries directly to destination (no conversion required) 131 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 132 133 /* TODO: What about a->diag? */ 134 } 135 } catch(std::exception const & ex) { 136 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 137 } 138 139 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 140 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 141 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 142 143 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 144 } else { 145 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 146 } 147 PetscFunctionReturn(0); 148 } 149 150 #undef __FUNCT__ 151 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 152 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 153 { 154 PetscErrorCode ierr; 155 156 PetscFunctionBegin; 157 if (right) { 158 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 159 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 160 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 161 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 162 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 163 } 164 if (left) { 165 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 166 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 167 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 168 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 169 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 170 } 171 PetscFunctionReturn(0); 172 } 173 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 176 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 177 { 178 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 179 PetscErrorCode ierr; 180 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 181 const ViennaCLVector *xgpu=NULL; 182 ViennaCLVector *ygpu=NULL; 183 184 PetscFunctionBegin; 185 if (A->rmap->n > 0 && A->cmap->n > 0) { 186 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 187 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 188 try { 189 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 190 } catch (std::exception const & ex) { 191 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 192 } 193 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 194 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 195 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 196 } 197 PetscFunctionReturn(0); 198 } 199 200 201 202 #undef __FUNCT__ 203 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 204 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 205 { 206 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 207 PetscErrorCode ierr; 208 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 209 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 210 ViennaCLVector *zgpu=NULL; 211 212 PetscFunctionBegin; 213 if (A->rmap->n > 0 && A->cmap->n > 0) { 214 try { 215 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 216 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 217 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 218 219 if (a->compressedrow.use) { 220 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 221 } else { 222 if (zz == xx || zz == yy) { //temporary required 223 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 224 *zgpu = *ygpu + temp; 225 } else { 226 *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 227 *zgpu += *ygpu; 228 } 229 } 230 231 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 232 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 233 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 234 235 } catch(std::exception const & ex) { 236 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 237 } 238 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 239 } 240 PetscFunctionReturn(0); 241 } 242 243 #undef __FUNCT__ 244 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 245 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 246 { 247 PetscErrorCode ierr; 248 249 PetscFunctionBegin; 250 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 251 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 252 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 253 A->ops->mult = MatMult_SeqAIJViennaCL; 254 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 255 PetscFunctionReturn(0); 256 } 257 258 /* --------------------------------------------------------------------------------*/ 259 #undef __FUNCT__ 260 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 261 /*@ 262 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 263 (the default parallel PETSc format). This matrix will ultimately be pushed down 264 to GPUs and use the ViennaCL library for calculations. For good matrix 265 assembly performance the user should preallocate the matrix storage by setting 266 the parameter nz (or the array nnz). By setting these parameters accurately, 267 performance during matrix assembly can be increased substantially. 268 269 270 Collective on MPI_Comm 271 272 Input Parameters: 273 + comm - MPI communicator, set to PETSC_COMM_SELF 274 . m - number of rows 275 . n - number of columns 276 . nz - number of nonzeros per row (same for all rows) 277 - nnz - array containing the number of nonzeros in the various rows 278 (possibly different for each row) or NULL 279 280 Output Parameter: 281 . A - the matrix 282 283 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 284 MatXXXXSetPreallocation() paradigm instead of this routine directly. 285 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 286 287 Notes: 288 If nnz is given then nz is ignored 289 290 The AIJ format (also called the Yale sparse matrix format or 291 compressed row storage), is fully compatible with standard Fortran 77 292 storage. That is, the stored row and column indices can begin at 293 either one (as in Fortran) or zero. See the users' manual for details. 294 295 Specify the preallocated storage with either nz or nnz (not both). 296 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 297 allocation. For large problems you MUST preallocate memory or you 298 will get TERRIBLE performance, see the users' manual chapter on matrices. 299 300 Level: intermediate 301 302 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 303 304 @*/ 305 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 306 { 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatCreate(comm,A);CHKERRQ(ierr); 311 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 312 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 313 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 318 #undef __FUNCT__ 319 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 320 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 321 { 322 PetscErrorCode ierr; 323 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 324 325 PetscFunctionBegin; 326 try { 327 if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 328 delete viennaclcontainer; 329 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 330 } catch(std::exception const & ex) { 331 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 332 } 333 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 334 A->spptr = 0; 335 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 336 PetscFunctionReturn(0); 337 } 338 339 340 #undef __FUNCT__ 341 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 342 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 343 { 344 PetscErrorCode ierr; 345 Mat_SeqAIJ *aij; 346 347 PetscFunctionBegin; 348 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 349 aij = (Mat_SeqAIJ*)B->data; 350 aij->inode.use = PETSC_FALSE; 351 B->ops->mult = MatMult_SeqAIJViennaCL; 352 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 353 B->spptr = new Mat_SeqAIJViennaCL(); 354 355 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 356 357 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 358 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 359 B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 360 361 ierr = MatSetFromOptions_SeqViennaCL(B);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */ 362 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 363 364 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 365 PetscFunctionReturn(0); 366 } 367 368 369 /*M 370 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 371 372 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 373 default. All matrix calculations are performed using the ViennaCL library. 374 375 Options Database Keys: 376 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 377 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 378 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 379 380 Level: beginner 381 382 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 383 M*/ 384 385