1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include "petscconf.h" 9 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10 #include "petscbt.h" 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscInt *ii; 31 PetscErrorCode ierr; 32 33 34 PetscFunctionBegin; 35 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 36 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 37 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38 39 try { 40 viennaclstruct->mat = new ViennaCLAIJMatrix(); 41 if (a->compressedrow.use) { 42 ii = a->compressedrow.i; 43 44 viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 45 46 // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 47 48 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 49 } else { 50 51 // Since PetscInt is in general different from cl_uint, we have to convert: 52 viennacl::backend::mem_handle dummy; 53 54 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 55 for (PetscInt i=0; i<=A->rmap->n; ++i) 56 row_buffer.set(i, (a->i)[i]); 57 58 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 59 for (PetscInt i=0; i<a->nz; ++i) 60 col_buffer.set(i, (a->j)[i]); 61 62 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 63 } 64 } catch(std::exception const & ex) { 65 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 66 } 67 68 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 69 70 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 71 } 72 } 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatViennaCLCopyFromGPU" 78 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu) 79 { 80 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 81 PetscInt m = A->rmap->n; 82 PetscErrorCode ierr; 83 84 85 PetscFunctionBegin; 86 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 87 try { 88 if (a->compressedrow.use) { 89 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 90 } else { 91 92 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 93 a->nz = Agpu->nnz(); 94 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 95 A->preallocated = PETSC_TRUE; 96 if (a->singlemalloc) { 97 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 98 } else { 99 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 100 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 101 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 102 } 103 ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 104 ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 105 106 a->singlemalloc = PETSC_TRUE; 107 108 /* Setup row lengths */ 109 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 110 ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 111 ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 112 113 /* Copy data back from GPU */ 114 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 115 116 // copy row array 117 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 118 (a->i)[0] = row_buffer[0]; 119 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 120 (a->i)[i+1] = row_buffer[i+1]; 121 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 122 } 123 124 // copy column indices 125 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 126 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 127 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 128 (a->j)[i] = col_buffer[i]; 129 130 // copy nonzero entries directly to destination (no conversion required) 131 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 132 133 /* TODO: What about a->diag? */ 134 } 135 } catch(std::exception const & ex) { 136 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 137 } 138 139 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 140 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 141 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 142 143 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 144 } else { 145 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 146 } 147 PetscFunctionReturn(0); 148 } 149 150 #undef __FUNCT__ 151 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 152 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 153 { 154 PetscErrorCode ierr; 155 156 PetscFunctionBegin; 157 if (right) { 158 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 159 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 160 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 161 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 162 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 163 } 164 if (left) { 165 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 166 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 167 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 168 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 169 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 170 } 171 PetscFunctionReturn(0); 172 } 173 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 176 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 177 { 178 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 179 PetscErrorCode ierr; 180 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 181 ViennaCLVector *xgpu=NULL,*ygpu=NULL; 182 183 PetscFunctionBegin; 184 if (A->rmap->n > 0 && A->cmap->n > 0) { 185 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 186 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 187 try { 188 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 189 } catch (std::exception const & ex) { 190 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 191 } 192 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 193 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 194 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 195 } 196 PetscFunctionReturn(0); 197 } 198 199 200 201 #undef __FUNCT__ 202 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 203 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 204 { 205 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 206 PetscErrorCode ierr; 207 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 208 ViennaCLVector *xgpu=NULL,*ygpu=NULL,*zgpu=NULL; 209 210 PetscFunctionBegin; 211 if (A->rmap->n > 0 && A->cmap->n > 0) { 212 try { 213 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 214 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 215 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 216 217 if (a->compressedrow.use) { 218 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 219 } else { 220 if (zz == xx || zz == yy) { //temporary required 221 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 222 *zgpu = *ygpu + temp; 223 } else { 224 *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 225 *zgpu += *ygpu; 226 } 227 } 228 229 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 230 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 231 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 232 233 } catch(std::exception const & ex) { 234 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 235 } 236 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 237 } 238 PetscFunctionReturn(0); 239 } 240 241 #undef __FUNCT__ 242 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 243 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 244 { 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 249 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 250 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 251 A->ops->mult = MatMult_SeqAIJViennaCL; 252 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 253 PetscFunctionReturn(0); 254 } 255 256 /* --------------------------------------------------------------------------------*/ 257 #undef __FUNCT__ 258 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 259 /*@ 260 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 261 (the default parallel PETSc format). This matrix will ultimately pushed down 262 to GPUs and use the ViennaCL library for calculations. For good matrix 263 assembly performance the user should preallocate the matrix storage by setting 264 the parameter nz (or the array nnz). By setting these parameters accurately, 265 performance during matrix assembly can be increased substantially. 266 267 268 Collective on MPI_Comm 269 270 Input Parameters: 271 + comm - MPI communicator, set to PETSC_COMM_SELF 272 . m - number of rows 273 . n - number of columns 274 . nz - number of nonzeros per row (same for all rows) 275 - nnz - array containing the number of nonzeros in the various rows 276 (possibly different for each row) or NULL 277 278 Output Parameter: 279 . A - the matrix 280 281 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 282 MatXXXXSetPreallocation() paradigm instead of this routine directly. 283 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 284 285 Notes: 286 If nnz is given then nz is ignored 287 288 The AIJ format (also called the Yale sparse matrix format or 289 compressed row storage), is fully compatible with standard Fortran 77 290 storage. That is, the stored row and column indices can begin at 291 either one (as in Fortran) or zero. See the users' manual for details. 292 293 Specify the preallocated storage with either nz or nnz (not both). 294 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 295 allocation. For large problems you MUST preallocate memory or you 296 will get TERRIBLE performance, see the users' manual chapter on matrices. 297 298 By default, this format uses inodes (identical nodes) when possible, to 299 improve numerical efficiency of matrix-vector products and solves. We 300 search for consecutive rows with the same nonzero structure, thereby 301 reusing matrix information to achieve increased efficiency. 302 303 Level: intermediate 304 305 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 306 307 @*/ 308 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 309 { 310 PetscErrorCode ierr; 311 312 PetscFunctionBegin; 313 ierr = MatCreate(comm,A);CHKERRQ(ierr); 314 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 315 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 316 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 317 PetscFunctionReturn(0); 318 } 319 320 321 #undef __FUNCT__ 322 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 323 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 324 { 325 PetscErrorCode ierr; 326 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 327 328 PetscFunctionBegin; 329 try { 330 if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 331 delete viennaclcontainer; 332 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 333 } catch(std::exception const & ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 335 } 336 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 337 A->spptr = 0; 338 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 339 PetscFunctionReturn(0); 340 } 341 342 343 #undef __FUNCT__ 344 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 345 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 346 { 347 PetscErrorCode ierr; 348 Mat_SeqAIJ *aij; 349 350 PetscFunctionBegin; 351 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 352 aij = (Mat_SeqAIJ*)B->data; 353 aij->inode.use = PETSC_FALSE; 354 B->ops->mult = MatMult_SeqAIJViennaCL; 355 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 356 B->spptr = new Mat_SeqAIJViennaCL(); 357 358 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 359 360 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 361 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 362 B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 363 364 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 365 366 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 367 PetscFunctionReturn(0); 368 } 369 370 371 /*M 372 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 373 374 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 375 default. All matrix calculations are performed using the ViennaCL library. 376 377 Options Database Keys: 378 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 379 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 380 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 381 382 Level: beginner 383 384 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 385 M*/ 386 387