1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include "petscconf.h" 9 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10 #include "petscbt.h" 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscInt *ii; 31 PetscErrorCode ierr; 32 33 34 PetscFunctionBegin; 35 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 36 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 37 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38 39 try { 40 ierr = PetscObjectSetFromOptions_ViennaCL((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */ 41 viennaclstruct->mat = new ViennaCLAIJMatrix(); 42 if (a->compressedrow.use) { 43 ii = a->compressedrow.i; 44 45 viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 46 47 // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 48 // Cannot be reasonably supported in ViennaCL 1.4.x (custom kernels required), hence postponing until there is support for compressed CSR 49 50 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 51 } else { 52 53 // Since PetscInt is in general different from cl_uint, we have to convert: 54 viennacl::backend::mem_handle dummy; 55 56 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 57 for (PetscInt i=0; i<=A->rmap->n; ++i) 58 row_buffer.set(i, (a->i)[i]); 59 60 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 61 for (PetscInt i=0; i<a->nz; ++i) 62 col_buffer.set(i, (a->j)[i]); 63 64 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 65 } 66 ViennaCLWaitForGPU(); 67 } catch(std::exception const & ex) { 68 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 69 } 70 71 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 72 73 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 74 } 75 } 76 PetscFunctionReturn(0); 77 } 78 79 #undef __FUNCT__ 80 #define __FUNCT__ "MatViennaCLCopyFromGPU" 81 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 82 { 83 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 84 PetscInt m = A->rmap->n; 85 PetscErrorCode ierr; 86 87 88 PetscFunctionBegin; 89 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 90 try { 91 if (a->compressedrow.use) { 92 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 93 } else { 94 95 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 96 a->nz = Agpu->nnz(); 97 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 98 A->preallocated = PETSC_TRUE; 99 if (a->singlemalloc) { 100 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 101 } else { 102 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 103 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 104 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 105 } 106 ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 107 ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 108 109 a->singlemalloc = PETSC_TRUE; 110 111 /* Setup row lengths */ 112 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 113 ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 114 ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 115 116 /* Copy data back from GPU */ 117 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 118 119 // copy row array 120 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 121 (a->i)[0] = row_buffer[0]; 122 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 123 (a->i)[i+1] = row_buffer[i+1]; 124 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 125 } 126 127 // copy column indices 128 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 129 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 130 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 131 (a->j)[i] = col_buffer[i]; 132 133 // copy nonzero entries directly to destination (no conversion required) 134 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 135 136 ViennaCLWaitForGPU(); 137 /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 138 } 139 } catch(std::exception const & ex) { 140 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 141 } 142 143 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 144 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 145 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 146 147 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 148 } else { 149 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 150 } 151 PetscFunctionReturn(0); 152 } 153 154 #undef __FUNCT__ 155 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 156 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 157 { 158 PetscErrorCode ierr; 159 160 PetscFunctionBegin; 161 if (right) { 162 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 163 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 164 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 165 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 166 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 167 } 168 if (left) { 169 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 170 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 171 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 172 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 173 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 174 } 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 180 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 181 { 182 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 183 PetscErrorCode ierr; 184 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 185 const ViennaCLVector *xgpu=NULL; 186 ViennaCLVector *ygpu=NULL; 187 188 PetscFunctionBegin; 189 if (A->rmap->n > 0 && A->cmap->n > 0) { 190 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 191 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 192 try { 193 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 194 ViennaCLWaitForGPU(); 195 } catch (std::exception const & ex) { 196 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 197 } 198 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 199 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 200 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 201 } 202 PetscFunctionReturn(0); 203 } 204 205 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 209 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 210 { 211 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 212 PetscErrorCode ierr; 213 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 214 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 215 ViennaCLVector *zgpu=NULL; 216 217 PetscFunctionBegin; 218 if (A->rmap->n > 0 && A->cmap->n > 0) { 219 try { 220 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 221 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 222 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 223 224 if (a->compressedrow.use) { 225 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 226 } else { 227 if (zz == xx || zz == yy) { //temporary required 228 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 229 *zgpu = *ygpu + temp; 230 ViennaCLWaitForGPU(); 231 } else { 232 *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 233 *zgpu += *ygpu; 234 ViennaCLWaitForGPU(); 235 } 236 } 237 238 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 239 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 240 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 241 242 } catch(std::exception const & ex) { 243 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 244 } 245 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 246 } 247 PetscFunctionReturn(0); 248 } 249 250 #undef __FUNCT__ 251 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 252 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 253 { 254 PetscErrorCode ierr; 255 256 PetscFunctionBegin; 257 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 258 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 259 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 260 A->ops->mult = MatMult_SeqAIJViennaCL; 261 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 262 PetscFunctionReturn(0); 263 } 264 265 /* --------------------------------------------------------------------------------*/ 266 #undef __FUNCT__ 267 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 268 /*@ 269 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 270 (the default parallel PETSc format). This matrix will ultimately be pushed down 271 to GPUs and use the ViennaCL library for calculations. For good matrix 272 assembly performance the user should preallocate the matrix storage by setting 273 the parameter nz (or the array nnz). By setting these parameters accurately, 274 performance during matrix assembly can be increased substantially. 275 276 277 Collective on MPI_Comm 278 279 Input Parameters: 280 + comm - MPI communicator, set to PETSC_COMM_SELF 281 . m - number of rows 282 . n - number of columns 283 . nz - number of nonzeros per row (same for all rows) 284 - nnz - array containing the number of nonzeros in the various rows 285 (possibly different for each row) or NULL 286 287 Output Parameter: 288 . A - the matrix 289 290 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 291 MatXXXXSetPreallocation() paradigm instead of this routine directly. 292 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 293 294 Notes: 295 If nnz is given then nz is ignored 296 297 The AIJ format (also called the Yale sparse matrix format or 298 compressed row storage), is fully compatible with standard Fortran 77 299 storage. That is, the stored row and column indices can begin at 300 either one (as in Fortran) or zero. See the users' manual for details. 301 302 Specify the preallocated storage with either nz or nnz (not both). 303 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 304 allocation. For large problems you MUST preallocate memory or you 305 will get TERRIBLE performance, see the users' manual chapter on matrices. 306 307 Level: intermediate 308 309 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 310 311 @*/ 312 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 313 { 314 PetscErrorCode ierr; 315 316 PetscFunctionBegin; 317 ierr = MatCreate(comm,A);CHKERRQ(ierr); 318 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 319 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 320 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 321 PetscFunctionReturn(0); 322 } 323 324 325 #undef __FUNCT__ 326 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 327 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 328 { 329 PetscErrorCode ierr; 330 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 331 332 PetscFunctionBegin; 333 try { 334 if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 335 delete viennaclcontainer; 336 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 337 } catch(std::exception const & ex) { 338 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 339 } 340 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 341 A->spptr = 0; 342 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 347 #undef __FUNCT__ 348 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 349 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 350 { 351 PetscErrorCode ierr; 352 Mat_SeqAIJ *aij; 353 354 PetscFunctionBegin; 355 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 356 aij = (Mat_SeqAIJ*)B->data; 357 aij->inode.use = PETSC_FALSE; 358 B->ops->mult = MatMult_SeqAIJViennaCL; 359 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 360 B->spptr = new Mat_SeqAIJViennaCL(); 361 362 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 363 364 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 365 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 366 B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 367 368 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 369 370 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 371 PetscFunctionReturn(0); 372 } 373 374 375 /*M 376 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 377 378 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 379 default. All matrix calculations are performed using the ViennaCL library. 380 381 Options Database Keys: 382 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 383 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 384 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 385 386 Level: beginner 387 388 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 389 M*/ 390 391