1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include "petscconf.h" 9 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10 #include "petscbt.h" 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscInt *ii; 31 PetscErrorCode ierr; 32 33 34 PetscFunctionBegin; 35 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 36 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 37 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38 39 try { 40 viennaclstruct->mat = new ViennaCLAIJMatrix(); 41 if (a->compressedrow.use) { 42 ii = a->compressedrow.i; 43 44 viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 45 46 // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 47 // Cannot be reasonably supported in ViennaCL 1.4.x (custom kernels required), hence postponing until there is support for compressed CSR 48 49 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 50 } else { 51 52 // Since PetscInt is in general different from cl_uint, we have to convert: 53 viennacl::backend::mem_handle dummy; 54 55 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 56 for (PetscInt i=0; i<=A->rmap->n; ++i) 57 row_buffer.set(i, (a->i)[i]); 58 59 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 60 for (PetscInt i=0; i<a->nz; ++i) 61 col_buffer.set(i, (a->j)[i]); 62 63 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 64 } 65 ViennaCLWaitForGPU(); 66 } catch(std::exception const & ex) { 67 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 68 } 69 70 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 71 72 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 73 } 74 } 75 PetscFunctionReturn(0); 76 } 77 78 #undef __FUNCT__ 79 #define __FUNCT__ "MatViennaCLCopyFromGPU" 80 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 81 { 82 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 83 PetscInt m = A->rmap->n; 84 PetscErrorCode ierr; 85 86 87 PetscFunctionBegin; 88 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 89 try { 90 if (a->compressedrow.use) { 91 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 92 } else { 93 94 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 95 a->nz = Agpu->nnz(); 96 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 97 A->preallocated = PETSC_TRUE; 98 if (a->singlemalloc) { 99 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 100 } else { 101 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 102 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 103 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 104 } 105 ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 106 ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 107 108 a->singlemalloc = PETSC_TRUE; 109 110 /* Setup row lengths */ 111 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 112 ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 113 ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 114 115 /* Copy data back from GPU */ 116 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 117 118 // copy row array 119 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 120 (a->i)[0] = row_buffer[0]; 121 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 122 (a->i)[i+1] = row_buffer[i+1]; 123 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 124 } 125 126 // copy column indices 127 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 128 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 129 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 130 (a->j)[i] = col_buffer[i]; 131 132 // copy nonzero entries directly to destination (no conversion required) 133 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 134 135 ViennaCLWaitForGPU(); 136 /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 137 } 138 } catch(std::exception const & ex) { 139 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 140 } 141 142 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 143 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 144 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 145 146 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 147 } else { 148 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 149 } 150 PetscFunctionReturn(0); 151 } 152 153 #undef __FUNCT__ 154 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 155 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 156 { 157 PetscErrorCode ierr; 158 159 PetscFunctionBegin; 160 if (right) { 161 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 162 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 163 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 164 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 165 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 166 } 167 if (left) { 168 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 169 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 170 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 171 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 172 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 173 } 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 179 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 180 { 181 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 182 PetscErrorCode ierr; 183 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 184 const ViennaCLVector *xgpu=NULL; 185 ViennaCLVector *ygpu=NULL; 186 187 PetscFunctionBegin; 188 if (A->rmap->n > 0 && A->cmap->n > 0) { 189 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 190 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 191 try { 192 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 193 ViennaCLWaitForGPU(); 194 } catch (std::exception const & ex) { 195 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 196 } 197 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 198 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 199 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 200 } 201 PetscFunctionReturn(0); 202 } 203 204 205 206 #undef __FUNCT__ 207 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 208 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 209 { 210 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 211 PetscErrorCode ierr; 212 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 213 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 214 ViennaCLVector *zgpu=NULL; 215 216 PetscFunctionBegin; 217 if (A->rmap->n > 0 && A->cmap->n > 0) { 218 try { 219 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 220 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 221 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 222 223 if (a->compressedrow.use) { 224 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 225 } else { 226 if (zz == xx || zz == yy) { //temporary required 227 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 228 *zgpu = *ygpu + temp; 229 ViennaCLWaitForGPU(); 230 } else { 231 *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 232 *zgpu += *ygpu; 233 ViennaCLWaitForGPU(); 234 } 235 } 236 237 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 238 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 239 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 240 241 } catch(std::exception const & ex) { 242 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 243 } 244 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 245 } 246 PetscFunctionReturn(0); 247 } 248 249 #undef __FUNCT__ 250 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 251 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 252 { 253 PetscErrorCode ierr; 254 255 PetscFunctionBegin; 256 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 257 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 258 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 259 A->ops->mult = MatMult_SeqAIJViennaCL; 260 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 261 PetscFunctionReturn(0); 262 } 263 264 /* --------------------------------------------------------------------------------*/ 265 #undef __FUNCT__ 266 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 267 /*@ 268 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 269 (the default parallel PETSc format). This matrix will ultimately be pushed down 270 to GPUs and use the ViennaCL library for calculations. For good matrix 271 assembly performance the user should preallocate the matrix storage by setting 272 the parameter nz (or the array nnz). By setting these parameters accurately, 273 performance during matrix assembly can be increased substantially. 274 275 276 Collective on MPI_Comm 277 278 Input Parameters: 279 + comm - MPI communicator, set to PETSC_COMM_SELF 280 . m - number of rows 281 . n - number of columns 282 . nz - number of nonzeros per row (same for all rows) 283 - nnz - array containing the number of nonzeros in the various rows 284 (possibly different for each row) or NULL 285 286 Output Parameter: 287 . A - the matrix 288 289 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 290 MatXXXXSetPreallocation() paradigm instead of this routine directly. 291 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 292 293 Notes: 294 If nnz is given then nz is ignored 295 296 The AIJ format (also called the Yale sparse matrix format or 297 compressed row storage), is fully compatible with standard Fortran 77 298 storage. That is, the stored row and column indices can begin at 299 either one (as in Fortran) or zero. See the users' manual for details. 300 301 Specify the preallocated storage with either nz or nnz (not both). 302 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 303 allocation. For large problems you MUST preallocate memory or you 304 will get TERRIBLE performance, see the users' manual chapter on matrices. 305 306 Level: intermediate 307 308 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 309 310 @*/ 311 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 312 { 313 PetscErrorCode ierr; 314 315 PetscFunctionBegin; 316 ierr = MatCreate(comm,A);CHKERRQ(ierr); 317 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 318 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 319 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 320 PetscFunctionReturn(0); 321 } 322 323 324 #undef __FUNCT__ 325 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 326 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 327 { 328 PetscErrorCode ierr; 329 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 330 331 PetscFunctionBegin; 332 try { 333 if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 334 delete viennaclcontainer; 335 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 336 } catch(std::exception const & ex) { 337 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 338 } 339 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 340 A->spptr = 0; 341 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 342 PetscFunctionReturn(0); 343 } 344 345 346 #undef __FUNCT__ 347 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 348 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 349 { 350 PetscErrorCode ierr; 351 Mat_SeqAIJ *aij; 352 353 PetscFunctionBegin; 354 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 355 aij = (Mat_SeqAIJ*)B->data; 356 aij->inode.use = PETSC_FALSE; 357 B->ops->mult = MatMult_SeqAIJViennaCL; 358 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 359 B->spptr = new Mat_SeqAIJViennaCL(); 360 361 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 362 363 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 364 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 365 B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 366 367 ierr = MatSetFromOptions_SeqViennaCL(B);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */ 368 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 369 370 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 371 PetscFunctionReturn(0); 372 } 373 374 375 /*M 376 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 377 378 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 379 default. All matrix calculations are performed using the ViennaCL library. 380 381 Options Database Keys: 382 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 383 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 384 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 385 386 Level: beginner 387 388 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 389 M*/ 390 391