1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include "petscconf.h" 9 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10 #include "petscbt.h" 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 //PetscInt m = A->rmap->n,*ii,*ridx; 31 PetscInt *ii; 32 PetscErrorCode ierr; 33 34 35 PetscFunctionBegin; 36 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 37 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38 39 try { 40 //viennaclstruct->nonzerorow=0; 41 //for (PetscInt j = 0; j<m; j++) viennaclstruct->nonzerorow += (a->i[j+1] > a->i[j]); 42 43 viennaclstruct->mat = new ViennaCLAIJMatrix(); 44 if (a->compressedrow.use) { 45 //m = a->compressedrow.nrows; 46 ii = a->compressedrow.i; 47 //ridx = a->compressedrow.rindex; 48 49 viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 50 51 // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 52 53 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 54 } else { 55 56 // Since PetscInt is in general different from cl_uint, we have to convert: 57 viennacl::backend::mem_handle dummy; 58 59 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 60 for (PetscInt i=0; i<=A->rmap->n; ++i) 61 row_buffer.set(i, (a->i)[i]); 62 63 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 64 for (PetscInt i=0; i<a->nz; ++i) 65 col_buffer.set(i, (a->j)[i]); 66 67 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 68 } 69 } catch(char *ex) { 70 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 71 } 72 73 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 74 75 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 76 } 77 PetscFunctionReturn(0); 78 } 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatViennaCLCopyFromGPU" 82 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu) 83 { 84 //Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 85 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 86 PetscInt m = A->rmap->n; 87 PetscErrorCode ierr; 88 89 90 PetscFunctionBegin; 91 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 92 try { 93 if (a->compressedrow.use) { 94 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 95 } else { 96 97 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 98 a->nz = Agpu->nnz(); 99 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 100 A->preallocated = PETSC_TRUE; 101 if (a->singlemalloc) { 102 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 103 } else { 104 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 105 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 106 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 107 } 108 ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 109 ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 110 111 a->singlemalloc = PETSC_TRUE; 112 113 /* Setup row lengths */ 114 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 115 ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 116 ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 117 118 /* Copy data back from GPU */ 119 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 120 121 // copy row array 122 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 123 (a->i)[0] = row_buffer[0]; 124 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 125 (a->i)[i+1] = row_buffer[i+1]; 126 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 127 } 128 129 // copy column indices 130 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 131 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 132 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 133 (a->j)[i] = col_buffer[i]; 134 135 // copy nonzero entries directly to destination (no conversion required) 136 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 137 138 /* TODO: What about a->diag? */ 139 } 140 } catch(char *ex) { 141 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex); 142 } 143 144 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 145 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 146 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 147 148 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 149 } else { 150 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 151 } 152 PetscFunctionReturn(0); 153 } 154 155 #undef __FUNCT__ 156 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 157 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 158 { 159 PetscErrorCode ierr; 160 161 PetscFunctionBegin; 162 if (right) { 163 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 164 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 165 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 166 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 167 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 168 } 169 if (left) { 170 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 171 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 172 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 173 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 174 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 175 } 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 181 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 182 { 183 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 184 PetscErrorCode ierr; 185 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 186 ViennaCLVector *xgpu=NULL,*ygpu=NULL; 187 188 PetscFunctionBegin; 189 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 190 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 191 try { 192 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 193 } catch (char *ex) { 194 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 195 } 196 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 197 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 198 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 199 PetscFunctionReturn(0); 200 } 201 202 203 204 #undef __FUNCT__ 205 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 206 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 207 { 208 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 209 PetscErrorCode ierr; 210 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 211 ViennaCLVector *xgpu=NULL,*ygpu=NULL,*zgpu=NULL; 212 213 PetscFunctionBegin; 214 try { 215 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 216 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 217 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 218 219 if (a->compressedrow.use) { 220 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 221 } else { 222 if (zz == xx || zz == yy) { //temporary required 223 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 224 *zgpu = *ygpu + temp; 225 } else { 226 *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 227 *zgpu += *ygpu; 228 } 229 } 230 231 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 232 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 233 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 234 235 } catch(char *ex) { 236 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 237 } 238 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 244 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 245 { 246 PetscErrorCode ierr; 247 248 PetscFunctionBegin; 249 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 250 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 251 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 252 A->ops->mult = MatMult_SeqAIJViennaCL; 253 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 254 PetscFunctionReturn(0); 255 } 256 257 /* --------------------------------------------------------------------------------*/ 258 #undef __FUNCT__ 259 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 260 /*@ 261 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 262 (the default parallel PETSc format). This matrix will ultimately pushed down 263 to GPUs and use the ViennaCL library for calculations. For good matrix 264 assembly performance the user should preallocate the matrix storage by setting 265 the parameter nz (or the array nnz). By setting these parameters accurately, 266 performance during matrix assembly can be increased substantially. 267 268 269 Collective on MPI_Comm 270 271 Input Parameters: 272 + comm - MPI communicator, set to PETSC_COMM_SELF 273 . m - number of rows 274 . n - number of columns 275 . nz - number of nonzeros per row (same for all rows) 276 - nnz - array containing the number of nonzeros in the various rows 277 (possibly different for each row) or NULL 278 279 Output Parameter: 280 . A - the matrix 281 282 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 283 MatXXXXSetPreallocation() paradigm instead of this routine directly. 284 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 285 286 Notes: 287 If nnz is given then nz is ignored 288 289 The AIJ format (also called the Yale sparse matrix format or 290 compressed row storage), is fully compatible with standard Fortran 77 291 storage. That is, the stored row and column indices can begin at 292 either one (as in Fortran) or zero. See the users' manual for details. 293 294 Specify the preallocated storage with either nz or nnz (not both). 295 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 296 allocation. For large problems you MUST preallocate memory or you 297 will get TERRIBLE performance, see the users' manual chapter on matrices. 298 299 By default, this format uses inodes (identical nodes) when possible, to 300 improve numerical efficiency of matrix-vector products and solves. We 301 search for consecutive rows with the same nonzero structure, thereby 302 reusing matrix information to achieve increased efficiency. 303 304 Level: intermediate 305 306 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 307 308 @*/ 309 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 310 { 311 PetscErrorCode ierr; 312 313 PetscFunctionBegin; 314 ierr = MatCreate(comm,A);CHKERRQ(ierr); 315 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 316 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 317 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 318 PetscFunctionReturn(0); 319 } 320 321 322 #undef __FUNCT__ 323 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 324 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 325 { 326 PetscErrorCode ierr; 327 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 328 329 PetscFunctionBegin; 330 try { 331 if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 332 delete viennaclcontainer; 333 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 334 } catch(char *ex) { 335 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 336 } 337 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 338 A->spptr = 0; 339 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 340 PetscFunctionReturn(0); 341 } 342 343 344 #undef __FUNCT__ 345 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 346 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 347 { 348 PetscErrorCode ierr; 349 Mat_SeqAIJ *aij; 350 351 PetscFunctionBegin; 352 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 353 aij = (Mat_SeqAIJ*)B->data; 354 aij->inode.use = PETSC_FALSE; 355 B->ops->mult = MatMult_SeqAIJViennaCL; 356 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 357 B->spptr = new Mat_SeqAIJViennaCL(); 358 359 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 360 361 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 362 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 363 B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 364 365 //ierr = PetscObjectComposeFunction((PetscObject)B,"MatViennaCLSetFormat_C", "MatViennaCLSetFormat_SeqAIJViennaCL", MatViennaCLSetFormat_SeqAIJViennaCL);CHKERRQ(ierr); 366 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 367 368 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 369 PetscFunctionReturn(0); 370 } 371 372 373 /*M 374 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 375 376 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 377 default. All matrix calculations are performed using the ViennaCL library. 378 379 Options Database Keys: 380 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 381 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 382 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 383 384 Level: beginner 385 386 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 387 M*/ 388 389