1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include "petscconf.h" 9 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10 #include "petscbt.h" 11 #include "../src/vec/vec/impls/dvecimpl.h" 12 #include "petsc-private/vecimpl.h" 13 14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 //PetscInt m = A->rmap->n,*ii,*ridx; 31 PetscInt *ii; 32 PetscErrorCode ierr; 33 34 35 PetscFunctionBegin; 36 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 37 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 38 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 39 40 try { 41 //viennaclstruct->nonzerorow=0; 42 //for (PetscInt j = 0; j<m; j++) viennaclstruct->nonzerorow += (a->i[j+1] > a->i[j]); 43 44 viennaclstruct->mat = new ViennaCLAIJMatrix(); 45 if (a->compressedrow.use) { 46 //m = a->compressedrow.nrows; 47 ii = a->compressedrow.i; 48 //ridx = a->compressedrow.rindex; 49 50 viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 51 52 // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 53 54 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 55 } else { 56 57 // Since PetscInt is in general different from cl_uint, we have to convert: 58 viennacl::backend::mem_handle dummy; 59 60 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 61 for (PetscInt i=0; i<=A->rmap->n; ++i) 62 row_buffer.set(i, (a->i)[i]); 63 64 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 65 for (PetscInt i=0; i<a->nz; ++i) 66 col_buffer.set(i, (a->j)[i]); 67 68 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 69 } 70 } catch(char *ex) { 71 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 72 } 73 74 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 75 76 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 77 } 78 } 79 PetscFunctionReturn(0); 80 } 81 82 #undef __FUNCT__ 83 #define __FUNCT__ "MatViennaCLCopyFromGPU" 84 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu) 85 { 86 //Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 87 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 88 PetscInt m = A->rmap->n; 89 PetscErrorCode ierr; 90 91 92 PetscFunctionBegin; 93 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 94 try { 95 if (a->compressedrow.use) { 96 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 97 } else { 98 99 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 100 a->nz = Agpu->nnz(); 101 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 102 A->preallocated = PETSC_TRUE; 103 if (a->singlemalloc) { 104 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 105 } else { 106 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 107 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 108 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 109 } 110 ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 111 ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 112 113 a->singlemalloc = PETSC_TRUE; 114 115 /* Setup row lengths */ 116 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 117 ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 118 ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 119 120 /* Copy data back from GPU */ 121 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 122 123 // copy row array 124 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 125 (a->i)[0] = row_buffer[0]; 126 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 127 (a->i)[i+1] = row_buffer[i+1]; 128 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 129 } 130 131 // copy column indices 132 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 133 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 134 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 135 (a->j)[i] = col_buffer[i]; 136 137 // copy nonzero entries directly to destination (no conversion required) 138 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 139 140 /* TODO: What about a->diag? */ 141 } 142 } catch(char *ex) { 143 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex); 144 } 145 146 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 147 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 148 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 149 150 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 151 } else { 152 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 153 } 154 PetscFunctionReturn(0); 155 } 156 157 #undef __FUNCT__ 158 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 159 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 160 { 161 PetscErrorCode ierr; 162 163 PetscFunctionBegin; 164 if (right) { 165 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 166 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 167 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 168 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 169 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 170 } 171 if (left) { 172 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 173 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 174 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 175 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 176 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 177 } 178 PetscFunctionReturn(0); 179 } 180 181 #undef __FUNCT__ 182 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 183 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 184 { 185 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 186 PetscErrorCode ierr; 187 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 188 ViennaCLVector *xgpu=NULL,*ygpu=NULL; 189 190 PetscFunctionBegin; 191 if (A->rmap->n > 0 && A->cmap->n > 0) { 192 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 193 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 194 try { 195 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 196 } catch (char *ex) { 197 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 198 } 199 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 200 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 201 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 202 } 203 PetscFunctionReturn(0); 204 } 205 206 207 208 #undef __FUNCT__ 209 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 210 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 211 { 212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 213 PetscErrorCode ierr; 214 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 215 ViennaCLVector *xgpu=NULL,*ygpu=NULL,*zgpu=NULL; 216 217 PetscFunctionBegin; 218 if (A->rmap->n > 0 && A->cmap->n > 0) { 219 try { 220 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 221 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 222 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 223 224 if (a->compressedrow.use) { 225 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 226 } else { 227 if (zz == xx || zz == yy) { //temporary required 228 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 229 *zgpu = *ygpu + temp; 230 } else { 231 *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 232 *zgpu += *ygpu; 233 } 234 } 235 236 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 237 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 238 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 239 240 } catch(char *ex) { 241 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 242 } 243 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 250 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 256 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 257 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 258 A->ops->mult = MatMult_SeqAIJViennaCL; 259 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 260 PetscFunctionReturn(0); 261 } 262 263 /* --------------------------------------------------------------------------------*/ 264 #undef __FUNCT__ 265 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 266 /*@ 267 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 268 (the default parallel PETSc format). This matrix will ultimately pushed down 269 to GPUs and use the ViennaCL library for calculations. For good matrix 270 assembly performance the user should preallocate the matrix storage by setting 271 the parameter nz (or the array nnz). By setting these parameters accurately, 272 performance during matrix assembly can be increased substantially. 273 274 275 Collective on MPI_Comm 276 277 Input Parameters: 278 + comm - MPI communicator, set to PETSC_COMM_SELF 279 . m - number of rows 280 . n - number of columns 281 . nz - number of nonzeros per row (same for all rows) 282 - nnz - array containing the number of nonzeros in the various rows 283 (possibly different for each row) or NULL 284 285 Output Parameter: 286 . A - the matrix 287 288 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 289 MatXXXXSetPreallocation() paradigm instead of this routine directly. 290 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 291 292 Notes: 293 If nnz is given then nz is ignored 294 295 The AIJ format (also called the Yale sparse matrix format or 296 compressed row storage), is fully compatible with standard Fortran 77 297 storage. That is, the stored row and column indices can begin at 298 either one (as in Fortran) or zero. See the users' manual for details. 299 300 Specify the preallocated storage with either nz or nnz (not both). 301 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 302 allocation. For large problems you MUST preallocate memory or you 303 will get TERRIBLE performance, see the users' manual chapter on matrices. 304 305 By default, this format uses inodes (identical nodes) when possible, to 306 improve numerical efficiency of matrix-vector products and solves. We 307 search for consecutive rows with the same nonzero structure, thereby 308 reusing matrix information to achieve increased efficiency. 309 310 Level: intermediate 311 312 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 313 314 @*/ 315 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 316 { 317 PetscErrorCode ierr; 318 319 PetscFunctionBegin; 320 ierr = MatCreate(comm,A);CHKERRQ(ierr); 321 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 322 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 323 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 330 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 331 { 332 PetscErrorCode ierr; 333 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 334 335 PetscFunctionBegin; 336 try { 337 if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 338 delete viennaclcontainer; 339 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 340 } catch(char *ex) { 341 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 342 } 343 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 344 A->spptr = 0; 345 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 346 PetscFunctionReturn(0); 347 } 348 349 350 #undef __FUNCT__ 351 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 352 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 353 { 354 PetscErrorCode ierr; 355 Mat_SeqAIJ *aij; 356 357 PetscFunctionBegin; 358 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 359 aij = (Mat_SeqAIJ*)B->data; 360 aij->inode.use = PETSC_FALSE; 361 B->ops->mult = MatMult_SeqAIJViennaCL; 362 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 363 B->spptr = new Mat_SeqAIJViennaCL(); 364 365 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 366 367 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 368 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 369 B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 370 371 //ierr = PetscObjectComposeFunction((PetscObject)B,"MatViennaCLSetFormat_C", "MatViennaCLSetFormat_SeqAIJViennaCL", MatViennaCLSetFormat_SeqAIJViennaCL);CHKERRQ(ierr); 372 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 373 374 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 375 PetscFunctionReturn(0); 376 } 377 378 379 /*M 380 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 381 382 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 383 default. All matrix calculations are performed using the ViennaCL library. 384 385 Options Database Keys: 386 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 387 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 388 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 389 390 Level: beginner 391 392 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 393 M*/ 394 395