1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <petscbt.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h> 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscErrorCode ierr; 31 32 33 PetscFunctionBegin; 34 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 35 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 36 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 37 38 try { 39 if (a->compressedrow.use) { 40 if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix(); 41 42 // Since PetscInt is different from cl_uint, we have to convert: 43 viennacl::backend::mem_handle dummy; 44 45 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1); 46 for (PetscInt i=0; i<=a->compressedrow.nrows; ++i) 47 row_buffer.set(i, (a->compressedrow.i)[i]); 48 49 viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows); 50 for (PetscInt i=0; i<a->compressedrow.nrows; ++i) 51 row_indices.set(i, (a->compressedrow.rindex)[i]); 52 53 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 54 for (PetscInt i=0; i<a->nz; ++i) 55 col_buffer.set(i, (a->j)[i]); 56 57 viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz); 58 } else { 59 if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix(); 60 61 // Since PetscInt is in general different from cl_uint, we have to convert: 62 viennacl::backend::mem_handle dummy; 63 64 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 65 for (PetscInt i=0; i<=A->rmap->n; ++i) 66 row_buffer.set(i, (a->i)[i]); 67 68 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 69 for (PetscInt i=0; i<a->nz; ++i) 70 col_buffer.set(i, (a->j)[i]); 71 72 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 73 } 74 ViennaCLWaitForGPU(); 75 } catch(std::exception const & ex) { 76 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 77 } 78 79 // Create temporary vector for v += A*x: 80 if (viennaclstruct->tempvec) { 81 if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) { 82 delete (ViennaCLVector*)viennaclstruct->tempvec; 83 viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 84 } else { 85 viennaclstruct->tempvec->clear(); 86 } 87 } else { 88 viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 89 } 90 91 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 92 93 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 94 } 95 } 96 PetscFunctionReturn(0); 97 } 98 99 #undef __FUNCT__ 100 #define __FUNCT__ "MatViennaCLCopyFromGPU" 101 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 102 { 103 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 104 PetscInt m = A->rmap->n; 105 PetscErrorCode ierr; 106 107 108 PetscFunctionBegin; 109 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 110 try { 111 if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 112 else { 113 114 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 115 a->nz = Agpu->nnz(); 116 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 117 A->preallocated = PETSC_TRUE; 118 if (a->singlemalloc) { 119 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 120 } else { 121 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 122 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 123 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 124 } 125 ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr); 126 ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 127 128 a->singlemalloc = PETSC_TRUE; 129 130 /* Setup row lengths */ 131 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 132 ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr); 133 ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 134 135 /* Copy data back from GPU */ 136 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 137 138 // copy row array 139 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 140 (a->i)[0] = row_buffer[0]; 141 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 142 (a->i)[i+1] = row_buffer[i+1]; 143 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 144 } 145 146 // copy column indices 147 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 148 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 149 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 150 (a->j)[i] = col_buffer[i]; 151 152 // copy nonzero entries directly to destination (no conversion required) 153 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 154 155 ViennaCLWaitForGPU(); 156 /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 157 } 158 } catch(std::exception const & ex) { 159 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 160 } 161 162 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 163 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 164 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 165 166 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 167 } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 168 PetscFunctionReturn(0); 169 } 170 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL" 173 PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 174 { 175 PetscErrorCode ierr; 176 PetscInt rbs,cbs; 177 178 PetscFunctionBegin; 179 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 180 if (right) { 181 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 182 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 183 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 184 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 185 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 186 } 187 if (left) { 188 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 189 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 190 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 191 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 192 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 193 } 194 PetscFunctionReturn(0); 195 } 196 197 #undef __FUNCT__ 198 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 199 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 200 { 201 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 202 PetscErrorCode ierr; 203 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 204 const ViennaCLVector *xgpu=NULL; 205 ViennaCLVector *ygpu=NULL; 206 207 PetscFunctionBegin; 208 if (A->rmap->n > 0 && A->cmap->n > 0) { 209 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 210 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 211 try { 212 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 213 ViennaCLWaitForGPU(); 214 } catch (std::exception const & ex) { 215 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 216 } 217 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 218 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 219 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 220 } 221 PetscFunctionReturn(0); 222 } 223 224 225 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 228 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 229 { 230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 231 PetscErrorCode ierr; 232 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 233 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 234 ViennaCLVector *zgpu=NULL; 235 236 PetscFunctionBegin; 237 if (A->rmap->n > 0 && A->cmap->n > 0) { 238 try { 239 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 240 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 241 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 242 243 if (a->compressedrow.use) { 244 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 245 *zgpu = *ygpu + temp; 246 ViennaCLWaitForGPU(); 247 } else { 248 if (zz == xx || zz == yy) { //temporary required 249 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 250 *zgpu = *ygpu; 251 *zgpu += temp; 252 ViennaCLWaitForGPU(); 253 } else { 254 *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 255 *zgpu = *ygpu + *viennaclstruct->tempvec; 256 ViennaCLWaitForGPU(); 257 } 258 } 259 260 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 261 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 262 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 263 264 } catch(std::exception const & ex) { 265 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 266 } 267 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 268 } 269 PetscFunctionReturn(0); 270 } 271 272 #undef __FUNCT__ 273 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 274 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 275 { 276 PetscErrorCode ierr; 277 278 PetscFunctionBegin; 279 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 280 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 281 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 282 A->ops->mult = MatMult_SeqAIJViennaCL; 283 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 284 PetscFunctionReturn(0); 285 } 286 287 /* --------------------------------------------------------------------------------*/ 288 #undef __FUNCT__ 289 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 290 /*@ 291 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 292 (the default parallel PETSc format). This matrix will ultimately be pushed down 293 to GPUs and use the ViennaCL library for calculations. For good matrix 294 assembly performance the user should preallocate the matrix storage by setting 295 the parameter nz (or the array nnz). By setting these parameters accurately, 296 performance during matrix assembly can be increased substantially. 297 298 299 Collective on MPI_Comm 300 301 Input Parameters: 302 + comm - MPI communicator, set to PETSC_COMM_SELF 303 . m - number of rows 304 . n - number of columns 305 . nz - number of nonzeros per row (same for all rows) 306 - nnz - array containing the number of nonzeros in the various rows 307 (possibly different for each row) or NULL 308 309 Output Parameter: 310 . A - the matrix 311 312 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 313 MatXXXXSetPreallocation() paradigm instead of this routine directly. 314 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 315 316 Notes: 317 If nnz is given then nz is ignored 318 319 The AIJ format (also called the Yale sparse matrix format or 320 compressed row storage), is fully compatible with standard Fortran 77 321 storage. That is, the stored row and column indices can begin at 322 either one (as in Fortran) or zero. See the users' manual for details. 323 324 Specify the preallocated storage with either nz or nnz (not both). 325 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 326 allocation. For large problems you MUST preallocate memory or you 327 will get TERRIBLE performance, see the users' manual chapter on matrices. 328 329 Level: intermediate 330 331 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 332 333 @*/ 334 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 335 { 336 PetscErrorCode ierr; 337 338 PetscFunctionBegin; 339 ierr = MatCreate(comm,A);CHKERRQ(ierr); 340 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 341 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 342 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 347 #undef __FUNCT__ 348 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 349 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 350 { 351 PetscErrorCode ierr; 352 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 353 354 PetscFunctionBegin; 355 try { 356 if (viennaclcontainer) { 357 delete viennaclcontainer->tempvec; 358 delete viennaclcontainer->mat; 359 delete viennaclcontainer->compressed_mat; 360 delete viennaclcontainer; 361 } 362 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 363 } catch(std::exception const & ex) { 364 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 365 } 366 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 367 A->spptr = 0; 368 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 369 PetscFunctionReturn(0); 370 } 371 372 373 #undef __FUNCT__ 374 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 375 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 376 { 377 PetscErrorCode ierr; 378 Mat_SeqAIJ *aij; 379 380 PetscFunctionBegin; 381 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 382 aij = (Mat_SeqAIJ*)B->data; 383 aij->inode.use = PETSC_FALSE; 384 B->ops->mult = MatMult_SeqAIJViennaCL; 385 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 386 B->spptr = new Mat_SeqAIJViennaCL(); 387 388 ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec = NULL; 389 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = NULL; 390 ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL; 391 392 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 393 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 394 B->ops->getvecs = MatCreateVecs_SeqAIJViennaCL; 395 396 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 397 398 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 399 PetscFunctionReturn(0); 400 } 401 402 403 /*M 404 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 405 406 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 407 default. All matrix calculations are performed using the ViennaCL library. 408 409 Options Database Keys: 410 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 411 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 412 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 413 414 Level: beginner 415 416 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 417 M*/ 418 419