1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <petscbt.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h> 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscErrorCode ierr; 31 32 33 PetscFunctionBegin; 34 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 35 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 36 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 37 38 try { 39 ierr = PetscObjectViennaCLSetFromOptions((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */ 40 if (a->compressedrow.use) { 41 if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix(); 42 43 // Since PetscInt is different from cl_uint, we have to convert: 44 viennacl::backend::mem_handle dummy; 45 46 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1); 47 for (PetscInt i=0; i<=a->compressedrow.nrows; ++i) 48 row_buffer.set(i, (a->compressedrow.i)[i]); 49 50 viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows); 51 for (PetscInt i=0; i<a->compressedrow.nrows; ++i) 52 row_indices.set(i, (a->compressedrow.rindex)[i]); 53 54 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 55 for (PetscInt i=0; i<a->nz; ++i) 56 col_buffer.set(i, (a->j)[i]); 57 58 viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz); 59 } else { 60 if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix(); 61 62 // Since PetscInt is in general different from cl_uint, we have to convert: 63 viennacl::backend::mem_handle dummy; 64 65 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 66 for (PetscInt i=0; i<=A->rmap->n; ++i) 67 row_buffer.set(i, (a->i)[i]); 68 69 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 70 for (PetscInt i=0; i<a->nz; ++i) 71 col_buffer.set(i, (a->j)[i]); 72 73 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 74 } 75 ViennaCLWaitForGPU(); 76 } catch(std::exception const & ex) { 77 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 78 } 79 80 // Create temporary vector for v += A*x: 81 if (viennaclstruct->tempvec) { 82 if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(a->nz)) { 83 delete (ViennaCLVector*)viennaclstruct->tempvec; 84 viennaclstruct->tempvec = new ViennaCLVector(a->nz); 85 } else { 86 viennaclstruct->tempvec->clear(); 87 } 88 } else { 89 viennaclstruct->tempvec = new ViennaCLVector(a->nz); 90 } 91 92 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 93 94 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 95 } 96 } 97 PetscFunctionReturn(0); 98 } 99 100 #undef __FUNCT__ 101 #define __FUNCT__ "MatViennaCLCopyFromGPU" 102 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 103 { 104 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 105 PetscInt m = A->rmap->n; 106 PetscErrorCode ierr; 107 108 109 PetscFunctionBegin; 110 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 111 try { 112 if (a->compressedrow.use) { 113 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 114 } else { 115 116 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 117 a->nz = Agpu->nnz(); 118 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 119 A->preallocated = PETSC_TRUE; 120 if (a->singlemalloc) { 121 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 122 } else { 123 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 124 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 125 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 126 } 127 ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr); 128 ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 129 130 a->singlemalloc = PETSC_TRUE; 131 132 /* Setup row lengths */ 133 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 134 ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr); 135 ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 136 137 /* Copy data back from GPU */ 138 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 139 140 // copy row array 141 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 142 (a->i)[0] = row_buffer[0]; 143 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 144 (a->i)[i+1] = row_buffer[i+1]; 145 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 146 } 147 148 // copy column indices 149 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 150 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 151 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 152 (a->j)[i] = col_buffer[i]; 153 154 // copy nonzero entries directly to destination (no conversion required) 155 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 156 157 ViennaCLWaitForGPU(); 158 /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 159 } 160 } catch(std::exception const & ex) { 161 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 162 } 163 164 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 165 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 166 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 167 168 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 169 } else { 170 SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 171 } 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL" 177 PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 178 { 179 PetscErrorCode ierr; 180 PetscInt rbs,cbs; 181 182 PetscFunctionBegin; 183 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 184 if (right) { 185 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 186 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 187 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 188 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 189 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 190 } 191 if (left) { 192 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 193 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 194 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 195 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 196 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 197 } 198 PetscFunctionReturn(0); 199 } 200 201 #undef __FUNCT__ 202 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 203 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 204 { 205 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 206 PetscErrorCode ierr; 207 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 208 const ViennaCLVector *xgpu=NULL; 209 ViennaCLVector *ygpu=NULL; 210 211 PetscFunctionBegin; 212 if (A->rmap->n > 0 && A->cmap->n > 0) { 213 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 214 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 215 try { 216 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 217 ViennaCLWaitForGPU(); 218 } catch (std::exception const & ex) { 219 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 220 } 221 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 222 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 223 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 224 } 225 PetscFunctionReturn(0); 226 } 227 228 229 230 #undef __FUNCT__ 231 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 232 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 233 { 234 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 235 PetscErrorCode ierr; 236 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 237 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 238 ViennaCLVector *zgpu=NULL; 239 240 PetscFunctionBegin; 241 if (A->rmap->n > 0 && A->cmap->n > 0) { 242 try { 243 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 244 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 245 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 246 247 if (a->compressedrow.use) { 248 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 249 *zgpu = *ygpu + temp; 250 ViennaCLWaitForGPU(); 251 } else { 252 if (zz == xx || zz == yy) { //temporary required 253 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 254 *zgpu = *ygpu; 255 *zgpu += temp; 256 ViennaCLWaitForGPU(); 257 } else { 258 *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 259 *zgpu = *ygpu + *viennaclstruct->tempvec; 260 ViennaCLWaitForGPU(); 261 } 262 } 263 264 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 265 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 266 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 267 268 } catch(std::exception const & ex) { 269 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 270 } 271 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 272 } 273 PetscFunctionReturn(0); 274 } 275 276 #undef __FUNCT__ 277 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 278 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 279 { 280 PetscErrorCode ierr; 281 282 PetscFunctionBegin; 283 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 284 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 285 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 286 A->ops->mult = MatMult_SeqAIJViennaCL; 287 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 288 PetscFunctionReturn(0); 289 } 290 291 /* --------------------------------------------------------------------------------*/ 292 #undef __FUNCT__ 293 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 294 /*@ 295 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 296 (the default parallel PETSc format). This matrix will ultimately be pushed down 297 to GPUs and use the ViennaCL library for calculations. For good matrix 298 assembly performance the user should preallocate the matrix storage by setting 299 the parameter nz (or the array nnz). By setting these parameters accurately, 300 performance during matrix assembly can be increased substantially. 301 302 303 Collective on MPI_Comm 304 305 Input Parameters: 306 + comm - MPI communicator, set to PETSC_COMM_SELF 307 . m - number of rows 308 . n - number of columns 309 . nz - number of nonzeros per row (same for all rows) 310 - nnz - array containing the number of nonzeros in the various rows 311 (possibly different for each row) or NULL 312 313 Output Parameter: 314 . A - the matrix 315 316 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 317 MatXXXXSetPreallocation() paradigm instead of this routine directly. 318 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 319 320 Notes: 321 If nnz is given then nz is ignored 322 323 The AIJ format (also called the Yale sparse matrix format or 324 compressed row storage), is fully compatible with standard Fortran 77 325 storage. That is, the stored row and column indices can begin at 326 either one (as in Fortran) or zero. See the users' manual for details. 327 328 Specify the preallocated storage with either nz or nnz (not both). 329 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 330 allocation. For large problems you MUST preallocate memory or you 331 will get TERRIBLE performance, see the users' manual chapter on matrices. 332 333 Level: intermediate 334 335 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 336 337 @*/ 338 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 339 { 340 PetscErrorCode ierr; 341 342 PetscFunctionBegin; 343 ierr = MatCreate(comm,A);CHKERRQ(ierr); 344 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 345 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 346 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 347 PetscFunctionReturn(0); 348 } 349 350 351 #undef __FUNCT__ 352 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 353 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 354 { 355 PetscErrorCode ierr; 356 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 357 358 PetscFunctionBegin; 359 try { 360 if (viennaclcontainer) { 361 delete viennaclcontainer->tempvec; 362 delete viennaclcontainer->mat; 363 delete viennaclcontainer->compressed_mat; 364 delete viennaclcontainer; 365 } 366 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 367 } catch(std::exception const & ex) { 368 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 369 } 370 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 371 A->spptr = 0; 372 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 373 PetscFunctionReturn(0); 374 } 375 376 377 #undef __FUNCT__ 378 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 379 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 380 { 381 PetscErrorCode ierr; 382 Mat_SeqAIJ *aij; 383 384 PetscFunctionBegin; 385 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 386 aij = (Mat_SeqAIJ*)B->data; 387 aij->inode.use = PETSC_FALSE; 388 B->ops->mult = MatMult_SeqAIJViennaCL; 389 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 390 B->spptr = new Mat_SeqAIJViennaCL(); 391 392 ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec = NULL; 393 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = NULL; 394 ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL; 395 396 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 397 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 398 B->ops->getvecs = MatCreateVecs_SeqAIJViennaCL; 399 400 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 401 402 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 403 PetscFunctionReturn(0); 404 } 405 406 407 /*M 408 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 409 410 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 411 default. All matrix calculations are performed using the ViennaCL library. 412 413 Options Database Keys: 414 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 415 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 416 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 417 418 Level: beginner 419 420 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 421 M*/ 422 423