1 2 3 /* 4 Defines the basic matrix operations for the AIJ (compressed row) 5 matrix storage format. 6 */ 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <petscbt.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h> 15 16 17 #include <algorithm> 18 #include <vector> 19 #include <string> 20 21 #include "viennacl/linalg/prod.hpp" 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatViennaCLCopyToGPU" 25 PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26 { 27 28 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30 PetscErrorCode ierr; 31 32 33 PetscFunctionBegin; 34 if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0 35 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 36 ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 37 38 try { 39 ierr = PetscObjectViennaCLSetFromOptions((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */ 40 if (a->compressedrow.use) { 41 if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix(); 42 43 // Since PetscInt is different from cl_uint, we have to convert: 44 viennacl::backend::mem_handle dummy; 45 46 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1); 47 for (PetscInt i=0; i<=a->compressedrow.nrows; ++i) 48 row_buffer.set(i, (a->compressedrow.i)[i]); 49 50 viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows); 51 for (PetscInt i=0; i<a->compressedrow.nrows; ++i) 52 row_indices.set(i, (a->compressedrow.rindex)[i]); 53 54 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 55 for (PetscInt i=0; i<a->nz; ++i) 56 col_buffer.set(i, (a->j)[i]); 57 58 viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz); 59 } else { 60 if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix(); 61 62 // Since PetscInt is in general different from cl_uint, we have to convert: 63 viennacl::backend::mem_handle dummy; 64 65 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 66 for (PetscInt i=0; i<=A->rmap->n; ++i) 67 row_buffer.set(i, (a->i)[i]); 68 69 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 70 for (PetscInt i=0; i<a->nz; ++i) 71 col_buffer.set(i, (a->j)[i]); 72 73 viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 74 } 75 ViennaCLWaitForGPU(); 76 } catch(std::exception const & ex) { 77 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 78 } 79 80 // Create temporary vector for v += A*x: 81 if (viennaclstruct->tempvec) { 82 if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(a->nz)) { 83 delete (ViennaCLVector*)viennaclstruct->tempvec; 84 viennaclstruct->tempvec = new ViennaCLVector(a->nz); 85 } else { 86 viennaclstruct->tempvec->clear(); 87 } 88 } else { 89 viennaclstruct->tempvec = new ViennaCLVector(a->nz); 90 } 91 92 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 93 94 ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 95 } 96 } 97 PetscFunctionReturn(0); 98 } 99 100 #undef __FUNCT__ 101 #define __FUNCT__ "MatViennaCLCopyFromGPU" 102 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 103 { 104 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 105 PetscInt m = A->rmap->n; 106 PetscErrorCode ierr; 107 108 109 PetscFunctionBegin; 110 if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 111 try { 112 if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 113 else { 114 115 if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 116 a->nz = Agpu->nnz(); 117 a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 118 A->preallocated = PETSC_TRUE; 119 if (a->singlemalloc) { 120 if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 121 } else { 122 if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 123 if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 124 if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 125 } 126 ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr); 127 ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 128 129 a->singlemalloc = PETSC_TRUE; 130 131 /* Setup row lengths */ 132 if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 133 ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr); 134 ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 135 136 /* Copy data back from GPU */ 137 viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 138 139 // copy row array 140 viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 141 (a->i)[0] = row_buffer[0]; 142 for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 143 (a->i)[i+1] = row_buffer[i+1]; 144 a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 145 } 146 147 // copy column indices 148 viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 149 viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 150 for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 151 (a->j)[i] = col_buffer[i]; 152 153 // copy nonzero entries directly to destination (no conversion required) 154 viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 155 156 ViennaCLWaitForGPU(); 157 /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 158 } 159 } catch(std::exception const & ex) { 160 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 161 } 162 163 /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 164 ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 165 ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 166 167 A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 168 } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 169 PetscFunctionReturn(0); 170 } 171 172 #undef __FUNCT__ 173 #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL" 174 PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 175 { 176 PetscErrorCode ierr; 177 PetscInt rbs,cbs; 178 179 PetscFunctionBegin; 180 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 181 if (right) { 182 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 183 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 184 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 185 ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 186 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 187 } 188 if (left) { 189 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 190 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 191 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 192 ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 193 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 194 } 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatMult_SeqAIJViennaCL" 200 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 201 { 202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 203 PetscErrorCode ierr; 204 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 205 const ViennaCLVector *xgpu=NULL; 206 ViennaCLVector *ygpu=NULL; 207 208 PetscFunctionBegin; 209 if (A->rmap->n > 0 && A->cmap->n > 0) { 210 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 211 ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 212 try { 213 *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 214 ViennaCLWaitForGPU(); 215 } catch (std::exception const & ex) { 216 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 217 } 218 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 219 ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 220 ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 226 227 #undef __FUNCT__ 228 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 229 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 230 { 231 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 232 PetscErrorCode ierr; 233 Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 234 const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 235 ViennaCLVector *zgpu=NULL; 236 237 PetscFunctionBegin; 238 if (A->rmap->n > 0 && A->cmap->n > 0) { 239 try { 240 ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 241 ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 242 ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 243 244 if (a->compressedrow.use) { 245 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 246 *zgpu = *ygpu + temp; 247 ViennaCLWaitForGPU(); 248 } else { 249 if (zz == xx || zz == yy) { //temporary required 250 ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 251 *zgpu = *ygpu; 252 *zgpu += temp; 253 ViennaCLWaitForGPU(); 254 } else { 255 *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 256 *zgpu = *ygpu + *viennaclstruct->tempvec; 257 ViennaCLWaitForGPU(); 258 } 259 } 260 261 ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 262 ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 263 ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 264 265 } catch(std::exception const & ex) { 266 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 267 } 268 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 269 } 270 PetscFunctionReturn(0); 271 } 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 275 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 276 { 277 PetscErrorCode ierr; 278 279 PetscFunctionBegin; 280 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 281 ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 282 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 283 A->ops->mult = MatMult_SeqAIJViennaCL; 284 A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 285 PetscFunctionReturn(0); 286 } 287 288 /* --------------------------------------------------------------------------------*/ 289 #undef __FUNCT__ 290 #define __FUNCT__ "MatCreateSeqAIJViennaCL" 291 /*@ 292 MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 293 (the default parallel PETSc format). This matrix will ultimately be pushed down 294 to GPUs and use the ViennaCL library for calculations. For good matrix 295 assembly performance the user should preallocate the matrix storage by setting 296 the parameter nz (or the array nnz). By setting these parameters accurately, 297 performance during matrix assembly can be increased substantially. 298 299 300 Collective on MPI_Comm 301 302 Input Parameters: 303 + comm - MPI communicator, set to PETSC_COMM_SELF 304 . m - number of rows 305 . n - number of columns 306 . nz - number of nonzeros per row (same for all rows) 307 - nnz - array containing the number of nonzeros in the various rows 308 (possibly different for each row) or NULL 309 310 Output Parameter: 311 . A - the matrix 312 313 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 314 MatXXXXSetPreallocation() paradigm instead of this routine directly. 315 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 316 317 Notes: 318 If nnz is given then nz is ignored 319 320 The AIJ format (also called the Yale sparse matrix format or 321 compressed row storage), is fully compatible with standard Fortran 77 322 storage. That is, the stored row and column indices can begin at 323 either one (as in Fortran) or zero. See the users' manual for details. 324 325 Specify the preallocated storage with either nz or nnz (not both). 326 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 327 allocation. For large problems you MUST preallocate memory or you 328 will get TERRIBLE performance, see the users' manual chapter on matrices. 329 330 Level: intermediate 331 332 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 333 334 @*/ 335 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 336 { 337 PetscErrorCode ierr; 338 339 PetscFunctionBegin; 340 ierr = MatCreate(comm,A);CHKERRQ(ierr); 341 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 342 ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 343 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 344 PetscFunctionReturn(0); 345 } 346 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 350 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 351 { 352 PetscErrorCode ierr; 353 Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 354 355 PetscFunctionBegin; 356 try { 357 if (viennaclcontainer) { 358 delete viennaclcontainer->tempvec; 359 delete viennaclcontainer->mat; 360 delete viennaclcontainer->compressed_mat; 361 delete viennaclcontainer; 362 } 363 A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 364 } catch(std::exception const & ex) { 365 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 366 } 367 /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 368 A->spptr = 0; 369 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 370 PetscFunctionReturn(0); 371 } 372 373 374 #undef __FUNCT__ 375 #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 376 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 377 { 378 PetscErrorCode ierr; 379 Mat_SeqAIJ *aij; 380 381 PetscFunctionBegin; 382 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 383 aij = (Mat_SeqAIJ*)B->data; 384 aij->inode.use = PETSC_FALSE; 385 B->ops->mult = MatMult_SeqAIJViennaCL; 386 B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 387 B->spptr = new Mat_SeqAIJViennaCL(); 388 389 ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec = NULL; 390 ((Mat_SeqAIJViennaCL*)B->spptr)->mat = NULL; 391 ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL; 392 393 B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 394 B->ops->destroy = MatDestroy_SeqAIJViennaCL; 395 B->ops->getvecs = MatCreateVecs_SeqAIJViennaCL; 396 397 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 398 399 B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 400 PetscFunctionReturn(0); 401 } 402 403 404 /*M 405 MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 406 407 A matrix type type whose data resides on GPUs. These matrices are in CSR format by 408 default. All matrix calculations are performed using the ViennaCL library. 409 410 Options Database Keys: 411 + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 412 . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 413 - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 414 415 Level: beginner 416 417 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 418 M*/ 419 420