1e4a0ef16SKarl Rupp 2e4a0ef16SKarl Rupp /* 3e4a0ef16SKarl Rupp Defines the basic matrix operations for the AIJ (compressed row) 4e4a0ef16SKarl Rupp matrix storage format. 5e4a0ef16SKarl Rupp */ 6e4a0ef16SKarl Rupp 7aaa7dc30SBarry Smith #include <petscconf.h> 899acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10aaa7dc30SBarry Smith #include <petscbt.h> 11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 13e4a0ef16SKarl Rupp 14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h> 15e4a0ef16SKarl Rupp 16e4a0ef16SKarl Rupp #include <algorithm> 17e4a0ef16SKarl Rupp #include <vector> 18e4a0ef16SKarl Rupp #include <string> 19e4a0ef16SKarl Rupp 20e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp" 21e4a0ef16SKarl Rupp 228713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat); 2372367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 244222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 258713a8baSPatrick Sanan 26e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A) 27e4a0ef16SKarl Rupp { 28e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30e4a0ef16SKarl Rupp 31e4a0ef16SKarl Rupp PetscFunctionBegin; 32bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0 33c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 34*9566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0)); 35e4a0ef16SKarl Rupp 36e4a0ef16SKarl Rupp try { 37e4a0ef16SKarl Rupp if (a->compressedrow.use) { 38a3430c56SKarl Rupp if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix(); 39e4a0ef16SKarl Rupp 40a3430c56SKarl Rupp // Since PetscInt is different from cl_uint, we have to convert: 41a3430c56SKarl Rupp viennacl::backend::mem_handle dummy; 42e4a0ef16SKarl Rupp 43a3430c56SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1); 44a3430c56SKarl Rupp for (PetscInt i=0; i<=a->compressedrow.nrows; ++i) 45a3430c56SKarl Rupp row_buffer.set(i, (a->compressedrow.i)[i]); 46e4a0ef16SKarl Rupp 47a3430c56SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows); 48a3430c56SKarl Rupp for (PetscInt i=0; i<a->compressedrow.nrows; ++i) 49a3430c56SKarl Rupp row_indices.set(i, (a->compressedrow.rindex)[i]); 50a3430c56SKarl Rupp 51a3430c56SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 52a3430c56SKarl Rupp for (PetscInt i=0; i<a->nz; ++i) 53a3430c56SKarl Rupp col_buffer.set(i, (a->j)[i]); 54a3430c56SKarl Rupp 55a3430c56SKarl Rupp viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz); 56*9566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar))); 57e4a0ef16SKarl Rupp } else { 58a3430c56SKarl Rupp if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix(); 59e4a0ef16SKarl Rupp 60e4a0ef16SKarl Rupp // Since PetscInt is in general different from cl_uint, we have to convert: 61e4a0ef16SKarl Rupp viennacl::backend::mem_handle dummy; 62e4a0ef16SKarl Rupp 63e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 64e4a0ef16SKarl Rupp for (PetscInt i=0; i<=A->rmap->n; ++i) 65e4a0ef16SKarl Rupp row_buffer.set(i, (a->i)[i]); 66e4a0ef16SKarl Rupp 67e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 68e4a0ef16SKarl Rupp for (PetscInt i=0; i<a->nz; ++i) 69e4a0ef16SKarl Rupp col_buffer.set(i, (a->j)[i]); 70e4a0ef16SKarl Rupp 71e4a0ef16SKarl Rupp viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 72*9566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar))); 73e4a0ef16SKarl Rupp } 744cf1874eSKarl Rupp ViennaCLWaitForGPU(); 754076e183SKarl Rupp } catch(std::exception const & ex) { 7698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 77e4a0ef16SKarl Rupp } 78e4a0ef16SKarl Rupp 79a3430c56SKarl Rupp // Create temporary vector for v += A*x: 80a3430c56SKarl Rupp if (viennaclstruct->tempvec) { 819b66742cSDave May if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) { 82a3430c56SKarl Rupp delete (ViennaCLVector*)viennaclstruct->tempvec; 839b66742cSDave May viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 84a3430c56SKarl Rupp } else { 85a3430c56SKarl Rupp viennaclstruct->tempvec->clear(); 86a3430c56SKarl Rupp } 87a3430c56SKarl Rupp } else { 889b66742cSDave May viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 89a3430c56SKarl Rupp } 90a3430c56SKarl Rupp 91c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 92e4a0ef16SKarl Rupp 93*9566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0)); 94e4a0ef16SKarl Rupp } 9567c87b7fSKarl Rupp } 96e4a0ef16SKarl Rupp PetscFunctionReturn(0); 97e4a0ef16SKarl Rupp } 98e4a0ef16SKarl Rupp 990d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 100e4a0ef16SKarl Rupp { 101f38c1e66SStefano Zampini Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 102e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 103e4a0ef16SKarl Rupp PetscInt m = A->rmap->n; 104e4a0ef16SKarl Rupp 105e4a0ef16SKarl Rupp PetscFunctionBegin; 106c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0); 107c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) { 108e4a0ef16SKarl Rupp try { 1092c71b3e2SJacob Faibussowitsch PetscCheck(!a->compressedrow.use,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 1106c4ed002SBarry Smith else { 111e4a0ef16SKarl Rupp 1122c71b3e2SJacob Faibussowitsch PetscCheck((PetscInt)Agpu->size1() == m,PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m); 113e4a0ef16SKarl Rupp a->nz = Agpu->nnz(); 114e4a0ef16SKarl Rupp a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 115e4a0ef16SKarl Rupp A->preallocated = PETSC_TRUE; 116e4a0ef16SKarl Rupp if (a->singlemalloc) { 117*9566063dSJacob Faibussowitsch if (a->a) PetscCall(PetscFree3(a->a,a->j,a->i)); 118e4a0ef16SKarl Rupp } else { 119*9566063dSJacob Faibussowitsch if (a->i) PetscCall(PetscFree(a->i)); 120*9566063dSJacob Faibussowitsch if (a->j) PetscCall(PetscFree(a->j)); 121*9566063dSJacob Faibussowitsch if (a->a) PetscCall(PetscFree(a->a)); 122e4a0ef16SKarl Rupp } 123*9566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i)); 124*9566063dSJacob Faibussowitsch PetscCall(PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt))); 125e4a0ef16SKarl Rupp 126e4a0ef16SKarl Rupp a->singlemalloc = PETSC_TRUE; 127e4a0ef16SKarl Rupp 128e4a0ef16SKarl Rupp /* Setup row lengths */ 129*9566063dSJacob Faibussowitsch PetscCall(PetscFree(a->imax)); 130*9566063dSJacob Faibussowitsch PetscCall(PetscFree(a->ilen)); 131*9566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&a->imax)); 132*9566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&a->ilen)); 133*9566063dSJacob Faibussowitsch PetscCall(PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt))); 134e4a0ef16SKarl Rupp 135e4a0ef16SKarl Rupp /* Copy data back from GPU */ 136e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 137e4a0ef16SKarl Rupp 138e4a0ef16SKarl Rupp // copy row array 139e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 140e4a0ef16SKarl Rupp (a->i)[0] = row_buffer[0]; 141e4a0ef16SKarl Rupp for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 142e4a0ef16SKarl Rupp (a->i)[i+1] = row_buffer[i+1]; 143e4a0ef16SKarl Rupp a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 144e4a0ef16SKarl Rupp } 145e4a0ef16SKarl Rupp 146e4a0ef16SKarl Rupp // copy column indices 147e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 148e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 149e4a0ef16SKarl Rupp for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 150e4a0ef16SKarl Rupp (a->j)[i] = col_buffer[i]; 151e4a0ef16SKarl Rupp 152e4a0ef16SKarl Rupp // copy nonzero entries directly to destination (no conversion required) 153e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 154e4a0ef16SKarl Rupp 155*9566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)))); 1564cf1874eSKarl Rupp ViennaCLWaitForGPU(); 157023073b3SKarl Rupp /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 158e4a0ef16SKarl Rupp } 1594076e183SKarl Rupp } catch(std::exception const & ex) { 16098921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 161e4a0ef16SKarl Rupp } 162c70f7ee4SJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) { 163f38c1e66SStefano Zampini PetscFunctionReturn(0); 164f38c1e66SStefano Zampini } else { 165c70f7ee4SJunchao Zhang if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0); 166e4a0ef16SKarl Rupp 1672c71b3e2SJacob Faibussowitsch PetscCheck(!a->compressedrow.use,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 168f38c1e66SStefano Zampini if (!Agpu) { 169f38c1e66SStefano Zampini viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar)*viennaclstruct->mat->nnz(), a->a); 170f38c1e66SStefano Zampini } else { 171f38c1e66SStefano Zampini viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 172f38c1e66SStefano Zampini } 173f38c1e66SStefano Zampini } 174c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 175b8ced49eSKarl Rupp /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */ 176*9566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 177*9566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 178e4a0ef16SKarl Rupp PetscFunctionReturn(0); 179e4a0ef16SKarl Rupp } 180e4a0ef16SKarl Rupp 181e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 182e4a0ef16SKarl Rupp { 183e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 184e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 1850d73d530SKarl Rupp const ViennaCLVector *xgpu=NULL; 1860d73d530SKarl Rupp ViennaCLVector *ygpu=NULL; 187e4a0ef16SKarl Rupp 188e4a0ef16SKarl Rupp PetscFunctionBegin; 189837a59e1SRichard Tran Mills /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 190*9566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyToGPU(A)); 191bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { 192*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayRead(xx,&xgpu)); 193*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayWrite(yy,&ygpu)); 194*9566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 195e4a0ef16SKarl Rupp try { 196b7832b47SStefano Zampini if (a->compressedrow.use) { 197b7832b47SStefano Zampini *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 198b7832b47SStefano Zampini } else { 199e4a0ef16SKarl Rupp *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 200b7832b47SStefano Zampini } 2014cf1874eSKarl Rupp ViennaCLWaitForGPU(); 2024076e183SKarl Rupp } catch (std::exception const & ex) { 20398921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 204e4a0ef16SKarl Rupp } 205*9566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 206*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayRead(xx,&xgpu)); 207*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayWrite(yy,&ygpu)); 208*9566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt)); 209bf1781e8SStefano Zampini } else { 210*9566063dSJacob Faibussowitsch PetscCall(VecSet_SeqViennaCL(yy,0)); 21167c87b7fSKarl Rupp } 212e4a0ef16SKarl Rupp PetscFunctionReturn(0); 213e4a0ef16SKarl Rupp } 214e4a0ef16SKarl Rupp 215e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 216e4a0ef16SKarl Rupp { 217e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 218e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 2190d73d530SKarl Rupp const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 2200d73d530SKarl Rupp ViennaCLVector *zgpu=NULL; 221e4a0ef16SKarl Rupp 222e4a0ef16SKarl Rupp PetscFunctionBegin; 223837a59e1SRichard Tran Mills /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 224*9566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyToGPU(A)); 225bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { 226e4a0ef16SKarl Rupp try { 227*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayRead(xx,&xgpu)); 228*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayRead(yy,&ygpu)); 229*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayWrite(zz,&zgpu)); 230*9566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 231f38c1e66SStefano Zampini if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 232f38c1e66SStefano Zampini else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 233f38c1e66SStefano Zampini if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec; 234f38c1e66SStefano Zampini else *zgpu += *viennaclstruct->tempvec; 2354cf1874eSKarl Rupp ViennaCLWaitForGPU(); 236*9566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 237*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayRead(xx,&xgpu)); 238*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayRead(yy,&ygpu)); 239*9566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayWrite(zz,&zgpu)); 240e4a0ef16SKarl Rupp 2414076e183SKarl Rupp } catch(std::exception const & ex) { 24298921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 243e4a0ef16SKarl Rupp } 244*9566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz)); 245bf1781e8SStefano Zampini } else { 246*9566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqViennaCL(yy,zz)); 24767c87b7fSKarl Rupp } 248e4a0ef16SKarl Rupp PetscFunctionReturn(0); 249e4a0ef16SKarl Rupp } 250e4a0ef16SKarl Rupp 251e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 252e4a0ef16SKarl Rupp { 253e4a0ef16SKarl Rupp PetscFunctionBegin; 254*9566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 255489de41dSStefano Zampini if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 256*9566063dSJacob Faibussowitsch if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A)); 257e4a0ef16SKarl Rupp PetscFunctionReturn(0); 258e4a0ef16SKarl Rupp } 259e4a0ef16SKarl Rupp 260e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/ 261cab5ea25SPierre Jolivet /*@C 262e4a0ef16SKarl Rupp MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 26319fddfadSKarl Rupp (the default parallel PETSc format). This matrix will ultimately be pushed down 264e4a0ef16SKarl Rupp to GPUs and use the ViennaCL library for calculations. For good matrix 265e4a0ef16SKarl Rupp assembly performance the user should preallocate the matrix storage by setting 266e4a0ef16SKarl Rupp the parameter nz (or the array nnz). By setting these parameters accurately, 267e4a0ef16SKarl Rupp performance during matrix assembly can be increased substantially. 268e4a0ef16SKarl Rupp 269d083f849SBarry Smith Collective 270e4a0ef16SKarl Rupp 271e4a0ef16SKarl Rupp Input Parameters: 272e4a0ef16SKarl Rupp + comm - MPI communicator, set to PETSC_COMM_SELF 273e4a0ef16SKarl Rupp . m - number of rows 274e4a0ef16SKarl Rupp . n - number of columns 275e4a0ef16SKarl Rupp . nz - number of nonzeros per row (same for all rows) 276e4a0ef16SKarl Rupp - nnz - array containing the number of nonzeros in the various rows 277e4a0ef16SKarl Rupp (possibly different for each row) or NULL 278e4a0ef16SKarl Rupp 279e4a0ef16SKarl Rupp Output Parameter: 280e4a0ef16SKarl Rupp . A - the matrix 281e4a0ef16SKarl Rupp 282e4a0ef16SKarl Rupp It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 283e4a0ef16SKarl Rupp MatXXXXSetPreallocation() paradigm instead of this routine directly. 284e4a0ef16SKarl Rupp [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 285e4a0ef16SKarl Rupp 286e4a0ef16SKarl Rupp Notes: 287e4a0ef16SKarl Rupp If nnz is given then nz is ignored 288e4a0ef16SKarl Rupp 289e4a0ef16SKarl Rupp The AIJ format (also called the Yale sparse matrix format or 290e4a0ef16SKarl Rupp compressed row storage), is fully compatible with standard Fortran 77 291e4a0ef16SKarl Rupp storage. That is, the stored row and column indices can begin at 292e4a0ef16SKarl Rupp either one (as in Fortran) or zero. See the users' manual for details. 293e4a0ef16SKarl Rupp 294e4a0ef16SKarl Rupp Specify the preallocated storage with either nz or nnz (not both). 295e4a0ef16SKarl Rupp Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 296e4a0ef16SKarl Rupp allocation. For large problems you MUST preallocate memory or you 297e4a0ef16SKarl Rupp will get TERRIBLE performance, see the users' manual chapter on matrices. 298e4a0ef16SKarl Rupp 299e4a0ef16SKarl Rupp Level: intermediate 300e4a0ef16SKarl Rupp 301e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 302e4a0ef16SKarl Rupp 303e4a0ef16SKarl Rupp @*/ 304e4a0ef16SKarl Rupp PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 305e4a0ef16SKarl Rupp { 306e4a0ef16SKarl Rupp PetscFunctionBegin; 307*9566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 308*9566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 309*9566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJVIENNACL)); 310*9566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 311e4a0ef16SKarl Rupp PetscFunctionReturn(0); 312e4a0ef16SKarl Rupp } 313e4a0ef16SKarl Rupp 314e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 315e4a0ef16SKarl Rupp { 316e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 317e4a0ef16SKarl Rupp 318e4a0ef16SKarl Rupp PetscFunctionBegin; 319e4a0ef16SKarl Rupp try { 3206447cd05SKarl Rupp if (viennaclcontainer) { 3216447cd05SKarl Rupp delete viennaclcontainer->tempvec; 3226447cd05SKarl Rupp delete viennaclcontainer->mat; 3236447cd05SKarl Rupp delete viennaclcontainer->compressed_mat; 324e4a0ef16SKarl Rupp delete viennaclcontainer; 3256447cd05SKarl Rupp } 326c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3274076e183SKarl Rupp } catch(std::exception const & ex) { 32898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 329e4a0ef16SKarl Rupp } 3308713a8baSPatrick Sanan 331*9566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL)); 332*9566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",NULL)); 333*9566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",NULL)); 3348713a8baSPatrick Sanan 335e4a0ef16SKarl Rupp /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 336e4a0ef16SKarl Rupp A->spptr = 0; 337*9566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 338e4a0ef16SKarl Rupp PetscFunctionReturn(0); 339e4a0ef16SKarl Rupp } 340e4a0ef16SKarl Rupp 341e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 342e4a0ef16SKarl Rupp { 343e4a0ef16SKarl Rupp PetscFunctionBegin; 344*9566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 345*9566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B)); 3468713a8baSPatrick Sanan PetscFunctionReturn(0); 3478713a8baSPatrick Sanan } 3488713a8baSPatrick Sanan 349b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat,PetscBool); 350c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B) 351c3cca76eSKarl Rupp { 352c3cca76eSKarl Rupp Mat C; 353c3cca76eSKarl Rupp 354c3cca76eSKarl Rupp PetscFunctionBegin; 355*9566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 356c3cca76eSKarl Rupp C = *B; 357c3cca76eSKarl Rupp 358*9566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE)); 35992f9df4aSRichard Tran Mills C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL; 360c3cca76eSKarl Rupp 361c3cca76eSKarl Rupp C->spptr = new Mat_SeqAIJViennaCL(); 362c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec = NULL; 363c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL*)C->spptr)->mat = NULL; 364c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL; 365c3cca76eSKarl Rupp 366*9566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL)); 367c3cca76eSKarl Rupp 368c70f7ee4SJunchao Zhang C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 369c3cca76eSKarl Rupp 370c3cca76eSKarl Rupp /* If the source matrix is already assembled, copy the destination matrix to the GPU */ 371*9566063dSJacob Faibussowitsch if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C)); 372c3cca76eSKarl Rupp PetscFunctionReturn(0); 373c3cca76eSKarl Rupp } 374c3cca76eSKarl Rupp 375f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar *array[]) 376f38c1e66SStefano Zampini { 377f38c1e66SStefano Zampini PetscFunctionBegin; 378*9566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL)); 379c1a4f3baSJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 380f38c1e66SStefano Zampini PetscFunctionReturn(0); 381f38c1e66SStefano Zampini } 382f38c1e66SStefano Zampini 383f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar *array[]) 384f38c1e66SStefano Zampini { 385f38c1e66SStefano Zampini PetscFunctionBegin; 386c1a4f3baSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 387c1a4f3baSJunchao Zhang *array = NULL; 388c1a4f3baSJunchao Zhang PetscFunctionReturn(0); 389c1a4f3baSJunchao Zhang } 390c1a4f3baSJunchao Zhang 391c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A,const PetscScalar *array[]) 392c1a4f3baSJunchao Zhang { 393c1a4f3baSJunchao Zhang PetscFunctionBegin; 394*9566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL)); 395c1a4f3baSJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 396c1a4f3baSJunchao Zhang PetscFunctionReturn(0); 397c1a4f3baSJunchao Zhang } 398c1a4f3baSJunchao Zhang 399c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A,const PetscScalar *array[]) 400c1a4f3baSJunchao Zhang { 401c1a4f3baSJunchao Zhang PetscFunctionBegin; 402c1a4f3baSJunchao Zhang *array = NULL; 403c1a4f3baSJunchao Zhang /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */ 404c1a4f3baSJunchao Zhang PetscFunctionReturn(0); 405c1a4f3baSJunchao Zhang } 406c1a4f3baSJunchao Zhang 407c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A,PetscScalar *array[]) 408c1a4f3baSJunchao Zhang { 409c1a4f3baSJunchao Zhang PetscFunctionBegin; 410c1a4f3baSJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 411c1a4f3baSJunchao Zhang PetscFunctionReturn(0); 412c1a4f3baSJunchao Zhang } 413c1a4f3baSJunchao Zhang 414c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A,PetscScalar *array[]) 415c1a4f3baSJunchao Zhang { 416c1a4f3baSJunchao Zhang PetscFunctionBegin; 417c1a4f3baSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 418f38c1e66SStefano Zampini *array = NULL; 419f38c1e66SStefano Zampini PetscFunctionReturn(0); 420f38c1e66SStefano Zampini } 421f38c1e66SStefano Zampini 422b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A,PetscBool flg) 423e7e92044SBarry Smith { 424c1a4f3baSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 425f38c1e66SStefano Zampini 426e7e92044SBarry Smith PetscFunctionBegin; 427b470e4b4SRichard Tran Mills A->boundtocpu = flg; 428c1a4f3baSJunchao Zhang if (flg && a->inode.size) { 429c1a4f3baSJunchao Zhang a->inode.use = PETSC_TRUE; 430ea500dcfSRichard Tran Mills } else { 431c1a4f3baSJunchao Zhang a->inode.use = PETSC_FALSE; 432ea500dcfSRichard Tran Mills } 433e7e92044SBarry Smith if (flg) { 434f38c1e66SStefano Zampini /* make sure we have an up-to-date copy on the CPU */ 435*9566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL)); 436e7e92044SBarry Smith A->ops->mult = MatMult_SeqAIJ; 437e7e92044SBarry Smith A->ops->multadd = MatMultAdd_SeqAIJ; 438e7e92044SBarry Smith A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 439e7e92044SBarry Smith A->ops->duplicate = MatDuplicate_SeqAIJ; 440*9566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 441e7e92044SBarry Smith } else { 442e7e92044SBarry Smith A->ops->mult = MatMult_SeqAIJViennaCL; 443e7e92044SBarry Smith A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 444e7e92044SBarry Smith A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 445e7e92044SBarry Smith A->ops->destroy = MatDestroy_SeqAIJViennaCL; 446e7e92044SBarry Smith A->ops->duplicate = MatDuplicate_SeqAIJViennaCL; 447c1a4f3baSJunchao Zhang 448c1a4f3baSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJViennaCL; 449c1a4f3baSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJViennaCL; 450c1a4f3baSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJViennaCL; 451c1a4f3baSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL; 452c1a4f3baSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJViennaCL; 453c1a4f3baSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL; 454e7e92044SBarry Smith } 455e7e92044SBarry Smith PetscFunctionReturn(0); 456e7e92044SBarry Smith } 457e7e92044SBarry Smith 4588713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat) 4598713a8baSPatrick Sanan { 4608713a8baSPatrick Sanan Mat B; 4618713a8baSPatrick Sanan 4628713a8baSPatrick Sanan PetscFunctionBegin; 4635f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_REUSE_MATRIX,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead"); 4648713a8baSPatrick Sanan 465*9566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 4668713a8baSPatrick Sanan 4678713a8baSPatrick Sanan B = *newmat; 4688713a8baSPatrick Sanan 469e4a0ef16SKarl Rupp B->spptr = new Mat_SeqAIJViennaCL(); 470e4a0ef16SKarl Rupp 471a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec = NULL; 472a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL*)B->spptr)->mat = NULL; 473a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL; 474e4a0ef16SKarl Rupp 475*9566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE)); 47692f9df4aSRichard Tran Mills A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL; 477e4a0ef16SKarl Rupp 478*9566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL)); 479*9566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 480*9566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECVIENNACL,&B->defaultvectype)); 481e4a0ef16SKarl Rupp 482*9566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL)); 483*9566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",MatProductSetFromOptions_SeqAIJ_SeqDense)); 484*9566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",MatProductSetFromOptions_SeqAIJ)); 4858713a8baSPatrick Sanan 486c70f7ee4SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4878713a8baSPatrick Sanan 4888713a8baSPatrick Sanan /* If the source matrix is already assembled, copy the destination matrix to the GPU */ 489*9566063dSJacob Faibussowitsch if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B)); 490e4a0ef16SKarl Rupp PetscFunctionReturn(0); 491e4a0ef16SKarl Rupp } 492e4a0ef16SKarl Rupp 4933ca39a21SBarry Smith /*MC 494e4a0ef16SKarl Rupp MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 495e4a0ef16SKarl Rupp 496e4a0ef16SKarl Rupp A matrix type type whose data resides on GPUs. These matrices are in CSR format by 497e4a0ef16SKarl Rupp default. All matrix calculations are performed using the ViennaCL library. 498e4a0ef16SKarl Rupp 499e4a0ef16SKarl Rupp Options Database Keys: 500e4a0ef16SKarl Rupp + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 501e4a0ef16SKarl Rupp . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 502e4a0ef16SKarl Rupp - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 503e4a0ef16SKarl Rupp 504e4a0ef16SKarl Rupp Level: beginner 505e4a0ef16SKarl Rupp 506e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 507e4a0ef16SKarl Rupp M*/ 508e4a0ef16SKarl Rupp 5093ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void) 51072367587SKarl Rupp { 51172367587SKarl Rupp PetscFunctionBegin; 512*9566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU,MatGetFactor_seqaij_petsc)); 513*9566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc)); 514*9566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc)); 515*9566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc)); 51672367587SKarl Rupp PetscFunctionReturn(0); 51772367587SKarl Rupp } 518