1e4a0ef16SKarl Rupp 2e4a0ef16SKarl Rupp 3e4a0ef16SKarl Rupp /* 4e4a0ef16SKarl Rupp Defines the basic matrix operations for the AIJ (compressed row) 5e4a0ef16SKarl Rupp matrix storage format. 6e4a0ef16SKarl Rupp */ 7e4a0ef16SKarl Rupp 8aaa7dc30SBarry Smith #include <petscconf.h> 999acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 10aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11aaa7dc30SBarry Smith #include <petscbt.h> 12aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 14e4a0ef16SKarl Rupp 15aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h> 16e4a0ef16SKarl Rupp 17e4a0ef16SKarl Rupp 18e4a0ef16SKarl Rupp #include <algorithm> 19e4a0ef16SKarl Rupp #include <vector> 20e4a0ef16SKarl Rupp #include <string> 21e4a0ef16SKarl Rupp 22e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp" 23e4a0ef16SKarl Rupp 248713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat); 2572367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 2672367587SKarl Rupp 278713a8baSPatrick Sanan 28e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A) 29e4a0ef16SKarl Rupp { 30e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 31e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 32e4a0ef16SKarl Rupp PetscErrorCode ierr; 33e4a0ef16SKarl Rupp 34e4a0ef16SKarl Rupp PetscFunctionBegin; 35bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0 36*c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 37e4a0ef16SKarl Rupp ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38e4a0ef16SKarl Rupp 39e4a0ef16SKarl Rupp try { 40e4a0ef16SKarl Rupp if (a->compressedrow.use) { 41a3430c56SKarl Rupp if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix(); 42e4a0ef16SKarl Rupp 43a3430c56SKarl Rupp // Since PetscInt is different from cl_uint, we have to convert: 44a3430c56SKarl Rupp viennacl::backend::mem_handle dummy; 45e4a0ef16SKarl Rupp 46a3430c56SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1); 47a3430c56SKarl Rupp for (PetscInt i=0; i<=a->compressedrow.nrows; ++i) 48a3430c56SKarl Rupp row_buffer.set(i, (a->compressedrow.i)[i]); 49e4a0ef16SKarl Rupp 50a3430c56SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows); 51a3430c56SKarl Rupp for (PetscInt i=0; i<a->compressedrow.nrows; ++i) 52a3430c56SKarl Rupp row_indices.set(i, (a->compressedrow.rindex)[i]); 53a3430c56SKarl Rupp 54a3430c56SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 55a3430c56SKarl Rupp for (PetscInt i=0; i<a->nz; ++i) 56a3430c56SKarl Rupp col_buffer.set(i, (a->j)[i]); 57a3430c56SKarl Rupp 58a3430c56SKarl Rupp viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz); 594863603aSSatish Balay ierr = PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 60e4a0ef16SKarl Rupp } else { 61a3430c56SKarl Rupp if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix(); 62e4a0ef16SKarl Rupp 63e4a0ef16SKarl Rupp // Since PetscInt is in general different from cl_uint, we have to convert: 64e4a0ef16SKarl Rupp viennacl::backend::mem_handle dummy; 65e4a0ef16SKarl Rupp 66e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 67e4a0ef16SKarl Rupp for (PetscInt i=0; i<=A->rmap->n; ++i) 68e4a0ef16SKarl Rupp row_buffer.set(i, (a->i)[i]); 69e4a0ef16SKarl Rupp 70e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 71e4a0ef16SKarl Rupp for (PetscInt i=0; i<a->nz; ++i) 72e4a0ef16SKarl Rupp col_buffer.set(i, (a->j)[i]); 73e4a0ef16SKarl Rupp 74e4a0ef16SKarl Rupp viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 754863603aSSatish Balay ierr = PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 76e4a0ef16SKarl Rupp } 774cf1874eSKarl Rupp ViennaCLWaitForGPU(); 784076e183SKarl Rupp } catch(std::exception const & ex) { 794076e183SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 80e4a0ef16SKarl Rupp } 81e4a0ef16SKarl Rupp 82a3430c56SKarl Rupp // Create temporary vector for v += A*x: 83a3430c56SKarl Rupp if (viennaclstruct->tempvec) { 849b66742cSDave May if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) { 85a3430c56SKarl Rupp delete (ViennaCLVector*)viennaclstruct->tempvec; 869b66742cSDave May viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 87a3430c56SKarl Rupp } else { 88a3430c56SKarl Rupp viennaclstruct->tempvec->clear(); 89a3430c56SKarl Rupp } 90a3430c56SKarl Rupp } else { 919b66742cSDave May viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 92a3430c56SKarl Rupp } 93a3430c56SKarl Rupp 94*c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 95e4a0ef16SKarl Rupp 96e4a0ef16SKarl Rupp ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 97e4a0ef16SKarl Rupp } 9867c87b7fSKarl Rupp } 99e4a0ef16SKarl Rupp PetscFunctionReturn(0); 100e4a0ef16SKarl Rupp } 101e4a0ef16SKarl Rupp 1020d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 103e4a0ef16SKarl Rupp { 104f38c1e66SStefano Zampini Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 105e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 106e4a0ef16SKarl Rupp PetscInt m = A->rmap->n; 107e4a0ef16SKarl Rupp PetscErrorCode ierr; 108e4a0ef16SKarl Rupp 109e4a0ef16SKarl Rupp PetscFunctionBegin; 110*c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0); 111*c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) { 112e4a0ef16SKarl Rupp try { 1136c4ed002SBarry Smith if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 1146c4ed002SBarry Smith else { 115e4a0ef16SKarl Rupp 116e4a0ef16SKarl Rupp if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 117e4a0ef16SKarl Rupp a->nz = Agpu->nnz(); 118e4a0ef16SKarl Rupp a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 119e4a0ef16SKarl Rupp A->preallocated = PETSC_TRUE; 120e4a0ef16SKarl Rupp if (a->singlemalloc) { 121e4a0ef16SKarl Rupp if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 122e4a0ef16SKarl Rupp } else { 123e4a0ef16SKarl Rupp if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 124e4a0ef16SKarl Rupp if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 125e4a0ef16SKarl Rupp if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 126e4a0ef16SKarl Rupp } 127dcca6d9dSJed Brown ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr); 128f7daeb2aSKarl Rupp ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 129e4a0ef16SKarl Rupp 130e4a0ef16SKarl Rupp a->singlemalloc = PETSC_TRUE; 131e4a0ef16SKarl Rupp 132e4a0ef16SKarl Rupp /* Setup row lengths */ 133071fcb05SBarry Smith ierr = PetscFree(a->imax);CHKERRQ(ierr); 134071fcb05SBarry Smith ierr = PetscFree(a->ilen);CHKERRQ(ierr); 135071fcb05SBarry Smith ierr = PetscMalloc1(m,&a->imax);CHKERRQ(ierr); 136071fcb05SBarry Smith ierr = PetscMalloc1(m,&a->ilen);CHKERRQ(ierr); 137f7daeb2aSKarl Rupp ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 138e4a0ef16SKarl Rupp 139e4a0ef16SKarl Rupp /* Copy data back from GPU */ 140e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 141e4a0ef16SKarl Rupp 142e4a0ef16SKarl Rupp // copy row array 143e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 144e4a0ef16SKarl Rupp (a->i)[0] = row_buffer[0]; 145e4a0ef16SKarl Rupp for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 146e4a0ef16SKarl Rupp (a->i)[i+1] = row_buffer[i+1]; 147e4a0ef16SKarl Rupp a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 148e4a0ef16SKarl Rupp } 149e4a0ef16SKarl Rupp 150e4a0ef16SKarl Rupp // copy column indices 151e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 152e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 153e4a0ef16SKarl Rupp for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 154e4a0ef16SKarl Rupp (a->j)[i] = col_buffer[i]; 155e4a0ef16SKarl Rupp 156e4a0ef16SKarl Rupp // copy nonzero entries directly to destination (no conversion required) 157e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 158e4a0ef16SKarl Rupp 1594863603aSSatish Balay ierr = PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)));CHKERRQ(ierr); 1604cf1874eSKarl Rupp ViennaCLWaitForGPU(); 161023073b3SKarl Rupp /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 162e4a0ef16SKarl Rupp } 1634076e183SKarl Rupp } catch(std::exception const & ex) { 1644076e183SKarl Rupp SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 165e4a0ef16SKarl Rupp } 166*c70f7ee4SJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) { 167f38c1e66SStefano Zampini PetscFunctionReturn(0); 168f38c1e66SStefano Zampini } else { 169*c70f7ee4SJunchao Zhang if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0); 170e4a0ef16SKarl Rupp 171f38c1e66SStefano Zampini if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 172f38c1e66SStefano Zampini if (!Agpu) { 173f38c1e66SStefano Zampini viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar)*viennaclstruct->mat->nnz(), a->a); 174f38c1e66SStefano Zampini } else { 175f38c1e66SStefano Zampini viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 176f38c1e66SStefano Zampini } 177f38c1e66SStefano Zampini } 178*c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 179b8ced49eSKarl Rupp /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */ 180e4a0ef16SKarl Rupp ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 181e4a0ef16SKarl Rupp ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 182e4a0ef16SKarl Rupp PetscFunctionReturn(0); 183e4a0ef16SKarl Rupp } 184e4a0ef16SKarl Rupp 185e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 186e4a0ef16SKarl Rupp { 187e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 188e4a0ef16SKarl Rupp PetscErrorCode ierr; 189e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 1900d73d530SKarl Rupp const ViennaCLVector *xgpu=NULL; 1910d73d530SKarl Rupp ViennaCLVector *ygpu=NULL; 192e4a0ef16SKarl Rupp 193e4a0ef16SKarl Rupp PetscFunctionBegin; 194bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { 195e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 196e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 1977a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 198e4a0ef16SKarl Rupp try { 199b7832b47SStefano Zampini if (a->compressedrow.use) { 200b7832b47SStefano Zampini *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 201b7832b47SStefano Zampini } else { 202e4a0ef16SKarl Rupp *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 203b7832b47SStefano Zampini } 2044cf1874eSKarl Rupp ViennaCLWaitForGPU(); 2054076e183SKarl Rupp } catch (std::exception const & ex) { 2064076e183SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 207e4a0ef16SKarl Rupp } 208958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 209e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 210e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 211958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 212bf1781e8SStefano Zampini } else { 213bf1781e8SStefano Zampini ierr = VecSet(yy,0);CHKERRQ(ierr); 21467c87b7fSKarl Rupp } 215e4a0ef16SKarl Rupp PetscFunctionReturn(0); 216e4a0ef16SKarl Rupp } 217e4a0ef16SKarl Rupp 218e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 219e4a0ef16SKarl Rupp { 220e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 221e4a0ef16SKarl Rupp PetscErrorCode ierr; 222e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 2230d73d530SKarl Rupp const ViennaCLVector *xgpu=NULL,*ygpu=NULL; 2240d73d530SKarl Rupp ViennaCLVector *zgpu=NULL; 225e4a0ef16SKarl Rupp 226e4a0ef16SKarl Rupp PetscFunctionBegin; 227bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { 228e4a0ef16SKarl Rupp try { 229e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 230e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 231e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 2327a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 233f38c1e66SStefano Zampini if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 234f38c1e66SStefano Zampini else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 235f38c1e66SStefano Zampini if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec; 236f38c1e66SStefano Zampini else *zgpu += *viennaclstruct->tempvec; 2374cf1874eSKarl Rupp ViennaCLWaitForGPU(); 238958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 240e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 241e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 242e4a0ef16SKarl Rupp 2434076e183SKarl Rupp } catch(std::exception const & ex) { 2444076e183SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 245e4a0ef16SKarl Rupp } 246958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 247bf1781e8SStefano Zampini } else { 248bf1781e8SStefano Zampini ierr = VecCopy(yy,zz);CHKERRQ(ierr); 24967c87b7fSKarl Rupp } 250e4a0ef16SKarl Rupp PetscFunctionReturn(0); 251e4a0ef16SKarl Rupp } 252e4a0ef16SKarl Rupp 253e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 254e4a0ef16SKarl Rupp { 255e4a0ef16SKarl Rupp PetscErrorCode ierr; 256e4a0ef16SKarl Rupp 257e4a0ef16SKarl Rupp PetscFunctionBegin; 258e4a0ef16SKarl Rupp ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 259489de41dSStefano Zampini if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 260e7e92044SBarry Smith if (!A->pinnedtocpu) { 261e4a0ef16SKarl Rupp ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 262e7e92044SBarry Smith } 263e4a0ef16SKarl Rupp PetscFunctionReturn(0); 264e4a0ef16SKarl Rupp } 265e4a0ef16SKarl Rupp 266e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/ 267e4a0ef16SKarl Rupp /*@ 268e4a0ef16SKarl Rupp MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 26919fddfadSKarl Rupp (the default parallel PETSc format). This matrix will ultimately be pushed down 270e4a0ef16SKarl Rupp to GPUs and use the ViennaCL library for calculations. For good matrix 271e4a0ef16SKarl Rupp assembly performance the user should preallocate the matrix storage by setting 272e4a0ef16SKarl Rupp the parameter nz (or the array nnz). By setting these parameters accurately, 273e4a0ef16SKarl Rupp performance during matrix assembly can be increased substantially. 274e4a0ef16SKarl Rupp 275e4a0ef16SKarl Rupp 276d083f849SBarry Smith Collective 277e4a0ef16SKarl Rupp 278e4a0ef16SKarl Rupp Input Parameters: 279e4a0ef16SKarl Rupp + comm - MPI communicator, set to PETSC_COMM_SELF 280e4a0ef16SKarl Rupp . m - number of rows 281e4a0ef16SKarl Rupp . n - number of columns 282e4a0ef16SKarl Rupp . nz - number of nonzeros per row (same for all rows) 283e4a0ef16SKarl Rupp - nnz - array containing the number of nonzeros in the various rows 284e4a0ef16SKarl Rupp (possibly different for each row) or NULL 285e4a0ef16SKarl Rupp 286e4a0ef16SKarl Rupp Output Parameter: 287e4a0ef16SKarl Rupp . A - the matrix 288e4a0ef16SKarl Rupp 289e4a0ef16SKarl Rupp It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 290e4a0ef16SKarl Rupp MatXXXXSetPreallocation() paradigm instead of this routine directly. 291e4a0ef16SKarl Rupp [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 292e4a0ef16SKarl Rupp 293e4a0ef16SKarl Rupp Notes: 294e4a0ef16SKarl Rupp If nnz is given then nz is ignored 295e4a0ef16SKarl Rupp 296e4a0ef16SKarl Rupp The AIJ format (also called the Yale sparse matrix format or 297e4a0ef16SKarl Rupp compressed row storage), is fully compatible with standard Fortran 77 298e4a0ef16SKarl Rupp storage. That is, the stored row and column indices can begin at 299e4a0ef16SKarl Rupp either one (as in Fortran) or zero. See the users' manual for details. 300e4a0ef16SKarl Rupp 301e4a0ef16SKarl Rupp Specify the preallocated storage with either nz or nnz (not both). 302e4a0ef16SKarl Rupp Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 303e4a0ef16SKarl Rupp allocation. For large problems you MUST preallocate memory or you 304e4a0ef16SKarl Rupp will get TERRIBLE performance, see the users' manual chapter on matrices. 305e4a0ef16SKarl Rupp 306e4a0ef16SKarl Rupp Level: intermediate 307e4a0ef16SKarl Rupp 308e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 309e4a0ef16SKarl Rupp 310e4a0ef16SKarl Rupp @*/ 311e4a0ef16SKarl Rupp PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 312e4a0ef16SKarl Rupp { 313e4a0ef16SKarl Rupp PetscErrorCode ierr; 314e4a0ef16SKarl Rupp 315e4a0ef16SKarl Rupp PetscFunctionBegin; 316e4a0ef16SKarl Rupp ierr = MatCreate(comm,A);CHKERRQ(ierr); 317e4a0ef16SKarl Rupp ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 318e4a0ef16SKarl Rupp ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 319e4a0ef16SKarl Rupp ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 320e4a0ef16SKarl Rupp PetscFunctionReturn(0); 321e4a0ef16SKarl Rupp } 322e4a0ef16SKarl Rupp 323e4a0ef16SKarl Rupp 324e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 325e4a0ef16SKarl Rupp { 326e4a0ef16SKarl Rupp PetscErrorCode ierr; 327e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 328e4a0ef16SKarl Rupp 329e4a0ef16SKarl Rupp PetscFunctionBegin; 330e4a0ef16SKarl Rupp try { 3316447cd05SKarl Rupp if (viennaclcontainer) { 3326447cd05SKarl Rupp delete viennaclcontainer->tempvec; 3336447cd05SKarl Rupp delete viennaclcontainer->mat; 3346447cd05SKarl Rupp delete viennaclcontainer->compressed_mat; 335e4a0ef16SKarl Rupp delete viennaclcontainer; 3366447cd05SKarl Rupp } 337*c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3384076e183SKarl Rupp } catch(std::exception const & ex) { 3394076e183SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what()); 340e4a0ef16SKarl Rupp } 3418713a8baSPatrick Sanan 3428713a8baSPatrick Sanan ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr); 3438713a8baSPatrick Sanan 344e4a0ef16SKarl Rupp /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 345e4a0ef16SKarl Rupp A->spptr = 0; 346e4a0ef16SKarl Rupp ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 347e4a0ef16SKarl Rupp PetscFunctionReturn(0); 348e4a0ef16SKarl Rupp } 349e4a0ef16SKarl Rupp 350e4a0ef16SKarl Rupp 351e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 352e4a0ef16SKarl Rupp { 353e4a0ef16SKarl Rupp PetscErrorCode ierr; 354e4a0ef16SKarl Rupp 355e4a0ef16SKarl Rupp PetscFunctionBegin; 356e4a0ef16SKarl Rupp ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3578713a8baSPatrick Sanan ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B); 3588713a8baSPatrick Sanan PetscFunctionReturn(0); 3598713a8baSPatrick Sanan } 3608713a8baSPatrick Sanan 361e7e92044SBarry Smith static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat,PetscBool); 362c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B) 363c3cca76eSKarl Rupp { 364c3cca76eSKarl Rupp PetscErrorCode ierr; 365c3cca76eSKarl Rupp Mat C; 366c3cca76eSKarl Rupp 367c3cca76eSKarl Rupp PetscFunctionBegin; 368c3cca76eSKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 369c3cca76eSKarl Rupp C = *B; 370c3cca76eSKarl Rupp 371e7e92044SBarry Smith ierr = MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr); 372e7e92044SBarry Smith C->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL; 373c3cca76eSKarl Rupp 374c3cca76eSKarl Rupp C->spptr = new Mat_SeqAIJViennaCL(); 375c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec = NULL; 376c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL*)C->spptr)->mat = NULL; 377c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL; 378c3cca76eSKarl Rupp 379c3cca76eSKarl Rupp ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);CHKERRQ(ierr); 380c3cca76eSKarl Rupp 381*c70f7ee4SJunchao Zhang C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 382c3cca76eSKarl Rupp 383c3cca76eSKarl Rupp /* If the source matrix is already assembled, copy the destination matrix to the GPU */ 384c3cca76eSKarl Rupp if (C->assembled) { 385c3cca76eSKarl Rupp ierr = MatViennaCLCopyToGPU(C);CHKERRQ(ierr); 386c3cca76eSKarl Rupp } 387c3cca76eSKarl Rupp 388c3cca76eSKarl Rupp PetscFunctionReturn(0); 389c3cca76eSKarl Rupp } 390c3cca76eSKarl Rupp 391f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar *array[]) 392f38c1e66SStefano Zampini { 393f38c1e66SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 394f38c1e66SStefano Zampini PetscErrorCode ierr; 395f38c1e66SStefano Zampini 396f38c1e66SStefano Zampini PetscFunctionBegin; 397f38c1e66SStefano Zampini ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr); 398f38c1e66SStefano Zampini *array = a->a; 399*c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 400f38c1e66SStefano Zampini PetscFunctionReturn(0); 401f38c1e66SStefano Zampini } 402f38c1e66SStefano Zampini 403f38c1e66SStefano Zampini 404f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar *array[]) 405f38c1e66SStefano Zampini { 406f38c1e66SStefano Zampini PetscFunctionBegin; 407f38c1e66SStefano Zampini *array = NULL; 408f38c1e66SStefano Zampini PetscFunctionReturn(0); 409f38c1e66SStefano Zampini } 410f38c1e66SStefano Zampini 411e7e92044SBarry Smith static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat A,PetscBool flg) 412e7e92044SBarry Smith { 413f38c1e66SStefano Zampini PetscErrorCode ierr; 414f38c1e66SStefano Zampini 415e7e92044SBarry Smith PetscFunctionBegin; 416e7e92044SBarry Smith A->pinnedtocpu = flg; 417e7e92044SBarry Smith if (flg) { 418f38c1e66SStefano Zampini /* make sure we have an up-to-date copy on the CPU */ 419f38c1e66SStefano Zampini ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr); 420f38c1e66SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 421f38c1e66SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJ);CHKERRQ(ierr); 422f38c1e66SStefano Zampini 423e7e92044SBarry Smith A->ops->mult = MatMult_SeqAIJ; 424e7e92044SBarry Smith A->ops->multadd = MatMultAdd_SeqAIJ; 425e7e92044SBarry Smith A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 426e7e92044SBarry Smith A->ops->duplicate = MatDuplicate_SeqAIJ; 427e7e92044SBarry Smith } else { 428f38c1e66SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJViennaCL);CHKERRQ(ierr); 429f38c1e66SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJViennaCL);CHKERRQ(ierr); 430f38c1e66SStefano Zampini 431e7e92044SBarry Smith A->ops->mult = MatMult_SeqAIJViennaCL; 432e7e92044SBarry Smith A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 433e7e92044SBarry Smith A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 434e7e92044SBarry Smith A->ops->destroy = MatDestroy_SeqAIJViennaCL; 435e7e92044SBarry Smith A->ops->duplicate = MatDuplicate_SeqAIJViennaCL; 436e7e92044SBarry Smith } 437e7e92044SBarry Smith PetscFunctionReturn(0); 438e7e92044SBarry Smith } 439e7e92044SBarry Smith 4408713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat) 4418713a8baSPatrick Sanan { 4428713a8baSPatrick Sanan PetscErrorCode ierr; 4438713a8baSPatrick Sanan Mat B; 4448713a8baSPatrick Sanan Mat_SeqAIJ *aij; 4458713a8baSPatrick Sanan 4468713a8baSPatrick Sanan PetscFunctionBegin; 4478713a8baSPatrick Sanan 4488713a8baSPatrick Sanan if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead"); 4498713a8baSPatrick Sanan 4508713a8baSPatrick Sanan if (reuse == MAT_INITIAL_MATRIX) { 4518713a8baSPatrick Sanan ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 4528713a8baSPatrick Sanan } 4538713a8baSPatrick Sanan 4548713a8baSPatrick Sanan B = *newmat; 4558713a8baSPatrick Sanan 456e4a0ef16SKarl Rupp aij = (Mat_SeqAIJ*)B->data; 457e4a0ef16SKarl Rupp aij->inode.use = PETSC_FALSE; 4588713a8baSPatrick Sanan 459e4a0ef16SKarl Rupp B->spptr = new Mat_SeqAIJViennaCL(); 460e4a0ef16SKarl Rupp 461a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec = NULL; 462a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL*)B->spptr)->mat = NULL; 463a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL; 464e4a0ef16SKarl Rupp 465e7e92044SBarry Smith ierr = MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr); 466e7e92044SBarry Smith A->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL; 467e4a0ef16SKarl Rupp 468e4a0ef16SKarl Rupp ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 46934136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 47034136279SStefano Zampini ierr = PetscStrallocpy(VECVIENNACL,&B->defaultvectype);CHKERRQ(ierr); 471e4a0ef16SKarl Rupp 4728713a8baSPatrick Sanan ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr); 4738713a8baSPatrick Sanan 474*c70f7ee4SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4758713a8baSPatrick Sanan 4768713a8baSPatrick Sanan /* If the source matrix is already assembled, copy the destination matrix to the GPU */ 4778713a8baSPatrick Sanan if (B->assembled) { 4788713a8baSPatrick Sanan ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr); 4798713a8baSPatrick Sanan } 4808713a8baSPatrick Sanan 481e4a0ef16SKarl Rupp PetscFunctionReturn(0); 482e4a0ef16SKarl Rupp } 483e4a0ef16SKarl Rupp 484e4a0ef16SKarl Rupp 4853ca39a21SBarry Smith /*MC 486e4a0ef16SKarl Rupp MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 487e4a0ef16SKarl Rupp 488e4a0ef16SKarl Rupp A matrix type type whose data resides on GPUs. These matrices are in CSR format by 489e4a0ef16SKarl Rupp default. All matrix calculations are performed using the ViennaCL library. 490e4a0ef16SKarl Rupp 491e4a0ef16SKarl Rupp Options Database Keys: 492e4a0ef16SKarl Rupp + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 493e4a0ef16SKarl Rupp . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 494e4a0ef16SKarl Rupp - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 495e4a0ef16SKarl Rupp 496e4a0ef16SKarl Rupp Level: beginner 497e4a0ef16SKarl Rupp 498e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 499e4a0ef16SKarl Rupp M*/ 500e4a0ef16SKarl Rupp 50172367587SKarl Rupp 5023ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void) 50372367587SKarl Rupp { 50472367587SKarl Rupp PetscErrorCode ierr; 50572367587SKarl Rupp 50672367587SKarl Rupp PetscFunctionBegin; 5073ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr); 5083ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr); 5093ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr); 5103ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr); 51172367587SKarl Rupp PetscFunctionReturn(0); 51272367587SKarl Rupp } 513