1*e4a0ef16SKarl Rupp 2*e4a0ef16SKarl Rupp 3*e4a0ef16SKarl Rupp /* 4*e4a0ef16SKarl Rupp Defines the basic matrix operations for the AIJ (compressed row) 5*e4a0ef16SKarl Rupp matrix storage format. 6*e4a0ef16SKarl Rupp */ 7*e4a0ef16SKarl Rupp 8*e4a0ef16SKarl Rupp #include "petscconf.h" 9*e4a0ef16SKarl Rupp #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 10*e4a0ef16SKarl Rupp #include "petscbt.h" 11*e4a0ef16SKarl Rupp #include "../src/vec/vec/impls/dvecimpl.h" 12*e4a0ef16SKarl Rupp #include "petsc-private/vecimpl.h" 13*e4a0ef16SKarl Rupp 14*e4a0ef16SKarl Rupp #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h" 15*e4a0ef16SKarl Rupp 16*e4a0ef16SKarl Rupp 17*e4a0ef16SKarl Rupp #include <algorithm> 18*e4a0ef16SKarl Rupp #include <vector> 19*e4a0ef16SKarl Rupp #include <string> 20*e4a0ef16SKarl Rupp 21*e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp" 22*e4a0ef16SKarl Rupp 23*e4a0ef16SKarl Rupp #undef __FUNCT__ 24*e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyToGPU" 25*e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A) 26*e4a0ef16SKarl Rupp { 27*e4a0ef16SKarl Rupp 28*e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 29*e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30*e4a0ef16SKarl Rupp //PetscInt m = A->rmap->n,*ii,*ridx; 31*e4a0ef16SKarl Rupp PetscInt *ii; 32*e4a0ef16SKarl Rupp PetscErrorCode ierr; 33*e4a0ef16SKarl Rupp 34*e4a0ef16SKarl Rupp 35*e4a0ef16SKarl Rupp PetscFunctionBegin; 36*e4a0ef16SKarl Rupp if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) { 37*e4a0ef16SKarl Rupp ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 38*e4a0ef16SKarl Rupp 39*e4a0ef16SKarl Rupp try { 40*e4a0ef16SKarl Rupp //viennaclstruct->nonzerorow=0; 41*e4a0ef16SKarl Rupp //for (PetscInt j = 0; j<m; j++) viennaclstruct->nonzerorow += (a->i[j+1] > a->i[j]); 42*e4a0ef16SKarl Rupp 43*e4a0ef16SKarl Rupp viennaclstruct->mat = new ViennaCLAIJMatrix(); 44*e4a0ef16SKarl Rupp if (a->compressedrow.use) { 45*e4a0ef16SKarl Rupp //m = a->compressedrow.nrows; 46*e4a0ef16SKarl Rupp ii = a->compressedrow.i; 47*e4a0ef16SKarl Rupp //ridx = a->compressedrow.rindex; 48*e4a0ef16SKarl Rupp 49*e4a0ef16SKarl Rupp viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz); 50*e4a0ef16SKarl Rupp 51*e4a0ef16SKarl Rupp // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications) 52*e4a0ef16SKarl Rupp 53*e4a0ef16SKarl Rupp SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 54*e4a0ef16SKarl Rupp } else { 55*e4a0ef16SKarl Rupp 56*e4a0ef16SKarl Rupp // Since PetscInt is in general different from cl_uint, we have to convert: 57*e4a0ef16SKarl Rupp viennacl::backend::mem_handle dummy; 58*e4a0ef16SKarl Rupp 59*e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1); 60*e4a0ef16SKarl Rupp for (PetscInt i=0; i<=A->rmap->n; ++i) 61*e4a0ef16SKarl Rupp row_buffer.set(i, (a->i)[i]); 62*e4a0ef16SKarl Rupp 63*e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz); 64*e4a0ef16SKarl Rupp for (PetscInt i=0; i<a->nz; ++i) 65*e4a0ef16SKarl Rupp col_buffer.set(i, (a->j)[i]); 66*e4a0ef16SKarl Rupp 67*e4a0ef16SKarl Rupp viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 68*e4a0ef16SKarl Rupp } 69*e4a0ef16SKarl Rupp } catch(char *ex) { 70*e4a0ef16SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 71*e4a0ef16SKarl Rupp } 72*e4a0ef16SKarl Rupp 73*e4a0ef16SKarl Rupp A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 74*e4a0ef16SKarl Rupp 75*e4a0ef16SKarl Rupp ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr); 76*e4a0ef16SKarl Rupp } 77*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 78*e4a0ef16SKarl Rupp } 79*e4a0ef16SKarl Rupp 80*e4a0ef16SKarl Rupp #undef __FUNCT__ 81*e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyFromGPU" 82*e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu) 83*e4a0ef16SKarl Rupp { 84*e4a0ef16SKarl Rupp //Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 85*e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 86*e4a0ef16SKarl Rupp PetscInt m = A->rmap->n; 87*e4a0ef16SKarl Rupp PetscErrorCode ierr; 88*e4a0ef16SKarl Rupp 89*e4a0ef16SKarl Rupp 90*e4a0ef16SKarl Rupp PetscFunctionBegin; 91*e4a0ef16SKarl Rupp if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) { 92*e4a0ef16SKarl Rupp try { 93*e4a0ef16SKarl Rupp if (a->compressedrow.use) { 94*e4a0ef16SKarl Rupp SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 95*e4a0ef16SKarl Rupp } else { 96*e4a0ef16SKarl Rupp 97*e4a0ef16SKarl Rupp if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m); 98*e4a0ef16SKarl Rupp a->nz = Agpu->nnz(); 99*e4a0ef16SKarl Rupp a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 100*e4a0ef16SKarl Rupp A->preallocated = PETSC_TRUE; 101*e4a0ef16SKarl Rupp if (a->singlemalloc) { 102*e4a0ef16SKarl Rupp if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 103*e4a0ef16SKarl Rupp } else { 104*e4a0ef16SKarl Rupp if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 105*e4a0ef16SKarl Rupp if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 106*e4a0ef16SKarl Rupp if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 107*e4a0ef16SKarl Rupp } 108*e4a0ef16SKarl Rupp ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 109*e4a0ef16SKarl Rupp ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 110*e4a0ef16SKarl Rupp 111*e4a0ef16SKarl Rupp a->singlemalloc = PETSC_TRUE; 112*e4a0ef16SKarl Rupp 113*e4a0ef16SKarl Rupp /* Setup row lengths */ 114*e4a0ef16SKarl Rupp if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 115*e4a0ef16SKarl Rupp ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 116*e4a0ef16SKarl Rupp ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 117*e4a0ef16SKarl Rupp 118*e4a0ef16SKarl Rupp /* Copy data back from GPU */ 119*e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 120*e4a0ef16SKarl Rupp 121*e4a0ef16SKarl Rupp // copy row array 122*e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 123*e4a0ef16SKarl Rupp (a->i)[0] = row_buffer[0]; 124*e4a0ef16SKarl Rupp for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 125*e4a0ef16SKarl Rupp (a->i)[i+1] = row_buffer[i+1]; 126*e4a0ef16SKarl Rupp a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 127*e4a0ef16SKarl Rupp } 128*e4a0ef16SKarl Rupp 129*e4a0ef16SKarl Rupp // copy column indices 130*e4a0ef16SKarl Rupp viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 131*e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 132*e4a0ef16SKarl Rupp for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i) 133*e4a0ef16SKarl Rupp (a->j)[i] = col_buffer[i]; 134*e4a0ef16SKarl Rupp 135*e4a0ef16SKarl Rupp // copy nonzero entries directly to destination (no conversion required) 136*e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a); 137*e4a0ef16SKarl Rupp 138*e4a0ef16SKarl Rupp /* TODO: What about a->diag? */ 139*e4a0ef16SKarl Rupp } 140*e4a0ef16SKarl Rupp } catch(char *ex) { 141*e4a0ef16SKarl Rupp SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex); 142*e4a0ef16SKarl Rupp } 143*e4a0ef16SKarl Rupp 144*e4a0ef16SKarl Rupp /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */ 145*e4a0ef16SKarl Rupp ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 146*e4a0ef16SKarl Rupp ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 147*e4a0ef16SKarl Rupp 148*e4a0ef16SKarl Rupp A->valid_GPU_matrix = PETSC_VIENNACL_BOTH; 149*e4a0ef16SKarl Rupp } else { 150*e4a0ef16SKarl Rupp SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices"); 151*e4a0ef16SKarl Rupp } 152*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 153*e4a0ef16SKarl Rupp } 154*e4a0ef16SKarl Rupp 155*e4a0ef16SKarl Rupp #undef __FUNCT__ 156*e4a0ef16SKarl Rupp #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL" 157*e4a0ef16SKarl Rupp PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left) 158*e4a0ef16SKarl Rupp { 159*e4a0ef16SKarl Rupp PetscErrorCode ierr; 160*e4a0ef16SKarl Rupp 161*e4a0ef16SKarl Rupp PetscFunctionBegin; 162*e4a0ef16SKarl Rupp if (right) { 163*e4a0ef16SKarl Rupp ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 164*e4a0ef16SKarl Rupp ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 165*e4a0ef16SKarl Rupp ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 166*e4a0ef16SKarl Rupp ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr); 167*e4a0ef16SKarl Rupp ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 168*e4a0ef16SKarl Rupp } 169*e4a0ef16SKarl Rupp if (left) { 170*e4a0ef16SKarl Rupp ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 171*e4a0ef16SKarl Rupp ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 172*e4a0ef16SKarl Rupp ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 173*e4a0ef16SKarl Rupp ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr); 174*e4a0ef16SKarl Rupp ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 175*e4a0ef16SKarl Rupp } 176*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 177*e4a0ef16SKarl Rupp } 178*e4a0ef16SKarl Rupp 179*e4a0ef16SKarl Rupp #undef __FUNCT__ 180*e4a0ef16SKarl Rupp #define __FUNCT__ "MatMult_SeqAIJViennaCL" 181*e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy) 182*e4a0ef16SKarl Rupp { 183*e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 184*e4a0ef16SKarl Rupp PetscErrorCode ierr; 185*e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 186*e4a0ef16SKarl Rupp ViennaCLVector *xgpu=NULL,*ygpu=NULL; 187*e4a0ef16SKarl Rupp 188*e4a0ef16SKarl Rupp PetscFunctionBegin; 189*e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 190*e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr); 191*e4a0ef16SKarl Rupp try { 192*e4a0ef16SKarl Rupp *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu); 193*e4a0ef16SKarl Rupp } catch (char *ex) { 194*e4a0ef16SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 195*e4a0ef16SKarl Rupp } 196*e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 197*e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr); 198*e4a0ef16SKarl Rupp ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr); 199*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 200*e4a0ef16SKarl Rupp } 201*e4a0ef16SKarl Rupp 202*e4a0ef16SKarl Rupp 203*e4a0ef16SKarl Rupp 204*e4a0ef16SKarl Rupp #undef __FUNCT__ 205*e4a0ef16SKarl Rupp #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL" 206*e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz) 207*e4a0ef16SKarl Rupp { 208*e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 209*e4a0ef16SKarl Rupp PetscErrorCode ierr; 210*e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr; 211*e4a0ef16SKarl Rupp ViennaCLVector *xgpu=NULL,*ygpu=NULL,*zgpu=NULL; 212*e4a0ef16SKarl Rupp 213*e4a0ef16SKarl Rupp PetscFunctionBegin; 214*e4a0ef16SKarl Rupp try { 215*e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr); 216*e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr); 217*e4a0ef16SKarl Rupp ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr); 218*e4a0ef16SKarl Rupp 219*e4a0ef16SKarl Rupp if (a->compressedrow.use) { 220*e4a0ef16SKarl Rupp SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported"); 221*e4a0ef16SKarl Rupp } else { 222*e4a0ef16SKarl Rupp if (zz == xx || zz == yy) { //temporary required 223*e4a0ef16SKarl Rupp ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 224*e4a0ef16SKarl Rupp *zgpu = *ygpu + temp; 225*e4a0ef16SKarl Rupp } else { 226*e4a0ef16SKarl Rupp *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 227*e4a0ef16SKarl Rupp *zgpu += *ygpu; 228*e4a0ef16SKarl Rupp } 229*e4a0ef16SKarl Rupp } 230*e4a0ef16SKarl Rupp 231*e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr); 232*e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr); 233*e4a0ef16SKarl Rupp ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr); 234*e4a0ef16SKarl Rupp 235*e4a0ef16SKarl Rupp } catch(char *ex) { 236*e4a0ef16SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 237*e4a0ef16SKarl Rupp } 238*e4a0ef16SKarl Rupp ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 239*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 240*e4a0ef16SKarl Rupp } 241*e4a0ef16SKarl Rupp 242*e4a0ef16SKarl Rupp #undef __FUNCT__ 243*e4a0ef16SKarl Rupp #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL" 244*e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode) 245*e4a0ef16SKarl Rupp { 246*e4a0ef16SKarl Rupp PetscErrorCode ierr; 247*e4a0ef16SKarl Rupp 248*e4a0ef16SKarl Rupp PetscFunctionBegin; 249*e4a0ef16SKarl Rupp ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 250*e4a0ef16SKarl Rupp ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr); 251*e4a0ef16SKarl Rupp if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 252*e4a0ef16SKarl Rupp A->ops->mult = MatMult_SeqAIJViennaCL; 253*e4a0ef16SKarl Rupp A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 254*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 255*e4a0ef16SKarl Rupp } 256*e4a0ef16SKarl Rupp 257*e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/ 258*e4a0ef16SKarl Rupp #undef __FUNCT__ 259*e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreateSeqAIJViennaCL" 260*e4a0ef16SKarl Rupp /*@ 261*e4a0ef16SKarl Rupp MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format 262*e4a0ef16SKarl Rupp (the default parallel PETSc format). This matrix will ultimately pushed down 263*e4a0ef16SKarl Rupp to GPUs and use the ViennaCL library for calculations. For good matrix 264*e4a0ef16SKarl Rupp assembly performance the user should preallocate the matrix storage by setting 265*e4a0ef16SKarl Rupp the parameter nz (or the array nnz). By setting these parameters accurately, 266*e4a0ef16SKarl Rupp performance during matrix assembly can be increased substantially. 267*e4a0ef16SKarl Rupp 268*e4a0ef16SKarl Rupp 269*e4a0ef16SKarl Rupp Collective on MPI_Comm 270*e4a0ef16SKarl Rupp 271*e4a0ef16SKarl Rupp Input Parameters: 272*e4a0ef16SKarl Rupp + comm - MPI communicator, set to PETSC_COMM_SELF 273*e4a0ef16SKarl Rupp . m - number of rows 274*e4a0ef16SKarl Rupp . n - number of columns 275*e4a0ef16SKarl Rupp . nz - number of nonzeros per row (same for all rows) 276*e4a0ef16SKarl Rupp - nnz - array containing the number of nonzeros in the various rows 277*e4a0ef16SKarl Rupp (possibly different for each row) or NULL 278*e4a0ef16SKarl Rupp 279*e4a0ef16SKarl Rupp Output Parameter: 280*e4a0ef16SKarl Rupp . A - the matrix 281*e4a0ef16SKarl Rupp 282*e4a0ef16SKarl Rupp It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 283*e4a0ef16SKarl Rupp MatXXXXSetPreallocation() paradigm instead of this routine directly. 284*e4a0ef16SKarl Rupp [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 285*e4a0ef16SKarl Rupp 286*e4a0ef16SKarl Rupp Notes: 287*e4a0ef16SKarl Rupp If nnz is given then nz is ignored 288*e4a0ef16SKarl Rupp 289*e4a0ef16SKarl Rupp The AIJ format (also called the Yale sparse matrix format or 290*e4a0ef16SKarl Rupp compressed row storage), is fully compatible with standard Fortran 77 291*e4a0ef16SKarl Rupp storage. That is, the stored row and column indices can begin at 292*e4a0ef16SKarl Rupp either one (as in Fortran) or zero. See the users' manual for details. 293*e4a0ef16SKarl Rupp 294*e4a0ef16SKarl Rupp Specify the preallocated storage with either nz or nnz (not both). 295*e4a0ef16SKarl Rupp Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 296*e4a0ef16SKarl Rupp allocation. For large problems you MUST preallocate memory or you 297*e4a0ef16SKarl Rupp will get TERRIBLE performance, see the users' manual chapter on matrices. 298*e4a0ef16SKarl Rupp 299*e4a0ef16SKarl Rupp By default, this format uses inodes (identical nodes) when possible, to 300*e4a0ef16SKarl Rupp improve numerical efficiency of matrix-vector products and solves. We 301*e4a0ef16SKarl Rupp search for consecutive rows with the same nonzero structure, thereby 302*e4a0ef16SKarl Rupp reusing matrix information to achieve increased efficiency. 303*e4a0ef16SKarl Rupp 304*e4a0ef16SKarl Rupp Level: intermediate 305*e4a0ef16SKarl Rupp 306*e4a0ef16SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 307*e4a0ef16SKarl Rupp 308*e4a0ef16SKarl Rupp @*/ 309*e4a0ef16SKarl Rupp PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 310*e4a0ef16SKarl Rupp { 311*e4a0ef16SKarl Rupp PetscErrorCode ierr; 312*e4a0ef16SKarl Rupp 313*e4a0ef16SKarl Rupp PetscFunctionBegin; 314*e4a0ef16SKarl Rupp ierr = MatCreate(comm,A);CHKERRQ(ierr); 315*e4a0ef16SKarl Rupp ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 316*e4a0ef16SKarl Rupp ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr); 317*e4a0ef16SKarl Rupp ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 318*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 319*e4a0ef16SKarl Rupp } 320*e4a0ef16SKarl Rupp 321*e4a0ef16SKarl Rupp 322*e4a0ef16SKarl Rupp #undef __FUNCT__ 323*e4a0ef16SKarl Rupp #define __FUNCT__ "MatDestroy_SeqAIJViennaCL" 324*e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 325*e4a0ef16SKarl Rupp { 326*e4a0ef16SKarl Rupp PetscErrorCode ierr; 327*e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr; 328*e4a0ef16SKarl Rupp 329*e4a0ef16SKarl Rupp PetscFunctionBegin; 330*e4a0ef16SKarl Rupp try { 331*e4a0ef16SKarl Rupp if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat); 332*e4a0ef16SKarl Rupp delete viennaclcontainer; 333*e4a0ef16SKarl Rupp A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 334*e4a0ef16SKarl Rupp } catch(char *ex) { 335*e4a0ef16SKarl Rupp SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex); 336*e4a0ef16SKarl Rupp } 337*e4a0ef16SKarl Rupp /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 338*e4a0ef16SKarl Rupp A->spptr = 0; 339*e4a0ef16SKarl Rupp ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 340*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 341*e4a0ef16SKarl Rupp } 342*e4a0ef16SKarl Rupp 343*e4a0ef16SKarl Rupp 344*e4a0ef16SKarl Rupp extern PetscErrorCode MatSetValuesBatch_SeqAIJViennaCL(Mat, PetscInt, PetscInt, PetscInt*,const PetscScalar*); 345*e4a0ef16SKarl Rupp 346*e4a0ef16SKarl Rupp 347*e4a0ef16SKarl Rupp #undef __FUNCT__ 348*e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreate_SeqAIJViennaCL" 349*e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 350*e4a0ef16SKarl Rupp { 351*e4a0ef16SKarl Rupp PetscErrorCode ierr; 352*e4a0ef16SKarl Rupp Mat_SeqAIJ *aij; 353*e4a0ef16SKarl Rupp 354*e4a0ef16SKarl Rupp PetscFunctionBegin; 355*e4a0ef16SKarl Rupp ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 356*e4a0ef16SKarl Rupp aij = (Mat_SeqAIJ*)B->data; 357*e4a0ef16SKarl Rupp aij->inode.use = PETSC_FALSE; 358*e4a0ef16SKarl Rupp B->ops->mult = MatMult_SeqAIJViennaCL; 359*e4a0ef16SKarl Rupp B->ops->multadd = MatMultAdd_SeqAIJViennaCL; 360*e4a0ef16SKarl Rupp B->spptr = new Mat_SeqAIJViennaCL(); 361*e4a0ef16SKarl Rupp 362*e4a0ef16SKarl Rupp ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0; 363*e4a0ef16SKarl Rupp 364*e4a0ef16SKarl Rupp B->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 365*e4a0ef16SKarl Rupp B->ops->destroy = MatDestroy_SeqAIJViennaCL; 366*e4a0ef16SKarl Rupp B->ops->getvecs = MatGetVecs_SeqAIJViennaCL; 367*e4a0ef16SKarl Rupp B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJViennaCL; 368*e4a0ef16SKarl Rupp 369*e4a0ef16SKarl Rupp //ierr = PetscObjectComposeFunction((PetscObject)B,"MatViennaCLSetFormat_C", "MatViennaCLSetFormat_SeqAIJViennaCL", MatViennaCLSetFormat_SeqAIJViennaCL);CHKERRQ(ierr); 370*e4a0ef16SKarl Rupp ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr); 371*e4a0ef16SKarl Rupp 372*e4a0ef16SKarl Rupp B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED; 373*e4a0ef16SKarl Rupp PetscFunctionReturn(0); 374*e4a0ef16SKarl Rupp } 375*e4a0ef16SKarl Rupp 376*e4a0ef16SKarl Rupp 377*e4a0ef16SKarl Rupp /*M 378*e4a0ef16SKarl Rupp MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 379*e4a0ef16SKarl Rupp 380*e4a0ef16SKarl Rupp A matrix type type whose data resides on GPUs. These matrices are in CSR format by 381*e4a0ef16SKarl Rupp default. All matrix calculations are performed using the ViennaCL library. 382*e4a0ef16SKarl Rupp 383*e4a0ef16SKarl Rupp Options Database Keys: 384*e4a0ef16SKarl Rupp + -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions() 385*e4a0ef16SKarl Rupp . -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 386*e4a0ef16SKarl Rupp - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). 387*e4a0ef16SKarl Rupp 388*e4a0ef16SKarl Rupp Level: beginner 389*e4a0ef16SKarl Rupp 390*e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL() 391*e4a0ef16SKarl Rupp M*/ 392*e4a0ef16SKarl Rupp 393