xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision e4a0ef16445bf169c9a8b17f5f5ac28daecc54e0)
1*e4a0ef16SKarl Rupp 
2*e4a0ef16SKarl Rupp 
3*e4a0ef16SKarl Rupp /*
4*e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5*e4a0ef16SKarl Rupp   matrix storage format.
6*e4a0ef16SKarl Rupp */
7*e4a0ef16SKarl Rupp 
8*e4a0ef16SKarl Rupp #include "petscconf.h"
9*e4a0ef16SKarl Rupp #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
10*e4a0ef16SKarl Rupp #include "petscbt.h"
11*e4a0ef16SKarl Rupp #include "../src/vec/vec/impls/dvecimpl.h"
12*e4a0ef16SKarl Rupp #include "petsc-private/vecimpl.h"
13*e4a0ef16SKarl Rupp 
14*e4a0ef16SKarl Rupp #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
15*e4a0ef16SKarl Rupp 
16*e4a0ef16SKarl Rupp 
17*e4a0ef16SKarl Rupp #include <algorithm>
18*e4a0ef16SKarl Rupp #include <vector>
19*e4a0ef16SKarl Rupp #include <string>
20*e4a0ef16SKarl Rupp 
21*e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
22*e4a0ef16SKarl Rupp 
23*e4a0ef16SKarl Rupp #undef __FUNCT__
24*e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyToGPU"
25*e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26*e4a0ef16SKarl Rupp {
27*e4a0ef16SKarl Rupp 
28*e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29*e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30*e4a0ef16SKarl Rupp   //PetscInt           m               = A->rmap->n,*ii,*ridx;
31*e4a0ef16SKarl Rupp   PetscInt           *ii;
32*e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
33*e4a0ef16SKarl Rupp 
34*e4a0ef16SKarl Rupp 
35*e4a0ef16SKarl Rupp   PetscFunctionBegin;
36*e4a0ef16SKarl Rupp   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
37*e4a0ef16SKarl Rupp     ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38*e4a0ef16SKarl Rupp 
39*e4a0ef16SKarl Rupp     try {
40*e4a0ef16SKarl Rupp       //viennaclstruct->nonzerorow=0;
41*e4a0ef16SKarl Rupp       //for (PetscInt j = 0; j<m; j++) viennaclstruct->nonzerorow += (a->i[j+1] > a->i[j]);
42*e4a0ef16SKarl Rupp 
43*e4a0ef16SKarl Rupp       viennaclstruct->mat = new ViennaCLAIJMatrix();
44*e4a0ef16SKarl Rupp       if (a->compressedrow.use) {
45*e4a0ef16SKarl Rupp         //m    = a->compressedrow.nrows;
46*e4a0ef16SKarl Rupp         ii   = a->compressedrow.i;
47*e4a0ef16SKarl Rupp         //ridx = a->compressedrow.rindex;
48*e4a0ef16SKarl Rupp 
49*e4a0ef16SKarl Rupp         viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
50*e4a0ef16SKarl Rupp 
51*e4a0ef16SKarl Rupp         // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
52*e4a0ef16SKarl Rupp 
53*e4a0ef16SKarl Rupp         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
54*e4a0ef16SKarl Rupp       } else {
55*e4a0ef16SKarl Rupp 
56*e4a0ef16SKarl Rupp         // Since PetscInt is in general different from cl_uint, we have to convert:
57*e4a0ef16SKarl Rupp         viennacl::backend::mem_handle dummy;
58*e4a0ef16SKarl Rupp 
59*e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
60*e4a0ef16SKarl Rupp         for (PetscInt i=0; i<=A->rmap->n; ++i)
61*e4a0ef16SKarl Rupp           row_buffer.set(i, (a->i)[i]);
62*e4a0ef16SKarl Rupp 
63*e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
64*e4a0ef16SKarl Rupp         for (PetscInt i=0; i<a->nz; ++i)
65*e4a0ef16SKarl Rupp           col_buffer.set(i, (a->j)[i]);
66*e4a0ef16SKarl Rupp 
67*e4a0ef16SKarl Rupp         viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
68*e4a0ef16SKarl Rupp       }
69*e4a0ef16SKarl Rupp     } catch(char *ex) {
70*e4a0ef16SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
71*e4a0ef16SKarl Rupp     }
72*e4a0ef16SKarl Rupp 
73*e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
74*e4a0ef16SKarl Rupp 
75*e4a0ef16SKarl Rupp     ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
76*e4a0ef16SKarl Rupp   }
77*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
78*e4a0ef16SKarl Rupp }
79*e4a0ef16SKarl Rupp 
80*e4a0ef16SKarl Rupp #undef __FUNCT__
81*e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyFromGPU"
82*e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu)
83*e4a0ef16SKarl Rupp {
84*e4a0ef16SKarl Rupp   //Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
85*e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
86*e4a0ef16SKarl Rupp   PetscInt           m               = A->rmap->n;
87*e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
88*e4a0ef16SKarl Rupp 
89*e4a0ef16SKarl Rupp 
90*e4a0ef16SKarl Rupp   PetscFunctionBegin;
91*e4a0ef16SKarl Rupp   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
92*e4a0ef16SKarl Rupp     try {
93*e4a0ef16SKarl Rupp       if (a->compressedrow.use) {
94*e4a0ef16SKarl Rupp         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
95*e4a0ef16SKarl Rupp       } else {
96*e4a0ef16SKarl Rupp 
97*e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
98*e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
99*e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
100*e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
101*e4a0ef16SKarl Rupp         if (a->singlemalloc) {
102*e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
103*e4a0ef16SKarl Rupp         } else {
104*e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
105*e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
106*e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
107*e4a0ef16SKarl Rupp         }
108*e4a0ef16SKarl Rupp         ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
109*e4a0ef16SKarl Rupp         ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
110*e4a0ef16SKarl Rupp 
111*e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
112*e4a0ef16SKarl Rupp 
113*e4a0ef16SKarl Rupp         /* Setup row lengths */
114*e4a0ef16SKarl Rupp         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
115*e4a0ef16SKarl Rupp         ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
116*e4a0ef16SKarl Rupp         ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
117*e4a0ef16SKarl Rupp 
118*e4a0ef16SKarl Rupp         /* Copy data back from GPU */
119*e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
120*e4a0ef16SKarl Rupp 
121*e4a0ef16SKarl Rupp         // copy row array
122*e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
123*e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
124*e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
125*e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
126*e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
127*e4a0ef16SKarl Rupp         }
128*e4a0ef16SKarl Rupp 
129*e4a0ef16SKarl Rupp         // copy column indices
130*e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
131*e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
132*e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
133*e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
134*e4a0ef16SKarl Rupp 
135*e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
136*e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
137*e4a0ef16SKarl Rupp 
138*e4a0ef16SKarl Rupp         /* TODO: What about a->diag? */
139*e4a0ef16SKarl Rupp       }
140*e4a0ef16SKarl Rupp     } catch(char *ex) {
141*e4a0ef16SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex);
142*e4a0ef16SKarl Rupp     }
143*e4a0ef16SKarl Rupp 
144*e4a0ef16SKarl Rupp     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
145*e4a0ef16SKarl Rupp     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
146*e4a0ef16SKarl Rupp     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
147*e4a0ef16SKarl Rupp 
148*e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
149*e4a0ef16SKarl Rupp   } else {
150*e4a0ef16SKarl Rupp     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
151*e4a0ef16SKarl Rupp   }
152*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
153*e4a0ef16SKarl Rupp }
154*e4a0ef16SKarl Rupp 
155*e4a0ef16SKarl Rupp #undef __FUNCT__
156*e4a0ef16SKarl Rupp #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
157*e4a0ef16SKarl Rupp PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
158*e4a0ef16SKarl Rupp {
159*e4a0ef16SKarl Rupp   PetscErrorCode ierr;
160*e4a0ef16SKarl Rupp 
161*e4a0ef16SKarl Rupp   PetscFunctionBegin;
162*e4a0ef16SKarl Rupp   if (right) {
163*e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
164*e4a0ef16SKarl Rupp     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
165*e4a0ef16SKarl Rupp     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
166*e4a0ef16SKarl Rupp     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
167*e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
168*e4a0ef16SKarl Rupp   }
169*e4a0ef16SKarl Rupp   if (left) {
170*e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
171*e4a0ef16SKarl Rupp     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
172*e4a0ef16SKarl Rupp     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
173*e4a0ef16SKarl Rupp     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
174*e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
175*e4a0ef16SKarl Rupp   }
176*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
177*e4a0ef16SKarl Rupp }
178*e4a0ef16SKarl Rupp 
179*e4a0ef16SKarl Rupp #undef __FUNCT__
180*e4a0ef16SKarl Rupp #define __FUNCT__ "MatMult_SeqAIJViennaCL"
181*e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
182*e4a0ef16SKarl Rupp {
183*e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
184*e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
185*e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
186*e4a0ef16SKarl Rupp   ViennaCLVector     *xgpu=NULL,*ygpu=NULL;
187*e4a0ef16SKarl Rupp 
188*e4a0ef16SKarl Rupp   PetscFunctionBegin;
189*e4a0ef16SKarl Rupp   ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
190*e4a0ef16SKarl Rupp   ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
191*e4a0ef16SKarl Rupp   try {
192*e4a0ef16SKarl Rupp     *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
193*e4a0ef16SKarl Rupp   } catch (char *ex) {
194*e4a0ef16SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
195*e4a0ef16SKarl Rupp   }
196*e4a0ef16SKarl Rupp   ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
197*e4a0ef16SKarl Rupp   ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
198*e4a0ef16SKarl Rupp   ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
199*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
200*e4a0ef16SKarl Rupp }
201*e4a0ef16SKarl Rupp 
202*e4a0ef16SKarl Rupp 
203*e4a0ef16SKarl Rupp 
204*e4a0ef16SKarl Rupp #undef __FUNCT__
205*e4a0ef16SKarl Rupp #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
206*e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
207*e4a0ef16SKarl Rupp {
208*e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
209*e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
210*e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
211*e4a0ef16SKarl Rupp   ViennaCLVector     *xgpu=NULL,*ygpu=NULL,*zgpu=NULL;
212*e4a0ef16SKarl Rupp 
213*e4a0ef16SKarl Rupp   PetscFunctionBegin;
214*e4a0ef16SKarl Rupp   try {
215*e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
216*e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
217*e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
218*e4a0ef16SKarl Rupp 
219*e4a0ef16SKarl Rupp     if (a->compressedrow.use) {
220*e4a0ef16SKarl Rupp         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
221*e4a0ef16SKarl Rupp     } else {
222*e4a0ef16SKarl Rupp       if (zz == xx || zz == yy) { //temporary required
223*e4a0ef16SKarl Rupp         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
224*e4a0ef16SKarl Rupp         *zgpu = *ygpu + temp;
225*e4a0ef16SKarl Rupp       } else {
226*e4a0ef16SKarl Rupp         *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
227*e4a0ef16SKarl Rupp         *zgpu += *ygpu;
228*e4a0ef16SKarl Rupp       }
229*e4a0ef16SKarl Rupp     }
230*e4a0ef16SKarl Rupp 
231*e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
232*e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
233*e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
234*e4a0ef16SKarl Rupp 
235*e4a0ef16SKarl Rupp   } catch(char *ex) {
236*e4a0ef16SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
237*e4a0ef16SKarl Rupp   }
238*e4a0ef16SKarl Rupp   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
239*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
240*e4a0ef16SKarl Rupp }
241*e4a0ef16SKarl Rupp 
242*e4a0ef16SKarl Rupp #undef __FUNCT__
243*e4a0ef16SKarl Rupp #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
244*e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
245*e4a0ef16SKarl Rupp {
246*e4a0ef16SKarl Rupp   PetscErrorCode ierr;
247*e4a0ef16SKarl Rupp 
248*e4a0ef16SKarl Rupp   PetscFunctionBegin;
249*e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
250*e4a0ef16SKarl Rupp   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
251*e4a0ef16SKarl Rupp   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
252*e4a0ef16SKarl Rupp   A->ops->mult    = MatMult_SeqAIJViennaCL;
253*e4a0ef16SKarl Rupp   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
254*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
255*e4a0ef16SKarl Rupp }
256*e4a0ef16SKarl Rupp 
257*e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
258*e4a0ef16SKarl Rupp #undef __FUNCT__
259*e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreateSeqAIJViennaCL"
260*e4a0ef16SKarl Rupp /*@
261*e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
262*e4a0ef16SKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately pushed down
263*e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
264*e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
265*e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
266*e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
267*e4a0ef16SKarl Rupp 
268*e4a0ef16SKarl Rupp 
269*e4a0ef16SKarl Rupp    Collective on MPI_Comm
270*e4a0ef16SKarl Rupp 
271*e4a0ef16SKarl Rupp    Input Parameters:
272*e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
273*e4a0ef16SKarl Rupp .  m - number of rows
274*e4a0ef16SKarl Rupp .  n - number of columns
275*e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
276*e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
277*e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
278*e4a0ef16SKarl Rupp 
279*e4a0ef16SKarl Rupp    Output Parameter:
280*e4a0ef16SKarl Rupp .  A - the matrix
281*e4a0ef16SKarl Rupp 
282*e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
283*e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
284*e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
285*e4a0ef16SKarl Rupp 
286*e4a0ef16SKarl Rupp    Notes:
287*e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
288*e4a0ef16SKarl Rupp 
289*e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
290*e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
291*e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
292*e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
293*e4a0ef16SKarl Rupp 
294*e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
295*e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
296*e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
297*e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
298*e4a0ef16SKarl Rupp 
299*e4a0ef16SKarl Rupp    By default, this format uses inodes (identical nodes) when possible, to
300*e4a0ef16SKarl Rupp    improve numerical efficiency of matrix-vector products and solves. We
301*e4a0ef16SKarl Rupp    search for consecutive rows with the same nonzero structure, thereby
302*e4a0ef16SKarl Rupp    reusing matrix information to achieve increased efficiency.
303*e4a0ef16SKarl Rupp 
304*e4a0ef16SKarl Rupp    Level: intermediate
305*e4a0ef16SKarl Rupp 
306*e4a0ef16SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
307*e4a0ef16SKarl Rupp 
308*e4a0ef16SKarl Rupp @*/
309*e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
310*e4a0ef16SKarl Rupp {
311*e4a0ef16SKarl Rupp   PetscErrorCode ierr;
312*e4a0ef16SKarl Rupp 
313*e4a0ef16SKarl Rupp   PetscFunctionBegin;
314*e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
315*e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
316*e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
317*e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
318*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
319*e4a0ef16SKarl Rupp }
320*e4a0ef16SKarl Rupp 
321*e4a0ef16SKarl Rupp 
322*e4a0ef16SKarl Rupp #undef __FUNCT__
323*e4a0ef16SKarl Rupp #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
324*e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
325*e4a0ef16SKarl Rupp {
326*e4a0ef16SKarl Rupp   PetscErrorCode ierr;
327*e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
328*e4a0ef16SKarl Rupp 
329*e4a0ef16SKarl Rupp   PetscFunctionBegin;
330*e4a0ef16SKarl Rupp   try {
331*e4a0ef16SKarl Rupp     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
332*e4a0ef16SKarl Rupp     delete viennaclcontainer;
333*e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
334*e4a0ef16SKarl Rupp   } catch(char *ex) {
335*e4a0ef16SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
336*e4a0ef16SKarl Rupp   }
337*e4a0ef16SKarl Rupp   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
338*e4a0ef16SKarl Rupp   A->spptr = 0;
339*e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
340*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
341*e4a0ef16SKarl Rupp }
342*e4a0ef16SKarl Rupp 
343*e4a0ef16SKarl Rupp 
344*e4a0ef16SKarl Rupp extern PetscErrorCode MatSetValuesBatch_SeqAIJViennaCL(Mat, PetscInt, PetscInt, PetscInt*,const PetscScalar*);
345*e4a0ef16SKarl Rupp 
346*e4a0ef16SKarl Rupp 
347*e4a0ef16SKarl Rupp #undef __FUNCT__
348*e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
349*e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
350*e4a0ef16SKarl Rupp {
351*e4a0ef16SKarl Rupp   PetscErrorCode ierr;
352*e4a0ef16SKarl Rupp   Mat_SeqAIJ     *aij;
353*e4a0ef16SKarl Rupp 
354*e4a0ef16SKarl Rupp   PetscFunctionBegin;
355*e4a0ef16SKarl Rupp   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
356*e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
357*e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
358*e4a0ef16SKarl Rupp   B->ops->mult    = MatMult_SeqAIJViennaCL;
359*e4a0ef16SKarl Rupp   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
360*e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
361*e4a0ef16SKarl Rupp 
362*e4a0ef16SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat     = 0;
363*e4a0ef16SKarl Rupp 
364*e4a0ef16SKarl Rupp   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
365*e4a0ef16SKarl Rupp   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
366*e4a0ef16SKarl Rupp   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
367*e4a0ef16SKarl Rupp   B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJViennaCL;
368*e4a0ef16SKarl Rupp 
369*e4a0ef16SKarl Rupp   //ierr = PetscObjectComposeFunction((PetscObject)B,"MatViennaCLSetFormat_C", "MatViennaCLSetFormat_SeqAIJViennaCL", MatViennaCLSetFormat_SeqAIJViennaCL);CHKERRQ(ierr);
370*e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
371*e4a0ef16SKarl Rupp 
372*e4a0ef16SKarl Rupp   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
373*e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
374*e4a0ef16SKarl Rupp }
375*e4a0ef16SKarl Rupp 
376*e4a0ef16SKarl Rupp 
377*e4a0ef16SKarl Rupp /*M
378*e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
379*e4a0ef16SKarl Rupp 
380*e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
381*e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
382*e4a0ef16SKarl Rupp 
383*e4a0ef16SKarl Rupp    Options Database Keys:
384*e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
385*e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
386*e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
387*e4a0ef16SKarl Rupp 
388*e4a0ef16SKarl Rupp   Level: beginner
389*e4a0ef16SKarl Rupp 
390*e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
391*e4a0ef16SKarl Rupp M*/
392*e4a0ef16SKarl Rupp 
393