xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 72367587056dc268aaef436b9c0c57b86e2ee348)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp 
3e4a0ef16SKarl Rupp /*
4e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5e4a0ef16SKarl Rupp   matrix storage format.
6e4a0ef16SKarl Rupp */
7e4a0ef16SKarl Rupp 
8aaa7dc30SBarry Smith #include <petscconf.h>
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp 
17e4a0ef16SKarl Rupp #include <algorithm>
18e4a0ef16SKarl Rupp #include <vector>
19e4a0ef16SKarl Rupp #include <string>
20e4a0ef16SKarl Rupp 
21e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
22e4a0ef16SKarl Rupp 
238713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
24*72367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
25*72367587SKarl Rupp 
268713a8baSPatrick Sanan 
27e4a0ef16SKarl Rupp #undef __FUNCT__
28e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyToGPU"
29e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
30e4a0ef16SKarl Rupp {
31e4a0ef16SKarl Rupp 
32e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
33e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
34e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
35e4a0ef16SKarl Rupp 
36e4a0ef16SKarl Rupp 
37e4a0ef16SKarl Rupp   PetscFunctionBegin;
3867c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
39e4a0ef16SKarl Rupp     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
40e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
41e4a0ef16SKarl Rupp 
42e4a0ef16SKarl Rupp       try {
43e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
44a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
45e4a0ef16SKarl Rupp 
46a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
47a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
48e4a0ef16SKarl Rupp 
49a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
50a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
51a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
52e4a0ef16SKarl Rupp 
53a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
54a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
55a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
56a3430c56SKarl Rupp 
57a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
58a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
59a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
60a3430c56SKarl Rupp 
61a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
62e4a0ef16SKarl Rupp         } else {
63a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
64e4a0ef16SKarl Rupp 
65e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
66e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
67e4a0ef16SKarl Rupp 
68e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
69e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
70e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
71e4a0ef16SKarl Rupp 
72e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
73e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
74e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
75e4a0ef16SKarl Rupp 
76e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
77e4a0ef16SKarl Rupp         }
784cf1874eSKarl Rupp         ViennaCLWaitForGPU();
794076e183SKarl Rupp       } catch(std::exception const & ex) {
804076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
81e4a0ef16SKarl Rupp       }
82e4a0ef16SKarl Rupp 
83a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
84a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
859b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
86a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
879b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
88a3430c56SKarl Rupp         } else {
89a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
90a3430c56SKarl Rupp         }
91a3430c56SKarl Rupp       } else {
929b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
93a3430c56SKarl Rupp       }
94a3430c56SKarl Rupp 
95e4a0ef16SKarl Rupp       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
96e4a0ef16SKarl Rupp 
97e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
98e4a0ef16SKarl Rupp     }
9967c87b7fSKarl Rupp   }
100e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
101e4a0ef16SKarl Rupp }
102e4a0ef16SKarl Rupp 
103e4a0ef16SKarl Rupp #undef __FUNCT__
104e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyFromGPU"
1050d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
106e4a0ef16SKarl Rupp {
107e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
108e4a0ef16SKarl Rupp   PetscInt           m               = A->rmap->n;
109e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
110e4a0ef16SKarl Rupp 
111e4a0ef16SKarl Rupp 
112e4a0ef16SKarl Rupp   PetscFunctionBegin;
113e4a0ef16SKarl Rupp   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
114e4a0ef16SKarl Rupp     try {
1156c4ed002SBarry Smith       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1166c4ed002SBarry Smith       else {
117e4a0ef16SKarl Rupp 
118e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
119e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
120e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
121e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
122e4a0ef16SKarl Rupp         if (a->singlemalloc) {
123e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
124e4a0ef16SKarl Rupp         } else {
125e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
126e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
127e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
128e4a0ef16SKarl Rupp         }
129dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
130f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
131e4a0ef16SKarl Rupp 
132e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
133e4a0ef16SKarl Rupp 
134e4a0ef16SKarl Rupp         /* Setup row lengths */
135e4a0ef16SKarl Rupp         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
136dcca6d9dSJed Brown         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
137f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
138e4a0ef16SKarl Rupp 
139e4a0ef16SKarl Rupp         /* Copy data back from GPU */
140e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
141e4a0ef16SKarl Rupp 
142e4a0ef16SKarl Rupp         // copy row array
143e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
144e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
145e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
146e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
147e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
148e4a0ef16SKarl Rupp         }
149e4a0ef16SKarl Rupp 
150e4a0ef16SKarl Rupp         // copy column indices
151e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
152e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
153e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
154e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
155e4a0ef16SKarl Rupp 
156e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
157e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
158e4a0ef16SKarl Rupp 
1594cf1874eSKarl Rupp         ViennaCLWaitForGPU();
160023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
161e4a0ef16SKarl Rupp       }
1624076e183SKarl Rupp     } catch(std::exception const & ex) {
1634076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
164e4a0ef16SKarl Rupp     }
165e4a0ef16SKarl Rupp 
166e4a0ef16SKarl Rupp     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
167e4a0ef16SKarl Rupp     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
168e4a0ef16SKarl Rupp     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169e4a0ef16SKarl Rupp 
170e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
1716c4ed002SBarry Smith   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
172e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
173e4a0ef16SKarl Rupp }
174e4a0ef16SKarl Rupp 
175e4a0ef16SKarl Rupp #undef __FUNCT__
1762a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL"
1772a7a6963SBarry Smith PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
178e4a0ef16SKarl Rupp {
179e4a0ef16SKarl Rupp   PetscErrorCode ierr;
18033d57670SJed Brown   PetscInt rbs,cbs;
181e4a0ef16SKarl Rupp 
182e4a0ef16SKarl Rupp   PetscFunctionBegin;
18333d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
184e4a0ef16SKarl Rupp   if (right) {
185e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
186e4a0ef16SKarl Rupp     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
18733d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
188e4a0ef16SKarl Rupp     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
189e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
190e4a0ef16SKarl Rupp   }
191e4a0ef16SKarl Rupp   if (left) {
192e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
193e4a0ef16SKarl Rupp     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
19433d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
195e4a0ef16SKarl Rupp     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
196e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
197e4a0ef16SKarl Rupp   }
198e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
199e4a0ef16SKarl Rupp }
200e4a0ef16SKarl Rupp 
201e4a0ef16SKarl Rupp #undef __FUNCT__
202e4a0ef16SKarl Rupp #define __FUNCT__ "MatMult_SeqAIJViennaCL"
203e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
204e4a0ef16SKarl Rupp {
205e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
206e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
207e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2080d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
2090d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
210e4a0ef16SKarl Rupp 
211e4a0ef16SKarl Rupp   PetscFunctionBegin;
21267c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) {
213e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
214e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
215e4a0ef16SKarl Rupp     try {
216e4a0ef16SKarl Rupp       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
2174cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2184076e183SKarl Rupp     } catch (std::exception const & ex) {
2194076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
220e4a0ef16SKarl Rupp     }
221e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
222e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
2239b66742cSDave May     ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
22467c87b7fSKarl Rupp   }
225e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
226e4a0ef16SKarl Rupp }
227e4a0ef16SKarl Rupp 
228e4a0ef16SKarl Rupp 
229e4a0ef16SKarl Rupp 
230e4a0ef16SKarl Rupp #undef __FUNCT__
231e4a0ef16SKarl Rupp #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
232e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
233e4a0ef16SKarl Rupp {
234e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
235e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
236e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2370d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2380d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
239e4a0ef16SKarl Rupp 
240e4a0ef16SKarl Rupp   PetscFunctionBegin;
24167c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) {
242e4a0ef16SKarl Rupp     try {
243e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
244e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
245e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
246e4a0ef16SKarl Rupp 
247e4a0ef16SKarl Rupp       if (a->compressedrow.use) {
248a3430c56SKarl Rupp         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
249e4a0ef16SKarl Rupp         *zgpu = *ygpu + temp;
2504cf1874eSKarl Rupp         ViennaCLWaitForGPU();
251e4a0ef16SKarl Rupp       } else {
252a3430c56SKarl Rupp         if (zz == xx || zz == yy) { //temporary required
253a3430c56SKarl Rupp           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
254a3430c56SKarl Rupp           *zgpu = *ygpu;
255a3430c56SKarl Rupp           *zgpu += temp;
256a3430c56SKarl Rupp           ViennaCLWaitForGPU();
257a3430c56SKarl Rupp         } else {
258a3430c56SKarl Rupp           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
259a3430c56SKarl Rupp           *zgpu = *ygpu + *viennaclstruct->tempvec;
2604cf1874eSKarl Rupp           ViennaCLWaitForGPU();
261e4a0ef16SKarl Rupp         }
262e4a0ef16SKarl Rupp       }
263e4a0ef16SKarl Rupp 
264e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
265e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
266e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
267e4a0ef16SKarl Rupp 
2684076e183SKarl Rupp     } catch(std::exception const & ex) {
2694076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
270e4a0ef16SKarl Rupp     }
271e4a0ef16SKarl Rupp     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
27267c87b7fSKarl Rupp   }
273e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
274e4a0ef16SKarl Rupp }
275e4a0ef16SKarl Rupp 
276e4a0ef16SKarl Rupp #undef __FUNCT__
277e4a0ef16SKarl Rupp #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
278e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
279e4a0ef16SKarl Rupp {
280e4a0ef16SKarl Rupp   PetscErrorCode ierr;
281e4a0ef16SKarl Rupp 
282e4a0ef16SKarl Rupp   PetscFunctionBegin;
283e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
284e4a0ef16SKarl Rupp   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
285e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
286e4a0ef16SKarl Rupp }
287e4a0ef16SKarl Rupp 
288e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
289e4a0ef16SKarl Rupp #undef __FUNCT__
290e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreateSeqAIJViennaCL"
291e4a0ef16SKarl Rupp /*@
292e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
29319fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
294e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
295e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
296e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
297e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
298e4a0ef16SKarl Rupp 
299e4a0ef16SKarl Rupp 
300e4a0ef16SKarl Rupp    Collective on MPI_Comm
301e4a0ef16SKarl Rupp 
302e4a0ef16SKarl Rupp    Input Parameters:
303e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
304e4a0ef16SKarl Rupp .  m - number of rows
305e4a0ef16SKarl Rupp .  n - number of columns
306e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
307e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
308e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
309e4a0ef16SKarl Rupp 
310e4a0ef16SKarl Rupp    Output Parameter:
311e4a0ef16SKarl Rupp .  A - the matrix
312e4a0ef16SKarl Rupp 
313e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
314e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
315e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
316e4a0ef16SKarl Rupp 
317e4a0ef16SKarl Rupp    Notes:
318e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
319e4a0ef16SKarl Rupp 
320e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
321e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
322e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
323e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
324e4a0ef16SKarl Rupp 
325e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
326e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
327e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
328e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
329e4a0ef16SKarl Rupp 
330e4a0ef16SKarl Rupp    Level: intermediate
331e4a0ef16SKarl Rupp 
332e4a0ef16SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
333e4a0ef16SKarl Rupp 
334e4a0ef16SKarl Rupp @*/
335e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
336e4a0ef16SKarl Rupp {
337e4a0ef16SKarl Rupp   PetscErrorCode ierr;
338e4a0ef16SKarl Rupp 
339e4a0ef16SKarl Rupp   PetscFunctionBegin;
340e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
341e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
342e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
343e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
344e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
345e4a0ef16SKarl Rupp }
346e4a0ef16SKarl Rupp 
347e4a0ef16SKarl Rupp 
348e4a0ef16SKarl Rupp #undef __FUNCT__
349e4a0ef16SKarl Rupp #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
350e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
351e4a0ef16SKarl Rupp {
352e4a0ef16SKarl Rupp   PetscErrorCode ierr;
353e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
354e4a0ef16SKarl Rupp 
355e4a0ef16SKarl Rupp   PetscFunctionBegin;
356e4a0ef16SKarl Rupp   try {
3576447cd05SKarl Rupp     if (viennaclcontainer) {
3586447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3596447cd05SKarl Rupp       delete viennaclcontainer->mat;
3606447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
361e4a0ef16SKarl Rupp       delete viennaclcontainer;
3626447cd05SKarl Rupp     }
363e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
3644076e183SKarl Rupp   } catch(std::exception const & ex) {
3654076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
366e4a0ef16SKarl Rupp   }
3678713a8baSPatrick Sanan 
3688713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
3698713a8baSPatrick Sanan 
370e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
371e4a0ef16SKarl Rupp   A->spptr = 0;
372e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
373e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
374e4a0ef16SKarl Rupp }
375e4a0ef16SKarl Rupp 
376e4a0ef16SKarl Rupp 
377e4a0ef16SKarl Rupp #undef __FUNCT__
378e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
379e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
380e4a0ef16SKarl Rupp {
381e4a0ef16SKarl Rupp   PetscErrorCode ierr;
382e4a0ef16SKarl Rupp 
383e4a0ef16SKarl Rupp   PetscFunctionBegin;
384e4a0ef16SKarl Rupp   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3858713a8baSPatrick Sanan   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
3868713a8baSPatrick Sanan   PetscFunctionReturn(0);
3878713a8baSPatrick Sanan }
3888713a8baSPatrick Sanan 
3898713a8baSPatrick Sanan #undef __FUNCT__
3908713a8baSPatrick Sanan #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJViennaCL"
3918713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
3928713a8baSPatrick Sanan {
3938713a8baSPatrick Sanan   PetscErrorCode ierr;
3948713a8baSPatrick Sanan   Mat            B;
3958713a8baSPatrick Sanan   Mat_SeqAIJ     *aij;
3968713a8baSPatrick Sanan 
3978713a8baSPatrick Sanan   PetscFunctionBegin;
3988713a8baSPatrick Sanan 
3998713a8baSPatrick Sanan   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4008713a8baSPatrick Sanan 
4018713a8baSPatrick Sanan   if (reuse == MAT_INITIAL_MATRIX) {
4028713a8baSPatrick Sanan     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4038713a8baSPatrick Sanan   }
4048713a8baSPatrick Sanan 
4058713a8baSPatrick Sanan   B = *newmat;
4068713a8baSPatrick Sanan 
407e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
408e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
4098713a8baSPatrick Sanan 
410e4a0ef16SKarl Rupp   B->ops->mult    = MatMult_SeqAIJViennaCL;
411e4a0ef16SKarl Rupp   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
412e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
413e4a0ef16SKarl Rupp 
414a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
415a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
416a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
417e4a0ef16SKarl Rupp 
418e4a0ef16SKarl Rupp   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
419e4a0ef16SKarl Rupp   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
4202a7a6963SBarry Smith   B->ops->getvecs        = MatCreateVecs_SeqAIJViennaCL;
421e4a0ef16SKarl Rupp 
422e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
423e4a0ef16SKarl Rupp 
4248713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
4258713a8baSPatrick Sanan 
426e4a0ef16SKarl Rupp   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
4278713a8baSPatrick Sanan 
4288713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4298713a8baSPatrick Sanan   if (B->assembled) {
4308713a8baSPatrick Sanan     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
4318713a8baSPatrick Sanan   }
4328713a8baSPatrick Sanan 
433e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
434e4a0ef16SKarl Rupp }
435e4a0ef16SKarl Rupp 
436e4a0ef16SKarl Rupp 
437e4a0ef16SKarl Rupp /*M
438e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
439e4a0ef16SKarl Rupp 
440e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
441e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
442e4a0ef16SKarl Rupp 
443e4a0ef16SKarl Rupp    Options Database Keys:
444e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
445e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
446e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
447e4a0ef16SKarl Rupp 
448e4a0ef16SKarl Rupp   Level: beginner
449e4a0ef16SKarl Rupp 
450e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
451e4a0ef16SKarl Rupp M*/
452e4a0ef16SKarl Rupp 
453*72367587SKarl Rupp 
454*72367587SKarl Rupp #undef __FUNCT__
455*72367587SKarl Rupp #define __FUNCT__ "MatSolverPackageRegister_ViennaCL"
456*72367587SKarl Rupp PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_ViennaCL(void)
457*72367587SKarl Rupp {
458*72367587SKarl Rupp   PetscErrorCode ierr;
459*72367587SKarl Rupp 
460*72367587SKarl Rupp   PetscFunctionBegin;
461*72367587SKarl Rupp   ierr = MatSolverPackageRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
462*72367587SKarl Rupp   ierr = MatSolverPackageRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
463*72367587SKarl Rupp   ierr = MatSolverPackageRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
464*72367587SKarl Rupp   ierr = MatSolverPackageRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
465*72367587SKarl Rupp   PetscFunctionReturn(0);
466*72367587SKarl Rupp }
467*72367587SKarl Rupp 
468