xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 8713a8ba9e2db6c442ef9554cd1fc160eeefc5e9)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp 
3e4a0ef16SKarl Rupp /*
4e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5e4a0ef16SKarl Rupp   matrix storage format.
6e4a0ef16SKarl Rupp */
7e4a0ef16SKarl Rupp 
8aaa7dc30SBarry Smith #include <petscconf.h>
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp 
17e4a0ef16SKarl Rupp #include <algorithm>
18e4a0ef16SKarl Rupp #include <vector>
19e4a0ef16SKarl Rupp #include <string>
20e4a0ef16SKarl Rupp 
21e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
22e4a0ef16SKarl Rupp 
23*8713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
24*8713a8baSPatrick Sanan 
25e4a0ef16SKarl Rupp #undef __FUNCT__
26e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyToGPU"
27e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
28e4a0ef16SKarl Rupp {
29e4a0ef16SKarl Rupp 
30e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
31e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
32e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
33e4a0ef16SKarl Rupp 
34e4a0ef16SKarl Rupp 
35e4a0ef16SKarl Rupp   PetscFunctionBegin;
3667c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
37e4a0ef16SKarl Rupp     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
38e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
39e4a0ef16SKarl Rupp 
40e4a0ef16SKarl Rupp       try {
41e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
42a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
43e4a0ef16SKarl Rupp 
44a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
45a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
46e4a0ef16SKarl Rupp 
47a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
48a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
49a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
50e4a0ef16SKarl Rupp 
51a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
52a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
53a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
54a3430c56SKarl Rupp 
55a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
56a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
57a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
58a3430c56SKarl Rupp 
59a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
60e4a0ef16SKarl Rupp         } else {
61a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
62e4a0ef16SKarl Rupp 
63e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
64e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
65e4a0ef16SKarl Rupp 
66e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
67e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
68e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
69e4a0ef16SKarl Rupp 
70e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
71e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
72e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
73e4a0ef16SKarl Rupp 
74e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
75e4a0ef16SKarl Rupp         }
764cf1874eSKarl Rupp         ViennaCLWaitForGPU();
774076e183SKarl Rupp       } catch(std::exception const & ex) {
784076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
79e4a0ef16SKarl Rupp       }
80e4a0ef16SKarl Rupp 
81a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
82a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
839b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
84a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
859b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
86a3430c56SKarl Rupp         } else {
87a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
88a3430c56SKarl Rupp         }
89a3430c56SKarl Rupp       } else {
909b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
91a3430c56SKarl Rupp       }
92a3430c56SKarl Rupp 
93e4a0ef16SKarl Rupp       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
94e4a0ef16SKarl Rupp 
95e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
96e4a0ef16SKarl Rupp     }
9767c87b7fSKarl Rupp   }
98e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
99e4a0ef16SKarl Rupp }
100e4a0ef16SKarl Rupp 
101e4a0ef16SKarl Rupp #undef __FUNCT__
102e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyFromGPU"
1030d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
104e4a0ef16SKarl Rupp {
105e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
106e4a0ef16SKarl Rupp   PetscInt           m               = A->rmap->n;
107e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
108e4a0ef16SKarl Rupp 
109e4a0ef16SKarl Rupp 
110e4a0ef16SKarl Rupp   PetscFunctionBegin;
111e4a0ef16SKarl Rupp   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
112e4a0ef16SKarl Rupp     try {
1136c4ed002SBarry Smith       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1146c4ed002SBarry Smith       else {
115e4a0ef16SKarl Rupp 
116e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
117e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
118e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
119e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
120e4a0ef16SKarl Rupp         if (a->singlemalloc) {
121e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
122e4a0ef16SKarl Rupp         } else {
123e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
124e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
125e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
126e4a0ef16SKarl Rupp         }
127dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
128f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
129e4a0ef16SKarl Rupp 
130e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
131e4a0ef16SKarl Rupp 
132e4a0ef16SKarl Rupp         /* Setup row lengths */
133e4a0ef16SKarl Rupp         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
134dcca6d9dSJed Brown         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
135f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
136e4a0ef16SKarl Rupp 
137e4a0ef16SKarl Rupp         /* Copy data back from GPU */
138e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
139e4a0ef16SKarl Rupp 
140e4a0ef16SKarl Rupp         // copy row array
141e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
142e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
143e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
144e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
145e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
146e4a0ef16SKarl Rupp         }
147e4a0ef16SKarl Rupp 
148e4a0ef16SKarl Rupp         // copy column indices
149e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
150e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
151e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
152e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
153e4a0ef16SKarl Rupp 
154e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
155e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
156e4a0ef16SKarl Rupp 
1574cf1874eSKarl Rupp         ViennaCLWaitForGPU();
158023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
159e4a0ef16SKarl Rupp       }
1604076e183SKarl Rupp     } catch(std::exception const & ex) {
1614076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
162e4a0ef16SKarl Rupp     }
163e4a0ef16SKarl Rupp 
164e4a0ef16SKarl Rupp     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
165e4a0ef16SKarl Rupp     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166e4a0ef16SKarl Rupp     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
167e4a0ef16SKarl Rupp 
168e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
1696c4ed002SBarry Smith   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
170e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
171e4a0ef16SKarl Rupp }
172e4a0ef16SKarl Rupp 
173e4a0ef16SKarl Rupp #undef __FUNCT__
1742a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL"
1752a7a6963SBarry Smith PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
176e4a0ef16SKarl Rupp {
177e4a0ef16SKarl Rupp   PetscErrorCode ierr;
17833d57670SJed Brown   PetscInt rbs,cbs;
179e4a0ef16SKarl Rupp 
180e4a0ef16SKarl Rupp   PetscFunctionBegin;
18133d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
182e4a0ef16SKarl Rupp   if (right) {
183e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
184e4a0ef16SKarl Rupp     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
18533d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
186e4a0ef16SKarl Rupp     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
187e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
188e4a0ef16SKarl Rupp   }
189e4a0ef16SKarl Rupp   if (left) {
190e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
191e4a0ef16SKarl Rupp     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
19233d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
193e4a0ef16SKarl Rupp     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
194e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
195e4a0ef16SKarl Rupp   }
196e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
197e4a0ef16SKarl Rupp }
198e4a0ef16SKarl Rupp 
199e4a0ef16SKarl Rupp #undef __FUNCT__
200e4a0ef16SKarl Rupp #define __FUNCT__ "MatMult_SeqAIJViennaCL"
201e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
202e4a0ef16SKarl Rupp {
203e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
204e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
205e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2060d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
2070d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
208e4a0ef16SKarl Rupp 
209e4a0ef16SKarl Rupp   PetscFunctionBegin;
21067c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) {
211e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
212e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
213e4a0ef16SKarl Rupp     try {
214e4a0ef16SKarl Rupp       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
2154cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2164076e183SKarl Rupp     } catch (std::exception const & ex) {
2174076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
218e4a0ef16SKarl Rupp     }
219e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
220e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
2219b66742cSDave May     ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
22267c87b7fSKarl Rupp   }
223e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
224e4a0ef16SKarl Rupp }
225e4a0ef16SKarl Rupp 
226e4a0ef16SKarl Rupp 
227e4a0ef16SKarl Rupp 
228e4a0ef16SKarl Rupp #undef __FUNCT__
229e4a0ef16SKarl Rupp #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
230e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
231e4a0ef16SKarl Rupp {
232e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
233e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
234e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2350d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2360d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
237e4a0ef16SKarl Rupp 
238e4a0ef16SKarl Rupp   PetscFunctionBegin;
23967c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) {
240e4a0ef16SKarl Rupp     try {
241e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
242e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
243e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
244e4a0ef16SKarl Rupp 
245e4a0ef16SKarl Rupp       if (a->compressedrow.use) {
246a3430c56SKarl Rupp         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
247e4a0ef16SKarl Rupp         *zgpu = *ygpu + temp;
2484cf1874eSKarl Rupp         ViennaCLWaitForGPU();
249e4a0ef16SKarl Rupp       } else {
250a3430c56SKarl Rupp         if (zz == xx || zz == yy) { //temporary required
251a3430c56SKarl Rupp           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
252a3430c56SKarl Rupp           *zgpu = *ygpu;
253a3430c56SKarl Rupp           *zgpu += temp;
254a3430c56SKarl Rupp           ViennaCLWaitForGPU();
255a3430c56SKarl Rupp         } else {
256a3430c56SKarl Rupp           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
257a3430c56SKarl Rupp           *zgpu = *ygpu + *viennaclstruct->tempvec;
2584cf1874eSKarl Rupp           ViennaCLWaitForGPU();
259e4a0ef16SKarl Rupp         }
260e4a0ef16SKarl Rupp       }
261e4a0ef16SKarl Rupp 
262e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
263e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
264e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
265e4a0ef16SKarl Rupp 
2664076e183SKarl Rupp     } catch(std::exception const & ex) {
2674076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
268e4a0ef16SKarl Rupp     }
269e4a0ef16SKarl Rupp     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
27067c87b7fSKarl Rupp   }
271e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
272e4a0ef16SKarl Rupp }
273e4a0ef16SKarl Rupp 
274e4a0ef16SKarl Rupp #undef __FUNCT__
275e4a0ef16SKarl Rupp #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
276e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
277e4a0ef16SKarl Rupp {
278e4a0ef16SKarl Rupp   PetscErrorCode ierr;
279e4a0ef16SKarl Rupp 
280e4a0ef16SKarl Rupp   PetscFunctionBegin;
281e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
282e4a0ef16SKarl Rupp   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
283e4a0ef16SKarl Rupp   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
284e4a0ef16SKarl Rupp   A->ops->mult    = MatMult_SeqAIJViennaCL;
285e4a0ef16SKarl Rupp   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
286e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
287e4a0ef16SKarl Rupp }
288e4a0ef16SKarl Rupp 
289e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
290e4a0ef16SKarl Rupp #undef __FUNCT__
291e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreateSeqAIJViennaCL"
292e4a0ef16SKarl Rupp /*@
293e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
29419fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
295e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
296e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
297e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
298e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
299e4a0ef16SKarl Rupp 
300e4a0ef16SKarl Rupp 
301e4a0ef16SKarl Rupp    Collective on MPI_Comm
302e4a0ef16SKarl Rupp 
303e4a0ef16SKarl Rupp    Input Parameters:
304e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
305e4a0ef16SKarl Rupp .  m - number of rows
306e4a0ef16SKarl Rupp .  n - number of columns
307e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
308e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
309e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
310e4a0ef16SKarl Rupp 
311e4a0ef16SKarl Rupp    Output Parameter:
312e4a0ef16SKarl Rupp .  A - the matrix
313e4a0ef16SKarl Rupp 
314e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
315e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
316e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
317e4a0ef16SKarl Rupp 
318e4a0ef16SKarl Rupp    Notes:
319e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
320e4a0ef16SKarl Rupp 
321e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
322e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
323e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
324e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
325e4a0ef16SKarl Rupp 
326e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
327e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
328e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
329e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
330e4a0ef16SKarl Rupp 
331e4a0ef16SKarl Rupp    Level: intermediate
332e4a0ef16SKarl Rupp 
333e4a0ef16SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
334e4a0ef16SKarl Rupp 
335e4a0ef16SKarl Rupp @*/
336e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
337e4a0ef16SKarl Rupp {
338e4a0ef16SKarl Rupp   PetscErrorCode ierr;
339e4a0ef16SKarl Rupp 
340e4a0ef16SKarl Rupp   PetscFunctionBegin;
341e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
342e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
343e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
344e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
345e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
346e4a0ef16SKarl Rupp }
347e4a0ef16SKarl Rupp 
348e4a0ef16SKarl Rupp 
349e4a0ef16SKarl Rupp #undef __FUNCT__
350e4a0ef16SKarl Rupp #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
351e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
352e4a0ef16SKarl Rupp {
353e4a0ef16SKarl Rupp   PetscErrorCode ierr;
354e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
355e4a0ef16SKarl Rupp 
356e4a0ef16SKarl Rupp   PetscFunctionBegin;
357e4a0ef16SKarl Rupp   try {
3586447cd05SKarl Rupp     if (viennaclcontainer) {
3596447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3606447cd05SKarl Rupp       delete viennaclcontainer->mat;
3616447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
362e4a0ef16SKarl Rupp       delete viennaclcontainer;
3636447cd05SKarl Rupp     }
364e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
3654076e183SKarl Rupp   } catch(std::exception const & ex) {
3664076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
367e4a0ef16SKarl Rupp   }
368*8713a8baSPatrick Sanan 
369*8713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
370*8713a8baSPatrick Sanan 
371e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
372e4a0ef16SKarl Rupp   A->spptr = 0;
373e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
374e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
375e4a0ef16SKarl Rupp }
376e4a0ef16SKarl Rupp 
377e4a0ef16SKarl Rupp 
378e4a0ef16SKarl Rupp #undef __FUNCT__
379e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
380e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
381e4a0ef16SKarl Rupp {
382e4a0ef16SKarl Rupp   PetscErrorCode ierr;
383e4a0ef16SKarl Rupp 
384e4a0ef16SKarl Rupp   PetscFunctionBegin;
385e4a0ef16SKarl Rupp   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
386*8713a8baSPatrick Sanan   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
387*8713a8baSPatrick Sanan   PetscFunctionReturn(0);
388*8713a8baSPatrick Sanan }
389*8713a8baSPatrick Sanan 
390*8713a8baSPatrick Sanan #undef __FUNCT__
391*8713a8baSPatrick Sanan #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJViennaCL"
392*8713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
393*8713a8baSPatrick Sanan {
394*8713a8baSPatrick Sanan   PetscErrorCode ierr;
395*8713a8baSPatrick Sanan   Mat            B;
396*8713a8baSPatrick Sanan   Mat_SeqAIJ     *aij;
397*8713a8baSPatrick Sanan 
398*8713a8baSPatrick Sanan   PetscFunctionBegin;
399*8713a8baSPatrick Sanan 
400*8713a8baSPatrick Sanan   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
401*8713a8baSPatrick Sanan 
402*8713a8baSPatrick Sanan   if (reuse == MAT_INITIAL_MATRIX) {
403*8713a8baSPatrick Sanan     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
404*8713a8baSPatrick Sanan   }
405*8713a8baSPatrick Sanan 
406*8713a8baSPatrick Sanan   B = *newmat;
407*8713a8baSPatrick Sanan 
408e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
409e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
410*8713a8baSPatrick Sanan 
411e4a0ef16SKarl Rupp   B->ops->mult    = MatMult_SeqAIJViennaCL;
412e4a0ef16SKarl Rupp   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
413e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
414e4a0ef16SKarl Rupp 
415a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
416a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
417a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
418e4a0ef16SKarl Rupp 
419e4a0ef16SKarl Rupp   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
420e4a0ef16SKarl Rupp   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
4212a7a6963SBarry Smith   B->ops->getvecs        = MatCreateVecs_SeqAIJViennaCL;
422e4a0ef16SKarl Rupp 
423e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
424e4a0ef16SKarl Rupp 
425*8713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
426*8713a8baSPatrick Sanan 
427e4a0ef16SKarl Rupp   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
428*8713a8baSPatrick Sanan 
429*8713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
430*8713a8baSPatrick Sanan   if (B->assembled) {
431*8713a8baSPatrick Sanan     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
432*8713a8baSPatrick Sanan   }
433*8713a8baSPatrick Sanan 
434e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
435e4a0ef16SKarl Rupp }
436e4a0ef16SKarl Rupp 
437e4a0ef16SKarl Rupp 
438e4a0ef16SKarl Rupp /*M
439e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
440e4a0ef16SKarl Rupp 
441e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
442e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
443e4a0ef16SKarl Rupp 
444e4a0ef16SKarl Rupp    Options Database Keys:
445e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
446e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
447e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
448e4a0ef16SKarl Rupp 
449e4a0ef16SKarl Rupp   Level: beginner
450e4a0ef16SKarl Rupp 
451e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
452e4a0ef16SKarl Rupp M*/
453e4a0ef16SKarl Rupp 
454