xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 9b66742c314cdd2458a88c3541cde11281b8f067)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp 
3e4a0ef16SKarl Rupp /*
4e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5e4a0ef16SKarl Rupp   matrix storage format.
6e4a0ef16SKarl Rupp */
7e4a0ef16SKarl Rupp 
8aaa7dc30SBarry Smith #include <petscconf.h>
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp 
17e4a0ef16SKarl Rupp #include <algorithm>
18e4a0ef16SKarl Rupp #include <vector>
19e4a0ef16SKarl Rupp #include <string>
20e4a0ef16SKarl Rupp 
21e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
22e4a0ef16SKarl Rupp 
23e4a0ef16SKarl Rupp #undef __FUNCT__
24e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyToGPU"
25e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26e4a0ef16SKarl Rupp {
27e4a0ef16SKarl Rupp 
28e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
31e4a0ef16SKarl Rupp 
32e4a0ef16SKarl Rupp 
33e4a0ef16SKarl Rupp   PetscFunctionBegin;
3467c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
35e4a0ef16SKarl Rupp     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
36e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
37e4a0ef16SKarl Rupp 
38e4a0ef16SKarl Rupp       try {
3949cfb1d6SBarry Smith         ierr = PetscObjectViennaCLSetFromOptions((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
40e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
41a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42e4a0ef16SKarl Rupp 
43a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
44a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
45e4a0ef16SKarl Rupp 
46a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
49e4a0ef16SKarl Rupp 
50a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
53a3430c56SKarl Rupp 
54a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
56a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
57a3430c56SKarl Rupp 
58a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
59e4a0ef16SKarl Rupp         } else {
60a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
61e4a0ef16SKarl Rupp 
62e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
63e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
64e4a0ef16SKarl Rupp 
65e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
66e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
67e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
68e4a0ef16SKarl Rupp 
69e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
70e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
71e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
72e4a0ef16SKarl Rupp 
73e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
74e4a0ef16SKarl Rupp         }
754cf1874eSKarl Rupp         ViennaCLWaitForGPU();
764076e183SKarl Rupp       } catch(std::exception const & ex) {
774076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
78e4a0ef16SKarl Rupp       }
79e4a0ef16SKarl Rupp 
80a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
81a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
82*9b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
83a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
84*9b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
85a3430c56SKarl Rupp         } else {
86a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
87a3430c56SKarl Rupp         }
88a3430c56SKarl Rupp       } else {
89*9b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
90a3430c56SKarl Rupp       }
91a3430c56SKarl Rupp 
92e4a0ef16SKarl Rupp       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
93e4a0ef16SKarl Rupp 
94e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
95e4a0ef16SKarl Rupp     }
9667c87b7fSKarl Rupp   }
97e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
98e4a0ef16SKarl Rupp }
99e4a0ef16SKarl Rupp 
100e4a0ef16SKarl Rupp #undef __FUNCT__
101e4a0ef16SKarl Rupp #define __FUNCT__ "MatViennaCLCopyFromGPU"
1020d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103e4a0ef16SKarl Rupp {
104e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
105e4a0ef16SKarl Rupp   PetscInt           m               = A->rmap->n;
106e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
107e4a0ef16SKarl Rupp 
108e4a0ef16SKarl Rupp 
109e4a0ef16SKarl Rupp   PetscFunctionBegin;
110e4a0ef16SKarl Rupp   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
111e4a0ef16SKarl Rupp     try {
1126c4ed002SBarry Smith       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1136c4ed002SBarry Smith       else {
114e4a0ef16SKarl Rupp 
115e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
116e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
117e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
118e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
119e4a0ef16SKarl Rupp         if (a->singlemalloc) {
120e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
121e4a0ef16SKarl Rupp         } else {
122e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
123e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
124e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
125e4a0ef16SKarl Rupp         }
126dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
127f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
128e4a0ef16SKarl Rupp 
129e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
130e4a0ef16SKarl Rupp 
131e4a0ef16SKarl Rupp         /* Setup row lengths */
132e4a0ef16SKarl Rupp         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
133dcca6d9dSJed Brown         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
134f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
135e4a0ef16SKarl Rupp 
136e4a0ef16SKarl Rupp         /* Copy data back from GPU */
137e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
138e4a0ef16SKarl Rupp 
139e4a0ef16SKarl Rupp         // copy row array
140e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
141e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
142e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
143e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
144e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
145e4a0ef16SKarl Rupp         }
146e4a0ef16SKarl Rupp 
147e4a0ef16SKarl Rupp         // copy column indices
148e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
149e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
150e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
151e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
152e4a0ef16SKarl Rupp 
153e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
154e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
155e4a0ef16SKarl Rupp 
1564cf1874eSKarl Rupp         ViennaCLWaitForGPU();
157023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
158e4a0ef16SKarl Rupp       }
1594076e183SKarl Rupp     } catch(std::exception const & ex) {
1604076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
161e4a0ef16SKarl Rupp     }
162e4a0ef16SKarl Rupp 
163e4a0ef16SKarl Rupp     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
164e4a0ef16SKarl Rupp     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165e4a0ef16SKarl Rupp     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166e4a0ef16SKarl Rupp 
167e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
1686c4ed002SBarry Smith   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
169e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
170e4a0ef16SKarl Rupp }
171e4a0ef16SKarl Rupp 
172e4a0ef16SKarl Rupp #undef __FUNCT__
1732a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL"
1742a7a6963SBarry Smith PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
175e4a0ef16SKarl Rupp {
176e4a0ef16SKarl Rupp   PetscErrorCode ierr;
17733d57670SJed Brown   PetscInt rbs,cbs;
178e4a0ef16SKarl Rupp 
179e4a0ef16SKarl Rupp   PetscFunctionBegin;
18033d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
181e4a0ef16SKarl Rupp   if (right) {
182e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
183e4a0ef16SKarl Rupp     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
18433d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
185e4a0ef16SKarl Rupp     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
186e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
187e4a0ef16SKarl Rupp   }
188e4a0ef16SKarl Rupp   if (left) {
189e4a0ef16SKarl Rupp     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
190e4a0ef16SKarl Rupp     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
19133d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
192e4a0ef16SKarl Rupp     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
193e4a0ef16SKarl Rupp     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
194e4a0ef16SKarl Rupp   }
195e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
196e4a0ef16SKarl Rupp }
197e4a0ef16SKarl Rupp 
198e4a0ef16SKarl Rupp #undef __FUNCT__
199e4a0ef16SKarl Rupp #define __FUNCT__ "MatMult_SeqAIJViennaCL"
200e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
201e4a0ef16SKarl Rupp {
202e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
203e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
204e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2050d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
2060d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
207e4a0ef16SKarl Rupp 
208e4a0ef16SKarl Rupp   PetscFunctionBegin;
20967c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) {
210e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
211e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
212e4a0ef16SKarl Rupp     try {
213e4a0ef16SKarl Rupp       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
2144cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2154076e183SKarl Rupp     } catch (std::exception const & ex) {
2164076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
217e4a0ef16SKarl Rupp     }
218e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
219e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
220*9b66742cSDave May     ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
22167c87b7fSKarl Rupp   }
222e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
223e4a0ef16SKarl Rupp }
224e4a0ef16SKarl Rupp 
225e4a0ef16SKarl Rupp 
226e4a0ef16SKarl Rupp 
227e4a0ef16SKarl Rupp #undef __FUNCT__
228e4a0ef16SKarl Rupp #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
229e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
230e4a0ef16SKarl Rupp {
231e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
232e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
233e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2340d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2350d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
236e4a0ef16SKarl Rupp 
237e4a0ef16SKarl Rupp   PetscFunctionBegin;
23867c87b7fSKarl Rupp   if (A->rmap->n > 0 && A->cmap->n > 0) {
239e4a0ef16SKarl Rupp     try {
240e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
241e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
242e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
243e4a0ef16SKarl Rupp 
244e4a0ef16SKarl Rupp       if (a->compressedrow.use) {
245a3430c56SKarl Rupp         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
246e4a0ef16SKarl Rupp         *zgpu = *ygpu + temp;
2474cf1874eSKarl Rupp         ViennaCLWaitForGPU();
248e4a0ef16SKarl Rupp       } else {
249a3430c56SKarl Rupp         if (zz == xx || zz == yy) { //temporary required
250a3430c56SKarl Rupp           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
251a3430c56SKarl Rupp           *zgpu = *ygpu;
252a3430c56SKarl Rupp           *zgpu += temp;
253a3430c56SKarl Rupp           ViennaCLWaitForGPU();
254a3430c56SKarl Rupp         } else {
255a3430c56SKarl Rupp           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
256a3430c56SKarl Rupp           *zgpu = *ygpu + *viennaclstruct->tempvec;
2574cf1874eSKarl Rupp           ViennaCLWaitForGPU();
258e4a0ef16SKarl Rupp         }
259e4a0ef16SKarl Rupp       }
260e4a0ef16SKarl Rupp 
261e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
262e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
263e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
264e4a0ef16SKarl Rupp 
2654076e183SKarl Rupp     } catch(std::exception const & ex) {
2664076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
267e4a0ef16SKarl Rupp     }
268e4a0ef16SKarl Rupp     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
26967c87b7fSKarl Rupp   }
270e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
271e4a0ef16SKarl Rupp }
272e4a0ef16SKarl Rupp 
273e4a0ef16SKarl Rupp #undef __FUNCT__
274e4a0ef16SKarl Rupp #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
275e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
276e4a0ef16SKarl Rupp {
277e4a0ef16SKarl Rupp   PetscErrorCode ierr;
278e4a0ef16SKarl Rupp 
279e4a0ef16SKarl Rupp   PetscFunctionBegin;
280e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
281e4a0ef16SKarl Rupp   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
282e4a0ef16SKarl Rupp   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
283e4a0ef16SKarl Rupp   A->ops->mult    = MatMult_SeqAIJViennaCL;
284e4a0ef16SKarl Rupp   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
285e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
286e4a0ef16SKarl Rupp }
287e4a0ef16SKarl Rupp 
288e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
289e4a0ef16SKarl Rupp #undef __FUNCT__
290e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreateSeqAIJViennaCL"
291e4a0ef16SKarl Rupp /*@
292e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
29319fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
294e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
295e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
296e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
297e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
298e4a0ef16SKarl Rupp 
299e4a0ef16SKarl Rupp 
300e4a0ef16SKarl Rupp    Collective on MPI_Comm
301e4a0ef16SKarl Rupp 
302e4a0ef16SKarl Rupp    Input Parameters:
303e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
304e4a0ef16SKarl Rupp .  m - number of rows
305e4a0ef16SKarl Rupp .  n - number of columns
306e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
307e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
308e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
309e4a0ef16SKarl Rupp 
310e4a0ef16SKarl Rupp    Output Parameter:
311e4a0ef16SKarl Rupp .  A - the matrix
312e4a0ef16SKarl Rupp 
313e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
314e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
315e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
316e4a0ef16SKarl Rupp 
317e4a0ef16SKarl Rupp    Notes:
318e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
319e4a0ef16SKarl Rupp 
320e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
321e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
322e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
323e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
324e4a0ef16SKarl Rupp 
325e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
326e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
327e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
328e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
329e4a0ef16SKarl Rupp 
330e4a0ef16SKarl Rupp    Level: intermediate
331e4a0ef16SKarl Rupp 
332e4a0ef16SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
333e4a0ef16SKarl Rupp 
334e4a0ef16SKarl Rupp @*/
335e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
336e4a0ef16SKarl Rupp {
337e4a0ef16SKarl Rupp   PetscErrorCode ierr;
338e4a0ef16SKarl Rupp 
339e4a0ef16SKarl Rupp   PetscFunctionBegin;
340e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
341e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
342e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
343e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
344e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
345e4a0ef16SKarl Rupp }
346e4a0ef16SKarl Rupp 
347e4a0ef16SKarl Rupp 
348e4a0ef16SKarl Rupp #undef __FUNCT__
349e4a0ef16SKarl Rupp #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
350e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
351e4a0ef16SKarl Rupp {
352e4a0ef16SKarl Rupp   PetscErrorCode ierr;
353e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
354e4a0ef16SKarl Rupp 
355e4a0ef16SKarl Rupp   PetscFunctionBegin;
356e4a0ef16SKarl Rupp   try {
3576447cd05SKarl Rupp     if (viennaclcontainer) {
3586447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3596447cd05SKarl Rupp       delete viennaclcontainer->mat;
3606447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
361e4a0ef16SKarl Rupp       delete viennaclcontainer;
3626447cd05SKarl Rupp     }
363e4a0ef16SKarl Rupp     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
3644076e183SKarl Rupp   } catch(std::exception const & ex) {
3654076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
366e4a0ef16SKarl Rupp   }
367e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
368e4a0ef16SKarl Rupp   A->spptr = 0;
369e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
370e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
371e4a0ef16SKarl Rupp }
372e4a0ef16SKarl Rupp 
373e4a0ef16SKarl Rupp 
374e4a0ef16SKarl Rupp #undef __FUNCT__
375e4a0ef16SKarl Rupp #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
376e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
377e4a0ef16SKarl Rupp {
378e4a0ef16SKarl Rupp   PetscErrorCode ierr;
379e4a0ef16SKarl Rupp   Mat_SeqAIJ     *aij;
380e4a0ef16SKarl Rupp 
381e4a0ef16SKarl Rupp   PetscFunctionBegin;
382e4a0ef16SKarl Rupp   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
383e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
384e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
385e4a0ef16SKarl Rupp   B->ops->mult    = MatMult_SeqAIJViennaCL;
386e4a0ef16SKarl Rupp   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
387e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
388e4a0ef16SKarl Rupp 
389a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
390a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
391a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
392e4a0ef16SKarl Rupp 
393e4a0ef16SKarl Rupp   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
394e4a0ef16SKarl Rupp   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
3952a7a6963SBarry Smith   B->ops->getvecs        = MatCreateVecs_SeqAIJViennaCL;
396e4a0ef16SKarl Rupp 
397e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
398e4a0ef16SKarl Rupp 
399e4a0ef16SKarl Rupp   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
400e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
401e4a0ef16SKarl Rupp }
402e4a0ef16SKarl Rupp 
403e4a0ef16SKarl Rupp 
404e4a0ef16SKarl Rupp /*M
405e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
406e4a0ef16SKarl Rupp 
407e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
408e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
409e4a0ef16SKarl Rupp 
410e4a0ef16SKarl Rupp    Options Database Keys:
411e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
412e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
413e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
414e4a0ef16SKarl Rupp 
415e4a0ef16SKarl Rupp   Level: beginner
416e4a0ef16SKarl Rupp 
417e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
418e4a0ef16SKarl Rupp M*/
419e4a0ef16SKarl Rupp 
420