xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 92f9df4af59fc9c2c05ab7f766ed9d946e985d8a)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp 
3e4a0ef16SKarl Rupp /*
4e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5e4a0ef16SKarl Rupp   matrix storage format.
6e4a0ef16SKarl Rupp */
7e4a0ef16SKarl Rupp 
8aaa7dc30SBarry Smith #include <petscconf.h>
999acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
10aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11aaa7dc30SBarry Smith #include <petscbt.h>
12aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
14e4a0ef16SKarl Rupp 
15aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
16e4a0ef16SKarl Rupp 
17e4a0ef16SKarl Rupp 
18e4a0ef16SKarl Rupp #include <algorithm>
19e4a0ef16SKarl Rupp #include <vector>
20e4a0ef16SKarl Rupp #include <string>
21e4a0ef16SKarl Rupp 
22e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
23e4a0ef16SKarl Rupp 
248713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2572367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
2672367587SKarl Rupp 
278713a8baSPatrick Sanan 
28e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
29e4a0ef16SKarl Rupp {
30e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
31e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
32e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
33e4a0ef16SKarl Rupp 
34e4a0ef16SKarl Rupp   PetscFunctionBegin;
35bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
36c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
37e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38e4a0ef16SKarl Rupp 
39e4a0ef16SKarl Rupp       try {
40e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
41a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42e4a0ef16SKarl Rupp 
43a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
44a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
45e4a0ef16SKarl Rupp 
46a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
49e4a0ef16SKarl Rupp 
50a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
53a3430c56SKarl Rupp 
54a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
56a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
57a3430c56SKarl Rupp 
58a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
594863603aSSatish Balay           ierr = PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
60e4a0ef16SKarl Rupp         } else {
61a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
62e4a0ef16SKarl Rupp 
63e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
64e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
65e4a0ef16SKarl Rupp 
66e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
67e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
68e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
69e4a0ef16SKarl Rupp 
70e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
71e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
72e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
73e4a0ef16SKarl Rupp 
74e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
754863603aSSatish Balay           ierr = PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
76e4a0ef16SKarl Rupp         }
774cf1874eSKarl Rupp         ViennaCLWaitForGPU();
784076e183SKarl Rupp       } catch(std::exception const & ex) {
794076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
80e4a0ef16SKarl Rupp       }
81e4a0ef16SKarl Rupp 
82a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
83a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
849b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
85a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
869b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
87a3430c56SKarl Rupp         } else {
88a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
89a3430c56SKarl Rupp         }
90a3430c56SKarl Rupp       } else {
919b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
92a3430c56SKarl Rupp       }
93a3430c56SKarl Rupp 
94c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
95e4a0ef16SKarl Rupp 
96e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
97e4a0ef16SKarl Rupp     }
9867c87b7fSKarl Rupp   }
99e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
100e4a0ef16SKarl Rupp }
101e4a0ef16SKarl Rupp 
1020d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103e4a0ef16SKarl Rupp {
104f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
105e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
106e4a0ef16SKarl Rupp   PetscInt           m  = A->rmap->n;
107e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
108e4a0ef16SKarl Rupp 
109e4a0ef16SKarl Rupp   PetscFunctionBegin;
110c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0);
111c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
112e4a0ef16SKarl Rupp     try {
1136c4ed002SBarry Smith       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1146c4ed002SBarry Smith       else {
115e4a0ef16SKarl Rupp 
116e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
117e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
118e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
119e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
120e4a0ef16SKarl Rupp         if (a->singlemalloc) {
121e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
122e4a0ef16SKarl Rupp         } else {
123e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
124e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
125e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
126e4a0ef16SKarl Rupp         }
127dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
128f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
129e4a0ef16SKarl Rupp 
130e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
131e4a0ef16SKarl Rupp 
132e4a0ef16SKarl Rupp         /* Setup row lengths */
133071fcb05SBarry Smith         ierr = PetscFree(a->imax);CHKERRQ(ierr);
134071fcb05SBarry Smith         ierr = PetscFree(a->ilen);CHKERRQ(ierr);
135071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->imax);CHKERRQ(ierr);
136071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->ilen);CHKERRQ(ierr);
137f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
138e4a0ef16SKarl Rupp 
139e4a0ef16SKarl Rupp         /* Copy data back from GPU */
140e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
141e4a0ef16SKarl Rupp 
142e4a0ef16SKarl Rupp         // copy row array
143e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
144e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
145e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
146e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
147e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
148e4a0ef16SKarl Rupp         }
149e4a0ef16SKarl Rupp 
150e4a0ef16SKarl Rupp         // copy column indices
151e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
152e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
153e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
154e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
155e4a0ef16SKarl Rupp 
156e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
157e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
158e4a0ef16SKarl Rupp 
1594863603aSSatish Balay         ierr = PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)));CHKERRQ(ierr);
1604cf1874eSKarl Rupp         ViennaCLWaitForGPU();
161023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
162e4a0ef16SKarl Rupp       }
1634076e183SKarl Rupp     } catch(std::exception const & ex) {
1644076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
165e4a0ef16SKarl Rupp     }
166c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
167f38c1e66SStefano Zampini     PetscFunctionReturn(0);
168f38c1e66SStefano Zampini   } else {
169c70f7ee4SJunchao Zhang     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
170e4a0ef16SKarl Rupp 
171f38c1e66SStefano Zampini     if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
172f38c1e66SStefano Zampini     if (!Agpu) {
173f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar)*viennaclstruct->mat->nnz(), a->a);
174f38c1e66SStefano Zampini     } else {
175f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
176f38c1e66SStefano Zampini     }
177f38c1e66SStefano Zampini   }
178c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
179b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
180e4a0ef16SKarl Rupp   ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
181e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
182e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
183e4a0ef16SKarl Rupp }
184e4a0ef16SKarl Rupp 
185e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
186e4a0ef16SKarl Rupp {
187e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
188e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
189e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
1900d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
1910d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
192e4a0ef16SKarl Rupp 
193e4a0ef16SKarl Rupp   PetscFunctionBegin;
194837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
195837a59e1SRichard Tran Mills   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
196bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
197e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
198e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
1997a052e47Shannah_mairs     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
200e4a0ef16SKarl Rupp     try {
201b7832b47SStefano Zampini       if (a->compressedrow.use) {
202b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
203b7832b47SStefano Zampini       } else {
204e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
205b7832b47SStefano Zampini       }
2064cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2074076e183SKarl Rupp     } catch (std::exception const & ex) {
2084076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
209e4a0ef16SKarl Rupp     }
210958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
211e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
212e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
213958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
214bf1781e8SStefano Zampini   } else {
215bf1781e8SStefano Zampini     ierr = VecSet(yy,0);CHKERRQ(ierr);
21667c87b7fSKarl Rupp   }
217e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
218e4a0ef16SKarl Rupp }
219e4a0ef16SKarl Rupp 
220e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
221e4a0ef16SKarl Rupp {
222e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
223e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
224e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2250d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2260d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
227e4a0ef16SKarl Rupp 
228e4a0ef16SKarl Rupp   PetscFunctionBegin;
229837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
230837a59e1SRichard Tran Mills   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
231bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
232e4a0ef16SKarl Rupp     try {
233e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
234e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
235e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
2367a052e47Shannah_mairs       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
237f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
238f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
239f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
240f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2414cf1874eSKarl Rupp       ViennaCLWaitForGPU();
242958c4211Shannah_mairs       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
243e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
244e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
245e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
246e4a0ef16SKarl Rupp 
2474076e183SKarl Rupp     } catch(std::exception const & ex) {
2484076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
249e4a0ef16SKarl Rupp     }
250958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
251bf1781e8SStefano Zampini   } else {
252bf1781e8SStefano Zampini     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
25367c87b7fSKarl Rupp   }
254e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
255e4a0ef16SKarl Rupp }
256e4a0ef16SKarl Rupp 
257e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
258e4a0ef16SKarl Rupp {
259e4a0ef16SKarl Rupp   PetscErrorCode ierr;
260e4a0ef16SKarl Rupp 
261e4a0ef16SKarl Rupp   PetscFunctionBegin;
262e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
263489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
264b470e4b4SRichard Tran Mills   if (!A->boundtocpu) {
265e4a0ef16SKarl Rupp     ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
266e7e92044SBarry Smith   }
267e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
268e4a0ef16SKarl Rupp }
269e4a0ef16SKarl Rupp 
270e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
271cab5ea25SPierre Jolivet /*@C
272e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
27319fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
274e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
275e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
276e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
277e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
278e4a0ef16SKarl Rupp 
279e4a0ef16SKarl Rupp 
280d083f849SBarry Smith    Collective
281e4a0ef16SKarl Rupp 
282e4a0ef16SKarl Rupp    Input Parameters:
283e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
284e4a0ef16SKarl Rupp .  m - number of rows
285e4a0ef16SKarl Rupp .  n - number of columns
286e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
287e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
288e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
289e4a0ef16SKarl Rupp 
290e4a0ef16SKarl Rupp    Output Parameter:
291e4a0ef16SKarl Rupp .  A - the matrix
292e4a0ef16SKarl Rupp 
293e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
294e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
295e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
296e4a0ef16SKarl Rupp 
297e4a0ef16SKarl Rupp    Notes:
298e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
299e4a0ef16SKarl Rupp 
300e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
301e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
302e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
303e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
304e4a0ef16SKarl Rupp 
305e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
306e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
307e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
308e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
309e4a0ef16SKarl Rupp 
310e4a0ef16SKarl Rupp    Level: intermediate
311e4a0ef16SKarl Rupp 
312e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
313e4a0ef16SKarl Rupp 
314e4a0ef16SKarl Rupp @*/
315e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
316e4a0ef16SKarl Rupp {
317e4a0ef16SKarl Rupp   PetscErrorCode ierr;
318e4a0ef16SKarl Rupp 
319e4a0ef16SKarl Rupp   PetscFunctionBegin;
320e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
321e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
322e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
323e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
324e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
325e4a0ef16SKarl Rupp }
326e4a0ef16SKarl Rupp 
327e4a0ef16SKarl Rupp 
328e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
329e4a0ef16SKarl Rupp {
330e4a0ef16SKarl Rupp   PetscErrorCode ierr;
331e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
332e4a0ef16SKarl Rupp 
333e4a0ef16SKarl Rupp   PetscFunctionBegin;
334e4a0ef16SKarl Rupp   try {
3356447cd05SKarl Rupp     if (viennaclcontainer) {
3366447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3376447cd05SKarl Rupp       delete viennaclcontainer->mat;
3386447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
339e4a0ef16SKarl Rupp       delete viennaclcontainer;
3406447cd05SKarl Rupp     }
341c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3424076e183SKarl Rupp   } catch(std::exception const & ex) {
3434076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
344e4a0ef16SKarl Rupp   }
3458713a8baSPatrick Sanan 
3468713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
3478713a8baSPatrick Sanan 
348e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
349e4a0ef16SKarl Rupp   A->spptr = 0;
350e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
351e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
352e4a0ef16SKarl Rupp }
353e4a0ef16SKarl Rupp 
354e4a0ef16SKarl Rupp 
355e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
356e4a0ef16SKarl Rupp {
357e4a0ef16SKarl Rupp   PetscErrorCode ierr;
358e4a0ef16SKarl Rupp 
359e4a0ef16SKarl Rupp   PetscFunctionBegin;
360e4a0ef16SKarl Rupp   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3618713a8baSPatrick Sanan   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
3628713a8baSPatrick Sanan   PetscFunctionReturn(0);
3638713a8baSPatrick Sanan }
3648713a8baSPatrick Sanan 
365b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat,PetscBool);
366c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
367c3cca76eSKarl Rupp {
368c3cca76eSKarl Rupp   PetscErrorCode ierr;
369c3cca76eSKarl Rupp   Mat            C;
370c3cca76eSKarl Rupp 
371c3cca76eSKarl Rupp   PetscFunctionBegin;
372c3cca76eSKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
373c3cca76eSKarl Rupp   C = *B;
374c3cca76eSKarl Rupp 
375b470e4b4SRichard Tran Mills   ierr = MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
376*92f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
377c3cca76eSKarl Rupp 
378c3cca76eSKarl Rupp   C->spptr = new Mat_SeqAIJViennaCL();
379c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
380c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
381c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;
382c3cca76eSKarl Rupp 
383c3cca76eSKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);CHKERRQ(ierr);
384c3cca76eSKarl Rupp 
385c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
386c3cca76eSKarl Rupp 
387c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
388c3cca76eSKarl Rupp   if (C->assembled) {
389c3cca76eSKarl Rupp     ierr = MatViennaCLCopyToGPU(C);CHKERRQ(ierr);
390c3cca76eSKarl Rupp   }
391c3cca76eSKarl Rupp 
392c3cca76eSKarl Rupp   PetscFunctionReturn(0);
393c3cca76eSKarl Rupp }
394c3cca76eSKarl Rupp 
395f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
396f38c1e66SStefano Zampini {
397f38c1e66SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
398f38c1e66SStefano Zampini   PetscErrorCode ierr;
399f38c1e66SStefano Zampini 
400f38c1e66SStefano Zampini   PetscFunctionBegin;
401f38c1e66SStefano Zampini   ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
402f38c1e66SStefano Zampini   *array = a->a;
403c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
404f38c1e66SStefano Zampini   PetscFunctionReturn(0);
405f38c1e66SStefano Zampini }
406f38c1e66SStefano Zampini 
407f38c1e66SStefano Zampini 
408f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
409f38c1e66SStefano Zampini {
410f38c1e66SStefano Zampini   PetscFunctionBegin;
411f38c1e66SStefano Zampini   *array = NULL;
412f38c1e66SStefano Zampini   PetscFunctionReturn(0);
413f38c1e66SStefano Zampini }
414f38c1e66SStefano Zampini 
415b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)
416e7e92044SBarry Smith {
417f38c1e66SStefano Zampini   PetscErrorCode ierr;
418f38c1e66SStefano Zampini 
419e7e92044SBarry Smith   PetscFunctionBegin;
420b470e4b4SRichard Tran Mills   A->boundtocpu = flg;
421e7e92044SBarry Smith   if (flg) {
422f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
423f38c1e66SStefano Zampini     ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
424f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
425f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJ);CHKERRQ(ierr);
426f38c1e66SStefano Zampini 
427e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
428e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
429e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
430e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
431e7e92044SBarry Smith   } else {
432f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJViennaCL);CHKERRQ(ierr);
433f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJViennaCL);CHKERRQ(ierr);
434f38c1e66SStefano Zampini 
435e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
436e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
437e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
438e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
439e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
440e7e92044SBarry Smith   }
441e7e92044SBarry Smith   PetscFunctionReturn(0);
442e7e92044SBarry Smith }
443e7e92044SBarry Smith 
4448713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
4458713a8baSPatrick Sanan {
4468713a8baSPatrick Sanan   PetscErrorCode ierr;
4478713a8baSPatrick Sanan   Mat            B;
4488713a8baSPatrick Sanan   Mat_SeqAIJ     *aij;
4498713a8baSPatrick Sanan 
4508713a8baSPatrick Sanan   PetscFunctionBegin;
4518713a8baSPatrick Sanan 
4528713a8baSPatrick Sanan   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4538713a8baSPatrick Sanan 
4548713a8baSPatrick Sanan   if (reuse == MAT_INITIAL_MATRIX) {
4558713a8baSPatrick Sanan     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4568713a8baSPatrick Sanan   }
4578713a8baSPatrick Sanan 
4588713a8baSPatrick Sanan   B = *newmat;
4598713a8baSPatrick Sanan 
460e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
461e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
4628713a8baSPatrick Sanan 
463e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
464e4a0ef16SKarl Rupp 
465a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
466a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
467a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
468e4a0ef16SKarl Rupp 
469b470e4b4SRichard Tran Mills   ierr = MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
470*92f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
471e4a0ef16SKarl Rupp 
472e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
47334136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
47434136279SStefano Zampini   ierr = PetscStrallocpy(VECVIENNACL,&B->defaultvectype);CHKERRQ(ierr);
475e4a0ef16SKarl Rupp 
4768713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
4778713a8baSPatrick Sanan 
478c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4798713a8baSPatrick Sanan 
4808713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4818713a8baSPatrick Sanan   if (B->assembled) {
4828713a8baSPatrick Sanan     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
4838713a8baSPatrick Sanan   }
4848713a8baSPatrick Sanan 
485e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
486e4a0ef16SKarl Rupp }
487e4a0ef16SKarl Rupp 
488e4a0ef16SKarl Rupp 
4893ca39a21SBarry Smith /*MC
490e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
491e4a0ef16SKarl Rupp 
492e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
493e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
494e4a0ef16SKarl Rupp 
495e4a0ef16SKarl Rupp    Options Database Keys:
496e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
497e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
498e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
499e4a0ef16SKarl Rupp 
500e4a0ef16SKarl Rupp   Level: beginner
501e4a0ef16SKarl Rupp 
502e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
503e4a0ef16SKarl Rupp M*/
504e4a0ef16SKarl Rupp 
50572367587SKarl Rupp 
5063ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
50772367587SKarl Rupp {
50872367587SKarl Rupp   PetscErrorCode ierr;
50972367587SKarl Rupp 
51072367587SKarl Rupp   PetscFunctionBegin;
5113ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5123ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5133ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5143ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
51572367587SKarl Rupp   PetscFunctionReturn(0);
51672367587SKarl Rupp }
517