xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 9566063d113dddea24716c546802770db7481bc0)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp /*
3e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
4e4a0ef16SKarl Rupp   matrix storage format.
5e4a0ef16SKarl Rupp */
6e4a0ef16SKarl Rupp 
7aaa7dc30SBarry Smith #include <petscconf.h>
899acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp #include <algorithm>
17e4a0ef16SKarl Rupp #include <vector>
18e4a0ef16SKarl Rupp #include <string>
19e4a0ef16SKarl Rupp 
20e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
21e4a0ef16SKarl Rupp 
228713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2372367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
244222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
258713a8baSPatrick Sanan 
26e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
27e4a0ef16SKarl Rupp {
28e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30e4a0ef16SKarl Rupp 
31e4a0ef16SKarl Rupp   PetscFunctionBegin;
32bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
33c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
34*9566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0));
35e4a0ef16SKarl Rupp 
36e4a0ef16SKarl Rupp       try {
37e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
38a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
39e4a0ef16SKarl Rupp 
40a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
41a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
42e4a0ef16SKarl Rupp 
43a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
44a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
45a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
46e4a0ef16SKarl Rupp 
47a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
48a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
49a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
50a3430c56SKarl Rupp 
51a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
52a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
53a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
54a3430c56SKarl Rupp 
55a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
56*9566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar)));
57e4a0ef16SKarl Rupp         } else {
58a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
59e4a0ef16SKarl Rupp 
60e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
61e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
62e4a0ef16SKarl Rupp 
63e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
64e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
65e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
66e4a0ef16SKarl Rupp 
67e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
68e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
69e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
70e4a0ef16SKarl Rupp 
71e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
72*9566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar)));
73e4a0ef16SKarl Rupp         }
744cf1874eSKarl Rupp         ViennaCLWaitForGPU();
754076e183SKarl Rupp       } catch(std::exception const & ex) {
7698921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
77e4a0ef16SKarl Rupp       }
78e4a0ef16SKarl Rupp 
79a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
80a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
819b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
82a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
839b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
84a3430c56SKarl Rupp         } else {
85a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
86a3430c56SKarl Rupp         }
87a3430c56SKarl Rupp       } else {
889b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
89a3430c56SKarl Rupp       }
90a3430c56SKarl Rupp 
91c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
92e4a0ef16SKarl Rupp 
93*9566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0));
94e4a0ef16SKarl Rupp     }
9567c87b7fSKarl Rupp   }
96e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
97e4a0ef16SKarl Rupp }
98e4a0ef16SKarl Rupp 
990d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
100e4a0ef16SKarl Rupp {
101f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
102e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
103e4a0ef16SKarl Rupp   PetscInt           m  = A->rmap->n;
104e4a0ef16SKarl Rupp 
105e4a0ef16SKarl Rupp   PetscFunctionBegin;
106c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0);
107c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
108e4a0ef16SKarl Rupp     try {
1092c71b3e2SJacob Faibussowitsch       PetscCheck(!a->compressedrow.use,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1106c4ed002SBarry Smith       else {
111e4a0ef16SKarl Rupp 
1122c71b3e2SJacob Faibussowitsch         PetscCheck((PetscInt)Agpu->size1() == m,PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
113e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
114e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
115e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
116e4a0ef16SKarl Rupp         if (a->singlemalloc) {
117*9566063dSJacob Faibussowitsch           if (a->a) PetscCall(PetscFree3(a->a,a->j,a->i));
118e4a0ef16SKarl Rupp         } else {
119*9566063dSJacob Faibussowitsch           if (a->i) PetscCall(PetscFree(a->i));
120*9566063dSJacob Faibussowitsch           if (a->j) PetscCall(PetscFree(a->j));
121*9566063dSJacob Faibussowitsch           if (a->a) PetscCall(PetscFree(a->a));
122e4a0ef16SKarl Rupp         }
123*9566063dSJacob Faibussowitsch         PetscCall(PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i));
124*9566063dSJacob Faibussowitsch         PetscCall(PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt)));
125e4a0ef16SKarl Rupp 
126e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
127e4a0ef16SKarl Rupp 
128e4a0ef16SKarl Rupp         /* Setup row lengths */
129*9566063dSJacob Faibussowitsch         PetscCall(PetscFree(a->imax));
130*9566063dSJacob Faibussowitsch         PetscCall(PetscFree(a->ilen));
131*9566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(m,&a->imax));
132*9566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(m,&a->ilen));
133*9566063dSJacob Faibussowitsch         PetscCall(PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt)));
134e4a0ef16SKarl Rupp 
135e4a0ef16SKarl Rupp         /* Copy data back from GPU */
136e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
137e4a0ef16SKarl Rupp 
138e4a0ef16SKarl Rupp         // copy row array
139e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
140e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
141e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
142e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
143e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
144e4a0ef16SKarl Rupp         }
145e4a0ef16SKarl Rupp 
146e4a0ef16SKarl Rupp         // copy column indices
147e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
148e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
149e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
150e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
151e4a0ef16SKarl Rupp 
152e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
153e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
154e4a0ef16SKarl Rupp 
155*9566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar))));
1564cf1874eSKarl Rupp         ViennaCLWaitForGPU();
157023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
158e4a0ef16SKarl Rupp       }
1594076e183SKarl Rupp     } catch(std::exception const & ex) {
16098921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
161e4a0ef16SKarl Rupp     }
162c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
163f38c1e66SStefano Zampini     PetscFunctionReturn(0);
164f38c1e66SStefano Zampini   } else {
165c70f7ee4SJunchao Zhang     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
166e4a0ef16SKarl Rupp 
1672c71b3e2SJacob Faibussowitsch     PetscCheck(!a->compressedrow.use,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
168f38c1e66SStefano Zampini     if (!Agpu) {
169f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar)*viennaclstruct->mat->nnz(), a->a);
170f38c1e66SStefano Zampini     } else {
171f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
172f38c1e66SStefano Zampini     }
173f38c1e66SStefano Zampini   }
174c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
175b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
176*9566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
177*9566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
178e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
179e4a0ef16SKarl Rupp }
180e4a0ef16SKarl Rupp 
181e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
182e4a0ef16SKarl Rupp {
183e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
184e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
1850d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
1860d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
187e4a0ef16SKarl Rupp 
188e4a0ef16SKarl Rupp   PetscFunctionBegin;
189837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
190*9566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
191bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
192*9566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayRead(xx,&xgpu));
193*9566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayWrite(yy,&ygpu));
194*9566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
195e4a0ef16SKarl Rupp     try {
196b7832b47SStefano Zampini       if (a->compressedrow.use) {
197b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
198b7832b47SStefano Zampini       } else {
199e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
200b7832b47SStefano Zampini       }
2014cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2024076e183SKarl Rupp     } catch (std::exception const & ex) {
20398921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
204e4a0ef16SKarl Rupp     }
205*9566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
206*9566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayRead(xx,&xgpu));
207*9566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayWrite(yy,&ygpu));
208*9566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt));
209bf1781e8SStefano Zampini   } else {
210*9566063dSJacob Faibussowitsch     PetscCall(VecSet_SeqViennaCL(yy,0));
21167c87b7fSKarl Rupp   }
212e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
213e4a0ef16SKarl Rupp }
214e4a0ef16SKarl Rupp 
215e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
216e4a0ef16SKarl Rupp {
217e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
218e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2190d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2200d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
221e4a0ef16SKarl Rupp 
222e4a0ef16SKarl Rupp   PetscFunctionBegin;
223837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
224*9566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
225bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
226e4a0ef16SKarl Rupp     try {
227*9566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(xx,&xgpu));
228*9566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(yy,&ygpu));
229*9566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayWrite(zz,&zgpu));
230*9566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
231f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
232f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
233f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
234f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2354cf1874eSKarl Rupp       ViennaCLWaitForGPU();
236*9566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
237*9566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(xx,&xgpu));
238*9566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(yy,&ygpu));
239*9566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayWrite(zz,&zgpu));
240e4a0ef16SKarl Rupp 
2414076e183SKarl Rupp     } catch(std::exception const & ex) {
24298921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
243e4a0ef16SKarl Rupp     }
244*9566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz));
245bf1781e8SStefano Zampini   } else {
246*9566063dSJacob Faibussowitsch     PetscCall(VecCopy_SeqViennaCL(yy,zz));
24767c87b7fSKarl Rupp   }
248e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
249e4a0ef16SKarl Rupp }
250e4a0ef16SKarl Rupp 
251e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
252e4a0ef16SKarl Rupp {
253e4a0ef16SKarl Rupp   PetscFunctionBegin;
254*9566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
255489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
256*9566063dSJacob Faibussowitsch   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
257e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
258e4a0ef16SKarl Rupp }
259e4a0ef16SKarl Rupp 
260e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
261cab5ea25SPierre Jolivet /*@C
262e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
26319fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
264e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
265e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
266e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
267e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
268e4a0ef16SKarl Rupp 
269d083f849SBarry Smith    Collective
270e4a0ef16SKarl Rupp 
271e4a0ef16SKarl Rupp    Input Parameters:
272e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
273e4a0ef16SKarl Rupp .  m - number of rows
274e4a0ef16SKarl Rupp .  n - number of columns
275e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
276e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
277e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
278e4a0ef16SKarl Rupp 
279e4a0ef16SKarl Rupp    Output Parameter:
280e4a0ef16SKarl Rupp .  A - the matrix
281e4a0ef16SKarl Rupp 
282e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
283e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
284e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
285e4a0ef16SKarl Rupp 
286e4a0ef16SKarl Rupp    Notes:
287e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
288e4a0ef16SKarl Rupp 
289e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
290e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
291e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
292e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
293e4a0ef16SKarl Rupp 
294e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
295e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
296e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
297e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
298e4a0ef16SKarl Rupp 
299e4a0ef16SKarl Rupp    Level: intermediate
300e4a0ef16SKarl Rupp 
301e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
302e4a0ef16SKarl Rupp 
303e4a0ef16SKarl Rupp @*/
304e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
305e4a0ef16SKarl Rupp {
306e4a0ef16SKarl Rupp   PetscFunctionBegin;
307*9566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
308*9566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
309*9566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJVIENNACL));
310*9566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
311e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
312e4a0ef16SKarl Rupp }
313e4a0ef16SKarl Rupp 
314e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
315e4a0ef16SKarl Rupp {
316e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
317e4a0ef16SKarl Rupp 
318e4a0ef16SKarl Rupp   PetscFunctionBegin;
319e4a0ef16SKarl Rupp   try {
3206447cd05SKarl Rupp     if (viennaclcontainer) {
3216447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3226447cd05SKarl Rupp       delete viennaclcontainer->mat;
3236447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
324e4a0ef16SKarl Rupp       delete viennaclcontainer;
3256447cd05SKarl Rupp     }
326c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3274076e183SKarl Rupp   } catch(std::exception const & ex) {
32898921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
329e4a0ef16SKarl Rupp   }
3308713a8baSPatrick Sanan 
331*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL));
332*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",NULL));
333*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",NULL));
3348713a8baSPatrick Sanan 
335e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
336e4a0ef16SKarl Rupp   A->spptr = 0;
337*9566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
338e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
339e4a0ef16SKarl Rupp }
340e4a0ef16SKarl Rupp 
341e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
342e4a0ef16SKarl Rupp {
343e4a0ef16SKarl Rupp   PetscFunctionBegin;
344*9566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
345*9566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B));
3468713a8baSPatrick Sanan   PetscFunctionReturn(0);
3478713a8baSPatrick Sanan }
3488713a8baSPatrick Sanan 
349b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat,PetscBool);
350c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
351c3cca76eSKarl Rupp {
352c3cca76eSKarl Rupp   Mat C;
353c3cca76eSKarl Rupp 
354c3cca76eSKarl Rupp   PetscFunctionBegin;
355*9566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
356c3cca76eSKarl Rupp   C = *B;
357c3cca76eSKarl Rupp 
358*9566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE));
35992f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
360c3cca76eSKarl Rupp 
361c3cca76eSKarl Rupp   C->spptr = new Mat_SeqAIJViennaCL();
362c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
363c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
364c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;
365c3cca76eSKarl Rupp 
366*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL));
367c3cca76eSKarl Rupp 
368c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
369c3cca76eSKarl Rupp 
370c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
371*9566063dSJacob Faibussowitsch   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
372c3cca76eSKarl Rupp   PetscFunctionReturn(0);
373c3cca76eSKarl Rupp }
374c3cca76eSKarl Rupp 
375f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
376f38c1e66SStefano Zampini {
377f38c1e66SStefano Zampini   PetscFunctionBegin;
378*9566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL));
379c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
380f38c1e66SStefano Zampini   PetscFunctionReturn(0);
381f38c1e66SStefano Zampini }
382f38c1e66SStefano Zampini 
383f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
384f38c1e66SStefano Zampini {
385f38c1e66SStefano Zampini   PetscFunctionBegin;
386c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
387c1a4f3baSJunchao Zhang   *array         = NULL;
388c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
389c1a4f3baSJunchao Zhang }
390c1a4f3baSJunchao Zhang 
391c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A,const PetscScalar *array[])
392c1a4f3baSJunchao Zhang {
393c1a4f3baSJunchao Zhang   PetscFunctionBegin;
394*9566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL));
395c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
396c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
397c1a4f3baSJunchao Zhang }
398c1a4f3baSJunchao Zhang 
399c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A,const PetscScalar *array[])
400c1a4f3baSJunchao Zhang {
401c1a4f3baSJunchao Zhang   PetscFunctionBegin;
402c1a4f3baSJunchao Zhang   *array = NULL;
403c1a4f3baSJunchao Zhang   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
404c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
405c1a4f3baSJunchao Zhang }
406c1a4f3baSJunchao Zhang 
407c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A,PetscScalar *array[])
408c1a4f3baSJunchao Zhang {
409c1a4f3baSJunchao Zhang   PetscFunctionBegin;
410c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
411c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
412c1a4f3baSJunchao Zhang }
413c1a4f3baSJunchao Zhang 
414c1a4f3baSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A,PetscScalar *array[])
415c1a4f3baSJunchao Zhang {
416c1a4f3baSJunchao Zhang   PetscFunctionBegin;
417c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
418f38c1e66SStefano Zampini   *array         = NULL;
419f38c1e66SStefano Zampini   PetscFunctionReturn(0);
420f38c1e66SStefano Zampini }
421f38c1e66SStefano Zampini 
422b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)
423e7e92044SBarry Smith {
424c1a4f3baSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
425f38c1e66SStefano Zampini 
426e7e92044SBarry Smith   PetscFunctionBegin;
427b470e4b4SRichard Tran Mills   A->boundtocpu  = flg;
428c1a4f3baSJunchao Zhang   if (flg && a->inode.size) {
429c1a4f3baSJunchao Zhang     a->inode.use = PETSC_TRUE;
430ea500dcfSRichard Tran Mills   } else {
431c1a4f3baSJunchao Zhang     a->inode.use = PETSC_FALSE;
432ea500dcfSRichard Tran Mills   }
433e7e92044SBarry Smith   if (flg) {
434f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
435*9566063dSJacob Faibussowitsch     PetscCall(MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL));
436e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
437e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
438e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
439e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
440*9566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
441e7e92044SBarry Smith   } else {
442e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
443e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
444e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
445e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
446e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
447c1a4f3baSJunchao Zhang 
448c1a4f3baSJunchao Zhang     a->ops->getarray           = MatSeqAIJGetArray_SeqAIJViennaCL;
449c1a4f3baSJunchao Zhang     a->ops->restorearray       = MatSeqAIJRestoreArray_SeqAIJViennaCL;
450c1a4f3baSJunchao Zhang     a->ops->getarrayread       = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
451c1a4f3baSJunchao Zhang     a->ops->restorearrayread   = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
452c1a4f3baSJunchao Zhang     a->ops->getarraywrite      = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
453c1a4f3baSJunchao Zhang     a->ops->restorearraywrite  = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
454e7e92044SBarry Smith   }
455e7e92044SBarry Smith   PetscFunctionReturn(0);
456e7e92044SBarry Smith }
457e7e92044SBarry Smith 
4588713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
4598713a8baSPatrick Sanan {
4608713a8baSPatrick Sanan   Mat B;
4618713a8baSPatrick Sanan 
4628713a8baSPatrick Sanan   PetscFunctionBegin;
4635f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_REUSE_MATRIX,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4648713a8baSPatrick Sanan 
465*9566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
4668713a8baSPatrick Sanan 
4678713a8baSPatrick Sanan   B = *newmat;
4688713a8baSPatrick Sanan 
469e4a0ef16SKarl Rupp   B->spptr = new Mat_SeqAIJViennaCL();
470e4a0ef16SKarl Rupp 
471a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
472a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
473a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
474e4a0ef16SKarl Rupp 
475*9566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE));
47692f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
477e4a0ef16SKarl Rupp 
478*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL));
479*9566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
480*9566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECVIENNACL,&B->defaultvectype));
481e4a0ef16SKarl Rupp 
482*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL));
483*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",MatProductSetFromOptions_SeqAIJ_SeqDense));
484*9566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",MatProductSetFromOptions_SeqAIJ));
4858713a8baSPatrick Sanan 
486c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4878713a8baSPatrick Sanan 
4888713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
489*9566063dSJacob Faibussowitsch   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
490e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
491e4a0ef16SKarl Rupp }
492e4a0ef16SKarl Rupp 
4933ca39a21SBarry Smith /*MC
494e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
495e4a0ef16SKarl Rupp 
496e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
497e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
498e4a0ef16SKarl Rupp 
499e4a0ef16SKarl Rupp    Options Database Keys:
500e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
501e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
502e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
503e4a0ef16SKarl Rupp 
504e4a0ef16SKarl Rupp   Level: beginner
505e4a0ef16SKarl Rupp 
506e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
507e4a0ef16SKarl Rupp M*/
508e4a0ef16SKarl Rupp 
5093ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
51072367587SKarl Rupp {
51172367587SKarl Rupp   PetscFunctionBegin;
512*9566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU,MatGetFactor_seqaij_petsc));
513*9566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc));
514*9566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc));
515*9566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc));
51672367587SKarl Rupp   PetscFunctionReturn(0);
51772367587SKarl Rupp }
518