xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision ea500dcf30ffef220839d8a7e6a1237a802d17f4)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp /*
3e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
4e4a0ef16SKarl Rupp   matrix storage format.
5e4a0ef16SKarl Rupp */
6e4a0ef16SKarl Rupp 
7aaa7dc30SBarry Smith #include <petscconf.h>
899acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp #include <algorithm>
17e4a0ef16SKarl Rupp #include <vector>
18e4a0ef16SKarl Rupp #include <string>
19e4a0ef16SKarl Rupp 
20e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
21e4a0ef16SKarl Rupp 
228713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2372367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
244222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
258713a8baSPatrick Sanan 
26e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
27e4a0ef16SKarl Rupp {
28e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
31e4a0ef16SKarl Rupp 
32e4a0ef16SKarl Rupp   PetscFunctionBegin;
33bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
34c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
35e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
36e4a0ef16SKarl Rupp 
37e4a0ef16SKarl Rupp       try {
38e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
39a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
40e4a0ef16SKarl Rupp 
41a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
42a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
43e4a0ef16SKarl Rupp 
44a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
45a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
46a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
47e4a0ef16SKarl Rupp 
48a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
49a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
50a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
51a3430c56SKarl Rupp 
52a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
53a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
54a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
55a3430c56SKarl Rupp 
56a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
574863603aSSatish Balay           ierr = PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
58e4a0ef16SKarl Rupp         } else {
59a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
60e4a0ef16SKarl Rupp 
61e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
62e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
63e4a0ef16SKarl Rupp 
64e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
65e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
66e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
67e4a0ef16SKarl Rupp 
68e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
69e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
70e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
71e4a0ef16SKarl Rupp 
72e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
734863603aSSatish Balay           ierr = PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
74e4a0ef16SKarl Rupp         }
754cf1874eSKarl Rupp         ViennaCLWaitForGPU();
764076e183SKarl Rupp       } catch(std::exception const & ex) {
774076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
78e4a0ef16SKarl Rupp       }
79e4a0ef16SKarl Rupp 
80a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
81a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
829b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
83a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
849b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
85a3430c56SKarl Rupp         } else {
86a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
87a3430c56SKarl Rupp         }
88a3430c56SKarl Rupp       } else {
899b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
90a3430c56SKarl Rupp       }
91a3430c56SKarl Rupp 
92c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
93e4a0ef16SKarl Rupp 
94e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
95e4a0ef16SKarl Rupp     }
9667c87b7fSKarl Rupp   }
97e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
98e4a0ef16SKarl Rupp }
99e4a0ef16SKarl Rupp 
1000d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
101e4a0ef16SKarl Rupp {
102f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
103e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
104e4a0ef16SKarl Rupp   PetscInt           m  = A->rmap->n;
105e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
106e4a0ef16SKarl Rupp 
107e4a0ef16SKarl Rupp   PetscFunctionBegin;
108c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0);
109c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
110e4a0ef16SKarl Rupp     try {
111ea13f565SStefano Zampini       if (a->compressedrow.use) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1126c4ed002SBarry Smith       else {
113e4a0ef16SKarl Rupp 
114ea13f565SStefano Zampini         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
115e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
116e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
117e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
118e4a0ef16SKarl Rupp         if (a->singlemalloc) {
119e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
120e4a0ef16SKarl Rupp         } else {
121e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
122e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
123e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
124e4a0ef16SKarl Rupp         }
125dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
126f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
127e4a0ef16SKarl Rupp 
128e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
129e4a0ef16SKarl Rupp 
130e4a0ef16SKarl Rupp         /* Setup row lengths */
131071fcb05SBarry Smith         ierr = PetscFree(a->imax);CHKERRQ(ierr);
132071fcb05SBarry Smith         ierr = PetscFree(a->ilen);CHKERRQ(ierr);
133071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->imax);CHKERRQ(ierr);
134071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->ilen);CHKERRQ(ierr);
135f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
136e4a0ef16SKarl Rupp 
137e4a0ef16SKarl Rupp         /* Copy data back from GPU */
138e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
139e4a0ef16SKarl Rupp 
140e4a0ef16SKarl Rupp         // copy row array
141e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
142e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
143e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
144e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
145e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
146e4a0ef16SKarl Rupp         }
147e4a0ef16SKarl Rupp 
148e4a0ef16SKarl Rupp         // copy column indices
149e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
150e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
151e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
152e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
153e4a0ef16SKarl Rupp 
154e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
155e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
156e4a0ef16SKarl Rupp 
1574863603aSSatish Balay         ierr = PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)));CHKERRQ(ierr);
1584cf1874eSKarl Rupp         ViennaCLWaitForGPU();
159023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
160e4a0ef16SKarl Rupp       }
1614076e183SKarl Rupp     } catch(std::exception const & ex) {
1624076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
163e4a0ef16SKarl Rupp     }
164c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
165f38c1e66SStefano Zampini     PetscFunctionReturn(0);
166f38c1e66SStefano Zampini   } else {
167c70f7ee4SJunchao Zhang     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
168e4a0ef16SKarl Rupp 
169ea13f565SStefano Zampini     if (a->compressedrow.use) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
170f38c1e66SStefano Zampini     if (!Agpu) {
171f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar)*viennaclstruct->mat->nnz(), a->a);
172f38c1e66SStefano Zampini     } else {
173f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
174f38c1e66SStefano Zampini     }
175f38c1e66SStefano Zampini   }
176c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
177b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
178e4a0ef16SKarl Rupp   ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
179e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
180e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
181e4a0ef16SKarl Rupp }
182e4a0ef16SKarl Rupp 
183e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
184e4a0ef16SKarl Rupp {
185e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
186e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
187e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
1880d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
1890d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
190e4a0ef16SKarl Rupp 
191e4a0ef16SKarl Rupp   PetscFunctionBegin;
192837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
193837a59e1SRichard Tran Mills   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
194bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
195e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
196e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
1977a052e47Shannah_mairs     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
198e4a0ef16SKarl Rupp     try {
199b7832b47SStefano Zampini       if (a->compressedrow.use) {
200b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
201b7832b47SStefano Zampini       } else {
202e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
203b7832b47SStefano Zampini       }
2044cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2054076e183SKarl Rupp     } catch (std::exception const & ex) {
2064076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
207e4a0ef16SKarl Rupp     }
208958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
209e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
210e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
211958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
212bf1781e8SStefano Zampini   } else {
213383ef339SStefano Zampini     ierr = VecSet_SeqViennaCL(yy,0);CHKERRQ(ierr);
21467c87b7fSKarl Rupp   }
215e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
216e4a0ef16SKarl Rupp }
217e4a0ef16SKarl Rupp 
218e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
219e4a0ef16SKarl Rupp {
220e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
221e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
222e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2230d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2240d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
225e4a0ef16SKarl Rupp 
226e4a0ef16SKarl Rupp   PetscFunctionBegin;
227837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
228837a59e1SRichard Tran Mills   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
229bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
230e4a0ef16SKarl Rupp     try {
231e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
232e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
233e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
2347a052e47Shannah_mairs       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
235f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
236f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
237f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
238f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2394cf1874eSKarl Rupp       ViennaCLWaitForGPU();
240958c4211Shannah_mairs       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
241e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
242e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
243e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
244e4a0ef16SKarl Rupp 
2454076e183SKarl Rupp     } catch(std::exception const & ex) {
2464076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
247e4a0ef16SKarl Rupp     }
248958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
249bf1781e8SStefano Zampini   } else {
250383ef339SStefano Zampini     ierr = VecCopy_SeqViennaCL(yy,zz);CHKERRQ(ierr);
25167c87b7fSKarl Rupp   }
252e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
253e4a0ef16SKarl Rupp }
254e4a0ef16SKarl Rupp 
255e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
256e4a0ef16SKarl Rupp {
257e4a0ef16SKarl Rupp   PetscErrorCode ierr;
258e4a0ef16SKarl Rupp 
259e4a0ef16SKarl Rupp   PetscFunctionBegin;
260e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
261489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
262b470e4b4SRichard Tran Mills   if (!A->boundtocpu) {
263e4a0ef16SKarl Rupp     ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
264e7e92044SBarry Smith   }
265e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
266e4a0ef16SKarl Rupp }
267e4a0ef16SKarl Rupp 
268e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
269cab5ea25SPierre Jolivet /*@C
270e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
27119fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
272e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
273e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
274e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
275e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
276e4a0ef16SKarl Rupp 
277d083f849SBarry Smith    Collective
278e4a0ef16SKarl Rupp 
279e4a0ef16SKarl Rupp    Input Parameters:
280e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
281e4a0ef16SKarl Rupp .  m - number of rows
282e4a0ef16SKarl Rupp .  n - number of columns
283e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
284e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
285e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
286e4a0ef16SKarl Rupp 
287e4a0ef16SKarl Rupp    Output Parameter:
288e4a0ef16SKarl Rupp .  A - the matrix
289e4a0ef16SKarl Rupp 
290e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
291e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
292e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293e4a0ef16SKarl Rupp 
294e4a0ef16SKarl Rupp    Notes:
295e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
296e4a0ef16SKarl Rupp 
297e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
298e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
299e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
300e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
301e4a0ef16SKarl Rupp 
302e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
303e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
304e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
305e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
306e4a0ef16SKarl Rupp 
307e4a0ef16SKarl Rupp    Level: intermediate
308e4a0ef16SKarl Rupp 
309e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
310e4a0ef16SKarl Rupp 
311e4a0ef16SKarl Rupp @*/
312e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
313e4a0ef16SKarl Rupp {
314e4a0ef16SKarl Rupp   PetscErrorCode ierr;
315e4a0ef16SKarl Rupp 
316e4a0ef16SKarl Rupp   PetscFunctionBegin;
317e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
318e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
319e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
320e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
321e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
322e4a0ef16SKarl Rupp }
323e4a0ef16SKarl Rupp 
324e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
325e4a0ef16SKarl Rupp {
326e4a0ef16SKarl Rupp   PetscErrorCode ierr;
327e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
328e4a0ef16SKarl Rupp 
329e4a0ef16SKarl Rupp   PetscFunctionBegin;
330e4a0ef16SKarl Rupp   try {
3316447cd05SKarl Rupp     if (viennaclcontainer) {
3326447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3336447cd05SKarl Rupp       delete viennaclcontainer->mat;
3346447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
335e4a0ef16SKarl Rupp       delete viennaclcontainer;
3366447cd05SKarl Rupp     }
337c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3384076e183SKarl Rupp   } catch(std::exception const & ex) {
3394076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
340e4a0ef16SKarl Rupp   }
3418713a8baSPatrick Sanan 
3428713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
3434222ddf1SHong Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",NULL);CHKERRQ(ierr);
3444222ddf1SHong Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",NULL);CHKERRQ(ierr);
3458713a8baSPatrick Sanan 
346e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
347e4a0ef16SKarl Rupp   A->spptr = 0;
348e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
349e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
350e4a0ef16SKarl Rupp }
351e4a0ef16SKarl Rupp 
352e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
353e4a0ef16SKarl Rupp {
354e4a0ef16SKarl Rupp   PetscErrorCode ierr;
355e4a0ef16SKarl Rupp 
356e4a0ef16SKarl Rupp   PetscFunctionBegin;
357e4a0ef16SKarl Rupp   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3581e1ea65dSPierre Jolivet   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3598713a8baSPatrick Sanan   PetscFunctionReturn(0);
3608713a8baSPatrick Sanan }
3618713a8baSPatrick Sanan 
362b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat,PetscBool);
363c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
364c3cca76eSKarl Rupp {
365c3cca76eSKarl Rupp   PetscErrorCode ierr;
366c3cca76eSKarl Rupp   Mat            C;
367c3cca76eSKarl Rupp 
368c3cca76eSKarl Rupp   PetscFunctionBegin;
369c3cca76eSKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
370c3cca76eSKarl Rupp   C = *B;
371c3cca76eSKarl Rupp 
372b470e4b4SRichard Tran Mills   ierr = MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
37392f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
374c3cca76eSKarl Rupp 
375c3cca76eSKarl Rupp   C->spptr = new Mat_SeqAIJViennaCL();
376c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
377c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
378c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;
379c3cca76eSKarl Rupp 
380c3cca76eSKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);CHKERRQ(ierr);
381c3cca76eSKarl Rupp 
382c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
383c3cca76eSKarl Rupp 
384c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
385c3cca76eSKarl Rupp   if (C->assembled) {
386c3cca76eSKarl Rupp     ierr = MatViennaCLCopyToGPU(C);CHKERRQ(ierr);
387c3cca76eSKarl Rupp   }
388c3cca76eSKarl Rupp 
389c3cca76eSKarl Rupp   PetscFunctionReturn(0);
390c3cca76eSKarl Rupp }
391c3cca76eSKarl Rupp 
392f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
393f38c1e66SStefano Zampini {
394f38c1e66SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
395f38c1e66SStefano Zampini   PetscErrorCode ierr;
396f38c1e66SStefano Zampini 
397f38c1e66SStefano Zampini   PetscFunctionBegin;
398f38c1e66SStefano Zampini   ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
399f38c1e66SStefano Zampini   *array = a->a;
400c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
401f38c1e66SStefano Zampini   PetscFunctionReturn(0);
402f38c1e66SStefano Zampini }
403f38c1e66SStefano Zampini 
404f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
405f38c1e66SStefano Zampini {
406f38c1e66SStefano Zampini   PetscFunctionBegin;
407f38c1e66SStefano Zampini   *array = NULL;
408f38c1e66SStefano Zampini   PetscFunctionReturn(0);
409f38c1e66SStefano Zampini }
410f38c1e66SStefano Zampini 
411b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)
412e7e92044SBarry Smith {
413c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij = (Mat_SeqAIJ*)A->data;
414f38c1e66SStefano Zampini   PetscErrorCode ierr;
415f38c1e66SStefano Zampini 
416e7e92044SBarry Smith   PetscFunctionBegin;
417b470e4b4SRichard Tran Mills   A->boundtocpu  = flg;
418*ea500dcfSRichard Tran Mills   if (flg && aij->inode.size) {
419*ea500dcfSRichard Tran Mills     aij->inode.use = PETSC_TRUE;
420*ea500dcfSRichard Tran Mills   } else {
421*ea500dcfSRichard Tran Mills     aij->inode.use = PETSC_FALSE;
422*ea500dcfSRichard Tran Mills   }
423e7e92044SBarry Smith   if (flg) {
424f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
425f38c1e66SStefano Zampini     ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
426f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
427f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJ);CHKERRQ(ierr);
428f38c1e66SStefano Zampini 
429e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
430e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
431e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
432e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
433e7e92044SBarry Smith   } else {
434f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJViennaCL);CHKERRQ(ierr);
435f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJViennaCL);CHKERRQ(ierr);
436f38c1e66SStefano Zampini 
437e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
438e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
439e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
440e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
441e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
442e7e92044SBarry Smith   }
443e7e92044SBarry Smith   PetscFunctionReturn(0);
444e7e92044SBarry Smith }
445e7e92044SBarry Smith 
4468713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
4478713a8baSPatrick Sanan {
4488713a8baSPatrick Sanan   PetscErrorCode ierr;
4498713a8baSPatrick Sanan   Mat            B;
4508713a8baSPatrick Sanan 
4518713a8baSPatrick Sanan   PetscFunctionBegin;
4528713a8baSPatrick Sanan 
453ea13f565SStefano Zampini   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4548713a8baSPatrick Sanan 
4558713a8baSPatrick Sanan   if (reuse == MAT_INITIAL_MATRIX) {
4568713a8baSPatrick Sanan     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4578713a8baSPatrick Sanan   }
4588713a8baSPatrick Sanan 
4598713a8baSPatrick Sanan   B = *newmat;
4608713a8baSPatrick Sanan 
461e4a0ef16SKarl Rupp   B->spptr = new Mat_SeqAIJViennaCL();
462e4a0ef16SKarl Rupp 
463a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
464a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
465a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
466e4a0ef16SKarl Rupp 
467b470e4b4SRichard Tran Mills   ierr = MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
46892f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
469e4a0ef16SKarl Rupp 
470e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
47134136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
47234136279SStefano Zampini   ierr = PetscStrallocpy(VECVIENNACL,&B->defaultvectype);CHKERRQ(ierr);
473e4a0ef16SKarl Rupp 
4748713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
4754222ddf1SHong Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",MatProductSetFromOptions_SeqAIJ_SeqDense);CHKERRQ(ierr);
4764222ddf1SHong Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",MatProductSetFromOptions_SeqAIJ);CHKERRQ(ierr);
4778713a8baSPatrick Sanan 
478c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4798713a8baSPatrick Sanan 
4808713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4818713a8baSPatrick Sanan   if (B->assembled) {
4828713a8baSPatrick Sanan     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
4838713a8baSPatrick Sanan   }
4848713a8baSPatrick Sanan 
485e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
486e4a0ef16SKarl Rupp }
487e4a0ef16SKarl Rupp 
4883ca39a21SBarry Smith /*MC
489e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
490e4a0ef16SKarl Rupp 
491e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
492e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
493e4a0ef16SKarl Rupp 
494e4a0ef16SKarl Rupp    Options Database Keys:
495e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
496e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
497e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
498e4a0ef16SKarl Rupp 
499e4a0ef16SKarl Rupp   Level: beginner
500e4a0ef16SKarl Rupp 
501e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
502e4a0ef16SKarl Rupp M*/
503e4a0ef16SKarl Rupp 
5043ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
50572367587SKarl Rupp {
50672367587SKarl Rupp   PetscErrorCode ierr;
50772367587SKarl Rupp 
50872367587SKarl Rupp   PetscFunctionBegin;
5093ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5103ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5113ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5123ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
51372367587SKarl Rupp   PetscFunctionReturn(0);
51472367587SKarl Rupp }
515