xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision f38c1e66a6e89a4b79f8a594d3e34117f8d83b31)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp 
3e4a0ef16SKarl Rupp /*
4e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5e4a0ef16SKarl Rupp   matrix storage format.
6e4a0ef16SKarl Rupp */
7e4a0ef16SKarl Rupp 
8aaa7dc30SBarry Smith #include <petscconf.h>
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp 
17e4a0ef16SKarl Rupp #include <algorithm>
18e4a0ef16SKarl Rupp #include <vector>
19e4a0ef16SKarl Rupp #include <string>
20e4a0ef16SKarl Rupp 
21e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
22e4a0ef16SKarl Rupp 
238713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2472367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
2572367587SKarl Rupp 
268713a8baSPatrick Sanan 
27e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
28e4a0ef16SKarl Rupp {
29e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
30e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
31e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
32e4a0ef16SKarl Rupp 
33e4a0ef16SKarl Rupp   PetscFunctionBegin;
34bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
35b8ced49eSKarl Rupp     if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
36e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
37e4a0ef16SKarl Rupp 
38e4a0ef16SKarl Rupp       try {
39e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
40a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
41e4a0ef16SKarl Rupp 
42a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
43a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
44e4a0ef16SKarl Rupp 
45a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
46a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
47a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
48e4a0ef16SKarl Rupp 
49a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
50a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
51a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
52a3430c56SKarl Rupp 
53a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
54a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
55a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
56a3430c56SKarl Rupp 
57a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
584863603aSSatish Balay           ierr = PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
59e4a0ef16SKarl Rupp         } else {
60a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
61e4a0ef16SKarl Rupp 
62e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
63e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
64e4a0ef16SKarl Rupp 
65e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
66e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
67e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
68e4a0ef16SKarl Rupp 
69e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
70e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
71e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
72e4a0ef16SKarl Rupp 
73e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
744863603aSSatish Balay           ierr = PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
75e4a0ef16SKarl Rupp         }
764cf1874eSKarl Rupp         ViennaCLWaitForGPU();
774076e183SKarl Rupp       } catch(std::exception const & ex) {
784076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
79e4a0ef16SKarl Rupp       }
80e4a0ef16SKarl Rupp 
81a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
82a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
839b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
84a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
859b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
86a3430c56SKarl Rupp         } else {
87a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
88a3430c56SKarl Rupp         }
89a3430c56SKarl Rupp       } else {
909b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
91a3430c56SKarl Rupp       }
92a3430c56SKarl Rupp 
93b8ced49eSKarl Rupp       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
94e4a0ef16SKarl Rupp 
95e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
96e4a0ef16SKarl Rupp     }
9767c87b7fSKarl Rupp   }
98e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
99e4a0ef16SKarl Rupp }
100e4a0ef16SKarl Rupp 
1010d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
102e4a0ef16SKarl Rupp {
103*f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
104e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
105e4a0ef16SKarl Rupp   PetscInt           m  = A->rmap->n;
106e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
107e4a0ef16SKarl Rupp 
108e4a0ef16SKarl Rupp   PetscFunctionBegin;
109*f38c1e66SStefano Zampini   if (A->valid_GPU_matrix == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0);
110*f38c1e66SStefano Zampini   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
111e4a0ef16SKarl Rupp     try {
1126c4ed002SBarry Smith       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1136c4ed002SBarry Smith       else {
114e4a0ef16SKarl Rupp 
115e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
116e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
117e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
118e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
119e4a0ef16SKarl Rupp         if (a->singlemalloc) {
120e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
121e4a0ef16SKarl Rupp         } else {
122e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
123e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
124e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
125e4a0ef16SKarl Rupp         }
126dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
127f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
128e4a0ef16SKarl Rupp 
129e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
130e4a0ef16SKarl Rupp 
131e4a0ef16SKarl Rupp         /* Setup row lengths */
132071fcb05SBarry Smith         ierr = PetscFree(a->imax);CHKERRQ(ierr);
133071fcb05SBarry Smith         ierr = PetscFree(a->ilen);CHKERRQ(ierr);
134071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->imax);CHKERRQ(ierr);
135071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->ilen);CHKERRQ(ierr);
136f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
137e4a0ef16SKarl Rupp 
138e4a0ef16SKarl Rupp         /* Copy data back from GPU */
139e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
140e4a0ef16SKarl Rupp 
141e4a0ef16SKarl Rupp         // copy row array
142e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
143e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
144e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
145e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
146e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
147e4a0ef16SKarl Rupp         }
148e4a0ef16SKarl Rupp 
149e4a0ef16SKarl Rupp         // copy column indices
150e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
151e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
152e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
153e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
154e4a0ef16SKarl Rupp 
155e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
156e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
157e4a0ef16SKarl Rupp 
1584863603aSSatish Balay         ierr = PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)));CHKERRQ(ierr);
1594cf1874eSKarl Rupp         ViennaCLWaitForGPU();
160023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
161e4a0ef16SKarl Rupp       }
1624076e183SKarl Rupp     } catch(std::exception const & ex) {
1634076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
164e4a0ef16SKarl Rupp     }
165*f38c1e66SStefano Zampini   } else if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED) {
166*f38c1e66SStefano Zampini     PetscFunctionReturn(0);
167*f38c1e66SStefano Zampini   } else {
168*f38c1e66SStefano Zampini     if (!Agpu && A->valid_GPU_matrix != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
169e4a0ef16SKarl Rupp 
170*f38c1e66SStefano Zampini     if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
171*f38c1e66SStefano Zampini     if (!Agpu) {
172*f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar)*viennaclstruct->mat->nnz(), a->a);
173*f38c1e66SStefano Zampini     } else {
174*f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
175*f38c1e66SStefano Zampini     }
176*f38c1e66SStefano Zampini   }
177*f38c1e66SStefano Zampini   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
178b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
179e4a0ef16SKarl Rupp   ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
180e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
181e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
182e4a0ef16SKarl Rupp }
183e4a0ef16SKarl Rupp 
184e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
185e4a0ef16SKarl Rupp {
186e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
187e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
188e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
1890d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
1900d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
191e4a0ef16SKarl Rupp 
192e4a0ef16SKarl Rupp   PetscFunctionBegin;
193bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
194e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
195e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
1967a052e47Shannah_mairs     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
197e4a0ef16SKarl Rupp     try {
198b7832b47SStefano Zampini       if (a->compressedrow.use) {
199b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
200b7832b47SStefano Zampini       } else {
201e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
202b7832b47SStefano Zampini       }
2034cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2044076e183SKarl Rupp     } catch (std::exception const & ex) {
2054076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
206e4a0ef16SKarl Rupp     }
207958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
208e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
209e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
210958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
211bf1781e8SStefano Zampini   } else {
212bf1781e8SStefano Zampini     ierr = VecSet(yy,0);CHKERRQ(ierr);
21367c87b7fSKarl Rupp   }
214e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
215e4a0ef16SKarl Rupp }
216e4a0ef16SKarl Rupp 
217e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
218e4a0ef16SKarl Rupp {
219e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
220e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
221e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2220d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2230d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
224e4a0ef16SKarl Rupp 
225e4a0ef16SKarl Rupp   PetscFunctionBegin;
226bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
227e4a0ef16SKarl Rupp     try {
228e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
229e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
230e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
2317a052e47Shannah_mairs       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
232*f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
233*f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
234*f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
235*f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2364cf1874eSKarl Rupp       ViennaCLWaitForGPU();
237958c4211Shannah_mairs       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
238e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
239e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
240e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
241e4a0ef16SKarl Rupp 
2424076e183SKarl Rupp     } catch(std::exception const & ex) {
2434076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
244e4a0ef16SKarl Rupp     }
245958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
246bf1781e8SStefano Zampini   } else {
247bf1781e8SStefano Zampini     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
24867c87b7fSKarl Rupp   }
249e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
250e4a0ef16SKarl Rupp }
251e4a0ef16SKarl Rupp 
252e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
253e4a0ef16SKarl Rupp {
254e4a0ef16SKarl Rupp   PetscErrorCode ierr;
255e4a0ef16SKarl Rupp 
256e4a0ef16SKarl Rupp   PetscFunctionBegin;
257e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
258489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
259e7e92044SBarry Smith   if (!A->pinnedtocpu) {
260e4a0ef16SKarl Rupp     ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
261e7e92044SBarry Smith   }
262e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
263e4a0ef16SKarl Rupp }
264e4a0ef16SKarl Rupp 
265e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
266e4a0ef16SKarl Rupp /*@
267e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
26819fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
269e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
270e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
271e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
272e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
273e4a0ef16SKarl Rupp 
274e4a0ef16SKarl Rupp 
275d083f849SBarry Smith    Collective
276e4a0ef16SKarl Rupp 
277e4a0ef16SKarl Rupp    Input Parameters:
278e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
279e4a0ef16SKarl Rupp .  m - number of rows
280e4a0ef16SKarl Rupp .  n - number of columns
281e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
282e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
283e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
284e4a0ef16SKarl Rupp 
285e4a0ef16SKarl Rupp    Output Parameter:
286e4a0ef16SKarl Rupp .  A - the matrix
287e4a0ef16SKarl Rupp 
288e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
289e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
290e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
291e4a0ef16SKarl Rupp 
292e4a0ef16SKarl Rupp    Notes:
293e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
294e4a0ef16SKarl Rupp 
295e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
296e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
297e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
298e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
299e4a0ef16SKarl Rupp 
300e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
301e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
302e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
303e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
304e4a0ef16SKarl Rupp 
305e4a0ef16SKarl Rupp    Level: intermediate
306e4a0ef16SKarl Rupp 
307e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
308e4a0ef16SKarl Rupp 
309e4a0ef16SKarl Rupp @*/
310e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
311e4a0ef16SKarl Rupp {
312e4a0ef16SKarl Rupp   PetscErrorCode ierr;
313e4a0ef16SKarl Rupp 
314e4a0ef16SKarl Rupp   PetscFunctionBegin;
315e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
316e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
317e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
318e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
319e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
320e4a0ef16SKarl Rupp }
321e4a0ef16SKarl Rupp 
322e4a0ef16SKarl Rupp 
323e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
324e4a0ef16SKarl Rupp {
325e4a0ef16SKarl Rupp   PetscErrorCode ierr;
326e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
327e4a0ef16SKarl Rupp 
328e4a0ef16SKarl Rupp   PetscFunctionBegin;
329e4a0ef16SKarl Rupp   try {
3306447cd05SKarl Rupp     if (viennaclcontainer) {
3316447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3326447cd05SKarl Rupp       delete viennaclcontainer->mat;
3336447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
334e4a0ef16SKarl Rupp       delete viennaclcontainer;
3356447cd05SKarl Rupp     }
336b8ced49eSKarl Rupp     A->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
3374076e183SKarl Rupp   } catch(std::exception const & ex) {
3384076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
339e4a0ef16SKarl Rupp   }
3408713a8baSPatrick Sanan 
3418713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
3428713a8baSPatrick Sanan 
343e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
344e4a0ef16SKarl Rupp   A->spptr = 0;
345e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
346e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
347e4a0ef16SKarl Rupp }
348e4a0ef16SKarl Rupp 
349e4a0ef16SKarl Rupp 
350e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
351e4a0ef16SKarl Rupp {
352e4a0ef16SKarl Rupp   PetscErrorCode ierr;
353e4a0ef16SKarl Rupp 
354e4a0ef16SKarl Rupp   PetscFunctionBegin;
355e4a0ef16SKarl Rupp   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3568713a8baSPatrick Sanan   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
3578713a8baSPatrick Sanan   PetscFunctionReturn(0);
3588713a8baSPatrick Sanan }
3598713a8baSPatrick Sanan 
360e7e92044SBarry Smith static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat,PetscBool);
361c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
362c3cca76eSKarl Rupp {
363c3cca76eSKarl Rupp   PetscErrorCode ierr;
364c3cca76eSKarl Rupp   Mat            C;
365c3cca76eSKarl Rupp 
366c3cca76eSKarl Rupp   PetscFunctionBegin;
367c3cca76eSKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
368c3cca76eSKarl Rupp   C = *B;
369c3cca76eSKarl Rupp 
370e7e92044SBarry Smith   ierr = MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
371e7e92044SBarry Smith   C->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL;
372c3cca76eSKarl Rupp 
373c3cca76eSKarl Rupp   C->spptr = new Mat_SeqAIJViennaCL();
374c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
375c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
376c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;
377c3cca76eSKarl Rupp 
378c3cca76eSKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);CHKERRQ(ierr);
379c3cca76eSKarl Rupp 
380b8ced49eSKarl Rupp   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
381c3cca76eSKarl Rupp 
382c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
383c3cca76eSKarl Rupp   if (C->assembled) {
384c3cca76eSKarl Rupp     ierr = MatViennaCLCopyToGPU(C);CHKERRQ(ierr);
385c3cca76eSKarl Rupp   }
386c3cca76eSKarl Rupp 
387c3cca76eSKarl Rupp   PetscFunctionReturn(0);
388c3cca76eSKarl Rupp }
389c3cca76eSKarl Rupp 
390*f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
391*f38c1e66SStefano Zampini {
392*f38c1e66SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
393*f38c1e66SStefano Zampini   PetscErrorCode ierr;
394*f38c1e66SStefano Zampini 
395*f38c1e66SStefano Zampini   PetscFunctionBegin;
396*f38c1e66SStefano Zampini   ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
397*f38c1e66SStefano Zampini   *array = a->a;
398*f38c1e66SStefano Zampini   A->valid_GPU_matrix = PETSC_OFFLOAD_CPU;
399*f38c1e66SStefano Zampini   PetscFunctionReturn(0);
400*f38c1e66SStefano Zampini }
401*f38c1e66SStefano Zampini 
402*f38c1e66SStefano Zampini 
403*f38c1e66SStefano Zampini static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
404*f38c1e66SStefano Zampini {
405*f38c1e66SStefano Zampini   PetscFunctionBegin;
406*f38c1e66SStefano Zampini   *array = NULL;
407*f38c1e66SStefano Zampini   PetscFunctionReturn(0);
408*f38c1e66SStefano Zampini }
409*f38c1e66SStefano Zampini 
410e7e92044SBarry Smith static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)
411e7e92044SBarry Smith {
412*f38c1e66SStefano Zampini   PetscErrorCode ierr;
413*f38c1e66SStefano Zampini 
414e7e92044SBarry Smith   PetscFunctionBegin;
415e7e92044SBarry Smith   A->pinnedtocpu = flg;
416e7e92044SBarry Smith   if (flg) {
417*f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
418*f38c1e66SStefano Zampini     ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
419*f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
420*f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJ);CHKERRQ(ierr);
421*f38c1e66SStefano Zampini 
422e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
423e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
424e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
425e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
426e7e92044SBarry Smith   } else {
427*f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJViennaCL);CHKERRQ(ierr);
428*f38c1e66SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJViennaCL);CHKERRQ(ierr);
429*f38c1e66SStefano Zampini 
430e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
431e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
432e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
433e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
434e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
435e7e92044SBarry Smith   }
436e7e92044SBarry Smith   PetscFunctionReturn(0);
437e7e92044SBarry Smith }
438e7e92044SBarry Smith 
4398713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
4408713a8baSPatrick Sanan {
4418713a8baSPatrick Sanan   PetscErrorCode ierr;
4428713a8baSPatrick Sanan   Mat            B;
4438713a8baSPatrick Sanan   Mat_SeqAIJ     *aij;
4448713a8baSPatrick Sanan 
4458713a8baSPatrick Sanan   PetscFunctionBegin;
4468713a8baSPatrick Sanan 
4478713a8baSPatrick Sanan   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4488713a8baSPatrick Sanan 
4498713a8baSPatrick Sanan   if (reuse == MAT_INITIAL_MATRIX) {
4508713a8baSPatrick Sanan     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4518713a8baSPatrick Sanan   }
4528713a8baSPatrick Sanan 
4538713a8baSPatrick Sanan   B = *newmat;
4548713a8baSPatrick Sanan 
455e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
456e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
4578713a8baSPatrick Sanan 
458e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
459e4a0ef16SKarl Rupp 
460a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
461a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
462a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
463e4a0ef16SKarl Rupp 
464e7e92044SBarry Smith   ierr = MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
465e7e92044SBarry Smith   A->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL;
466e4a0ef16SKarl Rupp 
467e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
46834136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
46934136279SStefano Zampini   ierr = PetscStrallocpy(VECVIENNACL,&B->defaultvectype);CHKERRQ(ierr);
470e4a0ef16SKarl Rupp 
4718713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
4728713a8baSPatrick Sanan 
473b8ced49eSKarl Rupp   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
4748713a8baSPatrick Sanan 
4758713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4768713a8baSPatrick Sanan   if (B->assembled) {
4778713a8baSPatrick Sanan     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
4788713a8baSPatrick Sanan   }
4798713a8baSPatrick Sanan 
480e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
481e4a0ef16SKarl Rupp }
482e4a0ef16SKarl Rupp 
483e4a0ef16SKarl Rupp 
4843ca39a21SBarry Smith /*MC
485e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
486e4a0ef16SKarl Rupp 
487e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
488e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
489e4a0ef16SKarl Rupp 
490e4a0ef16SKarl Rupp    Options Database Keys:
491e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
492e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
493e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
494e4a0ef16SKarl Rupp 
495e4a0ef16SKarl Rupp   Level: beginner
496e4a0ef16SKarl Rupp 
497e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
498e4a0ef16SKarl Rupp M*/
499e4a0ef16SKarl Rupp 
50072367587SKarl Rupp 
5013ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
50272367587SKarl Rupp {
50372367587SKarl Rupp   PetscErrorCode ierr;
50472367587SKarl Rupp 
50572367587SKarl Rupp   PetscFunctionBegin;
5063ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5073ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5083ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
5093ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
51072367587SKarl Rupp   PetscFunctionReturn(0);
51172367587SKarl Rupp }
512