xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 489de41d07dee5e052e0dd527a81a1379b674967)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp 
3e4a0ef16SKarl Rupp /*
4e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5e4a0ef16SKarl Rupp   matrix storage format.
6e4a0ef16SKarl Rupp */
7e4a0ef16SKarl Rupp 
8aaa7dc30SBarry Smith #include <petscconf.h>
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp 
17e4a0ef16SKarl Rupp #include <algorithm>
18e4a0ef16SKarl Rupp #include <vector>
19e4a0ef16SKarl Rupp #include <string>
20e4a0ef16SKarl Rupp 
21e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
22e4a0ef16SKarl Rupp 
238713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2472367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
2572367587SKarl Rupp 
268713a8baSPatrick Sanan 
27e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
28e4a0ef16SKarl Rupp {
29e4a0ef16SKarl Rupp 
30e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
31e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
32e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
33e4a0ef16SKarl Rupp 
34e4a0ef16SKarl Rupp   PetscFunctionBegin;
35bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
36b8ced49eSKarl Rupp     if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
37e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38e4a0ef16SKarl Rupp 
39e4a0ef16SKarl Rupp       try {
40e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
41a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42e4a0ef16SKarl Rupp 
43a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
44a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
45e4a0ef16SKarl Rupp 
46a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
49e4a0ef16SKarl Rupp 
50a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
53a3430c56SKarl Rupp 
54a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
56a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
57a3430c56SKarl Rupp 
58a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
594863603aSSatish Balay           ierr = PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
60e4a0ef16SKarl Rupp         } else {
61a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
62e4a0ef16SKarl Rupp 
63e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
64e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
65e4a0ef16SKarl Rupp 
66e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
67e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
68e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
69e4a0ef16SKarl Rupp 
70e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
71e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
72e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
73e4a0ef16SKarl Rupp 
74e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
754863603aSSatish Balay           ierr = PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
76e4a0ef16SKarl Rupp         }
774cf1874eSKarl Rupp         ViennaCLWaitForGPU();
784076e183SKarl Rupp       } catch(std::exception const & ex) {
794076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
80e4a0ef16SKarl Rupp       }
81e4a0ef16SKarl Rupp 
82a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
83a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
849b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
85a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
869b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
87a3430c56SKarl Rupp         } else {
88a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
89a3430c56SKarl Rupp         }
90a3430c56SKarl Rupp       } else {
919b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
92a3430c56SKarl Rupp       }
93a3430c56SKarl Rupp 
94b8ced49eSKarl Rupp       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
95e4a0ef16SKarl Rupp 
96e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
97e4a0ef16SKarl Rupp     }
9867c87b7fSKarl Rupp   }
99e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
100e4a0ef16SKarl Rupp }
101e4a0ef16SKarl Rupp 
1020d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103e4a0ef16SKarl Rupp {
104e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
105e4a0ef16SKarl Rupp   PetscInt           m               = A->rmap->n;
106e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
107e4a0ef16SKarl Rupp 
108e4a0ef16SKarl Rupp 
109e4a0ef16SKarl Rupp   PetscFunctionBegin;
110b8ced49eSKarl Rupp   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED) {
111e4a0ef16SKarl Rupp     try {
1126c4ed002SBarry Smith       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1136c4ed002SBarry Smith       else {
114e4a0ef16SKarl Rupp 
115e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
116e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
117e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
118e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
119e4a0ef16SKarl Rupp         if (a->singlemalloc) {
120e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
121e4a0ef16SKarl Rupp         } else {
122e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
123e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
124e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
125e4a0ef16SKarl Rupp         }
126dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
127f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
128e4a0ef16SKarl Rupp 
129e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
130e4a0ef16SKarl Rupp 
131e4a0ef16SKarl Rupp         /* Setup row lengths */
132071fcb05SBarry Smith         ierr = PetscFree(a->imax);CHKERRQ(ierr);
133071fcb05SBarry Smith         ierr = PetscFree(a->ilen);CHKERRQ(ierr);
134071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->imax);CHKERRQ(ierr);
135071fcb05SBarry Smith         ierr = PetscMalloc1(m,&a->ilen);CHKERRQ(ierr);
136f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
137e4a0ef16SKarl Rupp 
138e4a0ef16SKarl Rupp         /* Copy data back from GPU */
139e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
140e4a0ef16SKarl Rupp 
141e4a0ef16SKarl Rupp         // copy row array
142e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
143e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
144e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
145e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
146e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
147e4a0ef16SKarl Rupp         }
148e4a0ef16SKarl Rupp 
149e4a0ef16SKarl Rupp         // copy column indices
150e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
151e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
152e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
153e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
154e4a0ef16SKarl Rupp 
155e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
156e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
157e4a0ef16SKarl Rupp 
1584863603aSSatish Balay         ierr = PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)));CHKERRQ(ierr);
1594cf1874eSKarl Rupp         ViennaCLWaitForGPU();
160023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
161e4a0ef16SKarl Rupp       }
1624076e183SKarl Rupp     } catch(std::exception const & ex) {
1634076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
164e4a0ef16SKarl Rupp     }
165e4a0ef16SKarl Rupp 
166b8ced49eSKarl Rupp     /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
167e4a0ef16SKarl Rupp     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
168e4a0ef16SKarl Rupp     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169e4a0ef16SKarl Rupp 
170b8ced49eSKarl Rupp     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1716c4ed002SBarry Smith   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
172e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
173e4a0ef16SKarl Rupp }
174e4a0ef16SKarl Rupp 
175e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
176e4a0ef16SKarl Rupp {
177e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
178e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
179e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
1800d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
1810d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
182e4a0ef16SKarl Rupp 
183e4a0ef16SKarl Rupp   PetscFunctionBegin;
184bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
185e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
186e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
1877a052e47Shannah_mairs     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
188e4a0ef16SKarl Rupp     try {
189b7832b47SStefano Zampini       if (a->compressedrow.use) {
190b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
191b7832b47SStefano Zampini       } else {
192e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
193b7832b47SStefano Zampini       }
1944cf1874eSKarl Rupp       ViennaCLWaitForGPU();
1954076e183SKarl Rupp     } catch (std::exception const & ex) {
1964076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
197e4a0ef16SKarl Rupp     }
198958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
199e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
200e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
201958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
202bf1781e8SStefano Zampini   } else {
203bf1781e8SStefano Zampini     ierr = VecSet(yy,0);CHKERRQ(ierr);
20467c87b7fSKarl Rupp   }
205e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
206e4a0ef16SKarl Rupp }
207e4a0ef16SKarl Rupp 
208e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
209e4a0ef16SKarl Rupp {
210e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
211e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
212e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2130d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2140d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
215e4a0ef16SKarl Rupp 
216e4a0ef16SKarl Rupp   PetscFunctionBegin;
217bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
218e4a0ef16SKarl Rupp     try {
219e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
220e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
221e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
2227a052e47Shannah_mairs       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
223e4a0ef16SKarl Rupp       if (a->compressedrow.use) {
224a3430c56SKarl Rupp         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
225e4a0ef16SKarl Rupp         *zgpu = *ygpu + temp;
2264cf1874eSKarl Rupp         ViennaCLWaitForGPU();
227e4a0ef16SKarl Rupp       } else {
228a3430c56SKarl Rupp         if (zz == xx || zz == yy) { //temporary required
229a3430c56SKarl Rupp           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
230a3430c56SKarl Rupp           *zgpu = *ygpu;
231a3430c56SKarl Rupp           *zgpu += temp;
232a3430c56SKarl Rupp           ViennaCLWaitForGPU();
233a3430c56SKarl Rupp         } else {
234a3430c56SKarl Rupp           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
235a3430c56SKarl Rupp           *zgpu = *ygpu + *viennaclstruct->tempvec;
2364cf1874eSKarl Rupp           ViennaCLWaitForGPU();
237e4a0ef16SKarl Rupp         }
238e4a0ef16SKarl Rupp       }
239958c4211Shannah_mairs       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
240e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
241e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
242e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
243e4a0ef16SKarl Rupp 
2444076e183SKarl Rupp     } catch(std::exception const & ex) {
2454076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
246e4a0ef16SKarl Rupp     }
247958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
248bf1781e8SStefano Zampini   } else {
249bf1781e8SStefano Zampini     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
25067c87b7fSKarl Rupp   }
251e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
252e4a0ef16SKarl Rupp }
253e4a0ef16SKarl Rupp 
254e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
255e4a0ef16SKarl Rupp {
256e4a0ef16SKarl Rupp   PetscErrorCode ierr;
257e4a0ef16SKarl Rupp 
258e4a0ef16SKarl Rupp   PetscFunctionBegin;
259e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
260*489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
261e7e92044SBarry Smith   if (!A->pinnedtocpu) {
262e4a0ef16SKarl Rupp     ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
263e7e92044SBarry Smith   }
264e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
265e4a0ef16SKarl Rupp }
266e4a0ef16SKarl Rupp 
267e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
268e4a0ef16SKarl Rupp /*@
269e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
27019fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
271e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
272e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
273e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
274e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
275e4a0ef16SKarl Rupp 
276e4a0ef16SKarl Rupp 
277d083f849SBarry Smith    Collective
278e4a0ef16SKarl Rupp 
279e4a0ef16SKarl Rupp    Input Parameters:
280e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
281e4a0ef16SKarl Rupp .  m - number of rows
282e4a0ef16SKarl Rupp .  n - number of columns
283e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
284e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
285e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
286e4a0ef16SKarl Rupp 
287e4a0ef16SKarl Rupp    Output Parameter:
288e4a0ef16SKarl Rupp .  A - the matrix
289e4a0ef16SKarl Rupp 
290e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
291e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
292e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293e4a0ef16SKarl Rupp 
294e4a0ef16SKarl Rupp    Notes:
295e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
296e4a0ef16SKarl Rupp 
297e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
298e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
299e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
300e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
301e4a0ef16SKarl Rupp 
302e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
303e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
304e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
305e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
306e4a0ef16SKarl Rupp 
307e4a0ef16SKarl Rupp    Level: intermediate
308e4a0ef16SKarl Rupp 
309e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
310e4a0ef16SKarl Rupp 
311e4a0ef16SKarl Rupp @*/
312e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
313e4a0ef16SKarl Rupp {
314e4a0ef16SKarl Rupp   PetscErrorCode ierr;
315e4a0ef16SKarl Rupp 
316e4a0ef16SKarl Rupp   PetscFunctionBegin;
317e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
318e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
319e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
320e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
321e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
322e4a0ef16SKarl Rupp }
323e4a0ef16SKarl Rupp 
324e4a0ef16SKarl Rupp 
325e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
326e4a0ef16SKarl Rupp {
327e4a0ef16SKarl Rupp   PetscErrorCode ierr;
328e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
329e4a0ef16SKarl Rupp 
330e4a0ef16SKarl Rupp   PetscFunctionBegin;
331e4a0ef16SKarl Rupp   try {
3326447cd05SKarl Rupp     if (viennaclcontainer) {
3336447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3346447cd05SKarl Rupp       delete viennaclcontainer->mat;
3356447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
336e4a0ef16SKarl Rupp       delete viennaclcontainer;
3376447cd05SKarl Rupp     }
338b8ced49eSKarl Rupp     A->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
3394076e183SKarl Rupp   } catch(std::exception const & ex) {
3404076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
341e4a0ef16SKarl Rupp   }
3428713a8baSPatrick Sanan 
3438713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
3448713a8baSPatrick Sanan 
345e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
346e4a0ef16SKarl Rupp   A->spptr = 0;
347e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
348e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
349e4a0ef16SKarl Rupp }
350e4a0ef16SKarl Rupp 
351e4a0ef16SKarl Rupp 
352e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
353e4a0ef16SKarl Rupp {
354e4a0ef16SKarl Rupp   PetscErrorCode ierr;
355e4a0ef16SKarl Rupp 
356e4a0ef16SKarl Rupp   PetscFunctionBegin;
357e4a0ef16SKarl Rupp   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3588713a8baSPatrick Sanan   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
3598713a8baSPatrick Sanan   PetscFunctionReturn(0);
3608713a8baSPatrick Sanan }
3618713a8baSPatrick Sanan 
362e7e92044SBarry Smith static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat,PetscBool);
363c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
364c3cca76eSKarl Rupp {
365c3cca76eSKarl Rupp   PetscErrorCode ierr;
366c3cca76eSKarl Rupp   Mat            C;
367c3cca76eSKarl Rupp 
368c3cca76eSKarl Rupp   PetscFunctionBegin;
369c3cca76eSKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
370c3cca76eSKarl Rupp   C = *B;
371c3cca76eSKarl Rupp 
372e7e92044SBarry Smith   ierr = MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
373e7e92044SBarry Smith   C->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL;
374c3cca76eSKarl Rupp 
375c3cca76eSKarl Rupp   C->spptr        = new Mat_SeqAIJViennaCL();
376c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
377c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
378c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;
379c3cca76eSKarl Rupp 
380c3cca76eSKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);CHKERRQ(ierr);
381c3cca76eSKarl Rupp 
382b8ced49eSKarl Rupp   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
383c3cca76eSKarl Rupp 
384c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
385c3cca76eSKarl Rupp   if (C->assembled) {
386c3cca76eSKarl Rupp     ierr = MatViennaCLCopyToGPU(C);CHKERRQ(ierr);
387c3cca76eSKarl Rupp   }
388c3cca76eSKarl Rupp 
389c3cca76eSKarl Rupp   PetscFunctionReturn(0);
390c3cca76eSKarl Rupp }
391c3cca76eSKarl Rupp 
392e7e92044SBarry Smith static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)
393e7e92044SBarry Smith {
394e7e92044SBarry Smith   PetscFunctionBegin;
395e7e92044SBarry Smith   A->pinnedtocpu = flg;
396e7e92044SBarry Smith   if (flg) {
397e7e92044SBarry Smith     A->ops->mult           = MatMult_SeqAIJ;
398e7e92044SBarry Smith     A->ops->multadd        = MatMultAdd_SeqAIJ;
399e7e92044SBarry Smith     A->ops->assemblyend    = MatAssemblyEnd_SeqAIJ;
400e7e92044SBarry Smith     A->ops->duplicate      = MatDuplicate_SeqAIJ;
401e7e92044SBarry Smith   } else {
402e7e92044SBarry Smith     A->ops->mult           = MatMult_SeqAIJViennaCL;
403e7e92044SBarry Smith     A->ops->multadd        = MatMultAdd_SeqAIJViennaCL;
404e7e92044SBarry Smith     A->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
405e7e92044SBarry Smith     A->ops->destroy        = MatDestroy_SeqAIJViennaCL;
406e7e92044SBarry Smith     A->ops->duplicate      = MatDuplicate_SeqAIJViennaCL;
407e7e92044SBarry Smith   }
408e7e92044SBarry Smith   PetscFunctionReturn(0);
409e7e92044SBarry Smith }
410e7e92044SBarry Smith 
4118713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
4128713a8baSPatrick Sanan {
4138713a8baSPatrick Sanan   PetscErrorCode ierr;
4148713a8baSPatrick Sanan   Mat            B;
4158713a8baSPatrick Sanan   Mat_SeqAIJ     *aij;
4168713a8baSPatrick Sanan 
4178713a8baSPatrick Sanan   PetscFunctionBegin;
4188713a8baSPatrick Sanan 
4198713a8baSPatrick Sanan   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4208713a8baSPatrick Sanan 
4218713a8baSPatrick Sanan   if (reuse == MAT_INITIAL_MATRIX) {
4228713a8baSPatrick Sanan     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4238713a8baSPatrick Sanan   }
4248713a8baSPatrick Sanan 
4258713a8baSPatrick Sanan   B = *newmat;
4268713a8baSPatrick Sanan 
427e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
428e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
4298713a8baSPatrick Sanan 
430e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
431e4a0ef16SKarl Rupp 
432a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
433a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
434a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
435e4a0ef16SKarl Rupp 
436e7e92044SBarry Smith   ierr = MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
437e7e92044SBarry Smith   A->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL;
438e4a0ef16SKarl Rupp 
439e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
44034136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
44134136279SStefano Zampini   ierr = PetscStrallocpy(VECVIENNACL,&B->defaultvectype);CHKERRQ(ierr);
442e4a0ef16SKarl Rupp 
4438713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
4448713a8baSPatrick Sanan 
445b8ced49eSKarl Rupp   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
4468713a8baSPatrick Sanan 
4478713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4488713a8baSPatrick Sanan   if (B->assembled) {
4498713a8baSPatrick Sanan     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
4508713a8baSPatrick Sanan   }
4518713a8baSPatrick Sanan 
452e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
453e4a0ef16SKarl Rupp }
454e4a0ef16SKarl Rupp 
455e4a0ef16SKarl Rupp 
4563ca39a21SBarry Smith /*MC
457e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
458e4a0ef16SKarl Rupp 
459e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
460e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
461e4a0ef16SKarl Rupp 
462e4a0ef16SKarl Rupp    Options Database Keys:
463e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
464e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
465e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
466e4a0ef16SKarl Rupp 
467e4a0ef16SKarl Rupp   Level: beginner
468e4a0ef16SKarl Rupp 
469e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
470e4a0ef16SKarl Rupp M*/
471e4a0ef16SKarl Rupp 
47272367587SKarl Rupp 
4733ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
47472367587SKarl Rupp {
47572367587SKarl Rupp   PetscErrorCode ierr;
47672367587SKarl Rupp 
47772367587SKarl Rupp   PetscFunctionBegin;
4783ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
4793ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
4803ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
4813ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
48272367587SKarl Rupp   PetscFunctionReturn(0);
48372367587SKarl Rupp }
484