xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision bf1781e84c1367df3075bc0cee46209ee12ec3dd)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp 
3e4a0ef16SKarl Rupp /*
4e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
5e4a0ef16SKarl Rupp   matrix storage format.
6e4a0ef16SKarl Rupp */
7e4a0ef16SKarl Rupp 
8aaa7dc30SBarry Smith #include <petscconf.h>
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp 
17e4a0ef16SKarl Rupp #include <algorithm>
18e4a0ef16SKarl Rupp #include <vector>
19e4a0ef16SKarl Rupp #include <string>
20e4a0ef16SKarl Rupp 
21e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
22e4a0ef16SKarl Rupp 
238713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2472367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
2572367587SKarl Rupp 
268713a8baSPatrick Sanan 
27e4a0ef16SKarl Rupp PetscErrorCode MatViennaCLCopyToGPU(Mat A)
28e4a0ef16SKarl Rupp {
29e4a0ef16SKarl Rupp 
30e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
31e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
32e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
33e4a0ef16SKarl Rupp 
34e4a0ef16SKarl Rupp   PetscFunctionBegin;
35*bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
36b8ced49eSKarl Rupp     if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
37e4a0ef16SKarl Rupp       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38e4a0ef16SKarl Rupp 
39e4a0ef16SKarl Rupp       try {
40e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
41a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42e4a0ef16SKarl Rupp 
43a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
44a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
45e4a0ef16SKarl Rupp 
46a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47a3430c56SKarl Rupp           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48a3430c56SKarl Rupp             row_buffer.set(i, (a->compressedrow.i)[i]);
49e4a0ef16SKarl Rupp 
50a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51a3430c56SKarl Rupp           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52a3430c56SKarl Rupp             row_indices.set(i, (a->compressedrow.rindex)[i]);
53a3430c56SKarl Rupp 
54a3430c56SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55a3430c56SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
56a3430c56SKarl Rupp             col_buffer.set(i, (a->j)[i]);
57a3430c56SKarl Rupp 
58a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
59e4a0ef16SKarl Rupp         } else {
60a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
61e4a0ef16SKarl Rupp 
62e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
63e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
64e4a0ef16SKarl Rupp 
65e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
66e4a0ef16SKarl Rupp           for (PetscInt i=0; i<=A->rmap->n; ++i)
67e4a0ef16SKarl Rupp             row_buffer.set(i, (a->i)[i]);
68e4a0ef16SKarl Rupp 
69e4a0ef16SKarl Rupp           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
70e4a0ef16SKarl Rupp           for (PetscInt i=0; i<a->nz; ++i)
71e4a0ef16SKarl Rupp             col_buffer.set(i, (a->j)[i]);
72e4a0ef16SKarl Rupp 
73e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
74e4a0ef16SKarl Rupp         }
754cf1874eSKarl Rupp         ViennaCLWaitForGPU();
764076e183SKarl Rupp       } catch(std::exception const & ex) {
774076e183SKarl Rupp         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
78e4a0ef16SKarl Rupp       }
79e4a0ef16SKarl Rupp 
80a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
81a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
829b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
83a3430c56SKarl Rupp           delete (ViennaCLVector*)viennaclstruct->tempvec;
849b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
85a3430c56SKarl Rupp         } else {
86a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
87a3430c56SKarl Rupp         }
88a3430c56SKarl Rupp       } else {
899b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
90a3430c56SKarl Rupp       }
91a3430c56SKarl Rupp 
92b8ced49eSKarl Rupp       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
93e4a0ef16SKarl Rupp 
94e4a0ef16SKarl Rupp       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
95e4a0ef16SKarl Rupp     }
9667c87b7fSKarl Rupp   }
97e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
98e4a0ef16SKarl Rupp }
99e4a0ef16SKarl Rupp 
1000d73d530SKarl Rupp PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
101e4a0ef16SKarl Rupp {
102e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
103e4a0ef16SKarl Rupp   PetscInt           m               = A->rmap->n;
104e4a0ef16SKarl Rupp   PetscErrorCode     ierr;
105e4a0ef16SKarl Rupp 
106e4a0ef16SKarl Rupp 
107e4a0ef16SKarl Rupp   PetscFunctionBegin;
108b8ced49eSKarl Rupp   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED) {
109e4a0ef16SKarl Rupp     try {
1106c4ed002SBarry Smith       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1116c4ed002SBarry Smith       else {
112e4a0ef16SKarl Rupp 
113e4a0ef16SKarl Rupp         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
114e4a0ef16SKarl Rupp         a->nz           = Agpu->nnz();
115e4a0ef16SKarl Rupp         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
116e4a0ef16SKarl Rupp         A->preallocated = PETSC_TRUE;
117e4a0ef16SKarl Rupp         if (a->singlemalloc) {
118e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
119e4a0ef16SKarl Rupp         } else {
120e4a0ef16SKarl Rupp           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
121e4a0ef16SKarl Rupp           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
122e4a0ef16SKarl Rupp           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
123e4a0ef16SKarl Rupp         }
124dcca6d9dSJed Brown         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
125f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
126e4a0ef16SKarl Rupp 
127e4a0ef16SKarl Rupp         a->singlemalloc = PETSC_TRUE;
128e4a0ef16SKarl Rupp 
129e4a0ef16SKarl Rupp         /* Setup row lengths */
130e4a0ef16SKarl Rupp         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
131dcca6d9dSJed Brown         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
132f7daeb2aSKarl Rupp         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
133e4a0ef16SKarl Rupp 
134e4a0ef16SKarl Rupp         /* Copy data back from GPU */
135e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
136e4a0ef16SKarl Rupp 
137e4a0ef16SKarl Rupp         // copy row array
138e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
139e4a0ef16SKarl Rupp         (a->i)[0] = row_buffer[0];
140e4a0ef16SKarl Rupp         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
141e4a0ef16SKarl Rupp           (a->i)[i+1] = row_buffer[i+1];
142e4a0ef16SKarl Rupp           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
143e4a0ef16SKarl Rupp         }
144e4a0ef16SKarl Rupp 
145e4a0ef16SKarl Rupp         // copy column indices
146e4a0ef16SKarl Rupp         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
147e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
148e4a0ef16SKarl Rupp         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
149e4a0ef16SKarl Rupp           (a->j)[i] = col_buffer[i];
150e4a0ef16SKarl Rupp 
151e4a0ef16SKarl Rupp         // copy nonzero entries directly to destination (no conversion required)
152e4a0ef16SKarl Rupp         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
153e4a0ef16SKarl Rupp 
1544cf1874eSKarl Rupp         ViennaCLWaitForGPU();
155023073b3SKarl Rupp         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
156e4a0ef16SKarl Rupp       }
1574076e183SKarl Rupp     } catch(std::exception const & ex) {
1584076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
159e4a0ef16SKarl Rupp     }
160e4a0ef16SKarl Rupp 
161b8ced49eSKarl Rupp     /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
162e4a0ef16SKarl Rupp     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
163e4a0ef16SKarl Rupp     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
164e4a0ef16SKarl Rupp 
165b8ced49eSKarl Rupp     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1666c4ed002SBarry Smith   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
167e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
168e4a0ef16SKarl Rupp }
169e4a0ef16SKarl Rupp 
170e4a0ef16SKarl Rupp PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
171e4a0ef16SKarl Rupp {
172e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
173e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
174e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
1750d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL;
1760d73d530SKarl Rupp   ViennaCLVector       *ygpu=NULL;
177e4a0ef16SKarl Rupp 
178e4a0ef16SKarl Rupp   PetscFunctionBegin;
179*bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
180e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
181e4a0ef16SKarl Rupp     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
182e4a0ef16SKarl Rupp     try {
183e4a0ef16SKarl Rupp       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
1844cf1874eSKarl Rupp       ViennaCLWaitForGPU();
1854076e183SKarl Rupp     } catch (std::exception const & ex) {
1864076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
187e4a0ef16SKarl Rupp     }
188e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
189e4a0ef16SKarl Rupp     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
1909b66742cSDave May     ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
191*bf1781e8SStefano Zampini   } else {
192*bf1781e8SStefano Zampini     ierr = VecSet(yy,0);CHKERRQ(ierr);
19367c87b7fSKarl Rupp   }
194e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
195e4a0ef16SKarl Rupp }
196e4a0ef16SKarl Rupp 
197e4a0ef16SKarl Rupp PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
198e4a0ef16SKarl Rupp {
199e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
200e4a0ef16SKarl Rupp   PetscErrorCode       ierr;
201e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
2020d73d530SKarl Rupp   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
2030d73d530SKarl Rupp   ViennaCLVector       *zgpu=NULL;
204e4a0ef16SKarl Rupp 
205e4a0ef16SKarl Rupp   PetscFunctionBegin;
206*bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
207e4a0ef16SKarl Rupp     try {
208e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
209e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
210e4a0ef16SKarl Rupp       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
211e4a0ef16SKarl Rupp 
212e4a0ef16SKarl Rupp       if (a->compressedrow.use) {
213a3430c56SKarl Rupp         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
214e4a0ef16SKarl Rupp         *zgpu = *ygpu + temp;
2154cf1874eSKarl Rupp         ViennaCLWaitForGPU();
216e4a0ef16SKarl Rupp       } else {
217a3430c56SKarl Rupp         if (zz == xx || zz == yy) { //temporary required
218a3430c56SKarl Rupp           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
219a3430c56SKarl Rupp           *zgpu = *ygpu;
220a3430c56SKarl Rupp           *zgpu += temp;
221a3430c56SKarl Rupp           ViennaCLWaitForGPU();
222a3430c56SKarl Rupp         } else {
223a3430c56SKarl Rupp           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
224a3430c56SKarl Rupp           *zgpu = *ygpu + *viennaclstruct->tempvec;
2254cf1874eSKarl Rupp           ViennaCLWaitForGPU();
226e4a0ef16SKarl Rupp         }
227e4a0ef16SKarl Rupp       }
228e4a0ef16SKarl Rupp 
229e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
230e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
231e4a0ef16SKarl Rupp       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
232e4a0ef16SKarl Rupp 
2334076e183SKarl Rupp     } catch(std::exception const & ex) {
2344076e183SKarl Rupp       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
235e4a0ef16SKarl Rupp     }
236e4a0ef16SKarl Rupp     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
237*bf1781e8SStefano Zampini   } else {
238*bf1781e8SStefano Zampini     ierr = VecCopy(yy,zz);CHKERRQ(ierr);
23967c87b7fSKarl Rupp   }
240e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
241e4a0ef16SKarl Rupp }
242e4a0ef16SKarl Rupp 
243e4a0ef16SKarl Rupp PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
244e4a0ef16SKarl Rupp {
245e4a0ef16SKarl Rupp   PetscErrorCode ierr;
246e4a0ef16SKarl Rupp 
247e4a0ef16SKarl Rupp   PetscFunctionBegin;
248e4a0ef16SKarl Rupp   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
249e4a0ef16SKarl Rupp   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
250e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
251e4a0ef16SKarl Rupp }
252e4a0ef16SKarl Rupp 
253e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
254e4a0ef16SKarl Rupp /*@
255e4a0ef16SKarl Rupp    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
25619fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
257e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
258e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
259e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
260e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
261e4a0ef16SKarl Rupp 
262e4a0ef16SKarl Rupp 
263e4a0ef16SKarl Rupp    Collective on MPI_Comm
264e4a0ef16SKarl Rupp 
265e4a0ef16SKarl Rupp    Input Parameters:
266e4a0ef16SKarl Rupp +  comm - MPI communicator, set to PETSC_COMM_SELF
267e4a0ef16SKarl Rupp .  m - number of rows
268e4a0ef16SKarl Rupp .  n - number of columns
269e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
270e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
271e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
272e4a0ef16SKarl Rupp 
273e4a0ef16SKarl Rupp    Output Parameter:
274e4a0ef16SKarl Rupp .  A - the matrix
275e4a0ef16SKarl Rupp 
276e4a0ef16SKarl Rupp    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
277e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
278e4a0ef16SKarl Rupp    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
279e4a0ef16SKarl Rupp 
280e4a0ef16SKarl Rupp    Notes:
281e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
282e4a0ef16SKarl Rupp 
283e4a0ef16SKarl Rupp    The AIJ format (also called the Yale sparse matrix format or
284e4a0ef16SKarl Rupp    compressed row storage), is fully compatible with standard Fortran 77
285e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
286e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
287e4a0ef16SKarl Rupp 
288e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
289e4a0ef16SKarl Rupp    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
290e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
291e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
292e4a0ef16SKarl Rupp 
293e4a0ef16SKarl Rupp    Level: intermediate
294e4a0ef16SKarl Rupp 
295e9e886b6SKarl Rupp .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
296e4a0ef16SKarl Rupp 
297e4a0ef16SKarl Rupp @*/
298e4a0ef16SKarl Rupp PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
299e4a0ef16SKarl Rupp {
300e4a0ef16SKarl Rupp   PetscErrorCode ierr;
301e4a0ef16SKarl Rupp 
302e4a0ef16SKarl Rupp   PetscFunctionBegin;
303e4a0ef16SKarl Rupp   ierr = MatCreate(comm,A);CHKERRQ(ierr);
304e4a0ef16SKarl Rupp   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
305e4a0ef16SKarl Rupp   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
306e4a0ef16SKarl Rupp   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
307e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
308e4a0ef16SKarl Rupp }
309e4a0ef16SKarl Rupp 
310e4a0ef16SKarl Rupp 
311e4a0ef16SKarl Rupp PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
312e4a0ef16SKarl Rupp {
313e4a0ef16SKarl Rupp   PetscErrorCode ierr;
314e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
315e4a0ef16SKarl Rupp 
316e4a0ef16SKarl Rupp   PetscFunctionBegin;
317e4a0ef16SKarl Rupp   try {
3186447cd05SKarl Rupp     if (viennaclcontainer) {
3196447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3206447cd05SKarl Rupp       delete viennaclcontainer->mat;
3216447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
322e4a0ef16SKarl Rupp       delete viennaclcontainer;
3236447cd05SKarl Rupp     }
324b8ced49eSKarl Rupp     A->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
3254076e183SKarl Rupp   } catch(std::exception const & ex) {
3264076e183SKarl Rupp     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
327e4a0ef16SKarl Rupp   }
3288713a8baSPatrick Sanan 
3298713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
3308713a8baSPatrick Sanan 
331e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
332e4a0ef16SKarl Rupp   A->spptr = 0;
333e4a0ef16SKarl Rupp   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
334e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
335e4a0ef16SKarl Rupp }
336e4a0ef16SKarl Rupp 
337e4a0ef16SKarl Rupp 
338e4a0ef16SKarl Rupp PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
339e4a0ef16SKarl Rupp {
340e4a0ef16SKarl Rupp   PetscErrorCode ierr;
341e4a0ef16SKarl Rupp 
342e4a0ef16SKarl Rupp   PetscFunctionBegin;
343e4a0ef16SKarl Rupp   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3448713a8baSPatrick Sanan   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
3458713a8baSPatrick Sanan   PetscFunctionReturn(0);
3468713a8baSPatrick Sanan }
3478713a8baSPatrick Sanan 
348c3cca76eSKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
349c3cca76eSKarl Rupp {
350c3cca76eSKarl Rupp   PetscErrorCode ierr;
351c3cca76eSKarl Rupp   Mat C;
352c3cca76eSKarl Rupp 
353c3cca76eSKarl Rupp   PetscFunctionBegin;
354c3cca76eSKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
355c3cca76eSKarl Rupp   C = *B;
356c3cca76eSKarl Rupp 
357c3cca76eSKarl Rupp   C->ops->mult        = MatMult_SeqAIJViennaCL;
358c3cca76eSKarl Rupp   C->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
359c3cca76eSKarl Rupp   C->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
360c3cca76eSKarl Rupp   C->ops->destroy     = MatDestroy_SeqAIJViennaCL;
361c3cca76eSKarl Rupp   C->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
362c3cca76eSKarl Rupp 
363c3cca76eSKarl Rupp   C->spptr        = new Mat_SeqAIJViennaCL();
364c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
365c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
366c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;
367c3cca76eSKarl Rupp 
368c3cca76eSKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);CHKERRQ(ierr);
369c3cca76eSKarl Rupp 
370b8ced49eSKarl Rupp   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
371c3cca76eSKarl Rupp 
372c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
373c3cca76eSKarl Rupp   if (C->assembled) {
374c3cca76eSKarl Rupp     ierr = MatViennaCLCopyToGPU(C);CHKERRQ(ierr);
375c3cca76eSKarl Rupp   }
376c3cca76eSKarl Rupp 
377c3cca76eSKarl Rupp   PetscFunctionReturn(0);
378c3cca76eSKarl Rupp }
379c3cca76eSKarl Rupp 
3808713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
3818713a8baSPatrick Sanan {
3828713a8baSPatrick Sanan   PetscErrorCode ierr;
3838713a8baSPatrick Sanan   Mat            B;
3848713a8baSPatrick Sanan   Mat_SeqAIJ     *aij;
3858713a8baSPatrick Sanan 
3868713a8baSPatrick Sanan   PetscFunctionBegin;
3878713a8baSPatrick Sanan 
3888713a8baSPatrick Sanan   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
3898713a8baSPatrick Sanan 
3908713a8baSPatrick Sanan   if (reuse == MAT_INITIAL_MATRIX) {
3918713a8baSPatrick Sanan     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3928713a8baSPatrick Sanan   }
3938713a8baSPatrick Sanan 
3948713a8baSPatrick Sanan   B = *newmat;
3958713a8baSPatrick Sanan 
396e4a0ef16SKarl Rupp   aij             = (Mat_SeqAIJ*)B->data;
397e4a0ef16SKarl Rupp   aij->inode.use  = PETSC_FALSE;
3988713a8baSPatrick Sanan 
399e4a0ef16SKarl Rupp   B->ops->mult    = MatMult_SeqAIJViennaCL;
400e4a0ef16SKarl Rupp   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
401e4a0ef16SKarl Rupp   B->spptr        = new Mat_SeqAIJViennaCL();
402e4a0ef16SKarl Rupp 
403a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
404a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
405a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
406e4a0ef16SKarl Rupp 
407e4a0ef16SKarl Rupp   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
408e4a0ef16SKarl Rupp   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
409c3cca76eSKarl Rupp   B->ops->duplicate      = MatDuplicate_SeqAIJViennaCL;
410e4a0ef16SKarl Rupp 
411e4a0ef16SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
41234136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
41334136279SStefano Zampini   ierr = PetscStrallocpy(VECVIENNACL,&B->defaultvectype);CHKERRQ(ierr);
414e4a0ef16SKarl Rupp 
4158713a8baSPatrick Sanan   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
4168713a8baSPatrick Sanan 
417b8ced49eSKarl Rupp   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
4188713a8baSPatrick Sanan 
4198713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4208713a8baSPatrick Sanan   if (B->assembled) {
4218713a8baSPatrick Sanan     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
4228713a8baSPatrick Sanan   }
4238713a8baSPatrick Sanan 
424e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
425e4a0ef16SKarl Rupp }
426e4a0ef16SKarl Rupp 
427e4a0ef16SKarl Rupp 
4283ca39a21SBarry Smith /*MC
429e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
430e4a0ef16SKarl Rupp 
431e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
432e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
433e4a0ef16SKarl Rupp 
434e4a0ef16SKarl Rupp    Options Database Keys:
435e4a0ef16SKarl Rupp +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
436e4a0ef16SKarl Rupp .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
437e4a0ef16SKarl Rupp -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
438e4a0ef16SKarl Rupp 
439e4a0ef16SKarl Rupp   Level: beginner
440e4a0ef16SKarl Rupp 
441e4a0ef16SKarl Rupp .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
442e4a0ef16SKarl Rupp M*/
443e4a0ef16SKarl Rupp 
44472367587SKarl Rupp 
4453ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
44672367587SKarl Rupp {
44772367587SKarl Rupp   PetscErrorCode ierr;
44872367587SKarl Rupp 
44972367587SKarl Rupp   PetscFunctionBegin;
4503ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
4513ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
4523ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
4533ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
45472367587SKarl Rupp   PetscFunctionReturn(0);
45572367587SKarl Rupp }
456