xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 11a5261e40035b7c793f2783a2ba6c7cd4f3b077)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp /*
3e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
4e4a0ef16SKarl Rupp   matrix storage format.
5e4a0ef16SKarl Rupp */
6e4a0ef16SKarl Rupp 
7aaa7dc30SBarry Smith #include <petscconf.h>
899acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp #include <algorithm>
17e4a0ef16SKarl Rupp #include <vector>
18e4a0ef16SKarl Rupp #include <string>
19e4a0ef16SKarl Rupp 
20e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
21e4a0ef16SKarl Rupp 
228713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2372367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *);
244222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
258713a8baSPatrick Sanan 
269371c9d4SSatish Balay PetscErrorCode MatViennaCLCopyToGPU(Mat A) {
27e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
28e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
29e4a0ef16SKarl Rupp 
30e4a0ef16SKarl Rupp   PetscFunctionBegin;
31bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
32c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
339566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
34e4a0ef16SKarl Rupp 
35e4a0ef16SKarl Rupp       try {
36e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
37a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
38e4a0ef16SKarl Rupp 
39a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
40a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
41e4a0ef16SKarl Rupp 
429371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
439371c9d4SSatish Balay           row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1);
449371c9d4SSatish Balay           for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]);
45e4a0ef16SKarl Rupp 
469371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_indices;
479371c9d4SSatish Balay           row_indices.raw_resize(dummy, a->compressedrow.nrows);
489371c9d4SSatish Balay           for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]);
49a3430c56SKarl Rupp 
509371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
519371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
529371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
53a3430c56SKarl Rupp 
54a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
559566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
56e4a0ef16SKarl Rupp         } else {
57a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
58e4a0ef16SKarl Rupp 
59e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
60e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
61e4a0ef16SKarl Rupp 
629371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
639371c9d4SSatish Balay           row_buffer.raw_resize(dummy, A->rmap->n + 1);
649371c9d4SSatish Balay           for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]);
65e4a0ef16SKarl Rupp 
669371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
679371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
689371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
69e4a0ef16SKarl Rupp 
70e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
719566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
72e4a0ef16SKarl Rupp         }
734cf1874eSKarl Rupp         ViennaCLWaitForGPU();
749371c9d4SSatish Balay       } catch (std::exception const &ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); }
75e4a0ef16SKarl Rupp 
76a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
77a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
789b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
79a3430c56SKarl Rupp           delete (ViennaCLVector *)viennaclstruct->tempvec;
809b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
81a3430c56SKarl Rupp         } else {
82a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
83a3430c56SKarl Rupp         }
84a3430c56SKarl Rupp       } else {
859b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
86a3430c56SKarl Rupp       }
87a3430c56SKarl Rupp 
88c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
89e4a0ef16SKarl Rupp 
909566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
91e4a0ef16SKarl Rupp     }
9267c87b7fSKarl Rupp   }
93e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
94e4a0ef16SKarl Rupp }
95e4a0ef16SKarl Rupp 
969371c9d4SSatish Balay PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) {
97f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
98e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
99e4a0ef16SKarl Rupp   PetscInt            m              = A->rmap->n;
100e4a0ef16SKarl Rupp 
101e4a0ef16SKarl Rupp   PetscFunctionBegin;
102c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0);
103c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
104e4a0ef16SKarl Rupp     try {
1052c71b3e2SJacob Faibussowitsch       PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1062c71b3e2SJacob Faibussowitsch       PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
107e4a0ef16SKarl Rupp       a->nz           = Agpu->nnz();
108e4a0ef16SKarl Rupp       a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
109e4a0ef16SKarl Rupp       A->preallocated = PETSC_TRUE;
110e4a0ef16SKarl Rupp       if (a->singlemalloc) {
1119566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree3(a->a, a->j, a->i));
112e4a0ef16SKarl Rupp       } else {
1139566063dSJacob Faibussowitsch         if (a->i) PetscCall(PetscFree(a->i));
1149566063dSJacob Faibussowitsch         if (a->j) PetscCall(PetscFree(a->j));
1159566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree(a->a));
116e4a0ef16SKarl Rupp       }
1179566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(a->nz, &a->a, a->nz, &a->j, m + 1, &a->i));
1189566063dSJacob Faibussowitsch       PetscCall(PetscLogObjectMemory((PetscObject)A, a->nz * (sizeof(PetscScalar) + sizeof(PetscInt)) + (m + 1) * sizeof(PetscInt)));
119e4a0ef16SKarl Rupp 
120e4a0ef16SKarl Rupp       a->singlemalloc = PETSC_TRUE;
121e4a0ef16SKarl Rupp 
122e4a0ef16SKarl Rupp       /* Setup row lengths */
1239566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->imax));
1249566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->ilen));
1259566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->imax));
1269566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->ilen));
1279566063dSJacob Faibussowitsch       PetscCall(PetscLogObjectMemory((PetscObject)A, 2 * m * sizeof(PetscInt)));
128e4a0ef16SKarl Rupp 
129e4a0ef16SKarl Rupp       /* Copy data back from GPU */
1309371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
1319371c9d4SSatish Balay       row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
132e4a0ef16SKarl Rupp 
133e4a0ef16SKarl Rupp       // copy row array
134e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
135e4a0ef16SKarl Rupp       (a->i)[0] = row_buffer[0];
136e4a0ef16SKarl Rupp       for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
137e4a0ef16SKarl Rupp         (a->i)[i + 1] = row_buffer[i + 1];
138e4a0ef16SKarl Rupp         a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
139e4a0ef16SKarl Rupp       }
140e4a0ef16SKarl Rupp 
141e4a0ef16SKarl Rupp       // copy column indices
1429371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
1439371c9d4SSatish Balay       col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
144e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
1459371c9d4SSatish Balay       for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i];
146e4a0ef16SKarl Rupp 
147e4a0ef16SKarl Rupp       // copy nonzero entries directly to destination (no conversion required)
148e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
149e4a0ef16SKarl Rupp 
1509566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar))));
1514cf1874eSKarl Rupp       ViennaCLWaitForGPU();
152023073b3SKarl Rupp       /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
1539371c9d4SSatish Balay     } catch (std::exception const &ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); }
154c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
155f38c1e66SStefano Zampini     PetscFunctionReturn(0);
156f38c1e66SStefano Zampini   } else {
157c70f7ee4SJunchao Zhang     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
158e4a0ef16SKarl Rupp 
1592c71b3e2SJacob Faibussowitsch     PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
160f38c1e66SStefano Zampini     if (!Agpu) {
161f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a);
162f38c1e66SStefano Zampini     } else {
163f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
164f38c1e66SStefano Zampini     }
165f38c1e66SStefano Zampini   }
166c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
167b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
1689566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1699566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
170e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
171e4a0ef16SKarl Rupp }
172e4a0ef16SKarl Rupp 
1739371c9d4SSatish Balay PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy) {
174e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
175e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
1760d73d530SKarl Rupp   const ViennaCLVector *xgpu           = NULL;
1770d73d530SKarl Rupp   ViennaCLVector       *ygpu           = NULL;
178e4a0ef16SKarl Rupp 
179e4a0ef16SKarl Rupp   PetscFunctionBegin;
180837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1819566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
182bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
1839566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
1849566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu));
1859566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
186e4a0ef16SKarl Rupp     try {
187b7832b47SStefano Zampini       if (a->compressedrow.use) {
188b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
189b7832b47SStefano Zampini       } else {
190e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
191b7832b47SStefano Zampini       }
1924cf1874eSKarl Rupp       ViennaCLWaitForGPU();
1939371c9d4SSatish Balay     } catch (std::exception const &ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); }
1949566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
1959566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
1969566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu));
1979566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
198bf1781e8SStefano Zampini   } else {
1999566063dSJacob Faibussowitsch     PetscCall(VecSet_SeqViennaCL(yy, 0));
20067c87b7fSKarl Rupp   }
201e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
202e4a0ef16SKarl Rupp }
203e4a0ef16SKarl Rupp 
2049371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz) {
205e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
206e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
2070d73d530SKarl Rupp   const ViennaCLVector *xgpu = NULL, *ygpu = NULL;
2080d73d530SKarl Rupp   ViennaCLVector       *zgpu = NULL;
209e4a0ef16SKarl Rupp 
210e4a0ef16SKarl Rupp   PetscFunctionBegin;
211837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2129566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
213bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
214e4a0ef16SKarl Rupp     try {
2159566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
2169566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(yy, &ygpu));
2179566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu));
2189566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
219f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
220f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
221f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
222f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2234cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2249566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
2259566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2269566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu));
2279566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu));
228e4a0ef16SKarl Rupp 
2299371c9d4SSatish Balay     } catch (std::exception const &ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); }
2309566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
231bf1781e8SStefano Zampini   } else {
2329566063dSJacob Faibussowitsch     PetscCall(VecCopy_SeqViennaCL(yy, zz));
23367c87b7fSKarl Rupp   }
234e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
235e4a0ef16SKarl Rupp }
236e4a0ef16SKarl Rupp 
2379371c9d4SSatish Balay PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode) {
238e4a0ef16SKarl Rupp   PetscFunctionBegin;
2399566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
240489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
2419566063dSJacob Faibussowitsch   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
242e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
243e4a0ef16SKarl Rupp }
244e4a0ef16SKarl Rupp 
245e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
246cab5ea25SPierre Jolivet /*@C
247*11a5261eSBarry Smith    MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format
24819fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
249e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
250e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
251e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
252e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
253e4a0ef16SKarl Rupp 
254d083f849SBarry Smith    Collective
255e4a0ef16SKarl Rupp 
256e4a0ef16SKarl Rupp    Input Parameters:
257*11a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
258e4a0ef16SKarl Rupp .  m - number of rows
259e4a0ef16SKarl Rupp .  n - number of columns
260e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
261e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
262e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
263e4a0ef16SKarl Rupp 
264e4a0ef16SKarl Rupp    Output Parameter:
265e4a0ef16SKarl Rupp .  A - the matrix
266e4a0ef16SKarl Rupp 
267*11a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
268e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
269*11a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
270e4a0ef16SKarl Rupp 
271e4a0ef16SKarl Rupp    Notes:
272e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
273e4a0ef16SKarl Rupp 
274*11a5261eSBarry Smith    The AIJ format, also called
275*11a5261eSBarry Smith    compressed row storage, is fully compatible with standard Fortran 77
276e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
277e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
278e4a0ef16SKarl Rupp 
279e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
280*11a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
281e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
282e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
283e4a0ef16SKarl Rupp 
284e4a0ef16SKarl Rupp    Level: intermediate
285e4a0ef16SKarl Rupp 
286*11a5261eSBarry Smith .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
287e4a0ef16SKarl Rupp @*/
2889371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
289e4a0ef16SKarl Rupp   PetscFunctionBegin;
2909566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
2919566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
2929566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJVIENNACL));
2939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
294e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
295e4a0ef16SKarl Rupp }
296e4a0ef16SKarl Rupp 
2979371c9d4SSatish Balay PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) {
298e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr;
299e4a0ef16SKarl Rupp 
300e4a0ef16SKarl Rupp   PetscFunctionBegin;
301e4a0ef16SKarl Rupp   try {
3026447cd05SKarl Rupp     if (viennaclcontainer) {
3036447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3046447cd05SKarl Rupp       delete viennaclcontainer->mat;
3056447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
306e4a0ef16SKarl Rupp       delete viennaclcontainer;
3076447cd05SKarl Rupp     }
308c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3099371c9d4SSatish Balay   } catch (std::exception const &ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); }
3108713a8baSPatrick Sanan 
3119566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL));
3129566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL));
3139566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL));
3148713a8baSPatrick Sanan 
315e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
316e4a0ef16SKarl Rupp   A->spptr = 0;
3179566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
318e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
319e4a0ef16SKarl Rupp }
320e4a0ef16SKarl Rupp 
3219371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) {
322e4a0ef16SKarl Rupp   PetscFunctionBegin;
3239566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
3249566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B));
3258713a8baSPatrick Sanan   PetscFunctionReturn(0);
3268713a8baSPatrick Sanan }
3278713a8baSPatrick Sanan 
328b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool);
3299371c9d4SSatish Balay static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B) {
330c3cca76eSKarl Rupp   Mat C;
331c3cca76eSKarl Rupp 
332c3cca76eSKarl Rupp   PetscFunctionBegin;
3339566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
334c3cca76eSKarl Rupp   C = *B;
335c3cca76eSKarl Rupp 
3369566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
33792f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
338c3cca76eSKarl Rupp 
339c3cca76eSKarl Rupp   C->spptr                                         = new Mat_SeqAIJViennaCL();
340c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec        = NULL;
341c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->mat            = NULL;
342c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL;
343c3cca76eSKarl Rupp 
3449566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL));
345c3cca76eSKarl Rupp 
346c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
347c3cca76eSKarl Rupp 
348c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
3499566063dSJacob Faibussowitsch   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
350c3cca76eSKarl Rupp   PetscFunctionReturn(0);
351c3cca76eSKarl Rupp }
352c3cca76eSKarl Rupp 
3539371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[]) {
354f38c1e66SStefano Zampini   PetscFunctionBegin;
3559566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
356c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
357f38c1e66SStefano Zampini   PetscFunctionReturn(0);
358f38c1e66SStefano Zampini }
359f38c1e66SStefano Zampini 
3609371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[]) {
361f38c1e66SStefano Zampini   PetscFunctionBegin;
362c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
363c1a4f3baSJunchao Zhang   *array         = NULL;
364c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
365c1a4f3baSJunchao Zhang }
366c1a4f3baSJunchao Zhang 
3679371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[]) {
368c1a4f3baSJunchao Zhang   PetscFunctionBegin;
3699566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
370c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
371c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
372c1a4f3baSJunchao Zhang }
373c1a4f3baSJunchao Zhang 
3749371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[]) {
375c1a4f3baSJunchao Zhang   PetscFunctionBegin;
376c1a4f3baSJunchao Zhang   *array = NULL;
377c1a4f3baSJunchao Zhang   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
378c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
379c1a4f3baSJunchao Zhang }
380c1a4f3baSJunchao Zhang 
3819371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[]) {
382c1a4f3baSJunchao Zhang   PetscFunctionBegin;
383c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
384c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
385c1a4f3baSJunchao Zhang }
386c1a4f3baSJunchao Zhang 
3879371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[]) {
388c1a4f3baSJunchao Zhang   PetscFunctionBegin;
389c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
390f38c1e66SStefano Zampini   *array         = NULL;
391f38c1e66SStefano Zampini   PetscFunctionReturn(0);
392f38c1e66SStefano Zampini }
393f38c1e66SStefano Zampini 
3949371c9d4SSatish Balay static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg) {
395c1a4f3baSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
396f38c1e66SStefano Zampini 
397e7e92044SBarry Smith   PetscFunctionBegin;
398b470e4b4SRichard Tran Mills   A->boundtocpu = flg;
399c1a4f3baSJunchao Zhang   if (flg && a->inode.size) {
400c1a4f3baSJunchao Zhang     a->inode.use = PETSC_TRUE;
401ea500dcfSRichard Tran Mills   } else {
402c1a4f3baSJunchao Zhang     a->inode.use = PETSC_FALSE;
403ea500dcfSRichard Tran Mills   }
404e7e92044SBarry Smith   if (flg) {
405f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
4069566063dSJacob Faibussowitsch     PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
407e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
408e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
409e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
410e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
4119566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
412e7e92044SBarry Smith   } else {
413e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
414e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
415e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
416e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
417e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
418c1a4f3baSJunchao Zhang 
419c1a4f3baSJunchao Zhang     a->ops->getarray          = MatSeqAIJGetArray_SeqAIJViennaCL;
420c1a4f3baSJunchao Zhang     a->ops->restorearray      = MatSeqAIJRestoreArray_SeqAIJViennaCL;
421c1a4f3baSJunchao Zhang     a->ops->getarrayread      = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
422c1a4f3baSJunchao Zhang     a->ops->restorearrayread  = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
423c1a4f3baSJunchao Zhang     a->ops->getarraywrite     = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
424c1a4f3baSJunchao Zhang     a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
425e7e92044SBarry Smith   }
426e7e92044SBarry Smith   PetscFunctionReturn(0);
427e7e92044SBarry Smith }
428e7e92044SBarry Smith 
4299371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
4308713a8baSPatrick Sanan   Mat B;
4318713a8baSPatrick Sanan 
4328713a8baSPatrick Sanan   PetscFunctionBegin;
4335f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4348713a8baSPatrick Sanan 
4359566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4368713a8baSPatrick Sanan 
4378713a8baSPatrick Sanan   B = *newmat;
4388713a8baSPatrick Sanan 
439e4a0ef16SKarl Rupp   B->spptr = new Mat_SeqAIJViennaCL();
440e4a0ef16SKarl Rupp 
441a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec        = NULL;
442a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->mat            = NULL;
443a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL;
444e4a0ef16SKarl Rupp 
4459566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
44692f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
447e4a0ef16SKarl Rupp 
4489566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL));
4499566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
4509566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype));
451e4a0ef16SKarl Rupp 
4529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL));
4539566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
4549566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ));
4558713a8baSPatrick Sanan 
456c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4578713a8baSPatrick Sanan 
4588713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4599566063dSJacob Faibussowitsch   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
460e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
461e4a0ef16SKarl Rupp }
462e4a0ef16SKarl Rupp 
4633ca39a21SBarry Smith /*MC
464e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
465e4a0ef16SKarl Rupp 
466e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
467e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
468e4a0ef16SKarl Rupp 
469e4a0ef16SKarl Rupp    Options Database Keys:
470*11a5261eSBarry Smith +  -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions()
471*11a5261eSBarry Smith .  -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
472*11a5261eSBarry Smith -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
473e4a0ef16SKarl Rupp 
474e4a0ef16SKarl Rupp   Level: beginner
475e4a0ef16SKarl Rupp 
476db781477SPatrick Sanan .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()`
477e4a0ef16SKarl Rupp M*/
478e4a0ef16SKarl Rupp 
4799371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void) {
48072367587SKarl Rupp   PetscFunctionBegin;
4819566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc));
4829566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc));
4839566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc));
4849566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc));
48572367587SKarl Rupp   PetscFunctionReturn(0);
48672367587SKarl Rupp }
487