xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp /*
3e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
4e4a0ef16SKarl Rupp   matrix storage format.
5e4a0ef16SKarl Rupp */
6e4a0ef16SKarl Rupp 
7aaa7dc30SBarry Smith #include <petscconf.h>
899acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp #include <algorithm>
17e4a0ef16SKarl Rupp #include <vector>
18e4a0ef16SKarl Rupp #include <string>
19e4a0ef16SKarl Rupp 
20e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
21e4a0ef16SKarl Rupp 
228713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2372367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *);
244222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
258713a8baSPatrick Sanan 
26*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyToGPU(Mat A)
27*d71ae5a4SJacob Faibussowitsch {
28e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
29e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
30e4a0ef16SKarl Rupp 
31e4a0ef16SKarl Rupp   PetscFunctionBegin;
32bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
33c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
349566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
35e4a0ef16SKarl Rupp 
36e4a0ef16SKarl Rupp       try {
37e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
38a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
39e4a0ef16SKarl Rupp 
40a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
41a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
42e4a0ef16SKarl Rupp 
439371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
449371c9d4SSatish Balay           row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1);
459371c9d4SSatish Balay           for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]);
46e4a0ef16SKarl Rupp 
479371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_indices;
489371c9d4SSatish Balay           row_indices.raw_resize(dummy, a->compressedrow.nrows);
499371c9d4SSatish Balay           for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]);
50a3430c56SKarl Rupp 
519371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
529371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
539371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
54a3430c56SKarl Rupp 
55a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
569566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
57e4a0ef16SKarl Rupp         } else {
58a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
59e4a0ef16SKarl Rupp 
60e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
61e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
62e4a0ef16SKarl Rupp 
639371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
649371c9d4SSatish Balay           row_buffer.raw_resize(dummy, A->rmap->n + 1);
659371c9d4SSatish Balay           for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]);
66e4a0ef16SKarl Rupp 
679371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
689371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
699371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
70e4a0ef16SKarl Rupp 
71e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
729566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
73e4a0ef16SKarl Rupp         }
744cf1874eSKarl Rupp         ViennaCLWaitForGPU();
75*d71ae5a4SJacob Faibussowitsch       } catch (std::exception const &ex) {
76*d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
77*d71ae5a4SJacob Faibussowitsch       }
78e4a0ef16SKarl Rupp 
79a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
80a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
819b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
82a3430c56SKarl Rupp           delete (ViennaCLVector *)viennaclstruct->tempvec;
839b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
84a3430c56SKarl Rupp         } else {
85a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
86a3430c56SKarl Rupp         }
87a3430c56SKarl Rupp       } else {
889b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
89a3430c56SKarl Rupp       }
90a3430c56SKarl Rupp 
91c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
92e4a0ef16SKarl Rupp 
939566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
94e4a0ef16SKarl Rupp     }
9567c87b7fSKarl Rupp   }
96e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
97e4a0ef16SKarl Rupp }
98e4a0ef16SKarl Rupp 
99*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
100*d71ae5a4SJacob Faibussowitsch {
101f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
102e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
103e4a0ef16SKarl Rupp   PetscInt            m              = A->rmap->n;
104e4a0ef16SKarl Rupp 
105e4a0ef16SKarl Rupp   PetscFunctionBegin;
106c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0);
107c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
108e4a0ef16SKarl Rupp     try {
1092c71b3e2SJacob Faibussowitsch       PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1102c71b3e2SJacob Faibussowitsch       PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
111e4a0ef16SKarl Rupp       a->nz           = Agpu->nnz();
112e4a0ef16SKarl Rupp       a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
113e4a0ef16SKarl Rupp       A->preallocated = PETSC_TRUE;
114e4a0ef16SKarl Rupp       if (a->singlemalloc) {
1159566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree3(a->a, a->j, a->i));
116e4a0ef16SKarl Rupp       } else {
1179566063dSJacob Faibussowitsch         if (a->i) PetscCall(PetscFree(a->i));
1189566063dSJacob Faibussowitsch         if (a->j) PetscCall(PetscFree(a->j));
1199566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree(a->a));
120e4a0ef16SKarl Rupp       }
1219566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(a->nz, &a->a, a->nz, &a->j, m + 1, &a->i));
122e4a0ef16SKarl Rupp 
123e4a0ef16SKarl Rupp       a->singlemalloc = PETSC_TRUE;
124e4a0ef16SKarl Rupp 
125e4a0ef16SKarl Rupp       /* Setup row lengths */
1269566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->imax));
1279566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->ilen));
1289566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->imax));
1299566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->ilen));
130e4a0ef16SKarl Rupp 
131e4a0ef16SKarl Rupp       /* Copy data back from GPU */
1329371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
1339371c9d4SSatish Balay       row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
134e4a0ef16SKarl Rupp 
135e4a0ef16SKarl Rupp       // copy row array
136e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
137e4a0ef16SKarl Rupp       (a->i)[0] = row_buffer[0];
138e4a0ef16SKarl Rupp       for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
139e4a0ef16SKarl Rupp         (a->i)[i + 1] = row_buffer[i + 1];
140e4a0ef16SKarl Rupp         a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
141e4a0ef16SKarl Rupp       }
142e4a0ef16SKarl Rupp 
143e4a0ef16SKarl Rupp       // copy column indices
1449371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
1459371c9d4SSatish Balay       col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
146e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
1479371c9d4SSatish Balay       for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i];
148e4a0ef16SKarl Rupp 
149e4a0ef16SKarl Rupp       // copy nonzero entries directly to destination (no conversion required)
150e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
151e4a0ef16SKarl Rupp 
1529566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar))));
1534cf1874eSKarl Rupp       ViennaCLWaitForGPU();
154023073b3SKarl Rupp       /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
155*d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
156*d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
157*d71ae5a4SJacob Faibussowitsch     }
158c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
159f38c1e66SStefano Zampini     PetscFunctionReturn(0);
160f38c1e66SStefano Zampini   } else {
161c70f7ee4SJunchao Zhang     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
162e4a0ef16SKarl Rupp 
1632c71b3e2SJacob Faibussowitsch     PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
164f38c1e66SStefano Zampini     if (!Agpu) {
165f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a);
166f38c1e66SStefano Zampini     } else {
167f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
168f38c1e66SStefano Zampini     }
169f38c1e66SStefano Zampini   }
170c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
171b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
1729566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1739566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
174e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
175e4a0ef16SKarl Rupp }
176e4a0ef16SKarl Rupp 
177*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy)
178*d71ae5a4SJacob Faibussowitsch {
179e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
180e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
1810d73d530SKarl Rupp   const ViennaCLVector *xgpu           = NULL;
1820d73d530SKarl Rupp   ViennaCLVector       *ygpu           = NULL;
183e4a0ef16SKarl Rupp 
184e4a0ef16SKarl Rupp   PetscFunctionBegin;
185837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1869566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
187bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
1889566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
1899566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu));
1909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
191e4a0ef16SKarl Rupp     try {
192b7832b47SStefano Zampini       if (a->compressedrow.use) {
193b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
194b7832b47SStefano Zampini       } else {
195e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
196b7832b47SStefano Zampini       }
1974cf1874eSKarl Rupp       ViennaCLWaitForGPU();
198*d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
199*d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
200*d71ae5a4SJacob Faibussowitsch     }
2019566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
2029566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2039566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu));
2049566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
205bf1781e8SStefano Zampini   } else {
2069566063dSJacob Faibussowitsch     PetscCall(VecSet_SeqViennaCL(yy, 0));
20767c87b7fSKarl Rupp   }
208e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
209e4a0ef16SKarl Rupp }
210e4a0ef16SKarl Rupp 
211*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz)
212*d71ae5a4SJacob Faibussowitsch {
213e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
214e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
2150d73d530SKarl Rupp   const ViennaCLVector *xgpu = NULL, *ygpu = NULL;
2160d73d530SKarl Rupp   ViennaCLVector       *zgpu = NULL;
217e4a0ef16SKarl Rupp 
218e4a0ef16SKarl Rupp   PetscFunctionBegin;
219837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2209566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
221bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
222e4a0ef16SKarl Rupp     try {
2239566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
2249566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(yy, &ygpu));
2259566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu));
2269566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
227f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
228f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
229f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
230f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2314cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2329566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
2339566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2349566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu));
2359566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu));
236e4a0ef16SKarl Rupp 
237*d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
238*d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
239*d71ae5a4SJacob Faibussowitsch     }
2409566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
241bf1781e8SStefano Zampini   } else {
2429566063dSJacob Faibussowitsch     PetscCall(VecCopy_SeqViennaCL(yy, zz));
24367c87b7fSKarl Rupp   }
244e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
245e4a0ef16SKarl Rupp }
246e4a0ef16SKarl Rupp 
247*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode)
248*d71ae5a4SJacob Faibussowitsch {
249e4a0ef16SKarl Rupp   PetscFunctionBegin;
2509566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
251489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
2529566063dSJacob Faibussowitsch   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
253e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
254e4a0ef16SKarl Rupp }
255e4a0ef16SKarl Rupp 
256e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
257cab5ea25SPierre Jolivet /*@C
25811a5261eSBarry Smith    MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format
25919fddfadSKarl Rupp    (the default parallel PETSc format).  This matrix will ultimately be pushed down
260e4a0ef16SKarl Rupp    to GPUs and use the ViennaCL library for calculations. For good matrix
261e4a0ef16SKarl Rupp    assembly performance the user should preallocate the matrix storage by setting
262e4a0ef16SKarl Rupp    the parameter nz (or the array nnz).  By setting these parameters accurately,
263e4a0ef16SKarl Rupp    performance during matrix assembly can be increased substantially.
264e4a0ef16SKarl Rupp 
265d083f849SBarry Smith    Collective
266e4a0ef16SKarl Rupp 
267e4a0ef16SKarl Rupp    Input Parameters:
26811a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
269e4a0ef16SKarl Rupp .  m - number of rows
270e4a0ef16SKarl Rupp .  n - number of columns
271e4a0ef16SKarl Rupp .  nz - number of nonzeros per row (same for all rows)
272e4a0ef16SKarl Rupp -  nnz - array containing the number of nonzeros in the various rows
273e4a0ef16SKarl Rupp          (possibly different for each row) or NULL
274e4a0ef16SKarl Rupp 
275e4a0ef16SKarl Rupp    Output Parameter:
276e4a0ef16SKarl Rupp .  A - the matrix
277e4a0ef16SKarl Rupp 
27811a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
279e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
28011a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
281e4a0ef16SKarl Rupp 
282e4a0ef16SKarl Rupp    Notes:
283e4a0ef16SKarl Rupp    If nnz is given then nz is ignored
284e4a0ef16SKarl Rupp 
28511a5261eSBarry Smith    The AIJ format, also called
28611a5261eSBarry Smith    compressed row storage, is fully compatible with standard Fortran 77
287e4a0ef16SKarl Rupp    storage.  That is, the stored row and column indices can begin at
288e4a0ef16SKarl Rupp    either one (as in Fortran) or zero.  See the users' manual for details.
289e4a0ef16SKarl Rupp 
290e4a0ef16SKarl Rupp    Specify the preallocated storage with either nz or nnz (not both).
29111a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
292e4a0ef16SKarl Rupp    allocation.  For large problems you MUST preallocate memory or you
293e4a0ef16SKarl Rupp    will get TERRIBLE performance, see the users' manual chapter on matrices.
294e4a0ef16SKarl Rupp 
295e4a0ef16SKarl Rupp    Level: intermediate
296e4a0ef16SKarl Rupp 
29711a5261eSBarry Smith .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
298e4a0ef16SKarl Rupp @*/
299*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
300*d71ae5a4SJacob Faibussowitsch {
301e4a0ef16SKarl Rupp   PetscFunctionBegin;
3029566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
3039566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
3049566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJVIENNACL));
3059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
306e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
307e4a0ef16SKarl Rupp }
308e4a0ef16SKarl Rupp 
309*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
310*d71ae5a4SJacob Faibussowitsch {
311e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr;
312e4a0ef16SKarl Rupp 
313e4a0ef16SKarl Rupp   PetscFunctionBegin;
314e4a0ef16SKarl Rupp   try {
3156447cd05SKarl Rupp     if (viennaclcontainer) {
3166447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3176447cd05SKarl Rupp       delete viennaclcontainer->mat;
3186447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
319e4a0ef16SKarl Rupp       delete viennaclcontainer;
3206447cd05SKarl Rupp     }
321c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
322*d71ae5a4SJacob Faibussowitsch   } catch (std::exception const &ex) {
323*d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
324*d71ae5a4SJacob Faibussowitsch   }
3258713a8baSPatrick Sanan 
3269566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL));
3279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL));
3289566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL));
3298713a8baSPatrick Sanan 
330e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
331e4a0ef16SKarl Rupp   A->spptr = 0;
3329566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
333e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
334e4a0ef16SKarl Rupp }
335e4a0ef16SKarl Rupp 
336*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
337*d71ae5a4SJacob Faibussowitsch {
338e4a0ef16SKarl Rupp   PetscFunctionBegin;
3399566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
3409566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B));
3418713a8baSPatrick Sanan   PetscFunctionReturn(0);
3428713a8baSPatrick Sanan }
3438713a8baSPatrick Sanan 
344b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool);
345*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B)
346*d71ae5a4SJacob Faibussowitsch {
347c3cca76eSKarl Rupp   Mat C;
348c3cca76eSKarl Rupp 
349c3cca76eSKarl Rupp   PetscFunctionBegin;
3509566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
351c3cca76eSKarl Rupp   C = *B;
352c3cca76eSKarl Rupp 
3539566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
35492f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
355c3cca76eSKarl Rupp 
356c3cca76eSKarl Rupp   C->spptr                                         = new Mat_SeqAIJViennaCL();
357c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec        = NULL;
358c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->mat            = NULL;
359c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL;
360c3cca76eSKarl Rupp 
3619566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL));
362c3cca76eSKarl Rupp 
363c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
364c3cca76eSKarl Rupp 
365c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
3669566063dSJacob Faibussowitsch   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
367c3cca76eSKarl Rupp   PetscFunctionReturn(0);
368c3cca76eSKarl Rupp }
369c3cca76eSKarl Rupp 
370*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
371*d71ae5a4SJacob Faibussowitsch {
372f38c1e66SStefano Zampini   PetscFunctionBegin;
3739566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
374c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
375f38c1e66SStefano Zampini   PetscFunctionReturn(0);
376f38c1e66SStefano Zampini }
377f38c1e66SStefano Zampini 
378*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
379*d71ae5a4SJacob Faibussowitsch {
380f38c1e66SStefano Zampini   PetscFunctionBegin;
381c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
382c1a4f3baSJunchao Zhang   *array         = NULL;
383c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
384c1a4f3baSJunchao Zhang }
385c1a4f3baSJunchao Zhang 
386*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
387*d71ae5a4SJacob Faibussowitsch {
388c1a4f3baSJunchao Zhang   PetscFunctionBegin;
3899566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
390c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
391c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
392c1a4f3baSJunchao Zhang }
393c1a4f3baSJunchao Zhang 
394*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
395*d71ae5a4SJacob Faibussowitsch {
396c1a4f3baSJunchao Zhang   PetscFunctionBegin;
397c1a4f3baSJunchao Zhang   *array = NULL;
398c1a4f3baSJunchao Zhang   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
399c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
400c1a4f3baSJunchao Zhang }
401c1a4f3baSJunchao Zhang 
402*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
403*d71ae5a4SJacob Faibussowitsch {
404c1a4f3baSJunchao Zhang   PetscFunctionBegin;
405c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
406c1a4f3baSJunchao Zhang   PetscFunctionReturn(0);
407c1a4f3baSJunchao Zhang }
408c1a4f3baSJunchao Zhang 
409*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
410*d71ae5a4SJacob Faibussowitsch {
411c1a4f3baSJunchao Zhang   PetscFunctionBegin;
412c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
413f38c1e66SStefano Zampini   *array         = NULL;
414f38c1e66SStefano Zampini   PetscFunctionReturn(0);
415f38c1e66SStefano Zampini }
416f38c1e66SStefano Zampini 
417*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg)
418*d71ae5a4SJacob Faibussowitsch {
419c1a4f3baSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
420f38c1e66SStefano Zampini 
421e7e92044SBarry Smith   PetscFunctionBegin;
422b470e4b4SRichard Tran Mills   A->boundtocpu = flg;
423c1a4f3baSJunchao Zhang   if (flg && a->inode.size) {
424c1a4f3baSJunchao Zhang     a->inode.use = PETSC_TRUE;
425ea500dcfSRichard Tran Mills   } else {
426c1a4f3baSJunchao Zhang     a->inode.use = PETSC_FALSE;
427ea500dcfSRichard Tran Mills   }
428e7e92044SBarry Smith   if (flg) {
429f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
4309566063dSJacob Faibussowitsch     PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
431e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
432e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
433e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
434e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
4359566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
436e7e92044SBarry Smith   } else {
437e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
438e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
439e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
440e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
441e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
442c1a4f3baSJunchao Zhang 
443c1a4f3baSJunchao Zhang     a->ops->getarray          = MatSeqAIJGetArray_SeqAIJViennaCL;
444c1a4f3baSJunchao Zhang     a->ops->restorearray      = MatSeqAIJRestoreArray_SeqAIJViennaCL;
445c1a4f3baSJunchao Zhang     a->ops->getarrayread      = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
446c1a4f3baSJunchao Zhang     a->ops->restorearrayread  = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
447c1a4f3baSJunchao Zhang     a->ops->getarraywrite     = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
448c1a4f3baSJunchao Zhang     a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
449e7e92044SBarry Smith   }
450e7e92044SBarry Smith   PetscFunctionReturn(0);
451e7e92044SBarry Smith }
452e7e92044SBarry Smith 
453*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat)
454*d71ae5a4SJacob Faibussowitsch {
4558713a8baSPatrick Sanan   Mat B;
4568713a8baSPatrick Sanan 
4578713a8baSPatrick Sanan   PetscFunctionBegin;
4585f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4598713a8baSPatrick Sanan 
4609566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4618713a8baSPatrick Sanan 
4628713a8baSPatrick Sanan   B = *newmat;
4638713a8baSPatrick Sanan 
464e4a0ef16SKarl Rupp   B->spptr = new Mat_SeqAIJViennaCL();
465e4a0ef16SKarl Rupp 
466a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec        = NULL;
467a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->mat            = NULL;
468a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL;
469e4a0ef16SKarl Rupp 
4709566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
47192f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
472e4a0ef16SKarl Rupp 
4739566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL));
4749566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
4759566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype));
476e4a0ef16SKarl Rupp 
4779566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL));
4789566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
4799566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ));
4808713a8baSPatrick Sanan 
481c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4828713a8baSPatrick Sanan 
4838713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4849566063dSJacob Faibussowitsch   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
485e4a0ef16SKarl Rupp   PetscFunctionReturn(0);
486e4a0ef16SKarl Rupp }
487e4a0ef16SKarl Rupp 
4883ca39a21SBarry Smith /*MC
489e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
490e4a0ef16SKarl Rupp 
491e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
492e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
493e4a0ef16SKarl Rupp 
494e4a0ef16SKarl Rupp    Options Database Keys:
49511a5261eSBarry Smith +  -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions()
49611a5261eSBarry Smith .  -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
49711a5261eSBarry Smith -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
498e4a0ef16SKarl Rupp 
499e4a0ef16SKarl Rupp   Level: beginner
500e4a0ef16SKarl Rupp 
501db781477SPatrick Sanan .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()`
502e4a0ef16SKarl Rupp M*/
503e4a0ef16SKarl Rupp 
504*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
505*d71ae5a4SJacob Faibussowitsch {
50672367587SKarl Rupp   PetscFunctionBegin;
5079566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc));
5089566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc));
5099566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc));
5109566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc));
51172367587SKarl Rupp   PetscFunctionReturn(0);
51272367587SKarl Rupp }
513