xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision fe59aa6d68c880d4014a5813129926ee5b21e858)
1e4a0ef16SKarl Rupp 
2e4a0ef16SKarl Rupp /*
3e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
4e4a0ef16SKarl Rupp   matrix storage format.
5e4a0ef16SKarl Rupp */
6e4a0ef16SKarl Rupp 
7aaa7dc30SBarry Smith #include <petscconf.h>
899acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10aaa7dc30SBarry Smith #include <petscbt.h>
11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
13e4a0ef16SKarl Rupp 
14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15e4a0ef16SKarl Rupp 
16e4a0ef16SKarl Rupp #include <algorithm>
17e4a0ef16SKarl Rupp #include <vector>
18e4a0ef16SKarl Rupp #include <string>
19e4a0ef16SKarl Rupp 
20e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
21e4a0ef16SKarl Rupp 
228713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2372367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *);
244222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
258713a8baSPatrick Sanan 
26d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyToGPU(Mat A)
27d71ae5a4SJacob Faibussowitsch {
28e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
29e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
30e4a0ef16SKarl Rupp 
31e4a0ef16SKarl Rupp   PetscFunctionBegin;
32bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
33c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
349566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
35e4a0ef16SKarl Rupp 
36e4a0ef16SKarl Rupp       try {
37e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
38a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
39e4a0ef16SKarl Rupp 
40a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
41a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
42e4a0ef16SKarl Rupp 
439371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
449371c9d4SSatish Balay           row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1);
459371c9d4SSatish Balay           for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]);
46e4a0ef16SKarl Rupp 
479371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_indices;
489371c9d4SSatish Balay           row_indices.raw_resize(dummy, a->compressedrow.nrows);
499371c9d4SSatish Balay           for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]);
50a3430c56SKarl Rupp 
519371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
529371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
539371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
54a3430c56SKarl Rupp 
55a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
569566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
57e4a0ef16SKarl Rupp         } else {
58a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
59e4a0ef16SKarl Rupp 
60e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
61e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
62e4a0ef16SKarl Rupp 
639371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
649371c9d4SSatish Balay           row_buffer.raw_resize(dummy, A->rmap->n + 1);
659371c9d4SSatish Balay           for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]);
66e4a0ef16SKarl Rupp 
679371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
689371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
699371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
70e4a0ef16SKarl Rupp 
71e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
729566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
73e4a0ef16SKarl Rupp         }
744cf1874eSKarl Rupp         ViennaCLWaitForGPU();
75d71ae5a4SJacob Faibussowitsch       } catch (std::exception const &ex) {
76d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
77d71ae5a4SJacob Faibussowitsch       }
78e4a0ef16SKarl Rupp 
79a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
80a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
819b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
82a3430c56SKarl Rupp           delete (ViennaCLVector *)viennaclstruct->tempvec;
839b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
84a3430c56SKarl Rupp         } else {
85a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
86a3430c56SKarl Rupp         }
87a3430c56SKarl Rupp       } else {
889b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
89a3430c56SKarl Rupp       }
90a3430c56SKarl Rupp 
91c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
92e4a0ef16SKarl Rupp 
939566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
94e4a0ef16SKarl Rupp     }
9567c87b7fSKarl Rupp   }
963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
97e4a0ef16SKarl Rupp }
98e4a0ef16SKarl Rupp 
99d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
100d71ae5a4SJacob Faibussowitsch {
101f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
102e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
103e4a0ef16SKarl Rupp   PetscInt            m              = A->rmap->n;
104e4a0ef16SKarl Rupp 
105e4a0ef16SKarl Rupp   PetscFunctionBegin;
1063ba16761SJacob Faibussowitsch   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(PETSC_SUCCESS);
107c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
108e4a0ef16SKarl Rupp     try {
1092c71b3e2SJacob Faibussowitsch       PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1102c71b3e2SJacob Faibussowitsch       PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
111e4a0ef16SKarl Rupp       a->nz           = Agpu->nnz();
112e4a0ef16SKarl Rupp       a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
113e4a0ef16SKarl Rupp       A->preallocated = PETSC_TRUE;
114e4a0ef16SKarl Rupp       if (a->singlemalloc) {
1159566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree3(a->a, a->j, a->i));
116e4a0ef16SKarl Rupp       } else {
1179566063dSJacob Faibussowitsch         if (a->i) PetscCall(PetscFree(a->i));
1189566063dSJacob Faibussowitsch         if (a->j) PetscCall(PetscFree(a->j));
1199566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree(a->a));
120e4a0ef16SKarl Rupp       }
1219566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(a->nz, &a->a, a->nz, &a->j, m + 1, &a->i));
122e4a0ef16SKarl Rupp 
123e4a0ef16SKarl Rupp       a->singlemalloc = PETSC_TRUE;
124e4a0ef16SKarl Rupp 
125e4a0ef16SKarl Rupp       /* Setup row lengths */
1269566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->imax));
1279566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->ilen));
1289566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->imax));
1299566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->ilen));
130e4a0ef16SKarl Rupp 
131e4a0ef16SKarl Rupp       /* Copy data back from GPU */
1329371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
1339371c9d4SSatish Balay       row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
134e4a0ef16SKarl Rupp 
135e4a0ef16SKarl Rupp       // copy row array
136e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
137e4a0ef16SKarl Rupp       (a->i)[0] = row_buffer[0];
138e4a0ef16SKarl Rupp       for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
139e4a0ef16SKarl Rupp         (a->i)[i + 1] = row_buffer[i + 1];
140e4a0ef16SKarl Rupp         a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
141e4a0ef16SKarl Rupp       }
142e4a0ef16SKarl Rupp 
143e4a0ef16SKarl Rupp       // copy column indices
1449371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
1459371c9d4SSatish Balay       col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
146e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
1479371c9d4SSatish Balay       for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i];
148e4a0ef16SKarl Rupp 
149e4a0ef16SKarl Rupp       // copy nonzero entries directly to destination (no conversion required)
150e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
151e4a0ef16SKarl Rupp 
1529566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar))));
1534cf1874eSKarl Rupp       ViennaCLWaitForGPU();
154023073b3SKarl Rupp       /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
155d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
156d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
157d71ae5a4SJacob Faibussowitsch     }
158c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
1593ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
160f38c1e66SStefano Zampini   } else {
1613ba16761SJacob Faibussowitsch     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(PETSC_SUCCESS);
162e4a0ef16SKarl Rupp 
1632c71b3e2SJacob Faibussowitsch     PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
164f38c1e66SStefano Zampini     if (!Agpu) {
165f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a);
166f38c1e66SStefano Zampini     } else {
167f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
168f38c1e66SStefano Zampini     }
169f38c1e66SStefano Zampini   }
170c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
171b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
1729566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1739566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
175e4a0ef16SKarl Rupp }
176e4a0ef16SKarl Rupp 
177d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy)
178d71ae5a4SJacob Faibussowitsch {
179e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
180e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
1810d73d530SKarl Rupp   const ViennaCLVector *xgpu           = NULL;
1820d73d530SKarl Rupp   ViennaCLVector       *ygpu           = NULL;
183e4a0ef16SKarl Rupp 
184e4a0ef16SKarl Rupp   PetscFunctionBegin;
185837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1869566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
187bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
1889566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
1899566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu));
1909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
191e4a0ef16SKarl Rupp     try {
192b7832b47SStefano Zampini       if (a->compressedrow.use) {
193b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
194b7832b47SStefano Zampini       } else {
195e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
196b7832b47SStefano Zampini       }
1974cf1874eSKarl Rupp       ViennaCLWaitForGPU();
198d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
199d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
200d71ae5a4SJacob Faibussowitsch     }
2019566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
2029566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2039566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu));
2049566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
205bf1781e8SStefano Zampini   } else {
2069566063dSJacob Faibussowitsch     PetscCall(VecSet_SeqViennaCL(yy, 0));
20767c87b7fSKarl Rupp   }
2083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
209e4a0ef16SKarl Rupp }
210e4a0ef16SKarl Rupp 
211d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz)
212d71ae5a4SJacob Faibussowitsch {
213e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
214e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
2150d73d530SKarl Rupp   const ViennaCLVector *xgpu = NULL, *ygpu = NULL;
2160d73d530SKarl Rupp   ViennaCLVector       *zgpu = NULL;
217e4a0ef16SKarl Rupp 
218e4a0ef16SKarl Rupp   PetscFunctionBegin;
219837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2209566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
221bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
222e4a0ef16SKarl Rupp     try {
2239566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
2249566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(yy, &ygpu));
2259566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu));
2269566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
227f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
228f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
229f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
230f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2314cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2329566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
2339566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2349566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu));
2359566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu));
236e4a0ef16SKarl Rupp 
237d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
238d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
239d71ae5a4SJacob Faibussowitsch     }
2409566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
241bf1781e8SStefano Zampini   } else {
2429566063dSJacob Faibussowitsch     PetscCall(VecCopy_SeqViennaCL(yy, zz));
24367c87b7fSKarl Rupp   }
2443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
245e4a0ef16SKarl Rupp }
246e4a0ef16SKarl Rupp 
247d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode)
248d71ae5a4SJacob Faibussowitsch {
249e4a0ef16SKarl Rupp   PetscFunctionBegin;
2509566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
2513ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
2529566063dSJacob Faibussowitsch   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
2533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
254e4a0ef16SKarl Rupp }
255e4a0ef16SKarl Rupp 
256e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
257cab5ea25SPierre Jolivet /*@C
25811a5261eSBarry Smith   MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format
25919fddfadSKarl Rupp   (the default parallel PETSc format).  This matrix will ultimately be pushed down
26020f4b53cSBarry Smith   to GPUs and use the ViennaCL library for calculations.
261e4a0ef16SKarl Rupp 
262d083f849SBarry Smith   Collective
263e4a0ef16SKarl Rupp 
264e4a0ef16SKarl Rupp   Input Parameters:
26511a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
266e4a0ef16SKarl Rupp . m    - number of rows
267e4a0ef16SKarl Rupp . n    - number of columns
26820f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
26920f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
270e4a0ef16SKarl Rupp 
271e4a0ef16SKarl Rupp   Output Parameter:
272e4a0ef16SKarl Rupp . A - the matrix
273e4a0ef16SKarl Rupp 
27411a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
275e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
27611a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
277e4a0ef16SKarl Rupp 
278e4a0ef16SKarl Rupp   Notes:
27911a5261eSBarry Smith   The AIJ format, also called
2802ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
281e4a0ef16SKarl Rupp   storage.  That is, the stored row and column indices can begin at
28220f4b53cSBarry Smith   either one (as in Fortran) or zero.
283e4a0ef16SKarl Rupp 
28420f4b53cSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
28520f4b53cSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
28620f4b53cSBarry Smith   allocation.
287e4a0ef16SKarl Rupp 
288e4a0ef16SKarl Rupp   Level: intermediate
289e4a0ef16SKarl Rupp 
290*fe59aa6dSJacob Faibussowitsch .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
291e4a0ef16SKarl Rupp @*/
292d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
293d71ae5a4SJacob Faibussowitsch {
294e4a0ef16SKarl Rupp   PetscFunctionBegin;
2959566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
2969566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
2979566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJVIENNACL));
2989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
2993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
300e4a0ef16SKarl Rupp }
301e4a0ef16SKarl Rupp 
302d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
303d71ae5a4SJacob Faibussowitsch {
304e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr;
305e4a0ef16SKarl Rupp 
306e4a0ef16SKarl Rupp   PetscFunctionBegin;
307e4a0ef16SKarl Rupp   try {
3086447cd05SKarl Rupp     if (viennaclcontainer) {
3096447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3106447cd05SKarl Rupp       delete viennaclcontainer->mat;
3116447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
312e4a0ef16SKarl Rupp       delete viennaclcontainer;
3136447cd05SKarl Rupp     }
314c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
315d71ae5a4SJacob Faibussowitsch   } catch (std::exception const &ex) {
316d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
317d71ae5a4SJacob Faibussowitsch   }
3188713a8baSPatrick Sanan 
3199566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL));
3209566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL));
3219566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL));
3228713a8baSPatrick Sanan 
323e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
324e4a0ef16SKarl Rupp   A->spptr = 0;
3259566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
3263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
327e4a0ef16SKarl Rupp }
328e4a0ef16SKarl Rupp 
329d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
330d71ae5a4SJacob Faibussowitsch {
331e4a0ef16SKarl Rupp   PetscFunctionBegin;
3329566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
3339566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B));
3343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3358713a8baSPatrick Sanan }
3368713a8baSPatrick Sanan 
337b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool);
338d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B)
339d71ae5a4SJacob Faibussowitsch {
340c3cca76eSKarl Rupp   Mat C;
341c3cca76eSKarl Rupp 
342c3cca76eSKarl Rupp   PetscFunctionBegin;
3439566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
344c3cca76eSKarl Rupp   C = *B;
345c3cca76eSKarl Rupp 
3469566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
34792f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
348c3cca76eSKarl Rupp 
349c3cca76eSKarl Rupp   C->spptr                                         = new Mat_SeqAIJViennaCL();
350c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec        = NULL;
351c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->mat            = NULL;
352c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL;
353c3cca76eSKarl Rupp 
3549566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL));
355c3cca76eSKarl Rupp 
356c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
357c3cca76eSKarl Rupp 
358c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
3599566063dSJacob Faibussowitsch   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
3603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
361c3cca76eSKarl Rupp }
362c3cca76eSKarl Rupp 
363d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
364d71ae5a4SJacob Faibussowitsch {
365f38c1e66SStefano Zampini   PetscFunctionBegin;
3669566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
367c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
369f38c1e66SStefano Zampini }
370f38c1e66SStefano Zampini 
371d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
372d71ae5a4SJacob Faibussowitsch {
373f38c1e66SStefano Zampini   PetscFunctionBegin;
374c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
375c1a4f3baSJunchao Zhang   *array         = NULL;
3763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
377c1a4f3baSJunchao Zhang }
378c1a4f3baSJunchao Zhang 
379d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
380d71ae5a4SJacob Faibussowitsch {
381c1a4f3baSJunchao Zhang   PetscFunctionBegin;
3829566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
383c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
385c1a4f3baSJunchao Zhang }
386c1a4f3baSJunchao Zhang 
387d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
388d71ae5a4SJacob Faibussowitsch {
389c1a4f3baSJunchao Zhang   PetscFunctionBegin;
390c1a4f3baSJunchao Zhang   *array = NULL;
391c1a4f3baSJunchao Zhang   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
3923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
393c1a4f3baSJunchao Zhang }
394c1a4f3baSJunchao Zhang 
395d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
396d71ae5a4SJacob Faibussowitsch {
397c1a4f3baSJunchao Zhang   PetscFunctionBegin;
398c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
400c1a4f3baSJunchao Zhang }
401c1a4f3baSJunchao Zhang 
402d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
403d71ae5a4SJacob Faibussowitsch {
404c1a4f3baSJunchao Zhang   PetscFunctionBegin;
405c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
406f38c1e66SStefano Zampini   *array         = NULL;
4073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
408f38c1e66SStefano Zampini }
409f38c1e66SStefano Zampini 
410d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg)
411d71ae5a4SJacob Faibussowitsch {
412c1a4f3baSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
413f38c1e66SStefano Zampini 
414e7e92044SBarry Smith   PetscFunctionBegin;
415b470e4b4SRichard Tran Mills   A->boundtocpu = flg;
416c1a4f3baSJunchao Zhang   if (flg && a->inode.size) {
417c1a4f3baSJunchao Zhang     a->inode.use = PETSC_TRUE;
418ea500dcfSRichard Tran Mills   } else {
419c1a4f3baSJunchao Zhang     a->inode.use = PETSC_FALSE;
420ea500dcfSRichard Tran Mills   }
421e7e92044SBarry Smith   if (flg) {
422f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
4239566063dSJacob Faibussowitsch     PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
424e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
425e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
426e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
427e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
4289566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
429e7e92044SBarry Smith   } else {
430e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
431e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
432e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
433e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
434e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
435c1a4f3baSJunchao Zhang 
436c1a4f3baSJunchao Zhang     a->ops->getarray          = MatSeqAIJGetArray_SeqAIJViennaCL;
437c1a4f3baSJunchao Zhang     a->ops->restorearray      = MatSeqAIJRestoreArray_SeqAIJViennaCL;
438c1a4f3baSJunchao Zhang     a->ops->getarrayread      = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
439c1a4f3baSJunchao Zhang     a->ops->restorearrayread  = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
440c1a4f3baSJunchao Zhang     a->ops->getarraywrite     = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
441c1a4f3baSJunchao Zhang     a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
442e7e92044SBarry Smith   }
4433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
444e7e92044SBarry Smith }
445e7e92044SBarry Smith 
446d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat)
447d71ae5a4SJacob Faibussowitsch {
4488713a8baSPatrick Sanan   Mat B;
4498713a8baSPatrick Sanan 
4508713a8baSPatrick Sanan   PetscFunctionBegin;
4515f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4528713a8baSPatrick Sanan 
4539566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4548713a8baSPatrick Sanan 
4558713a8baSPatrick Sanan   B = *newmat;
4568713a8baSPatrick Sanan 
457e4a0ef16SKarl Rupp   B->spptr = new Mat_SeqAIJViennaCL();
458e4a0ef16SKarl Rupp 
459a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec        = NULL;
460a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->mat            = NULL;
461a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL;
462e4a0ef16SKarl Rupp 
4639566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
46492f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
465e4a0ef16SKarl Rupp 
4669566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL));
4679566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
4689566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype));
469e4a0ef16SKarl Rupp 
4709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL));
4719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
4729566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ));
4738713a8baSPatrick Sanan 
474c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4758713a8baSPatrick Sanan 
4768713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4779566063dSJacob Faibussowitsch   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
4783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
479e4a0ef16SKarl Rupp }
480e4a0ef16SKarl Rupp 
4813ca39a21SBarry Smith /*MC
482e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
483e4a0ef16SKarl Rupp 
484e4a0ef16SKarl Rupp    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
485e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
486e4a0ef16SKarl Rupp 
487e4a0ef16SKarl Rupp    Options Database Keys:
48811a5261eSBarry Smith +  -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions()
48911a5261eSBarry Smith .  -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
49011a5261eSBarry Smith -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
491e4a0ef16SKarl Rupp 
492e4a0ef16SKarl Rupp   Level: beginner
493e4a0ef16SKarl Rupp 
494db781477SPatrick Sanan .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()`
495e4a0ef16SKarl Rupp M*/
496e4a0ef16SKarl Rupp 
497d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
498d71ae5a4SJacob Faibussowitsch {
49972367587SKarl Rupp   PetscFunctionBegin;
5009566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc));
5019566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc));
5029566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc));
5039566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc));
5043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
50572367587SKarl Rupp }
506