xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 15229ffc342989b2bf0590a733d92c152a3348fc)
1e4a0ef16SKarl Rupp /*
2e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
3e4a0ef16SKarl Rupp   matrix storage format.
4e4a0ef16SKarl Rupp */
5e4a0ef16SKarl Rupp 
6aaa7dc30SBarry Smith #include <petscconf.h>
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9aaa7dc30SBarry Smith #include <petscbt.h>
10aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
12e4a0ef16SKarl Rupp 
13aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
14e4a0ef16SKarl Rupp 
15e4a0ef16SKarl Rupp #include <algorithm>
16e4a0ef16SKarl Rupp #include <vector>
17e4a0ef16SKarl Rupp #include <string>
18e4a0ef16SKarl Rupp 
19e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
20e4a0ef16SKarl Rupp 
218713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2272367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *);
234222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
248713a8baSPatrick Sanan 
25d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26d71ae5a4SJacob Faibussowitsch {
27e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
28e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
29e4a0ef16SKarl Rupp 
30e4a0ef16SKarl Rupp   PetscFunctionBegin;
31bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
32c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
339566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
34e4a0ef16SKarl Rupp 
35e4a0ef16SKarl Rupp       try {
36e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
37a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
38e4a0ef16SKarl Rupp 
39a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
40a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
41e4a0ef16SKarl Rupp 
429371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
439371c9d4SSatish Balay           row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1);
449371c9d4SSatish Balay           for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]);
45e4a0ef16SKarl Rupp 
469371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_indices;
479371c9d4SSatish Balay           row_indices.raw_resize(dummy, a->compressedrow.nrows);
489371c9d4SSatish Balay           for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]);
49a3430c56SKarl Rupp 
509371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
519371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
529371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
53a3430c56SKarl Rupp 
54a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
559566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
56e4a0ef16SKarl Rupp         } else {
57a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
58e4a0ef16SKarl Rupp 
59e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
60e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
61e4a0ef16SKarl Rupp 
629371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
639371c9d4SSatish Balay           row_buffer.raw_resize(dummy, A->rmap->n + 1);
649371c9d4SSatish Balay           for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]);
65e4a0ef16SKarl Rupp 
669371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
679371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
689371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
69e4a0ef16SKarl Rupp 
70e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
719566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
72e4a0ef16SKarl Rupp         }
734cf1874eSKarl Rupp         ViennaCLWaitForGPU();
74d71ae5a4SJacob Faibussowitsch       } catch (std::exception const &ex) {
75d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
76d71ae5a4SJacob Faibussowitsch       }
77e4a0ef16SKarl Rupp 
78a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
79a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
809b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
81a3430c56SKarl Rupp           delete (ViennaCLVector *)viennaclstruct->tempvec;
829b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
83a3430c56SKarl Rupp         } else {
84a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
85a3430c56SKarl Rupp         }
86a3430c56SKarl Rupp       } else {
879b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
88a3430c56SKarl Rupp       }
89a3430c56SKarl Rupp 
90c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
91e4a0ef16SKarl Rupp 
929566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
93e4a0ef16SKarl Rupp     }
9467c87b7fSKarl Rupp   }
953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
96e4a0ef16SKarl Rupp }
97e4a0ef16SKarl Rupp 
98d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
99d71ae5a4SJacob Faibussowitsch {
100f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
101e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
102e4a0ef16SKarl Rupp   PetscInt            m              = A->rmap->n;
103e4a0ef16SKarl Rupp 
104e4a0ef16SKarl Rupp   PetscFunctionBegin;
1053ba16761SJacob Faibussowitsch   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(PETSC_SUCCESS);
106c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
107e4a0ef16SKarl Rupp     try {
1082c71b3e2SJacob Faibussowitsch       PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1092c71b3e2SJacob Faibussowitsch       PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
110e4a0ef16SKarl Rupp       a->nz           = Agpu->nnz();
111e4a0ef16SKarl Rupp       a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
112e4a0ef16SKarl Rupp       A->preallocated = PETSC_TRUE;
113e4a0ef16SKarl Rupp       if (a->singlemalloc) {
1149566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree3(a->a, a->j, a->i));
115e4a0ef16SKarl Rupp       } else {
1169566063dSJacob Faibussowitsch         if (a->i) PetscCall(PetscFree(a->i));
1179566063dSJacob Faibussowitsch         if (a->j) PetscCall(PetscFree(a->j));
1189566063dSJacob Faibussowitsch         if (a->a) PetscCall(PetscFree(a->a));
119e4a0ef16SKarl Rupp       }
1209566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(a->nz, &a->a, a->nz, &a->j, m + 1, &a->i));
121e4a0ef16SKarl Rupp 
122e4a0ef16SKarl Rupp       a->singlemalloc = PETSC_TRUE;
123e4a0ef16SKarl Rupp 
124e4a0ef16SKarl Rupp       /* Setup row lengths */
1259566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->imax));
1269566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->ilen));
1279566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->imax));
1289566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->ilen));
129e4a0ef16SKarl Rupp 
130e4a0ef16SKarl Rupp       /* Copy data back from GPU */
1319371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
1329371c9d4SSatish Balay       row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
133e4a0ef16SKarl Rupp 
134e4a0ef16SKarl Rupp       // copy row array
135e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
136e4a0ef16SKarl Rupp       (a->i)[0] = row_buffer[0];
137e4a0ef16SKarl Rupp       for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
138e4a0ef16SKarl Rupp         (a->i)[i + 1] = row_buffer[i + 1];
139e4a0ef16SKarl Rupp         a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
140e4a0ef16SKarl Rupp       }
141e4a0ef16SKarl Rupp 
142e4a0ef16SKarl Rupp       // copy column indices
1439371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
1449371c9d4SSatish Balay       col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
145e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
1469371c9d4SSatish Balay       for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i];
147e4a0ef16SKarl Rupp 
148e4a0ef16SKarl Rupp       // copy nonzero entries directly to destination (no conversion required)
149e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
150e4a0ef16SKarl Rupp 
1519566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar))));
1524cf1874eSKarl Rupp       ViennaCLWaitForGPU();
153023073b3SKarl Rupp       /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
154d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
155d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
156d71ae5a4SJacob Faibussowitsch     }
157c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
1583ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
159f38c1e66SStefano Zampini   } else {
1603ba16761SJacob Faibussowitsch     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(PETSC_SUCCESS);
161e4a0ef16SKarl Rupp 
1622c71b3e2SJacob Faibussowitsch     PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
163f38c1e66SStefano Zampini     if (!Agpu) {
164f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a);
165f38c1e66SStefano Zampini     } else {
166f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
167f38c1e66SStefano Zampini     }
168f38c1e66SStefano Zampini   }
169c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
170b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
1719566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1729566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
174e4a0ef16SKarl Rupp }
175e4a0ef16SKarl Rupp 
17666976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy)
177d71ae5a4SJacob Faibussowitsch {
178e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
179e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
1800d73d530SKarl Rupp   const ViennaCLVector *xgpu           = NULL;
1810d73d530SKarl Rupp   ViennaCLVector       *ygpu           = NULL;
182e4a0ef16SKarl Rupp 
183e4a0ef16SKarl Rupp   PetscFunctionBegin;
184837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1859566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
186bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
1879566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
1889566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu));
1899566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
190e4a0ef16SKarl Rupp     try {
191b7832b47SStefano Zampini       if (a->compressedrow.use) {
192b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
193b7832b47SStefano Zampini       } else {
194e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
195b7832b47SStefano Zampini       }
1964cf1874eSKarl Rupp       ViennaCLWaitForGPU();
197d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
198d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
199d71ae5a4SJacob Faibussowitsch     }
2009566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
2019566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2029566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu));
2039566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
204bf1781e8SStefano Zampini   } else {
2059566063dSJacob Faibussowitsch     PetscCall(VecSet_SeqViennaCL(yy, 0));
20667c87b7fSKarl Rupp   }
2073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
208e4a0ef16SKarl Rupp }
209e4a0ef16SKarl Rupp 
21066976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz)
211d71ae5a4SJacob Faibussowitsch {
212e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
213e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
2140d73d530SKarl Rupp   const ViennaCLVector *xgpu = NULL, *ygpu = NULL;
2150d73d530SKarl Rupp   ViennaCLVector       *zgpu = NULL;
216e4a0ef16SKarl Rupp 
217e4a0ef16SKarl Rupp   PetscFunctionBegin;
218837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2199566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
220bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
221e4a0ef16SKarl Rupp     try {
2229566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
2239566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(yy, &ygpu));
2249566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu));
2259566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
226f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
227f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
228f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
229f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2304cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2319566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
2329566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2339566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu));
2349566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu));
235e4a0ef16SKarl Rupp 
236d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
237d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
238d71ae5a4SJacob Faibussowitsch     }
2399566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
240bf1781e8SStefano Zampini   } else {
2419566063dSJacob Faibussowitsch     PetscCall(VecCopy_SeqViennaCL(yy, zz));
24267c87b7fSKarl Rupp   }
2433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
244e4a0ef16SKarl Rupp }
245e4a0ef16SKarl Rupp 
24666976f2fSJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode)
247d71ae5a4SJacob Faibussowitsch {
248e4a0ef16SKarl Rupp   PetscFunctionBegin;
2499566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
2503ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
2519566063dSJacob Faibussowitsch   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
2523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
253e4a0ef16SKarl Rupp }
254e4a0ef16SKarl Rupp 
255e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
256cab5ea25SPierre Jolivet /*@C
25711a5261eSBarry Smith   MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format
25819fddfadSKarl Rupp   (the default parallel PETSc format).  This matrix will ultimately be pushed down
25920f4b53cSBarry Smith   to GPUs and use the ViennaCL library for calculations.
260e4a0ef16SKarl Rupp 
261d083f849SBarry Smith   Collective
262e4a0ef16SKarl Rupp 
263e4a0ef16SKarl Rupp   Input Parameters:
26411a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
265e4a0ef16SKarl Rupp . m    - number of rows
266e4a0ef16SKarl Rupp . n    - number of columns
26720f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
26820f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
269e4a0ef16SKarl Rupp 
270e4a0ef16SKarl Rupp   Output Parameter:
271e4a0ef16SKarl Rupp . A - the matrix
272e4a0ef16SKarl Rupp 
27311a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
274e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
27511a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
276e4a0ef16SKarl Rupp 
277e4a0ef16SKarl Rupp   Notes:
27811a5261eSBarry Smith   The AIJ format, also called
2792ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
280e4a0ef16SKarl Rupp   storage.  That is, the stored row and column indices can begin at
28120f4b53cSBarry Smith   either one (as in Fortran) or zero.
282e4a0ef16SKarl Rupp 
28320f4b53cSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
28420f4b53cSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
28520f4b53cSBarry Smith   allocation.
286e4a0ef16SKarl Rupp 
287e4a0ef16SKarl Rupp   Level: intermediate
288e4a0ef16SKarl Rupp 
289fe59aa6dSJacob Faibussowitsch .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
290e4a0ef16SKarl Rupp @*/
291d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
292d71ae5a4SJacob Faibussowitsch {
293e4a0ef16SKarl Rupp   PetscFunctionBegin;
2949566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
2959566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
2969566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJVIENNACL));
2979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
2983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
299e4a0ef16SKarl Rupp }
300e4a0ef16SKarl Rupp 
30166976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
302d71ae5a4SJacob Faibussowitsch {
303e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr;
304e4a0ef16SKarl Rupp 
305e4a0ef16SKarl Rupp   PetscFunctionBegin;
306e4a0ef16SKarl Rupp   try {
3076447cd05SKarl Rupp     if (viennaclcontainer) {
3086447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3096447cd05SKarl Rupp       delete viennaclcontainer->mat;
3106447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
311e4a0ef16SKarl Rupp       delete viennaclcontainer;
3126447cd05SKarl Rupp     }
313c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
314d71ae5a4SJacob Faibussowitsch   } catch (std::exception const &ex) {
315d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
316d71ae5a4SJacob Faibussowitsch   }
3178713a8baSPatrick Sanan 
3189566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL));
3199566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL));
3209566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL));
3218713a8baSPatrick Sanan 
322e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
323e4a0ef16SKarl Rupp   A->spptr = 0;
3249566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
3253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
326e4a0ef16SKarl Rupp }
327e4a0ef16SKarl Rupp 
328d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
329d71ae5a4SJacob Faibussowitsch {
330e4a0ef16SKarl Rupp   PetscFunctionBegin;
3319566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
3329566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B));
3333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3348713a8baSPatrick Sanan }
3358713a8baSPatrick Sanan 
336b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool);
337d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B)
338d71ae5a4SJacob Faibussowitsch {
339c3cca76eSKarl Rupp   Mat C;
340c3cca76eSKarl Rupp 
341c3cca76eSKarl Rupp   PetscFunctionBegin;
3429566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
343c3cca76eSKarl Rupp   C = *B;
344c3cca76eSKarl Rupp 
3459566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
34692f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
347c3cca76eSKarl Rupp 
348c3cca76eSKarl Rupp   C->spptr                                         = new Mat_SeqAIJViennaCL();
349c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec        = NULL;
350c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->mat            = NULL;
351c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL;
352c3cca76eSKarl Rupp 
3539566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL));
354c3cca76eSKarl Rupp 
355c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
356c3cca76eSKarl Rupp 
357c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
3589566063dSJacob Faibussowitsch   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
3593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
360c3cca76eSKarl Rupp }
361c3cca76eSKarl Rupp 
362d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
363d71ae5a4SJacob Faibussowitsch {
364f38c1e66SStefano Zampini   PetscFunctionBegin;
3659566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
366c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
368f38c1e66SStefano Zampini }
369f38c1e66SStefano Zampini 
370d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
371d71ae5a4SJacob Faibussowitsch {
372f38c1e66SStefano Zampini   PetscFunctionBegin;
373c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
374c1a4f3baSJunchao Zhang   *array         = NULL;
3753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
376c1a4f3baSJunchao Zhang }
377c1a4f3baSJunchao Zhang 
378d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
379d71ae5a4SJacob Faibussowitsch {
380c1a4f3baSJunchao Zhang   PetscFunctionBegin;
3819566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
382c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
384c1a4f3baSJunchao Zhang }
385c1a4f3baSJunchao Zhang 
386d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
387d71ae5a4SJacob Faibussowitsch {
388c1a4f3baSJunchao Zhang   PetscFunctionBegin;
389c1a4f3baSJunchao Zhang   *array = NULL;
390c1a4f3baSJunchao Zhang   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
3913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
392c1a4f3baSJunchao Zhang }
393c1a4f3baSJunchao Zhang 
394d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
395d71ae5a4SJacob Faibussowitsch {
396c1a4f3baSJunchao Zhang   PetscFunctionBegin;
397c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
399c1a4f3baSJunchao Zhang }
400c1a4f3baSJunchao Zhang 
401d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
402d71ae5a4SJacob Faibussowitsch {
403c1a4f3baSJunchao Zhang   PetscFunctionBegin;
404c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
405f38c1e66SStefano Zampini   *array         = NULL;
4063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
407f38c1e66SStefano Zampini }
408f38c1e66SStefano Zampini 
409d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg)
410d71ae5a4SJacob Faibussowitsch {
411c1a4f3baSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
412f38c1e66SStefano Zampini 
413e7e92044SBarry Smith   PetscFunctionBegin;
414b470e4b4SRichard Tran Mills   A->boundtocpu = flg;
415c1a4f3baSJunchao Zhang   if (flg && a->inode.size) {
416c1a4f3baSJunchao Zhang     a->inode.use = PETSC_TRUE;
417ea500dcfSRichard Tran Mills   } else {
418c1a4f3baSJunchao Zhang     a->inode.use = PETSC_FALSE;
419ea500dcfSRichard Tran Mills   }
420e7e92044SBarry Smith   if (flg) {
421f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
4229566063dSJacob Faibussowitsch     PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
423e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
424e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
425e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
426e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
4279566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
428e7e92044SBarry Smith   } else {
429e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
430e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
431e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
432e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
433e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
434c1a4f3baSJunchao Zhang 
435c1a4f3baSJunchao Zhang     a->ops->getarray          = MatSeqAIJGetArray_SeqAIJViennaCL;
436c1a4f3baSJunchao Zhang     a->ops->restorearray      = MatSeqAIJRestoreArray_SeqAIJViennaCL;
437c1a4f3baSJunchao Zhang     a->ops->getarrayread      = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
438c1a4f3baSJunchao Zhang     a->ops->restorearrayread  = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
439c1a4f3baSJunchao Zhang     a->ops->getarraywrite     = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
440c1a4f3baSJunchao Zhang     a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
441e7e92044SBarry Smith   }
4423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
443e7e92044SBarry Smith }
444e7e92044SBarry Smith 
445d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat)
446d71ae5a4SJacob Faibussowitsch {
4478713a8baSPatrick Sanan   Mat B;
4488713a8baSPatrick Sanan 
4498713a8baSPatrick Sanan   PetscFunctionBegin;
4505f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4518713a8baSPatrick Sanan 
4529566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4538713a8baSPatrick Sanan 
4548713a8baSPatrick Sanan   B = *newmat;
4558713a8baSPatrick Sanan 
456e4a0ef16SKarl Rupp   B->spptr = new Mat_SeqAIJViennaCL();
457e4a0ef16SKarl Rupp 
458a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec        = NULL;
459a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->mat            = NULL;
460a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL;
461e4a0ef16SKarl Rupp 
4629566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
46392f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
464e4a0ef16SKarl Rupp 
4659566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL));
4669566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
4679566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype));
468e4a0ef16SKarl Rupp 
4699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL));
4709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
4719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ));
4728713a8baSPatrick Sanan 
473c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4748713a8baSPatrick Sanan 
4758713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4769566063dSJacob Faibussowitsch   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
4773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
478e4a0ef16SKarl Rupp }
479e4a0ef16SKarl Rupp 
4803ca39a21SBarry Smith /*MC
481e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
482e4a0ef16SKarl Rupp 
483*15229ffcSPierre Jolivet    A matrix type whose data resides on GPUs. These matrices are in CSR format by
484e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
485e4a0ef16SKarl Rupp 
486e4a0ef16SKarl Rupp    Options Database Keys:
48711a5261eSBarry Smith +  -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions()
48811a5261eSBarry Smith .  -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
48911a5261eSBarry Smith -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
490e4a0ef16SKarl Rupp 
491e4a0ef16SKarl Rupp   Level: beginner
492e4a0ef16SKarl Rupp 
493db781477SPatrick Sanan .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()`
494e4a0ef16SKarl Rupp M*/
495e4a0ef16SKarl Rupp 
496d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
497d71ae5a4SJacob Faibussowitsch {
49872367587SKarl Rupp   PetscFunctionBegin;
4999566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc));
5009566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc));
5019566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc));
5029566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc));
5033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
50472367587SKarl Rupp }
505