xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 4d12350b4d1d562f24a95bd06d554a68a71c004f)
1e4a0ef16SKarl Rupp /*
2e4a0ef16SKarl Rupp     Defines the basic matrix operations for the AIJ (compressed row)
3e4a0ef16SKarl Rupp   matrix storage format.
4e4a0ef16SKarl Rupp */
5e4a0ef16SKarl Rupp 
6aaa7dc30SBarry Smith #include <petscconf.h>
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9aaa7dc30SBarry Smith #include <petscbt.h>
10aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
12e4a0ef16SKarl Rupp 
13aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
14e4a0ef16SKarl Rupp 
15e4a0ef16SKarl Rupp #include <algorithm>
16e4a0ef16SKarl Rupp #include <vector>
17e4a0ef16SKarl Rupp #include <string>
18e4a0ef16SKarl Rupp 
19e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp"
20e4a0ef16SKarl Rupp 
218713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
2272367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *);
234222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
248713a8baSPatrick Sanan 
25d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26d71ae5a4SJacob Faibussowitsch {
27e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
28e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
29e4a0ef16SKarl Rupp 
30e4a0ef16SKarl Rupp   PetscFunctionBegin;
31bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
32c70f7ee4SJunchao Zhang     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
339566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
34e4a0ef16SKarl Rupp 
35e4a0ef16SKarl Rupp       try {
36e4a0ef16SKarl Rupp         if (a->compressedrow.use) {
37a3430c56SKarl Rupp           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
38e4a0ef16SKarl Rupp 
39a3430c56SKarl Rupp           // Since PetscInt is different from cl_uint, we have to convert:
40a3430c56SKarl Rupp           viennacl::backend::mem_handle dummy;
41e4a0ef16SKarl Rupp 
429371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
439371c9d4SSatish Balay           row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1);
449371c9d4SSatish Balay           for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]);
45e4a0ef16SKarl Rupp 
469371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_indices;
479371c9d4SSatish Balay           row_indices.raw_resize(dummy, a->compressedrow.nrows);
489371c9d4SSatish Balay           for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]);
49a3430c56SKarl Rupp 
509371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
519371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
529371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
53a3430c56SKarl Rupp 
54a3430c56SKarl Rupp           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
559566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
56e4a0ef16SKarl Rupp         } else {
57a3430c56SKarl Rupp           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
58e4a0ef16SKarl Rupp 
59e4a0ef16SKarl Rupp           // Since PetscInt is in general different from cl_uint, we have to convert:
60e4a0ef16SKarl Rupp           viennacl::backend::mem_handle dummy;
61e4a0ef16SKarl Rupp 
629371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
639371c9d4SSatish Balay           row_buffer.raw_resize(dummy, A->rmap->n + 1);
649371c9d4SSatish Balay           for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]);
65e4a0ef16SKarl Rupp 
669371c9d4SSatish Balay           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
679371c9d4SSatish Balay           col_buffer.raw_resize(dummy, a->nz);
689371c9d4SSatish Balay           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
69e4a0ef16SKarl Rupp 
70e4a0ef16SKarl Rupp           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
719566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
72e4a0ef16SKarl Rupp         }
734cf1874eSKarl Rupp         ViennaCLWaitForGPU();
74d71ae5a4SJacob Faibussowitsch       } catch (std::exception const &ex) {
75d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
76d71ae5a4SJacob Faibussowitsch       }
77e4a0ef16SKarl Rupp 
78a3430c56SKarl Rupp       // Create temporary vector for v += A*x:
79a3430c56SKarl Rupp       if (viennaclstruct->tempvec) {
809b66742cSDave May         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
81a3430c56SKarl Rupp           delete (ViennaCLVector *)viennaclstruct->tempvec;
829b66742cSDave May           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
83a3430c56SKarl Rupp         } else {
84a3430c56SKarl Rupp           viennaclstruct->tempvec->clear();
85a3430c56SKarl Rupp         }
86a3430c56SKarl Rupp       } else {
879b66742cSDave May         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
88a3430c56SKarl Rupp       }
89a3430c56SKarl Rupp 
90c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
91e4a0ef16SKarl Rupp 
929566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
93e4a0ef16SKarl Rupp     }
9467c87b7fSKarl Rupp   }
953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
96e4a0ef16SKarl Rupp }
97e4a0ef16SKarl Rupp 
98d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
99d71ae5a4SJacob Faibussowitsch {
100f38c1e66SStefano Zampini   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
101e4a0ef16SKarl Rupp   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
102e4a0ef16SKarl Rupp   PetscInt            m              = A->rmap->n;
103e4a0ef16SKarl Rupp 
104e4a0ef16SKarl Rupp   PetscFunctionBegin;
1053ba16761SJacob Faibussowitsch   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(PETSC_SUCCESS);
106c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
107e4a0ef16SKarl Rupp     try {
1082c71b3e2SJacob Faibussowitsch       PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
1092c71b3e2SJacob Faibussowitsch       PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
110e4a0ef16SKarl Rupp       a->nz           = Agpu->nnz();
111e4a0ef16SKarl Rupp       a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
112e4a0ef16SKarl Rupp       A->preallocated = PETSC_TRUE;
1139f0612e4SBarry Smith       if (a->free_a) PetscCall(PetscShmgetDeallocateArray((void **)a->a));
1149f0612e4SBarry Smith       if (a->free_ij) PetscCall(PetscShmgetDeallocateArray((void **)a->j));
1159f0612e4SBarry Smith       if (a->free_ij) PetscCall(PetscShmgetDeallocateArray((void **)a->i));
1169f0612e4SBarry Smith       PetscCall(PetscShmgetAllocateArray(a->nz, sizeof(PetscScalar), (void **)&a->a));
1179f0612e4SBarry Smith       PetscCall(PetscShmgetAllocateArray(a->nz, sizeof(PetscInt), (void **)&a->j));
1189f0612e4SBarry Smith       PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&a->i));
1199f0612e4SBarry Smith       a->free_a  = PETSC_TRUE;
1209f0612e4SBarry Smith       a->free_ij = PETSC_TRUE;
121e4a0ef16SKarl Rupp 
122e4a0ef16SKarl Rupp       /* Setup row lengths */
1239566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->imax));
1249566063dSJacob Faibussowitsch       PetscCall(PetscFree(a->ilen));
1259566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->imax));
1269566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m, &a->ilen));
127e4a0ef16SKarl Rupp 
128e4a0ef16SKarl Rupp       /* Copy data back from GPU */
1299371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
1309371c9d4SSatish Balay       row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
131e4a0ef16SKarl Rupp 
132e4a0ef16SKarl Rupp       // copy row array
133e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
134e4a0ef16SKarl Rupp       (a->i)[0] = row_buffer[0];
135e4a0ef16SKarl Rupp       for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
136e4a0ef16SKarl Rupp         (a->i)[i + 1] = row_buffer[i + 1];
137e4a0ef16SKarl Rupp         a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
138e4a0ef16SKarl Rupp       }
139e4a0ef16SKarl Rupp 
140e4a0ef16SKarl Rupp       // copy column indices
1419371c9d4SSatish Balay       viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
1429371c9d4SSatish Balay       col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
143e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
1449371c9d4SSatish Balay       for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i];
145e4a0ef16SKarl Rupp 
146e4a0ef16SKarl Rupp       // copy nonzero entries directly to destination (no conversion required)
147e4a0ef16SKarl Rupp       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
148e4a0ef16SKarl Rupp 
1499566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar))));
1504cf1874eSKarl Rupp       ViennaCLWaitForGPU();
151023073b3SKarl Rupp       /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
152d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
153d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
154d71ae5a4SJacob Faibussowitsch     }
155c70f7ee4SJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
1563ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
157f38c1e66SStefano Zampini   } else {
1583ba16761SJacob Faibussowitsch     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(PETSC_SUCCESS);
159e4a0ef16SKarl Rupp 
1602c71b3e2SJacob Faibussowitsch     PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
161f38c1e66SStefano Zampini     if (!Agpu) {
162f38c1e66SStefano Zampini       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a);
163f38c1e66SStefano Zampini     } else {
164f38c1e66SStefano Zampini       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
165f38c1e66SStefano Zampini     }
166f38c1e66SStefano Zampini   }
167c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
168b8ced49eSKarl Rupp   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
1699566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1709566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
172e4a0ef16SKarl Rupp }
173e4a0ef16SKarl Rupp 
17466976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy)
175d71ae5a4SJacob Faibussowitsch {
176e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
177e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
1780d73d530SKarl Rupp   const ViennaCLVector *xgpu           = NULL;
1790d73d530SKarl Rupp   ViennaCLVector       *ygpu           = NULL;
180e4a0ef16SKarl Rupp 
181e4a0ef16SKarl Rupp   PetscFunctionBegin;
182837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1839566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
184bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
1859566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
1869566063dSJacob Faibussowitsch     PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu));
1879566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
188e4a0ef16SKarl Rupp     try {
189b7832b47SStefano Zampini       if (a->compressedrow.use) {
190b7832b47SStefano Zampini         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
191b7832b47SStefano Zampini       } else {
192e4a0ef16SKarl Rupp         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
193b7832b47SStefano Zampini       }
1944cf1874eSKarl Rupp       ViennaCLWaitForGPU();
195d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
196d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
197d71ae5a4SJacob Faibussowitsch     }
1989566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
1999566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2009566063dSJacob Faibussowitsch     PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu));
2019566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
202bf1781e8SStefano Zampini   } else {
2039566063dSJacob Faibussowitsch     PetscCall(VecSet_SeqViennaCL(yy, 0));
20467c87b7fSKarl Rupp   }
2053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
206e4a0ef16SKarl Rupp }
207e4a0ef16SKarl Rupp 
20866976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz)
209d71ae5a4SJacob Faibussowitsch {
210e4a0ef16SKarl Rupp   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
211e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
2120d73d530SKarl Rupp   const ViennaCLVector *xgpu = NULL, *ygpu = NULL;
2130d73d530SKarl Rupp   ViennaCLVector       *zgpu = NULL;
214e4a0ef16SKarl Rupp 
215e4a0ef16SKarl Rupp   PetscFunctionBegin;
216837a59e1SRichard Tran Mills   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2179566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyToGPU(A));
218bf1781e8SStefano Zampini   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
219e4a0ef16SKarl Rupp     try {
2209566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
2219566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayRead(yy, &ygpu));
2229566063dSJacob Faibussowitsch       PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu));
2239566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
224f38c1e66SStefano Zampini       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
225f38c1e66SStefano Zampini       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
226f38c1e66SStefano Zampini       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
227f38c1e66SStefano Zampini       else *zgpu += *viennaclstruct->tempvec;
2284cf1874eSKarl Rupp       ViennaCLWaitForGPU();
2299566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
2309566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
2319566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu));
2329566063dSJacob Faibussowitsch       PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu));
233e4a0ef16SKarl Rupp 
234d71ae5a4SJacob Faibussowitsch     } catch (std::exception const &ex) {
235d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
236d71ae5a4SJacob Faibussowitsch     }
2379566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
238bf1781e8SStefano Zampini   } else {
2399566063dSJacob Faibussowitsch     PetscCall(VecCopy_SeqViennaCL(yy, zz));
24067c87b7fSKarl Rupp   }
2413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
242e4a0ef16SKarl Rupp }
243e4a0ef16SKarl Rupp 
24466976f2fSJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode)
245d71ae5a4SJacob Faibussowitsch {
246e4a0ef16SKarl Rupp   PetscFunctionBegin;
2479566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
2483ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
2499566063dSJacob Faibussowitsch   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
2503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
251e4a0ef16SKarl Rupp }
252e4a0ef16SKarl Rupp 
253e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/
254cab5ea25SPierre Jolivet /*@C
25511a5261eSBarry Smith   MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format
25619fddfadSKarl Rupp   (the default parallel PETSc format).  This matrix will ultimately be pushed down
25720f4b53cSBarry Smith   to GPUs and use the ViennaCL library for calculations.
258e4a0ef16SKarl Rupp 
259d083f849SBarry Smith   Collective
260e4a0ef16SKarl Rupp 
261e4a0ef16SKarl Rupp   Input Parameters:
26211a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
263e4a0ef16SKarl Rupp . m    - number of rows
264e4a0ef16SKarl Rupp . n    - number of columns
26520f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
26620f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
267e4a0ef16SKarl Rupp 
268e4a0ef16SKarl Rupp   Output Parameter:
269e4a0ef16SKarl Rupp . A - the matrix
270e4a0ef16SKarl Rupp 
27111a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
272e4a0ef16SKarl Rupp    MatXXXXSetPreallocation() paradigm instead of this routine directly.
27311a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
274e4a0ef16SKarl Rupp 
275e4a0ef16SKarl Rupp   Notes:
27611a5261eSBarry Smith   The AIJ format, also called
2772ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
278e4a0ef16SKarl Rupp   storage.  That is, the stored row and column indices can begin at
27920f4b53cSBarry Smith   either one (as in Fortran) or zero.
280e4a0ef16SKarl Rupp 
28120f4b53cSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
28220f4b53cSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
28320f4b53cSBarry Smith   allocation.
284e4a0ef16SKarl Rupp 
285e4a0ef16SKarl Rupp   Level: intermediate
286e4a0ef16SKarl Rupp 
287fe59aa6dSJacob Faibussowitsch .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
288e4a0ef16SKarl Rupp @*/
289d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
290d71ae5a4SJacob Faibussowitsch {
291e4a0ef16SKarl Rupp   PetscFunctionBegin;
2929566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
2939566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
2949566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJVIENNACL));
2959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
2963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
297e4a0ef16SKarl Rupp }
298e4a0ef16SKarl Rupp 
29966976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
300d71ae5a4SJacob Faibussowitsch {
301e4a0ef16SKarl Rupp   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr;
302e4a0ef16SKarl Rupp 
303e4a0ef16SKarl Rupp   PetscFunctionBegin;
304e4a0ef16SKarl Rupp   try {
3056447cd05SKarl Rupp     if (viennaclcontainer) {
3066447cd05SKarl Rupp       delete viennaclcontainer->tempvec;
3076447cd05SKarl Rupp       delete viennaclcontainer->mat;
3086447cd05SKarl Rupp       delete viennaclcontainer->compressed_mat;
309e4a0ef16SKarl Rupp       delete viennaclcontainer;
3106447cd05SKarl Rupp     }
311c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
312d71ae5a4SJacob Faibussowitsch   } catch (std::exception const &ex) {
313d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
314d71ae5a4SJacob Faibussowitsch   }
3158713a8baSPatrick Sanan 
3169566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL));
3179566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL));
3189566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL));
3198713a8baSPatrick Sanan 
320e4a0ef16SKarl Rupp   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
321e4a0ef16SKarl Rupp   A->spptr = 0;
3229566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
3233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
324e4a0ef16SKarl Rupp }
325e4a0ef16SKarl Rupp 
326d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
327d71ae5a4SJacob Faibussowitsch {
328e4a0ef16SKarl Rupp   PetscFunctionBegin;
3299566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
3309566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B));
3313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3328713a8baSPatrick Sanan }
3338713a8baSPatrick Sanan 
334b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool);
335d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B)
336d71ae5a4SJacob Faibussowitsch {
337c3cca76eSKarl Rupp   Mat C;
338c3cca76eSKarl Rupp 
339c3cca76eSKarl Rupp   PetscFunctionBegin;
3409566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
341c3cca76eSKarl Rupp   C = *B;
342c3cca76eSKarl Rupp 
3439566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
34492f9df4aSRichard Tran Mills   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
345c3cca76eSKarl Rupp 
346c3cca76eSKarl Rupp   C->spptr                                         = new Mat_SeqAIJViennaCL();
347c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec        = NULL;
348c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->mat            = NULL;
349c3cca76eSKarl Rupp   ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL;
350c3cca76eSKarl Rupp 
3519566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL));
352c3cca76eSKarl Rupp 
353c70f7ee4SJunchao Zhang   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
354c3cca76eSKarl Rupp 
355c3cca76eSKarl Rupp   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
3569566063dSJacob Faibussowitsch   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
3573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
358c3cca76eSKarl Rupp }
359c3cca76eSKarl Rupp 
360d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
361d71ae5a4SJacob Faibussowitsch {
362f38c1e66SStefano Zampini   PetscFunctionBegin;
3639566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
364c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
366f38c1e66SStefano Zampini }
367f38c1e66SStefano Zampini 
368d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
369d71ae5a4SJacob Faibussowitsch {
370f38c1e66SStefano Zampini   PetscFunctionBegin;
371c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
372c1a4f3baSJunchao Zhang   *array         = NULL;
3733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
374c1a4f3baSJunchao Zhang }
375c1a4f3baSJunchao Zhang 
376d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
377d71ae5a4SJacob Faibussowitsch {
378c1a4f3baSJunchao Zhang   PetscFunctionBegin;
3799566063dSJacob Faibussowitsch   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
380c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
382c1a4f3baSJunchao Zhang }
383c1a4f3baSJunchao Zhang 
384d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
385d71ae5a4SJacob Faibussowitsch {
386c1a4f3baSJunchao Zhang   PetscFunctionBegin;
387c1a4f3baSJunchao Zhang   *array = NULL;
388c1a4f3baSJunchao Zhang   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
3893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
390c1a4f3baSJunchao Zhang }
391c1a4f3baSJunchao Zhang 
392d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
393d71ae5a4SJacob Faibussowitsch {
394c1a4f3baSJunchao Zhang   PetscFunctionBegin;
395c1a4f3baSJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
3963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
397c1a4f3baSJunchao Zhang }
398c1a4f3baSJunchao Zhang 
399d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
400d71ae5a4SJacob Faibussowitsch {
401c1a4f3baSJunchao Zhang   PetscFunctionBegin;
402c1a4f3baSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
403f38c1e66SStefano Zampini   *array         = NULL;
4043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
405f38c1e66SStefano Zampini }
406f38c1e66SStefano Zampini 
407d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg)
408d71ae5a4SJacob Faibussowitsch {
409c1a4f3baSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
410f38c1e66SStefano Zampini 
411e7e92044SBarry Smith   PetscFunctionBegin;
412b470e4b4SRichard Tran Mills   A->boundtocpu = flg;
413*4d12350bSJunchao Zhang   if (flg && a->inode.size_csr) {
414c1a4f3baSJunchao Zhang     a->inode.use = PETSC_TRUE;
415ea500dcfSRichard Tran Mills   } else {
416c1a4f3baSJunchao Zhang     a->inode.use = PETSC_FALSE;
417ea500dcfSRichard Tran Mills   }
418e7e92044SBarry Smith   if (flg) {
419f38c1e66SStefano Zampini     /* make sure we have an up-to-date copy on the CPU */
4209566063dSJacob Faibussowitsch     PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
421e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJ;
422e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJ;
423e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
424e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJ;
4259566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
426e7e92044SBarry Smith   } else {
427e7e92044SBarry Smith     A->ops->mult        = MatMult_SeqAIJViennaCL;
428e7e92044SBarry Smith     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
429e7e92044SBarry Smith     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
430e7e92044SBarry Smith     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
431e7e92044SBarry Smith     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
432c1a4f3baSJunchao Zhang 
433c1a4f3baSJunchao Zhang     a->ops->getarray          = MatSeqAIJGetArray_SeqAIJViennaCL;
434c1a4f3baSJunchao Zhang     a->ops->restorearray      = MatSeqAIJRestoreArray_SeqAIJViennaCL;
435c1a4f3baSJunchao Zhang     a->ops->getarrayread      = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
436c1a4f3baSJunchao Zhang     a->ops->restorearrayread  = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
437c1a4f3baSJunchao Zhang     a->ops->getarraywrite     = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
438c1a4f3baSJunchao Zhang     a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
439e7e92044SBarry Smith   }
4403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
441e7e92044SBarry Smith }
442e7e92044SBarry Smith 
443d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat)
444d71ae5a4SJacob Faibussowitsch {
4458713a8baSPatrick Sanan   Mat B;
4468713a8baSPatrick Sanan 
4478713a8baSPatrick Sanan   PetscFunctionBegin;
4485f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
4498713a8baSPatrick Sanan 
4509566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4518713a8baSPatrick Sanan 
4528713a8baSPatrick Sanan   B = *newmat;
4538713a8baSPatrick Sanan 
454e4a0ef16SKarl Rupp   B->spptr = new Mat_SeqAIJViennaCL();
455e4a0ef16SKarl Rupp 
456a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec        = NULL;
457a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->mat            = NULL;
458a3430c56SKarl Rupp   ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL;
459e4a0ef16SKarl Rupp 
4609566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
46192f9df4aSRichard Tran Mills   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
462e4a0ef16SKarl Rupp 
4639566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL));
4649566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
4659566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype));
466e4a0ef16SKarl Rupp 
4679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL));
4689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
4699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ));
4708713a8baSPatrick Sanan 
471c70f7ee4SJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4728713a8baSPatrick Sanan 
4738713a8baSPatrick Sanan   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
4749566063dSJacob Faibussowitsch   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
4753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
476e4a0ef16SKarl Rupp }
477e4a0ef16SKarl Rupp 
4783ca39a21SBarry Smith /*MC
479e4a0ef16SKarl Rupp    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
480e4a0ef16SKarl Rupp 
48115229ffcSPierre Jolivet    A matrix type whose data resides on GPUs. These matrices are in CSR format by
482e4a0ef16SKarl Rupp    default. All matrix calculations are performed using the ViennaCL library.
483e4a0ef16SKarl Rupp 
484e4a0ef16SKarl Rupp    Options Database Keys:
48511a5261eSBarry Smith +  -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions()
48611a5261eSBarry Smith .  -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
48711a5261eSBarry Smith -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
488e4a0ef16SKarl Rupp 
489e4a0ef16SKarl Rupp   Level: beginner
490e4a0ef16SKarl Rupp 
491db781477SPatrick Sanan .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()`
492e4a0ef16SKarl Rupp M*/
493e4a0ef16SKarl Rupp 
494d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
495d71ae5a4SJacob Faibussowitsch {
49672367587SKarl Rupp   PetscFunctionBegin;
4979566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc));
4989566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc));
4999566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc));
5009566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc));
5013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
50272367587SKarl Rupp }
503