xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 7f296bb328fcd4c99f2da7bfe8ba7ed8a4ebceee)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include <petscconf.h>
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <petscbt.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
14 
15 #include <algorithm>
16 #include <vector>
17 #include <string>
18 
19 #include "viennacl/linalg/prod.hpp"
20 
21 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
22 PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *);
23 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
24 
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
28   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
29 
30   PetscFunctionBegin;
31   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
32     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
33       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
34 
35       try {
36         if (a->compressedrow.use) {
37           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
38 
39           // Since PetscInt is different from cl_uint, we have to convert:
40           viennacl::backend::mem_handle dummy;
41 
42           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
43           row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1);
44           for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]);
45 
46           viennacl::backend::typesafe_host_array<unsigned int> row_indices;
47           row_indices.raw_resize(dummy, a->compressedrow.nrows);
48           for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]);
49 
50           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
51           col_buffer.raw_resize(dummy, a->nz);
52           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
53 
54           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
55           PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
56         } else {
57           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
58 
59           // Since PetscInt is in general different from cl_uint, we have to convert:
60           viennacl::backend::mem_handle dummy;
61 
62           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
63           row_buffer.raw_resize(dummy, A->rmap->n + 1);
64           for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]);
65 
66           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
67           col_buffer.raw_resize(dummy, a->nz);
68           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
69 
70           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
71           PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
72         }
73         ViennaCLWaitForGPU();
74       } catch (std::exception const &ex) {
75         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
76       }
77 
78       // Create temporary vector for v += A*x:
79       if (viennaclstruct->tempvec) {
80         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
81           delete (ViennaCLVector *)viennaclstruct->tempvec;
82           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
83         } else {
84           viennaclstruct->tempvec->clear();
85         }
86       } else {
87         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
88       }
89 
90       A->offloadmask = PETSC_OFFLOAD_BOTH;
91 
92       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
93     }
94   }
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
99 {
100   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
101   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
102   PetscInt            m              = A->rmap->n;
103 
104   PetscFunctionBegin;
105   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(PETSC_SUCCESS);
106   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
107     try {
108       PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
109       PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
110       a->nz           = Agpu->nnz();
111       a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
112       A->preallocated = PETSC_TRUE;
113       if (a->free_a) PetscCall(PetscShmgetDeallocateArray((void **)a->a));
114       if (a->free_ij) PetscCall(PetscShmgetDeallocateArray((void **)a->j));
115       if (a->free_ij) PetscCall(PetscShmgetDeallocateArray((void **)a->i));
116       PetscCall(PetscShmgetAllocateArray(a->nz, sizeof(PetscScalar), (void **)&a->a));
117       PetscCall(PetscShmgetAllocateArray(a->nz, sizeof(PetscInt), (void **)&a->j));
118       PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&a->i));
119       a->free_a  = PETSC_TRUE;
120       a->free_ij = PETSC_TRUE;
121 
122       /* Setup row lengths */
123       PetscCall(PetscFree(a->imax));
124       PetscCall(PetscFree(a->ilen));
125       PetscCall(PetscMalloc1(m, &a->imax));
126       PetscCall(PetscMalloc1(m, &a->ilen));
127 
128       /* Copy data back from GPU */
129       viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
130       row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
131 
132       // copy row array
133       viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
134       (a->i)[0] = row_buffer[0];
135       for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
136         (a->i)[i + 1] = row_buffer[i + 1];
137         a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
138       }
139 
140       // copy column indices
141       viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
142       col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
143       viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
144       for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i];
145 
146       // copy nonzero entries directly to destination (no conversion required)
147       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
148 
149       PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar))));
150       ViennaCLWaitForGPU();
151       /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
152     } catch (std::exception const &ex) {
153       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
154     }
155   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
156     PetscFunctionReturn(PETSC_SUCCESS);
157   } else {
158     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(PETSC_SUCCESS);
159 
160     PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
161     if (!Agpu) {
162       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a);
163     } else {
164       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
165     }
166   }
167   A->offloadmask = PETSC_OFFLOAD_BOTH;
168   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
169   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
170   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
171   PetscFunctionReturn(PETSC_SUCCESS);
172 }
173 
174 static PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy)
175 {
176   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
177   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
178   const ViennaCLVector *xgpu           = NULL;
179   ViennaCLVector       *ygpu           = NULL;
180 
181   PetscFunctionBegin;
182   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
183   PetscCall(MatViennaCLCopyToGPU(A));
184   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
185     PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
186     PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu));
187     PetscCall(PetscLogGpuTimeBegin());
188     try {
189       if (a->compressedrow.use) {
190         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
191       } else {
192         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
193       }
194       ViennaCLWaitForGPU();
195     } catch (std::exception const &ex) {
196       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
197     }
198     PetscCall(PetscLogGpuTimeEnd());
199     PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
200     PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu));
201     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
202   } else {
203     PetscCall(VecSet_SeqViennaCL(yy, 0));
204   }
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 static PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz)
209 {
210   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
211   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
212   const ViennaCLVector *xgpu = NULL, *ygpu = NULL;
213   ViennaCLVector       *zgpu = NULL;
214 
215   PetscFunctionBegin;
216   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
217   PetscCall(MatViennaCLCopyToGPU(A));
218   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
219     try {
220       PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
221       PetscCall(VecViennaCLGetArrayRead(yy, &ygpu));
222       PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu));
223       PetscCall(PetscLogGpuTimeBegin());
224       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
225       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
226       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
227       else *zgpu += *viennaclstruct->tempvec;
228       ViennaCLWaitForGPU();
229       PetscCall(PetscLogGpuTimeEnd());
230       PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
231       PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu));
232       PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu));
233 
234     } catch (std::exception const &ex) {
235       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
236     }
237     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
238   } else {
239     PetscCall(VecCopy_SeqViennaCL(yy, zz));
240   }
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode)
245 {
246   PetscFunctionBegin;
247   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
248   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
249   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
250   PetscFunctionReturn(PETSC_SUCCESS);
251 }
252 
253 /* --------------------------------------------------------------------------------*/
254 /*@C
255   MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format
256   (the default parallel PETSc format).  This matrix will ultimately be pushed down
257   to GPUs and use the ViennaCL library for calculations.
258 
259   Collective
260 
261   Input Parameters:
262 + comm - MPI communicator, set to `PETSC_COMM_SELF`
263 . m    - number of rows
264 . n    - number of columns
265 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
266 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
267 
268   Output Parameter:
269 . A - the matrix
270 
271    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
272    MatXXXXSetPreallocation() paradigm instead of this routine directly.
273    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
274 
275   Notes:
276   The AIJ format, also called
277   compressed row storage, is fully compatible with standard Fortran
278   storage.  That is, the stored row and column indices can begin at
279   either one (as in Fortran) or zero.
280 
281   Specify the preallocated storage with either `nz` or `nnz` (not both).
282   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
283   allocation.
284 
285   Level: intermediate
286 
287 .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
288 @*/
289 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
290 {
291   PetscFunctionBegin;
292   PetscCall(MatCreate(comm, A));
293   PetscCall(MatSetSizes(*A, m, n, m, n));
294   PetscCall(MatSetType(*A, MATSEQAIJVIENNACL));
295   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
296   PetscFunctionReturn(PETSC_SUCCESS);
297 }
298 
299 static PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
300 {
301   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr;
302 
303   PetscFunctionBegin;
304   try {
305     if (viennaclcontainer) {
306       delete viennaclcontainer->tempvec;
307       delete viennaclcontainer->mat;
308       delete viennaclcontainer->compressed_mat;
309       delete viennaclcontainer;
310     }
311     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
312   } catch (std::exception const &ex) {
313     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
314   }
315 
316   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL));
317   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL));
318   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL));
319 
320   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
321   A->spptr = 0;
322   PetscCall(MatDestroy_SeqAIJ(A));
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 
326 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
327 {
328   PetscFunctionBegin;
329   PetscCall(MatCreate_SeqAIJ(B));
330   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B));
331   PetscFunctionReturn(PETSC_SUCCESS);
332 }
333 
334 static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool);
335 static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B)
336 {
337   Mat C;
338 
339   PetscFunctionBegin;
340   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
341   C = *B;
342 
343   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
344   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
345 
346   C->spptr                                         = new Mat_SeqAIJViennaCL();
347   ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec        = NULL;
348   ((Mat_SeqAIJViennaCL *)C->spptr)->mat            = NULL;
349   ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL;
350 
351   PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL));
352 
353   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
354 
355   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
356   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
357   PetscFunctionReturn(PETSC_SUCCESS);
358 }
359 
360 static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
361 {
362   PetscFunctionBegin;
363   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
364   *array = ((Mat_SeqAIJ *)A->data)->a;
365   PetscFunctionReturn(PETSC_SUCCESS);
366 }
367 
368 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
369 {
370   PetscFunctionBegin;
371   A->offloadmask = PETSC_OFFLOAD_CPU;
372   *array         = NULL;
373   PetscFunctionReturn(PETSC_SUCCESS);
374 }
375 
376 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
377 {
378   PetscFunctionBegin;
379   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
380   *array = ((Mat_SeqAIJ *)A->data)->a;
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
385 {
386   PetscFunctionBegin;
387   *array = NULL;
388   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
389   PetscFunctionReturn(PETSC_SUCCESS);
390 }
391 
392 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
393 {
394   PetscFunctionBegin;
395   *array = ((Mat_SeqAIJ *)A->data)->a;
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
400 {
401   PetscFunctionBegin;
402   A->offloadmask = PETSC_OFFLOAD_CPU;
403   *array         = NULL;
404   PetscFunctionReturn(PETSC_SUCCESS);
405 }
406 
407 static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg)
408 {
409   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
410 
411   PetscFunctionBegin;
412   A->boundtocpu = flg;
413   if (flg && a->inode.size) {
414     a->inode.use = PETSC_TRUE;
415   } else {
416     a->inode.use = PETSC_FALSE;
417   }
418   if (flg) {
419     /* make sure we have an up-to-date copy on the CPU */
420     PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
421     A->ops->mult        = MatMult_SeqAIJ;
422     A->ops->multadd     = MatMultAdd_SeqAIJ;
423     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
424     A->ops->duplicate   = MatDuplicate_SeqAIJ;
425     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
426   } else {
427     A->ops->mult        = MatMult_SeqAIJViennaCL;
428     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
429     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
430     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
431     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
432 
433     a->ops->getarray          = MatSeqAIJGetArray_SeqAIJViennaCL;
434     a->ops->restorearray      = MatSeqAIJRestoreArray_SeqAIJViennaCL;
435     a->ops->getarrayread      = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
436     a->ops->restorearrayread  = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
437     a->ops->getarraywrite     = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
438     a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
439   }
440   PetscFunctionReturn(PETSC_SUCCESS);
441 }
442 
443 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat)
444 {
445   Mat B;
446 
447   PetscFunctionBegin;
448   PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
449 
450   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
451 
452   B = *newmat;
453 
454   B->spptr = new Mat_SeqAIJViennaCL();
455 
456   ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec        = NULL;
457   ((Mat_SeqAIJViennaCL *)B->spptr)->mat            = NULL;
458   ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL;
459 
460   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
461   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
462 
463   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL));
464   PetscCall(PetscFree(B->defaultvectype));
465   PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype));
466 
467   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL));
468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ));
470 
471   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
472 
473   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
474   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
475   PetscFunctionReturn(PETSC_SUCCESS);
476 }
477 
478 /*MC
479    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
480 
481    A matrix type whose data resides on GPUs. These matrices are in CSR format by
482    default. All matrix calculations are performed using the ViennaCL library.
483 
484    Options Database Keys:
485 +  -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions()
486 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
487 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
488 
489   Level: beginner
490 
491 .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()`
492 M*/
493 
494 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
495 {
496   PetscFunctionBegin;
497   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc));
498   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc));
499   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc));
500   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc));
501   PetscFunctionReturn(PETSC_SUCCESS);
502 }
503