xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 72a35d6d63ea98b60d8ef4c5aaefda5ff933c0fe)
1 
2 /*
3     Defines the basic matrix operations for the AIJ (compressed row)
4   matrix storage format.
5 */
6 
7 #include <petscconf.h>
8 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10 #include <petscbt.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 
14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15 
16 #include <algorithm>
17 #include <vector>
18 #include <string>
19 
20 #include "viennacl/linalg/prod.hpp"
21 
22 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
23 PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *);
24 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
25 
26 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
27 {
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
30 
31   PetscFunctionBegin;
32   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
33     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
34       PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
35 
36       try {
37         if (a->compressedrow.use) {
38           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
39 
40           // Since PetscInt is different from cl_uint, we have to convert:
41           viennacl::backend::mem_handle dummy;
42 
43           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
44           row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1);
45           for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]);
46 
47           viennacl::backend::typesafe_host_array<unsigned int> row_indices;
48           row_indices.raw_resize(dummy, a->compressedrow.nrows);
49           for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]);
50 
51           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
52           col_buffer.raw_resize(dummy, a->nz);
53           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
54 
55           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
56           PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
57         } else {
58           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
59 
60           // Since PetscInt is in general different from cl_uint, we have to convert:
61           viennacl::backend::mem_handle dummy;
62 
63           viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
64           row_buffer.raw_resize(dummy, A->rmap->n + 1);
65           for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]);
66 
67           viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
68           col_buffer.raw_resize(dummy, a->nz);
69           for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]);
70 
71           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
72           PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar)));
73         }
74         ViennaCLWaitForGPU();
75       } catch (std::exception const &ex) {
76         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
77       }
78 
79       // Create temporary vector for v += A*x:
80       if (viennaclstruct->tempvec) {
81         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
82           delete (ViennaCLVector *)viennaclstruct->tempvec;
83           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
84         } else {
85           viennaclstruct->tempvec->clear();
86         }
87       } else {
88         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
89       }
90 
91       A->offloadmask = PETSC_OFFLOAD_BOTH;
92 
93       PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0));
94     }
95   }
96   PetscFunctionReturn(PETSC_SUCCESS);
97 }
98 
99 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
100 {
101   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
102   Mat_SeqAIJ         *a              = (Mat_SeqAIJ *)A->data;
103   PetscInt            m              = A->rmap->n;
104 
105   PetscFunctionBegin;
106   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(PETSC_SUCCESS);
107   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
108     try {
109       PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
110       PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m);
111       a->nz           = Agpu->nnz();
112       a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
113       A->preallocated = PETSC_TRUE;
114       if (a->singlemalloc) {
115         if (a->a) PetscCall(PetscFree3(a->a, a->j, a->i));
116       } else {
117         if (a->i) PetscCall(PetscFree(a->i));
118         if (a->j) PetscCall(PetscFree(a->j));
119         if (a->a) PetscCall(PetscFree(a->a));
120       }
121       PetscCall(PetscMalloc3(a->nz, &a->a, a->nz, &a->j, m + 1, &a->i));
122 
123       a->singlemalloc = PETSC_TRUE;
124 
125       /* Setup row lengths */
126       PetscCall(PetscFree(a->imax));
127       PetscCall(PetscFree(a->ilen));
128       PetscCall(PetscMalloc1(m, &a->imax));
129       PetscCall(PetscMalloc1(m, &a->ilen));
130 
131       /* Copy data back from GPU */
132       viennacl::backend::typesafe_host_array<unsigned int> row_buffer;
133       row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
134 
135       // copy row array
136       viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
137       (a->i)[0] = row_buffer[0];
138       for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
139         (a->i)[i + 1] = row_buffer[i + 1];
140         a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
141       }
142 
143       // copy column indices
144       viennacl::backend::typesafe_host_array<unsigned int> col_buffer;
145       col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
146       viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
147       for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i];
148 
149       // copy nonzero entries directly to destination (no conversion required)
150       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
151 
152       PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar))));
153       ViennaCLWaitForGPU();
154       /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
155     } catch (std::exception const &ex) {
156       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
157     }
158   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
159     PetscFunctionReturn(PETSC_SUCCESS);
160   } else {
161     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(PETSC_SUCCESS);
162 
163     PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
164     if (!Agpu) {
165       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a);
166     } else {
167       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a);
168     }
169   }
170   A->offloadmask = PETSC_OFFLOAD_BOTH;
171   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
172   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
173   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
174   PetscFunctionReturn(PETSC_SUCCESS);
175 }
176 
177 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy)
178 {
179   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
180   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
181   const ViennaCLVector *xgpu           = NULL;
182   ViennaCLVector       *ygpu           = NULL;
183 
184   PetscFunctionBegin;
185   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
186   PetscCall(MatViennaCLCopyToGPU(A));
187   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
188     PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
189     PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu));
190     PetscCall(PetscLogGpuTimeBegin());
191     try {
192       if (a->compressedrow.use) {
193         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
194       } else {
195         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
196       }
197       ViennaCLWaitForGPU();
198     } catch (std::exception const &ex) {
199       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
200     }
201     PetscCall(PetscLogGpuTimeEnd());
202     PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
203     PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu));
204     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
205   } else {
206     PetscCall(VecSet_SeqViennaCL(yy, 0));
207   }
208   PetscFunctionReturn(PETSC_SUCCESS);
209 }
210 
211 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz)
212 {
213   Mat_SeqAIJ           *a              = (Mat_SeqAIJ *)A->data;
214   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr;
215   const ViennaCLVector *xgpu = NULL, *ygpu = NULL;
216   ViennaCLVector       *zgpu = NULL;
217 
218   PetscFunctionBegin;
219   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
220   PetscCall(MatViennaCLCopyToGPU(A));
221   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
222     try {
223       PetscCall(VecViennaCLGetArrayRead(xx, &xgpu));
224       PetscCall(VecViennaCLGetArrayRead(yy, &ygpu));
225       PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu));
226       PetscCall(PetscLogGpuTimeBegin());
227       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
228       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
229       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
230       else *zgpu += *viennaclstruct->tempvec;
231       ViennaCLWaitForGPU();
232       PetscCall(PetscLogGpuTimeEnd());
233       PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu));
234       PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu));
235       PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu));
236 
237     } catch (std::exception const &ex) {
238       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
239     }
240     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
241   } else {
242     PetscCall(VecCopy_SeqViennaCL(yy, zz));
243   }
244   PetscFunctionReturn(PETSC_SUCCESS);
245 }
246 
247 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode)
248 {
249   PetscFunctionBegin;
250   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
251   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
252   if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A));
253   PetscFunctionReturn(PETSC_SUCCESS);
254 }
255 
256 /* --------------------------------------------------------------------------------*/
257 /*@C
258    MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format
259    (the default parallel PETSc format).  This matrix will ultimately be pushed down
260    to GPUs and use the ViennaCL library for calculations. For good matrix
261    assembly performance the user should preallocate the matrix storage by setting
262    the parameter nz (or the array nnz).  By setting these parameters accurately,
263    performance during matrix assembly can be increased substantially.
264 
265    Collective
266 
267    Input Parameters:
268 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
269 .  m - number of rows
270 .  n - number of columns
271 .  nz - number of nonzeros per row (same for all rows)
272 -  nnz - array containing the number of nonzeros in the various rows
273          (possibly different for each row) or NULL
274 
275    Output Parameter:
276 .  A - the matrix
277 
278    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
279    MatXXXXSetPreallocation() paradigm instead of this routine directly.
280    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
281 
282    Notes:
283    If nnz is given then nz is ignored
284 
285    The AIJ format, also called
286    compressed row storage, is fully compatible with standard Fortran
287    storage.  That is, the stored row and column indices can begin at
288    either one (as in Fortran) or zero.  See the users' manual for details.
289 
290    Specify the preallocated storage with either nz or nnz (not both).
291    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
292    allocation.  For large problems you MUST preallocate memory or you
293    will get TERRIBLE performance, see the users' manual chapter on matrices.
294 
295    Level: intermediate
296 
297 .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
298 @*/
299 PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
300 {
301   PetscFunctionBegin;
302   PetscCall(MatCreate(comm, A));
303   PetscCall(MatSetSizes(*A, m, n, m, n));
304   PetscCall(MatSetType(*A, MATSEQAIJVIENNACL));
305   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
306   PetscFunctionReturn(PETSC_SUCCESS);
307 }
308 
309 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
310 {
311   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr;
312 
313   PetscFunctionBegin;
314   try {
315     if (viennaclcontainer) {
316       delete viennaclcontainer->tempvec;
317       delete viennaclcontainer->mat;
318       delete viennaclcontainer->compressed_mat;
319       delete viennaclcontainer;
320     }
321     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
322   } catch (std::exception const &ex) {
323     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
324   }
325 
326   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL));
327   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL));
328   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL));
329 
330   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
331   A->spptr = 0;
332   PetscCall(MatDestroy_SeqAIJ(A));
333   PetscFunctionReturn(PETSC_SUCCESS);
334 }
335 
336 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
337 {
338   PetscFunctionBegin;
339   PetscCall(MatCreate_SeqAIJ(B));
340   PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B));
341   PetscFunctionReturn(PETSC_SUCCESS);
342 }
343 
344 static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool);
345 static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B)
346 {
347   Mat C;
348 
349   PetscFunctionBegin;
350   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
351   C = *B;
352 
353   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
354   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
355 
356   C->spptr                                         = new Mat_SeqAIJViennaCL();
357   ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec        = NULL;
358   ((Mat_SeqAIJViennaCL *)C->spptr)->mat            = NULL;
359   ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL;
360 
361   PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL));
362 
363   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
364 
365   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
366   if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C));
367   PetscFunctionReturn(PETSC_SUCCESS);
368 }
369 
370 static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
371 {
372   PetscFunctionBegin;
373   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
374   *array = ((Mat_SeqAIJ *)A->data)->a;
375   PetscFunctionReturn(PETSC_SUCCESS);
376 }
377 
378 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[])
379 {
380   PetscFunctionBegin;
381   A->offloadmask = PETSC_OFFLOAD_CPU;
382   *array         = NULL;
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
386 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
387 {
388   PetscFunctionBegin;
389   PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
390   *array = ((Mat_SeqAIJ *)A->data)->a;
391   PetscFunctionReturn(PETSC_SUCCESS);
392 }
393 
394 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[])
395 {
396   PetscFunctionBegin;
397   *array = NULL;
398   /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
403 {
404   PetscFunctionBegin;
405   *array = ((Mat_SeqAIJ *)A->data)->a;
406   PetscFunctionReturn(PETSC_SUCCESS);
407 }
408 
409 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[])
410 {
411   PetscFunctionBegin;
412   A->offloadmask = PETSC_OFFLOAD_CPU;
413   *array         = NULL;
414   PetscFunctionReturn(PETSC_SUCCESS);
415 }
416 
417 static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg)
418 {
419   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
420 
421   PetscFunctionBegin;
422   A->boundtocpu = flg;
423   if (flg && a->inode.size) {
424     a->inode.use = PETSC_TRUE;
425   } else {
426     a->inode.use = PETSC_FALSE;
427   }
428   if (flg) {
429     /* make sure we have an up-to-date copy on the CPU */
430     PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL));
431     A->ops->mult        = MatMult_SeqAIJ;
432     A->ops->multadd     = MatMultAdd_SeqAIJ;
433     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
434     A->ops->duplicate   = MatDuplicate_SeqAIJ;
435     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
436   } else {
437     A->ops->mult        = MatMult_SeqAIJViennaCL;
438     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
439     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
440     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
441     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
442 
443     a->ops->getarray          = MatSeqAIJGetArray_SeqAIJViennaCL;
444     a->ops->restorearray      = MatSeqAIJRestoreArray_SeqAIJViennaCL;
445     a->ops->getarrayread      = MatSeqAIJGetArrayRead_SeqAIJViennaCL;
446     a->ops->restorearrayread  = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL;
447     a->ops->getarraywrite     = MatSeqAIJGetArrayWrite_SeqAIJViennaCL;
448     a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL;
449   }
450   PetscFunctionReturn(PETSC_SUCCESS);
451 }
452 
453 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat)
454 {
455   Mat B;
456 
457   PetscFunctionBegin;
458   PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
459 
460   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
461 
462   B = *newmat;
463 
464   B->spptr = new Mat_SeqAIJViennaCL();
465 
466   ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec        = NULL;
467   ((Mat_SeqAIJViennaCL *)B->spptr)->mat            = NULL;
468   ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL;
469 
470   PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE));
471   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
472 
473   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL));
474   PetscCall(PetscFree(B->defaultvectype));
475   PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype));
476 
477   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL));
478   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
479   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ));
480 
481   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
482 
483   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
484   if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B));
485   PetscFunctionReturn(PETSC_SUCCESS);
486 }
487 
488 /*MC
489    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
490 
491    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
492    default. All matrix calculations are performed using the ViennaCL library.
493 
494    Options Database Keys:
495 +  -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions()
496 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
497 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`.
498 
499   Level: beginner
500 
501 .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()`
502 M*/
503 
504 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
505 {
506   PetscFunctionBegin;
507   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc));
508   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc));
509   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc));
510   PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc));
511   PetscFunctionReturn(PETSC_SUCCESS);
512 }
513