xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 023073b3e9f5f67a417c10d95dd9f12e62bc1d2b)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include "petscconf.h"
9 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
10 #include "petscbt.h"
11 #include "../src/vec/vec/impls/dvecimpl.h"
12 #include "petsc-private/vecimpl.h"
13 
14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscInt           *ii;
31   PetscErrorCode     ierr;
32 
33 
34   PetscFunctionBegin;
35   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
36     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
37       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38 
39       try {
40         viennaclstruct->mat = new ViennaCLAIJMatrix();
41         if (a->compressedrow.use) {
42           ii = a->compressedrow.i;
43 
44           viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
45 
46           // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
47           //       Cannot be reasonably supported in ViennaCL 1.4.x (custom kernels required), hence postponing until there is support for compressed CSR
48 
49           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
50         } else {
51 
52           // Since PetscInt is in general different from cl_uint, we have to convert:
53           viennacl::backend::mem_handle dummy;
54 
55           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
56           for (PetscInt i=0; i<=A->rmap->n; ++i)
57             row_buffer.set(i, (a->i)[i]);
58 
59           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
60           for (PetscInt i=0; i<a->nz; ++i)
61             col_buffer.set(i, (a->j)[i]);
62 
63           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
64         }
65       } catch(std::exception const & ex) {
66         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
67       }
68 
69       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
70 
71       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
72     }
73   }
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNCT__
78 #define __FUNCT__ "MatViennaCLCopyFromGPU"
79 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
80 {
81   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
82   PetscInt           m               = A->rmap->n;
83   PetscErrorCode     ierr;
84 
85 
86   PetscFunctionBegin;
87   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
88     try {
89       if (a->compressedrow.use) {
90         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
91       } else {
92 
93         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
94         a->nz           = Agpu->nnz();
95         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
96         A->preallocated = PETSC_TRUE;
97         if (a->singlemalloc) {
98           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
99         } else {
100           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
101           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
102           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
103         }
104         ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
105         ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
106 
107         a->singlemalloc = PETSC_TRUE;
108 
109         /* Setup row lengths */
110         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
111         ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
112         ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
113 
114         /* Copy data back from GPU */
115         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
116 
117         // copy row array
118         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
119         (a->i)[0] = row_buffer[0];
120         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
121           (a->i)[i+1] = row_buffer[i+1];
122           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
123         }
124 
125         // copy column indices
126         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
127         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
128         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
129           (a->j)[i] = col_buffer[i];
130 
131         // copy nonzero entries directly to destination (no conversion required)
132         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
133 
134         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
135       }
136     } catch(std::exception const & ex) {
137       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
138     }
139 
140     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
141     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
142     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
143 
144     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
145   } else {
146     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
147   }
148   PetscFunctionReturn(0);
149 }
150 
151 #undef __FUNCT__
152 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
153 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
154 {
155   PetscErrorCode ierr;
156 
157   PetscFunctionBegin;
158   if (right) {
159     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
160     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
161     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
162     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
163     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
164   }
165   if (left) {
166     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
167     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
168     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
169     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
170     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
171   }
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
177 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
178 {
179   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
180   PetscErrorCode       ierr;
181   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
182   const ViennaCLVector *xgpu=NULL;
183   ViennaCLVector       *ygpu=NULL;
184 
185   PetscFunctionBegin;
186   if (A->rmap->n > 0 && A->cmap->n > 0) {
187     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
188     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
189     try {
190       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
191     } catch (std::exception const & ex) {
192       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
193     }
194     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
195     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
196     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
197   }
198   PetscFunctionReturn(0);
199 }
200 
201 
202 
203 #undef __FUNCT__
204 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
205 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
206 {
207   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
208   PetscErrorCode       ierr;
209   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
210   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
211   ViennaCLVector       *zgpu=NULL;
212 
213   PetscFunctionBegin;
214   if (A->rmap->n > 0 && A->cmap->n > 0) {
215     try {
216       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
217       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
218       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
219 
220       if (a->compressedrow.use) {
221           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
222       } else {
223         if (zz == xx || zz == yy) { //temporary required
224           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
225           *zgpu = *ygpu + temp;
226         } else {
227           *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
228           *zgpu += *ygpu;
229         }
230       }
231 
232       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
233       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
234       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
235 
236     } catch(std::exception const & ex) {
237       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
238     }
239     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
240   }
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
246 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
247 {
248   PetscErrorCode ierr;
249 
250   PetscFunctionBegin;
251   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
252   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
253   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
254   A->ops->mult    = MatMult_SeqAIJViennaCL;
255   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
256   PetscFunctionReturn(0);
257 }
258 
259 /* --------------------------------------------------------------------------------*/
260 #undef __FUNCT__
261 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
262 /*@
263    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
264    (the default parallel PETSc format).  This matrix will ultimately be pushed down
265    to GPUs and use the ViennaCL library for calculations. For good matrix
266    assembly performance the user should preallocate the matrix storage by setting
267    the parameter nz (or the array nnz).  By setting these parameters accurately,
268    performance during matrix assembly can be increased substantially.
269 
270 
271    Collective on MPI_Comm
272 
273    Input Parameters:
274 +  comm - MPI communicator, set to PETSC_COMM_SELF
275 .  m - number of rows
276 .  n - number of columns
277 .  nz - number of nonzeros per row (same for all rows)
278 -  nnz - array containing the number of nonzeros in the various rows
279          (possibly different for each row) or NULL
280 
281    Output Parameter:
282 .  A - the matrix
283 
284    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
285    MatXXXXSetPreallocation() paradigm instead of this routine directly.
286    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
287 
288    Notes:
289    If nnz is given then nz is ignored
290 
291    The AIJ format (also called the Yale sparse matrix format or
292    compressed row storage), is fully compatible with standard Fortran 77
293    storage.  That is, the stored row and column indices can begin at
294    either one (as in Fortran) or zero.  See the users' manual for details.
295 
296    Specify the preallocated storage with either nz or nnz (not both).
297    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
298    allocation.  For large problems you MUST preallocate memory or you
299    will get TERRIBLE performance, see the users' manual chapter on matrices.
300 
301    Level: intermediate
302 
303 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
304 
305 @*/
306 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
307 {
308   PetscErrorCode ierr;
309 
310   PetscFunctionBegin;
311   ierr = MatCreate(comm,A);CHKERRQ(ierr);
312   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
313   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
314   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 
319 #undef __FUNCT__
320 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
321 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
322 {
323   PetscErrorCode ierr;
324   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
325 
326   PetscFunctionBegin;
327   try {
328     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
329     delete viennaclcontainer;
330     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
331   } catch(std::exception const & ex) {
332     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
333   }
334   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
335   A->spptr = 0;
336   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
340 
341 #undef __FUNCT__
342 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
343 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
344 {
345   PetscErrorCode ierr;
346   Mat_SeqAIJ     *aij;
347 
348   PetscFunctionBegin;
349   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
350   aij             = (Mat_SeqAIJ*)B->data;
351   aij->inode.use  = PETSC_FALSE;
352   B->ops->mult    = MatMult_SeqAIJViennaCL;
353   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
354   B->spptr        = new Mat_SeqAIJViennaCL();
355 
356   ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0;
357 
358   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
359   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
360   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
361 
362   ierr = MatSetFromOptions_SeqViennaCL(B);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
363   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
364 
365   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
366   PetscFunctionReturn(0);
367 }
368 
369 
370 /*M
371    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
372 
373    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
374    default. All matrix calculations are performed using the ViennaCL library.
375 
376    Options Database Keys:
377 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
378 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
379 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
380 
381   Level: beginner
382 
383 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
384 M*/
385 
386