xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision ab6435e118e23a6371a2f5a89b07e5fa384571c8)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include "petscconf.h"
9 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
10 #include "petscbt.h"
11 #include "../src/vec/vec/impls/dvecimpl.h"
12 #include "petsc-private/vecimpl.h"
13 
14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscInt           *ii;
31   PetscErrorCode     ierr;
32 
33 
34   PetscFunctionBegin;
35   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
36     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
37       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38 
39       try {
40         viennaclstruct->mat = new ViennaCLAIJMatrix();
41         if (a->compressedrow.use) {
42           ii = a->compressedrow.i;
43 
44           viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
45 
46           // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
47 
48           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
49         } else {
50 
51           // Since PetscInt is in general different from cl_uint, we have to convert:
52           viennacl::backend::mem_handle dummy;
53 
54           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
55           for (PetscInt i=0; i<=A->rmap->n; ++i)
56             row_buffer.set(i, (a->i)[i]);
57 
58           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
59           for (PetscInt i=0; i<a->nz; ++i)
60             col_buffer.set(i, (a->j)[i]);
61 
62           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
63         }
64       } catch(std::exception const & ex) {
65         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
66       }
67 
68       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
69 
70       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
71     }
72   }
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatViennaCLCopyFromGPU"
78 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
79 {
80   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
81   PetscInt           m               = A->rmap->n;
82   PetscErrorCode     ierr;
83 
84 
85   PetscFunctionBegin;
86   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
87     try {
88       if (a->compressedrow.use) {
89         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
90       } else {
91 
92         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
93         a->nz           = Agpu->nnz();
94         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
95         A->preallocated = PETSC_TRUE;
96         if (a->singlemalloc) {
97           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
98         } else {
99           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
100           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
101           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
102         }
103         ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
104         ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
105 
106         a->singlemalloc = PETSC_TRUE;
107 
108         /* Setup row lengths */
109         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
110         ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
111         ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
112 
113         /* Copy data back from GPU */
114         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
115 
116         // copy row array
117         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
118         (a->i)[0] = row_buffer[0];
119         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
120           (a->i)[i+1] = row_buffer[i+1];
121           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
122         }
123 
124         // copy column indices
125         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
126         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
127         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
128           (a->j)[i] = col_buffer[i];
129 
130         // copy nonzero entries directly to destination (no conversion required)
131         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
132 
133         /* TODO: What about a->diag? */
134       }
135     } catch(std::exception const & ex) {
136       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
137     }
138 
139     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
140     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
141     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
142 
143     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
144   } else {
145     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
146   }
147   PetscFunctionReturn(0);
148 }
149 
150 #undef __FUNCT__
151 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
152 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
153 {
154   PetscErrorCode ierr;
155 
156   PetscFunctionBegin;
157   if (right) {
158     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
159     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
160     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
161     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
162     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
163   }
164   if (left) {
165     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
166     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
167     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
168     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
169     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 #undef __FUNCT__
175 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
176 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
177 {
178   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
179   PetscErrorCode       ierr;
180   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
181   const ViennaCLVector *xgpu=NULL;
182   ViennaCLVector       *ygpu=NULL;
183 
184   PetscFunctionBegin;
185   if (A->rmap->n > 0 && A->cmap->n > 0) {
186     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
187     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
188     try {
189       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
190     } catch (std::exception const & ex) {
191       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
192     }
193     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
194     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
195     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 
201 
202 #undef __FUNCT__
203 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
204 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
205 {
206   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
207   PetscErrorCode       ierr;
208   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
209   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
210   ViennaCLVector       *zgpu=NULL;
211 
212   PetscFunctionBegin;
213   if (A->rmap->n > 0 && A->cmap->n > 0) {
214     try {
215       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
216       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
217       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
218 
219       if (a->compressedrow.use) {
220           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
221       } else {
222         if (zz == xx || zz == yy) { //temporary required
223           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
224           *zgpu = *ygpu + temp;
225         } else {
226           *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
227           *zgpu += *ygpu;
228         }
229       }
230 
231       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
232       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
233       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
234 
235     } catch(std::exception const & ex) {
236       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
237     }
238     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
239   }
240   PetscFunctionReturn(0);
241 }
242 
243 #undef __FUNCT__
244 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
245 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
246 {
247   PetscErrorCode ierr;
248 
249   PetscFunctionBegin;
250   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
251   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
252   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
253   A->ops->mult    = MatMult_SeqAIJViennaCL;
254   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
255   PetscFunctionReturn(0);
256 }
257 
258 /* --------------------------------------------------------------------------------*/
259 #undef __FUNCT__
260 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
261 /*@
262    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
263    (the default parallel PETSc format).  This matrix will ultimately be pushed down
264    to GPUs and use the ViennaCL library for calculations. For good matrix
265    assembly performance the user should preallocate the matrix storage by setting
266    the parameter nz (or the array nnz).  By setting these parameters accurately,
267    performance during matrix assembly can be increased substantially.
268 
269 
270    Collective on MPI_Comm
271 
272    Input Parameters:
273 +  comm - MPI communicator, set to PETSC_COMM_SELF
274 .  m - number of rows
275 .  n - number of columns
276 .  nz - number of nonzeros per row (same for all rows)
277 -  nnz - array containing the number of nonzeros in the various rows
278          (possibly different for each row) or NULL
279 
280    Output Parameter:
281 .  A - the matrix
282 
283    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
284    MatXXXXSetPreallocation() paradigm instead of this routine directly.
285    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
286 
287    Notes:
288    If nnz is given then nz is ignored
289 
290    The AIJ format (also called the Yale sparse matrix format or
291    compressed row storage), is fully compatible with standard Fortran 77
292    storage.  That is, the stored row and column indices can begin at
293    either one (as in Fortran) or zero.  See the users' manual for details.
294 
295    Specify the preallocated storage with either nz or nnz (not both).
296    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
297    allocation.  For large problems you MUST preallocate memory or you
298    will get TERRIBLE performance, see the users' manual chapter on matrices.
299 
300    Level: intermediate
301 
302 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
303 
304 @*/
305 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
306 {
307   PetscErrorCode ierr;
308 
309   PetscFunctionBegin;
310   ierr = MatCreate(comm,A);CHKERRQ(ierr);
311   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
312   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
313   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 
318 #undef __FUNCT__
319 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
320 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
321 {
322   PetscErrorCode ierr;
323   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
324 
325   PetscFunctionBegin;
326   try {
327     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
328     delete viennaclcontainer;
329     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
330   } catch(std::exception const & ex) {
331     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
332   }
333   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
334   A->spptr = 0;
335   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
336   PetscFunctionReturn(0);
337 }
338 
339 
340 #undef __FUNCT__
341 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
342 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
343 {
344   PetscErrorCode ierr;
345   Mat_SeqAIJ     *aij;
346 
347   PetscFunctionBegin;
348   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
349   aij             = (Mat_SeqAIJ*)B->data;
350   aij->inode.use  = PETSC_FALSE;
351   B->ops->mult    = MatMult_SeqAIJViennaCL;
352   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
353   B->spptr        = new Mat_SeqAIJViennaCL();
354 
355   ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0;
356 
357   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
358   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
359   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
360 
361   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
362 
363   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
364   PetscFunctionReturn(0);
365 }
366 
367 
368 /*M
369    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
370 
371    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
372    default. All matrix calculations are performed using the ViennaCL library.
373 
374    Options Database Keys:
375 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
376 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
377 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
378 
379   Level: beginner
380 
381 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
382 M*/
383 
384