xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision f9426fe092dba0ba2fdf65dfec8d938c4b10a31c)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include "petscconf.h"
9 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
10 #include "petscbt.h"
11 #include "../src/vec/vec/impls/dvecimpl.h"
12 #include "petsc-private/vecimpl.h"
13 
14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscInt           *ii;
31   PetscErrorCode     ierr;
32 
33 
34   PetscFunctionBegin;
35   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
36     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
37       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38 
39       try {
40         viennaclstruct->mat = new ViennaCLAIJMatrix();
41         if (a->compressedrow.use) {
42           ii = a->compressedrow.i;
43 
44           viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
45 
46           // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
47           //       Cannot be reasonably supported in ViennaCL 1.4.x (custom kernels required), hence postponing until there is support for compressed CSR
48 
49           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
50         } else {
51 
52           // Since PetscInt is in general different from cl_uint, we have to convert:
53           viennacl::backend::mem_handle dummy;
54 
55           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
56           for (PetscInt i=0; i<=A->rmap->n; ++i)
57             row_buffer.set(i, (a->i)[i]);
58 
59           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
60           for (PetscInt i=0; i<a->nz; ++i)
61             col_buffer.set(i, (a->j)[i]);
62 
63           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
64         }
65         ViennaCLWaitForGPU();
66       } catch(std::exception const & ex) {
67         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
68       }
69 
70       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
71 
72       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
73     }
74   }
75   PetscFunctionReturn(0);
76 }
77 
78 #undef __FUNCT__
79 #define __FUNCT__ "MatViennaCLCopyFromGPU"
80 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
81 {
82   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
83   PetscInt           m               = A->rmap->n;
84   PetscErrorCode     ierr;
85 
86 
87   PetscFunctionBegin;
88   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
89     try {
90       if (a->compressedrow.use) {
91         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
92       } else {
93 
94         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
95         a->nz           = Agpu->nnz();
96         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
97         A->preallocated = PETSC_TRUE;
98         if (a->singlemalloc) {
99           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
100         } else {
101           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
102           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
103           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
104         }
105         ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
106         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
107 
108         a->singlemalloc = PETSC_TRUE;
109 
110         /* Setup row lengths */
111         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
112         ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
113         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
114 
115         /* Copy data back from GPU */
116         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
117 
118         // copy row array
119         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
120         (a->i)[0] = row_buffer[0];
121         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
122           (a->i)[i+1] = row_buffer[i+1];
123           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
124         }
125 
126         // copy column indices
127         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
128         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
129         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
130           (a->j)[i] = col_buffer[i];
131 
132         // copy nonzero entries directly to destination (no conversion required)
133         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
134 
135         ViennaCLWaitForGPU();
136         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
137       }
138     } catch(std::exception const & ex) {
139       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
140     }
141 
142     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
143     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
144     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
145 
146     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
147   } else {
148     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
149   }
150   PetscFunctionReturn(0);
151 }
152 
153 #undef __FUNCT__
154 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
155 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
156 {
157   PetscErrorCode ierr;
158 
159   PetscFunctionBegin;
160   if (right) {
161     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
162     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
163     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
164     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
165     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
166   }
167   if (left) {
168     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
169     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
170     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
171     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
172     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
179 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
180 {
181   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
182   PetscErrorCode       ierr;
183   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
184   const ViennaCLVector *xgpu=NULL;
185   ViennaCLVector       *ygpu=NULL;
186 
187   PetscFunctionBegin;
188   if (A->rmap->n > 0 && A->cmap->n > 0) {
189     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
190     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
191     try {
192       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
193       ViennaCLWaitForGPU();
194     } catch (std::exception const & ex) {
195       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
196     }
197     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
198     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
199     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 
205 
206 #undef __FUNCT__
207 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
208 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
209 {
210   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
211   PetscErrorCode       ierr;
212   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
213   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
214   ViennaCLVector       *zgpu=NULL;
215 
216   PetscFunctionBegin;
217   if (A->rmap->n > 0 && A->cmap->n > 0) {
218     try {
219       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
220       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
221       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
222 
223       if (a->compressedrow.use) {
224           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
225       } else {
226         if (zz == xx || zz == yy) { //temporary required
227           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
228           *zgpu = *ygpu + temp;
229           ViennaCLWaitForGPU();
230         } else {
231           *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
232           *zgpu += *ygpu;
233           ViennaCLWaitForGPU();
234         }
235       }
236 
237       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
238       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
239       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
240 
241     } catch(std::exception const & ex) {
242       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
243     }
244     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #undef __FUNCT__
250 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
251 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
252 {
253   PetscErrorCode ierr;
254 
255   PetscFunctionBegin;
256   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
257   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
258   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
259   A->ops->mult    = MatMult_SeqAIJViennaCL;
260   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
261   PetscFunctionReturn(0);
262 }
263 
264 /* --------------------------------------------------------------------------------*/
265 #undef __FUNCT__
266 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
267 /*@
268    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
269    (the default parallel PETSc format).  This matrix will ultimately be pushed down
270    to GPUs and use the ViennaCL library for calculations. For good matrix
271    assembly performance the user should preallocate the matrix storage by setting
272    the parameter nz (or the array nnz).  By setting these parameters accurately,
273    performance during matrix assembly can be increased substantially.
274 
275 
276    Collective on MPI_Comm
277 
278    Input Parameters:
279 +  comm - MPI communicator, set to PETSC_COMM_SELF
280 .  m - number of rows
281 .  n - number of columns
282 .  nz - number of nonzeros per row (same for all rows)
283 -  nnz - array containing the number of nonzeros in the various rows
284          (possibly different for each row) or NULL
285 
286    Output Parameter:
287 .  A - the matrix
288 
289    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
290    MatXXXXSetPreallocation() paradigm instead of this routine directly.
291    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
292 
293    Notes:
294    If nnz is given then nz is ignored
295 
296    The AIJ format (also called the Yale sparse matrix format or
297    compressed row storage), is fully compatible with standard Fortran 77
298    storage.  That is, the stored row and column indices can begin at
299    either one (as in Fortran) or zero.  See the users' manual for details.
300 
301    Specify the preallocated storage with either nz or nnz (not both).
302    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
303    allocation.  For large problems you MUST preallocate memory or you
304    will get TERRIBLE performance, see the users' manual chapter on matrices.
305 
306    Level: intermediate
307 
308 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
309 
310 @*/
311 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
312 {
313   PetscErrorCode ierr;
314 
315   PetscFunctionBegin;
316   ierr = MatCreate(comm,A);CHKERRQ(ierr);
317   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
318   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
319   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
320   PetscFunctionReturn(0);
321 }
322 
323 
324 #undef __FUNCT__
325 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
326 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
327 {
328   PetscErrorCode ierr;
329   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
330 
331   PetscFunctionBegin;
332   try {
333     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
334     delete viennaclcontainer;
335     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
336   } catch(std::exception const & ex) {
337     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
338   }
339   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
340   A->spptr = 0;
341   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
348 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
349 {
350   PetscErrorCode ierr;
351   Mat_SeqAIJ     *aij;
352 
353   PetscFunctionBegin;
354   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
355   aij             = (Mat_SeqAIJ*)B->data;
356   aij->inode.use  = PETSC_FALSE;
357   B->ops->mult    = MatMult_SeqAIJViennaCL;
358   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
359   B->spptr        = new Mat_SeqAIJViennaCL();
360 
361   ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0;
362 
363   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
364   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
365   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
366 
367   ierr = MatSetFromOptions_SeqViennaCL(B);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
368   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
369 
370   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
371   PetscFunctionReturn(0);
372 }
373 
374 
375 /*M
376    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
377 
378    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
379    default. All matrix calculations are performed using the ViennaCL library.
380 
381    Options Database Keys:
382 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
383 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
384 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
385 
386   Level: beginner
387 
388 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
389 M*/
390 
391