xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 67c87b7f722561d11437735eb3bab5a6f62e8669)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include "petscconf.h"
9 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
10 #include "petscbt.h"
11 #include "../src/vec/vec/impls/dvecimpl.h"
12 #include "petsc-private/vecimpl.h"
13 
14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   //PetscInt           m               = A->rmap->n,*ii,*ridx;
31   PetscInt           *ii;
32   PetscErrorCode     ierr;
33 
34 
35   PetscFunctionBegin;
36   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
37     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
38       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
39 
40       try {
41         //viennaclstruct->nonzerorow=0;
42         //for (PetscInt j = 0; j<m; j++) viennaclstruct->nonzerorow += (a->i[j+1] > a->i[j]);
43 
44         viennaclstruct->mat = new ViennaCLAIJMatrix();
45         if (a->compressedrow.use) {
46           //m    = a->compressedrow.nrows;
47           ii   = a->compressedrow.i;
48           //ridx = a->compressedrow.rindex;
49 
50           viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
51 
52           // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
53 
54           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
55         } else {
56 
57           // Since PetscInt is in general different from cl_uint, we have to convert:
58           viennacl::backend::mem_handle dummy;
59 
60           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
61           for (PetscInt i=0; i<=A->rmap->n; ++i)
62             row_buffer.set(i, (a->i)[i]);
63 
64           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
65           for (PetscInt i=0; i<a->nz; ++i)
66             col_buffer.set(i, (a->j)[i]);
67 
68           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
69         }
70       } catch(char *ex) {
71         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
72       }
73 
74       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
75 
76       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
77     }
78   }
79   PetscFunctionReturn(0);
80 }
81 
82 #undef __FUNCT__
83 #define __FUNCT__ "MatViennaCLCopyFromGPU"
84 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu)
85 {
86   //Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
87   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
88   PetscInt           m               = A->rmap->n;
89   PetscErrorCode     ierr;
90 
91 
92   PetscFunctionBegin;
93   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
94     try {
95       if (a->compressedrow.use) {
96         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
97       } else {
98 
99         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
100         a->nz           = Agpu->nnz();
101         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
102         A->preallocated = PETSC_TRUE;
103         if (a->singlemalloc) {
104           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
105         } else {
106           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
107           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
108           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
109         }
110         ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
111         ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
112 
113         a->singlemalloc = PETSC_TRUE;
114 
115         /* Setup row lengths */
116         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
117         ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
118         ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
119 
120         /* Copy data back from GPU */
121         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
122 
123         // copy row array
124         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
125         (a->i)[0] = row_buffer[0];
126         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
127           (a->i)[i+1] = row_buffer[i+1];
128           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
129         }
130 
131         // copy column indices
132         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
133         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
134         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
135           (a->j)[i] = col_buffer[i];
136 
137         // copy nonzero entries directly to destination (no conversion required)
138         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
139 
140         /* TODO: What about a->diag? */
141       }
142     } catch(char *ex) {
143       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex);
144     }
145 
146     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
147     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
148     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
149 
150     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
151   } else {
152     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
153   }
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
159 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   if (right) {
165     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
166     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
167     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
168     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
169     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
170   }
171   if (left) {
172     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
173     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
174     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
175     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
176     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
177   }
178   PetscFunctionReturn(0);
179 }
180 
181 #undef __FUNCT__
182 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
183 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
184 {
185   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
186   PetscErrorCode     ierr;
187   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
188   ViennaCLVector     *xgpu=NULL,*ygpu=NULL;
189 
190   PetscFunctionBegin;
191   if (A->rmap->n > 0 && A->cmap->n > 0) {
192     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
193     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
194     try {
195       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
196     } catch (char *ex) {
197       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
198     }
199     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
200     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
201     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
202   }
203   PetscFunctionReturn(0);
204 }
205 
206 
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
210 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
211 {
212   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
213   PetscErrorCode     ierr;
214   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
215   ViennaCLVector     *xgpu=NULL,*ygpu=NULL,*zgpu=NULL;
216 
217   PetscFunctionBegin;
218   if (A->rmap->n > 0 && A->cmap->n > 0) {
219     try {
220       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
221       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
222       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
223 
224       if (a->compressedrow.use) {
225           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
226       } else {
227         if (zz == xx || zz == yy) { //temporary required
228           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
229           *zgpu = *ygpu + temp;
230         } else {
231           *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
232           *zgpu += *ygpu;
233         }
234       }
235 
236       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
237       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
238       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
239 
240     } catch(char *ex) {
241       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
242     }
243     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
250 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
256   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
257   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
258   A->ops->mult    = MatMult_SeqAIJViennaCL;
259   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
260   PetscFunctionReturn(0);
261 }
262 
263 /* --------------------------------------------------------------------------------*/
264 #undef __FUNCT__
265 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
266 /*@
267    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
268    (the default parallel PETSc format).  This matrix will ultimately pushed down
269    to GPUs and use the ViennaCL library for calculations. For good matrix
270    assembly performance the user should preallocate the matrix storage by setting
271    the parameter nz (or the array nnz).  By setting these parameters accurately,
272    performance during matrix assembly can be increased substantially.
273 
274 
275    Collective on MPI_Comm
276 
277    Input Parameters:
278 +  comm - MPI communicator, set to PETSC_COMM_SELF
279 .  m - number of rows
280 .  n - number of columns
281 .  nz - number of nonzeros per row (same for all rows)
282 -  nnz - array containing the number of nonzeros in the various rows
283          (possibly different for each row) or NULL
284 
285    Output Parameter:
286 .  A - the matrix
287 
288    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
289    MatXXXXSetPreallocation() paradigm instead of this routine directly.
290    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
291 
292    Notes:
293    If nnz is given then nz is ignored
294 
295    The AIJ format (also called the Yale sparse matrix format or
296    compressed row storage), is fully compatible with standard Fortran 77
297    storage.  That is, the stored row and column indices can begin at
298    either one (as in Fortran) or zero.  See the users' manual for details.
299 
300    Specify the preallocated storage with either nz or nnz (not both).
301    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
302    allocation.  For large problems you MUST preallocate memory or you
303    will get TERRIBLE performance, see the users' manual chapter on matrices.
304 
305    By default, this format uses inodes (identical nodes) when possible, to
306    improve numerical efficiency of matrix-vector products and solves. We
307    search for consecutive rows with the same nonzero structure, thereby
308    reusing matrix information to achieve increased efficiency.
309 
310    Level: intermediate
311 
312 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
313 
314 @*/
315 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
316 {
317   PetscErrorCode ierr;
318 
319   PetscFunctionBegin;
320   ierr = MatCreate(comm,A);CHKERRQ(ierr);
321   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
322   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
323   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
330 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
331 {
332   PetscErrorCode ierr;
333   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
334 
335   PetscFunctionBegin;
336   try {
337     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
338     delete viennaclcontainer;
339     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
340   } catch(char *ex) {
341     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
342   }
343   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
344   A->spptr = 0;
345   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
346   PetscFunctionReturn(0);
347 }
348 
349 
350 #undef __FUNCT__
351 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
352 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
353 {
354   PetscErrorCode ierr;
355   Mat_SeqAIJ     *aij;
356 
357   PetscFunctionBegin;
358   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
359   aij             = (Mat_SeqAIJ*)B->data;
360   aij->inode.use  = PETSC_FALSE;
361   B->ops->mult    = MatMult_SeqAIJViennaCL;
362   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
363   B->spptr        = new Mat_SeqAIJViennaCL();
364 
365   ((Mat_SeqAIJViennaCL*)B->spptr)->mat     = 0;
366 
367   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
368   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
369   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
370 
371   //ierr = PetscObjectComposeFunction((PetscObject)B,"MatViennaCLSetFormat_C", "MatViennaCLSetFormat_SeqAIJViennaCL", MatViennaCLSetFormat_SeqAIJViennaCL);CHKERRQ(ierr);
372   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
373 
374   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
375   PetscFunctionReturn(0);
376 }
377 
378 
379 /*M
380    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
381 
382    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
383    default. All matrix calculations are performed using the ViennaCL library.
384 
385    Options Database Keys:
386 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
387 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
388 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
389 
390   Level: beginner
391 
392 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
393 M*/
394 
395