xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 33d57670fcdbf57d9203d482728f549b81403a0e)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <petscbt.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc-private/vecimpl.h>
13 
14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscInt           *ii;
31   PetscErrorCode     ierr;
32 
33 
34   PetscFunctionBegin;
35   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
36     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
37       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38 
39       try {
40         ierr = PetscObjectSetFromOptions_ViennaCL((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
41         viennaclstruct->mat = new ViennaCLAIJMatrix();
42         if (a->compressedrow.use) {
43           ii = a->compressedrow.i;
44 
45           viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
46 
47           // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
48           //       Cannot be reasonably supported in ViennaCL 1.4.x (custom kernels required), hence postponing until there is support for compressed CSR
49 
50           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
51         } else {
52 
53           // Since PetscInt is in general different from cl_uint, we have to convert:
54           viennacl::backend::mem_handle dummy;
55 
56           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
57           for (PetscInt i=0; i<=A->rmap->n; ++i)
58             row_buffer.set(i, (a->i)[i]);
59 
60           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
61           for (PetscInt i=0; i<a->nz; ++i)
62             col_buffer.set(i, (a->j)[i]);
63 
64           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
65         }
66         ViennaCLWaitForGPU();
67       } catch(std::exception const & ex) {
68         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
69       }
70 
71       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
72 
73       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
74     }
75   }
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNCT__
80 #define __FUNCT__ "MatViennaCLCopyFromGPU"
81 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
82 {
83   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
84   PetscInt           m               = A->rmap->n;
85   PetscErrorCode     ierr;
86 
87 
88   PetscFunctionBegin;
89   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
90     try {
91       if (a->compressedrow.use) {
92         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
93       } else {
94 
95         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
96         a->nz           = Agpu->nnz();
97         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
98         A->preallocated = PETSC_TRUE;
99         if (a->singlemalloc) {
100           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
101         } else {
102           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
103           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
104           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
105         }
106         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
107         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
108 
109         a->singlemalloc = PETSC_TRUE;
110 
111         /* Setup row lengths */
112         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
113         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
114         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
115 
116         /* Copy data back from GPU */
117         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
118 
119         // copy row array
120         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
121         (a->i)[0] = row_buffer[0];
122         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
123           (a->i)[i+1] = row_buffer[i+1];
124           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
125         }
126 
127         // copy column indices
128         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
129         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
130         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
131           (a->j)[i] = col_buffer[i];
132 
133         // copy nonzero entries directly to destination (no conversion required)
134         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
135 
136         ViennaCLWaitForGPU();
137         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
138       }
139     } catch(std::exception const & ex) {
140       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
141     }
142 
143     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
144     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
145     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
146 
147     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
148   } else {
149     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
150   }
151   PetscFunctionReturn(0);
152 }
153 
154 #undef __FUNCT__
155 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
156 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
157 {
158   PetscErrorCode ierr;
159   PetscInt rbs,cbs;
160 
161   PetscFunctionBegin;
162   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
163   if (right) {
164     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
165     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
166     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
167     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
168     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
169   }
170   if (left) {
171     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
172     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
173     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
174     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
175     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
176   }
177   PetscFunctionReturn(0);
178 }
179 
180 #undef __FUNCT__
181 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
182 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
183 {
184   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
185   PetscErrorCode       ierr;
186   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
187   const ViennaCLVector *xgpu=NULL;
188   ViennaCLVector       *ygpu=NULL;
189 
190   PetscFunctionBegin;
191   if (A->rmap->n > 0 && A->cmap->n > 0) {
192     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
193     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
194     try {
195       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
196       ViennaCLWaitForGPU();
197     } catch (std::exception const & ex) {
198       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
199     }
200     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
201     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
202     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
203   }
204   PetscFunctionReturn(0);
205 }
206 
207 
208 
209 #undef __FUNCT__
210 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
211 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
212 {
213   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
214   PetscErrorCode       ierr;
215   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
216   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
217   ViennaCLVector       *zgpu=NULL;
218 
219   PetscFunctionBegin;
220   if (A->rmap->n > 0 && A->cmap->n > 0) {
221     try {
222       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
223       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
224       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
225 
226       if (a->compressedrow.use) {
227           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
228       } else {
229         if (zz == xx || zz == yy) { //temporary required
230           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
231           *zgpu = *ygpu + temp;
232           ViennaCLWaitForGPU();
233         } else {
234           *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
235           *zgpu += *ygpu;
236           ViennaCLWaitForGPU();
237         }
238       }
239 
240       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
241       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
242       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
243 
244     } catch(std::exception const & ex) {
245       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
246     }
247     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
254 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
260   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
261   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
262   A->ops->mult    = MatMult_SeqAIJViennaCL;
263   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
264   PetscFunctionReturn(0);
265 }
266 
267 /* --------------------------------------------------------------------------------*/
268 #undef __FUNCT__
269 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
270 /*@
271    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
272    (the default parallel PETSc format).  This matrix will ultimately be pushed down
273    to GPUs and use the ViennaCL library for calculations. For good matrix
274    assembly performance the user should preallocate the matrix storage by setting
275    the parameter nz (or the array nnz).  By setting these parameters accurately,
276    performance during matrix assembly can be increased substantially.
277 
278 
279    Collective on MPI_Comm
280 
281    Input Parameters:
282 +  comm - MPI communicator, set to PETSC_COMM_SELF
283 .  m - number of rows
284 .  n - number of columns
285 .  nz - number of nonzeros per row (same for all rows)
286 -  nnz - array containing the number of nonzeros in the various rows
287          (possibly different for each row) or NULL
288 
289    Output Parameter:
290 .  A - the matrix
291 
292    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
293    MatXXXXSetPreallocation() paradigm instead of this routine directly.
294    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
295 
296    Notes:
297    If nnz is given then nz is ignored
298 
299    The AIJ format (also called the Yale sparse matrix format or
300    compressed row storage), is fully compatible with standard Fortran 77
301    storage.  That is, the stored row and column indices can begin at
302    either one (as in Fortran) or zero.  See the users' manual for details.
303 
304    Specify the preallocated storage with either nz or nnz (not both).
305    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
306    allocation.  For large problems you MUST preallocate memory or you
307    will get TERRIBLE performance, see the users' manual chapter on matrices.
308 
309    Level: intermediate
310 
311 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
312 
313 @*/
314 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
315 {
316   PetscErrorCode ierr;
317 
318   PetscFunctionBegin;
319   ierr = MatCreate(comm,A);CHKERRQ(ierr);
320   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
321   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
322   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 
327 #undef __FUNCT__
328 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
329 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
330 {
331   PetscErrorCode ierr;
332   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
333 
334   PetscFunctionBegin;
335   try {
336     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
337     delete viennaclcontainer;
338     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
339   } catch(std::exception const & ex) {
340     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
341   }
342   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
343   A->spptr = 0;
344   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 
349 #undef __FUNCT__
350 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
351 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
352 {
353   PetscErrorCode ierr;
354   Mat_SeqAIJ     *aij;
355 
356   PetscFunctionBegin;
357   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
358   aij             = (Mat_SeqAIJ*)B->data;
359   aij->inode.use  = PETSC_FALSE;
360   B->ops->mult    = MatMult_SeqAIJViennaCL;
361   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
362   B->spptr        = new Mat_SeqAIJViennaCL();
363 
364   ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0;
365 
366   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
367   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
368   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
369 
370   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
371 
372   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
373   PetscFunctionReturn(0);
374 }
375 
376 
377 /*M
378    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
379 
380    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
381    default. All matrix calculations are performed using the ViennaCL library.
382 
383    Options Database Keys:
384 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
385 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
386 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
387 
388   Level: beginner
389 
390 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
391 M*/
392 
393