xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 19fddfadf26cdedd36d5f24fa1f2bdedfcd3712b)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include "petscconf.h"
9 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
10 #include "petscbt.h"
11 #include "../src/vec/vec/impls/dvecimpl.h"
12 #include "petsc-private/vecimpl.h"
13 
14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscInt           *ii;
31   PetscErrorCode     ierr;
32 
33 
34   PetscFunctionBegin;
35   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
36     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
37       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38 
39       try {
40         viennaclstruct->mat = new ViennaCLAIJMatrix();
41         if (a->compressedrow.use) {
42           ii = a->compressedrow.i;
43 
44           viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
45 
46           // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
47 
48           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
49         } else {
50 
51           // Since PetscInt is in general different from cl_uint, we have to convert:
52           viennacl::backend::mem_handle dummy;
53 
54           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
55           for (PetscInt i=0; i<=A->rmap->n; ++i)
56             row_buffer.set(i, (a->i)[i]);
57 
58           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
59           for (PetscInt i=0; i<a->nz; ++i)
60             col_buffer.set(i, (a->j)[i]);
61 
62           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
63         }
64       } catch(std::exception const & ex) {
65         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
66       }
67 
68       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
69 
70       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
71     }
72   }
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatViennaCLCopyFromGPU"
78 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu)
79 {
80   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
81   PetscInt           m               = A->rmap->n;
82   PetscErrorCode     ierr;
83 
84 
85   PetscFunctionBegin;
86   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
87     try {
88       if (a->compressedrow.use) {
89         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
90       } else {
91 
92         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
93         a->nz           = Agpu->nnz();
94         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
95         A->preallocated = PETSC_TRUE;
96         if (a->singlemalloc) {
97           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
98         } else {
99           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
100           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
101           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
102         }
103         ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
104         ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
105 
106         a->singlemalloc = PETSC_TRUE;
107 
108         /* Setup row lengths */
109         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
110         ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
111         ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
112 
113         /* Copy data back from GPU */
114         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
115 
116         // copy row array
117         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
118         (a->i)[0] = row_buffer[0];
119         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
120           (a->i)[i+1] = row_buffer[i+1];
121           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
122         }
123 
124         // copy column indices
125         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
126         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
127         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
128           (a->j)[i] = col_buffer[i];
129 
130         // copy nonzero entries directly to destination (no conversion required)
131         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
132 
133         /* TODO: What about a->diag? */
134       }
135     } catch(std::exception const & ex) {
136       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
137     }
138 
139     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
140     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
141     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
142 
143     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
144   } else {
145     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
146   }
147   PetscFunctionReturn(0);
148 }
149 
150 #undef __FUNCT__
151 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
152 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
153 {
154   PetscErrorCode ierr;
155 
156   PetscFunctionBegin;
157   if (right) {
158     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
159     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
160     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
161     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
162     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
163   }
164   if (left) {
165     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
166     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
167     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
168     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
169     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 #undef __FUNCT__
175 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
176 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
177 {
178   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
179   PetscErrorCode     ierr;
180   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
181   ViennaCLVector     *xgpu=NULL,*ygpu=NULL;
182 
183   PetscFunctionBegin;
184   if (A->rmap->n > 0 && A->cmap->n > 0) {
185     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
186     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
187     try {
188       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
189     } catch (std::exception const & ex) {
190       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
191     }
192     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
193     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
194     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
195   }
196   PetscFunctionReturn(0);
197 }
198 
199 
200 
201 #undef __FUNCT__
202 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
203 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
204 {
205   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
206   PetscErrorCode     ierr;
207   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
208   ViennaCLVector     *xgpu=NULL,*ygpu=NULL,*zgpu=NULL;
209 
210   PetscFunctionBegin;
211   if (A->rmap->n > 0 && A->cmap->n > 0) {
212     try {
213       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
214       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
215       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
216 
217       if (a->compressedrow.use) {
218           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
219       } else {
220         if (zz == xx || zz == yy) { //temporary required
221           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
222           *zgpu = *ygpu + temp;
223         } else {
224           *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
225           *zgpu += *ygpu;
226         }
227       }
228 
229       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
230       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
231       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
232 
233     } catch(std::exception const & ex) {
234       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
235     }
236     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
237   }
238   PetscFunctionReturn(0);
239 }
240 
241 #undef __FUNCT__
242 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
243 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
249   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
250   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
251   A->ops->mult    = MatMult_SeqAIJViennaCL;
252   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
253   PetscFunctionReturn(0);
254 }
255 
256 /* --------------------------------------------------------------------------------*/
257 #undef __FUNCT__
258 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
259 /*@
260    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
261    (the default parallel PETSc format).  This matrix will ultimately be pushed down
262    to GPUs and use the ViennaCL library for calculations. For good matrix
263    assembly performance the user should preallocate the matrix storage by setting
264    the parameter nz (or the array nnz).  By setting these parameters accurately,
265    performance during matrix assembly can be increased substantially.
266 
267 
268    Collective on MPI_Comm
269 
270    Input Parameters:
271 +  comm - MPI communicator, set to PETSC_COMM_SELF
272 .  m - number of rows
273 .  n - number of columns
274 .  nz - number of nonzeros per row (same for all rows)
275 -  nnz - array containing the number of nonzeros in the various rows
276          (possibly different for each row) or NULL
277 
278    Output Parameter:
279 .  A - the matrix
280 
281    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282    MatXXXXSetPreallocation() paradigm instead of this routine directly.
283    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
284 
285    Notes:
286    If nnz is given then nz is ignored
287 
288    The AIJ format (also called the Yale sparse matrix format or
289    compressed row storage), is fully compatible with standard Fortran 77
290    storage.  That is, the stored row and column indices can begin at
291    either one (as in Fortran) or zero.  See the users' manual for details.
292 
293    Specify the preallocated storage with either nz or nnz (not both).
294    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295    allocation.  For large problems you MUST preallocate memory or you
296    will get TERRIBLE performance, see the users' manual chapter on matrices.
297 
298    Level: intermediate
299 
300 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
301 
302 @*/
303 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
304 {
305   PetscErrorCode ierr;
306 
307   PetscFunctionBegin;
308   ierr = MatCreate(comm,A);CHKERRQ(ierr);
309   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
310   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
311   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
312   PetscFunctionReturn(0);
313 }
314 
315 
316 #undef __FUNCT__
317 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
318 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
319 {
320   PetscErrorCode ierr;
321   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
322 
323   PetscFunctionBegin;
324   try {
325     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
326     delete viennaclcontainer;
327     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
328   } catch(std::exception const & ex) {
329     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
330   }
331   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
332   A->spptr = 0;
333   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
334   PetscFunctionReturn(0);
335 }
336 
337 
338 #undef __FUNCT__
339 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
340 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
341 {
342   PetscErrorCode ierr;
343   Mat_SeqAIJ     *aij;
344 
345   PetscFunctionBegin;
346   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
347   aij             = (Mat_SeqAIJ*)B->data;
348   aij->inode.use  = PETSC_FALSE;
349   B->ops->mult    = MatMult_SeqAIJViennaCL;
350   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
351   B->spptr        = new Mat_SeqAIJViennaCL();
352 
353   ((Mat_SeqAIJViennaCL*)B->spptr)->mat = 0;
354 
355   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
356   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
357   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
358 
359   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
360 
361   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
362   PetscFunctionReturn(0);
363 }
364 
365 
366 /*M
367    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
368 
369    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
370    default. All matrix calculations are performed using the ViennaCL library.
371 
372    Options Database Keys:
373 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
374 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
375 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
376 
377   Level: beginner
378 
379 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
380 M*/
381 
382