xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 2b8d69ca7ea5fe9190df62c1dce3bbd66fce84dd)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <petscbt.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 
14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscErrorCode     ierr;
31 
32 
33   PetscFunctionBegin;
34   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
35     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
36       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
37 
38       try {
39         ierr = PetscObjectViennaCLSetFromOptions((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
40         if (a->compressedrow.use) {
41           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42 
43           // Since PetscInt is different from cl_uint, we have to convert:
44           viennacl::backend::mem_handle dummy;
45 
46           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48             row_buffer.set(i, (a->compressedrow.i)[i]);
49 
50           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52             row_indices.set(i, (a->compressedrow.rindex)[i]);
53 
54           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55           for (PetscInt i=0; i<a->nz; ++i)
56             col_buffer.set(i, (a->j)[i]);
57 
58           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
59         } else {
60           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
61 
62           // Since PetscInt is in general different from cl_uint, we have to convert:
63           viennacl::backend::mem_handle dummy;
64 
65           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
66           for (PetscInt i=0; i<=A->rmap->n; ++i)
67             row_buffer.set(i, (a->i)[i]);
68 
69           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
70           for (PetscInt i=0; i<a->nz; ++i)
71             col_buffer.set(i, (a->j)[i]);
72 
73           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
74         }
75         ViennaCLWaitForGPU();
76       } catch(std::exception const & ex) {
77         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
78       }
79 
80       // Create temporary vector for v += A*x:
81       if (viennaclstruct->tempvec) {
82         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(a->nz)) {
83           delete (ViennaCLVector*)viennaclstruct->tempvec;
84           viennaclstruct->tempvec = new ViennaCLVector(a->nz);
85         } else {
86           viennaclstruct->tempvec->clear();
87         }
88       } else {
89         viennaclstruct->tempvec = new ViennaCLVector(a->nz);
90       }
91 
92       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
93 
94       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
95     }
96   }
97   PetscFunctionReturn(0);
98 }
99 
100 #undef __FUNCT__
101 #define __FUNCT__ "MatViennaCLCopyFromGPU"
102 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103 {
104   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
105   PetscInt           m               = A->rmap->n;
106   PetscErrorCode     ierr;
107 
108 
109   PetscFunctionBegin;
110   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
111     try {
112       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
113       else {
114 
115         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
116         a->nz           = Agpu->nnz();
117         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
118         A->preallocated = PETSC_TRUE;
119         if (a->singlemalloc) {
120           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
121         } else {
122           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
123           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
124           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
125         }
126         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
127         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
128 
129         a->singlemalloc = PETSC_TRUE;
130 
131         /* Setup row lengths */
132         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
133         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
134         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
135 
136         /* Copy data back from GPU */
137         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
138 
139         // copy row array
140         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
141         (a->i)[0] = row_buffer[0];
142         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
143           (a->i)[i+1] = row_buffer[i+1];
144           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
145         }
146 
147         // copy column indices
148         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
149         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
150         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
151           (a->j)[i] = col_buffer[i];
152 
153         // copy nonzero entries directly to destination (no conversion required)
154         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
155 
156         ViennaCLWaitForGPU();
157         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
158       }
159     } catch(std::exception const & ex) {
160       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
161     }
162 
163     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
164     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166 
167     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
168   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
169   PetscFunctionReturn(0);
170 }
171 
172 #undef __FUNCT__
173 #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL"
174 PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
175 {
176   PetscErrorCode ierr;
177   PetscInt rbs,cbs;
178 
179   PetscFunctionBegin;
180   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
181   if (right) {
182     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
183     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
184     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
185     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
186     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
187   }
188   if (left) {
189     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
190     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
191     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
192     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
193     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
194   }
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
200 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
201 {
202   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
203   PetscErrorCode       ierr;
204   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
205   const ViennaCLVector *xgpu=NULL;
206   ViennaCLVector       *ygpu=NULL;
207 
208   PetscFunctionBegin;
209   if (A->rmap->n > 0 && A->cmap->n > 0) {
210     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
211     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
212     try {
213       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
214       ViennaCLWaitForGPU();
215     } catch (std::exception const & ex) {
216       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
217     }
218     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
219     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
220     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 
226 
227 #undef __FUNCT__
228 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
229 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
230 {
231   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
232   PetscErrorCode       ierr;
233   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
234   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
235   ViennaCLVector       *zgpu=NULL;
236 
237   PetscFunctionBegin;
238   if (A->rmap->n > 0 && A->cmap->n > 0) {
239     try {
240       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
241       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
242       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
243 
244       if (a->compressedrow.use) {
245         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
246         *zgpu = *ygpu + temp;
247         ViennaCLWaitForGPU();
248       } else {
249         if (zz == xx || zz == yy) { //temporary required
250           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
251           *zgpu = *ygpu;
252           *zgpu += temp;
253           ViennaCLWaitForGPU();
254         } else {
255           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
256           *zgpu = *ygpu + *viennaclstruct->tempvec;
257           ViennaCLWaitForGPU();
258         }
259       }
260 
261       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
262       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
263       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
264 
265     } catch(std::exception const & ex) {
266       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
267     }
268     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
269   }
270   PetscFunctionReturn(0);
271 }
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
275 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
276 {
277   PetscErrorCode ierr;
278 
279   PetscFunctionBegin;
280   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
281   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
282   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
283   A->ops->mult    = MatMult_SeqAIJViennaCL;
284   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
285   PetscFunctionReturn(0);
286 }
287 
288 /* --------------------------------------------------------------------------------*/
289 #undef __FUNCT__
290 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
291 /*@
292    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
293    (the default parallel PETSc format).  This matrix will ultimately be pushed down
294    to GPUs and use the ViennaCL library for calculations. For good matrix
295    assembly performance the user should preallocate the matrix storage by setting
296    the parameter nz (or the array nnz).  By setting these parameters accurately,
297    performance during matrix assembly can be increased substantially.
298 
299 
300    Collective on MPI_Comm
301 
302    Input Parameters:
303 +  comm - MPI communicator, set to PETSC_COMM_SELF
304 .  m - number of rows
305 .  n - number of columns
306 .  nz - number of nonzeros per row (same for all rows)
307 -  nnz - array containing the number of nonzeros in the various rows
308          (possibly different for each row) or NULL
309 
310    Output Parameter:
311 .  A - the matrix
312 
313    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
314    MatXXXXSetPreallocation() paradigm instead of this routine directly.
315    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
316 
317    Notes:
318    If nnz is given then nz is ignored
319 
320    The AIJ format (also called the Yale sparse matrix format or
321    compressed row storage), is fully compatible with standard Fortran 77
322    storage.  That is, the stored row and column indices can begin at
323    either one (as in Fortran) or zero.  See the users' manual for details.
324 
325    Specify the preallocated storage with either nz or nnz (not both).
326    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
327    allocation.  For large problems you MUST preallocate memory or you
328    will get TERRIBLE performance, see the users' manual chapter on matrices.
329 
330    Level: intermediate
331 
332 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
333 
334 @*/
335 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
336 {
337   PetscErrorCode ierr;
338 
339   PetscFunctionBegin;
340   ierr = MatCreate(comm,A);CHKERRQ(ierr);
341   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
342   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
343   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
350 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
351 {
352   PetscErrorCode ierr;
353   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
354 
355   PetscFunctionBegin;
356   try {
357     if (viennaclcontainer) {
358       delete viennaclcontainer->tempvec;
359       delete viennaclcontainer->mat;
360       delete viennaclcontainer->compressed_mat;
361       delete viennaclcontainer;
362     }
363     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
364   } catch(std::exception const & ex) {
365     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
366   }
367   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
368   A->spptr = 0;
369   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
370   PetscFunctionReturn(0);
371 }
372 
373 
374 #undef __FUNCT__
375 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
376 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
377 {
378   PetscErrorCode ierr;
379   Mat_SeqAIJ     *aij;
380 
381   PetscFunctionBegin;
382   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
383   aij             = (Mat_SeqAIJ*)B->data;
384   aij->inode.use  = PETSC_FALSE;
385   B->ops->mult    = MatMult_SeqAIJViennaCL;
386   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
387   B->spptr        = new Mat_SeqAIJViennaCL();
388 
389   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
390   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
391   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
392 
393   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
394   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
395   B->ops->getvecs        = MatCreateVecs_SeqAIJViennaCL;
396 
397   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
398 
399   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
400   PetscFunctionReturn(0);
401 }
402 
403 
404 /*M
405    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
406 
407    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
408    default. All matrix calculations are performed using the ViennaCL library.
409 
410    Options Database Keys:
411 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
412 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
413 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
414 
415   Level: beginner
416 
417 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
418 M*/
419 
420