xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision a3430c569f2f14ee3f6a1c44cd55773bcfd7d13e)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <petscbt.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc-private/vecimpl.h>
13 
14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscErrorCode     ierr;
31 
32 
33   PetscFunctionBegin;
34   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
35     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
36       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
37 
38       try {
39         ierr = PetscObjectSetFromOptions_ViennaCL((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
40         if (a->compressedrow.use) {
41           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42 
43           // Since PetscInt is different from cl_uint, we have to convert:
44           viennacl::backend::mem_handle dummy;
45 
46           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48             row_buffer.set(i, (a->compressedrow.i)[i]);
49 
50           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52             row_indices.set(i, (a->compressedrow.rindex)[i]);
53 
54           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55           for (PetscInt i=0; i<a->nz; ++i)
56             col_buffer.set(i, (a->j)[i]);
57 
58           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
59         } else {
60           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
61 
62           // Since PetscInt is in general different from cl_uint, we have to convert:
63           viennacl::backend::mem_handle dummy;
64 
65           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
66           for (PetscInt i=0; i<=A->rmap->n; ++i)
67             row_buffer.set(i, (a->i)[i]);
68 
69           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
70           for (PetscInt i=0; i<a->nz; ++i)
71             col_buffer.set(i, (a->j)[i]);
72 
73           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
74         }
75         ViennaCLWaitForGPU();
76       } catch(std::exception const & ex) {
77         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
78       }
79 
80       // Create temporary vector for v += A*x:
81       if (viennaclstruct->tempvec) {
82         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(a->nz)) {
83           delete (ViennaCLVector*)viennaclstruct->tempvec;
84           viennaclstruct->tempvec = new ViennaCLVector(a->nz);
85         } else {
86           viennaclstruct->tempvec->clear();
87         }
88       } else {
89         viennaclstruct->tempvec = new ViennaCLVector(a->nz);
90       }
91 
92       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
93 
94       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
95     }
96   }
97   PetscFunctionReturn(0);
98 }
99 
100 #undef __FUNCT__
101 #define __FUNCT__ "MatViennaCLCopyFromGPU"
102 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103 {
104   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
105   PetscInt           m               = A->rmap->n;
106   PetscErrorCode     ierr;
107 
108 
109   PetscFunctionBegin;
110   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
111     try {
112       if (a->compressedrow.use) {
113         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
114       } else {
115 
116         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
117         a->nz           = Agpu->nnz();
118         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
119         A->preallocated = PETSC_TRUE;
120         if (a->singlemalloc) {
121           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
122         } else {
123           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
124           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
125           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
126         }
127         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
128         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
129 
130         a->singlemalloc = PETSC_TRUE;
131 
132         /* Setup row lengths */
133         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
134         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
135         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
136 
137         /* Copy data back from GPU */
138         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
139 
140         // copy row array
141         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
142         (a->i)[0] = row_buffer[0];
143         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
144           (a->i)[i+1] = row_buffer[i+1];
145           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
146         }
147 
148         // copy column indices
149         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
150         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
151         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
152           (a->j)[i] = col_buffer[i];
153 
154         // copy nonzero entries directly to destination (no conversion required)
155         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
156 
157         ViennaCLWaitForGPU();
158         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
159       }
160     } catch(std::exception const & ex) {
161       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
162     }
163 
164     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
165     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
167 
168     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
169   } else {
170     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
171   }
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
177 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
178 {
179   PetscErrorCode ierr;
180 
181   PetscFunctionBegin;
182   if (right) {
183     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
184     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
185     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
186     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
187     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
188   }
189   if (left) {
190     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
191     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
192     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
193     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
194     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
195   }
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
201 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
202 {
203   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
204   PetscErrorCode       ierr;
205   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
206   const ViennaCLVector *xgpu=NULL;
207   ViennaCLVector       *ygpu=NULL;
208 
209   PetscFunctionBegin;
210   if (A->rmap->n > 0 && A->cmap->n > 0) {
211     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
212     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
213     try {
214       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
215       ViennaCLWaitForGPU();
216     } catch (std::exception const & ex) {
217       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
218     }
219     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
220     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
221     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
230 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
231 {
232   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode       ierr;
234   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
235   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
236   ViennaCLVector       *zgpu=NULL;
237 
238   PetscFunctionBegin;
239   if (A->rmap->n > 0 && A->cmap->n > 0) {
240     try {
241       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
242       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
243       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
244 
245       if (a->compressedrow.use) {
246         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
247         *zgpu = *ygpu + temp;
248         ViennaCLWaitForGPU();
249       } else {
250         if (zz == xx || zz == yy) { //temporary required
251           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
252           *zgpu = *ygpu;
253           *zgpu += temp;
254           ViennaCLWaitForGPU();
255         } else {
256           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
257           *zgpu = *ygpu + *viennaclstruct->tempvec;
258           ViennaCLWaitForGPU();
259         }
260       }
261 
262       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
263       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
264       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
265 
266     } catch(std::exception const & ex) {
267       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
268     }
269     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
270   }
271   PetscFunctionReturn(0);
272 }
273 
274 #undef __FUNCT__
275 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
276 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
277 {
278   PetscErrorCode ierr;
279 
280   PetscFunctionBegin;
281   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
282   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
283   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
284   A->ops->mult    = MatMult_SeqAIJViennaCL;
285   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
286   PetscFunctionReturn(0);
287 }
288 
289 /* --------------------------------------------------------------------------------*/
290 #undef __FUNCT__
291 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
292 /*@
293    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
294    (the default parallel PETSc format).  This matrix will ultimately be pushed down
295    to GPUs and use the ViennaCL library for calculations. For good matrix
296    assembly performance the user should preallocate the matrix storage by setting
297    the parameter nz (or the array nnz).  By setting these parameters accurately,
298    performance during matrix assembly can be increased substantially.
299 
300 
301    Collective on MPI_Comm
302 
303    Input Parameters:
304 +  comm - MPI communicator, set to PETSC_COMM_SELF
305 .  m - number of rows
306 .  n - number of columns
307 .  nz - number of nonzeros per row (same for all rows)
308 -  nnz - array containing the number of nonzeros in the various rows
309          (possibly different for each row) or NULL
310 
311    Output Parameter:
312 .  A - the matrix
313 
314    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
315    MatXXXXSetPreallocation() paradigm instead of this routine directly.
316    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
317 
318    Notes:
319    If nnz is given then nz is ignored
320 
321    The AIJ format (also called the Yale sparse matrix format or
322    compressed row storage), is fully compatible with standard Fortran 77
323    storage.  That is, the stored row and column indices can begin at
324    either one (as in Fortran) or zero.  See the users' manual for details.
325 
326    Specify the preallocated storage with either nz or nnz (not both).
327    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
328    allocation.  For large problems you MUST preallocate memory or you
329    will get TERRIBLE performance, see the users' manual chapter on matrices.
330 
331    Level: intermediate
332 
333 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
334 
335 @*/
336 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
337 {
338   PetscErrorCode ierr;
339 
340   PetscFunctionBegin;
341   ierr = MatCreate(comm,A);CHKERRQ(ierr);
342   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
343   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
344   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 
349 #undef __FUNCT__
350 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
351 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
352 {
353   PetscErrorCode ierr;
354   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
355 
356   PetscFunctionBegin;
357   try {
358     if (!viennaclcontainer->tempvec)        delete viennaclcontainer->tempvec;
359     if (!viennaclcontainer->mat)            delete viennaclcontainer->mat;
360     if (!viennaclcontainer->compressed_mat) delete viennaclcontainer->compressed_mat;
361     delete viennaclcontainer;
362     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
363   } catch(std::exception const & ex) {
364     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
365   }
366   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
367   A->spptr = 0;
368   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
369   PetscFunctionReturn(0);
370 }
371 
372 
373 #undef __FUNCT__
374 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
375 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
376 {
377   PetscErrorCode ierr;
378   Mat_SeqAIJ     *aij;
379 
380   PetscFunctionBegin;
381   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
382   aij             = (Mat_SeqAIJ*)B->data;
383   aij->inode.use  = PETSC_FALSE;
384   B->ops->mult    = MatMult_SeqAIJViennaCL;
385   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
386   B->spptr        = new Mat_SeqAIJViennaCL();
387 
388   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
389   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
390   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
391 
392   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
393   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
394   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
395 
396   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
397 
398   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
399   PetscFunctionReturn(0);
400 }
401 
402 
403 /*M
404    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
405 
406    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
407    default. All matrix calculations are performed using the ViennaCL library.
408 
409    Options Database Keys:
410 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
411 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
412 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
413 
414   Level: beginner
415 
416 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
417 M*/
418 
419