xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision bebe2cf65d55febe21a5af8db2bd2e168caaa2e7)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <petscbt.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 
14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscErrorCode     ierr;
31 
32 
33   PetscFunctionBegin;
34   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
35     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
36       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
37 
38       try {
39         ierr = PetscObjectViennaCLSetFromOptions((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
40         if (a->compressedrow.use) {
41           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42 
43           // Since PetscInt is different from cl_uint, we have to convert:
44           viennacl::backend::mem_handle dummy;
45 
46           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48             row_buffer.set(i, (a->compressedrow.i)[i]);
49 
50           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52             row_indices.set(i, (a->compressedrow.rindex)[i]);
53 
54           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55           for (PetscInt i=0; i<a->nz; ++i)
56             col_buffer.set(i, (a->j)[i]);
57 
58           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
59         } else {
60           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
61 
62           // Since PetscInt is in general different from cl_uint, we have to convert:
63           viennacl::backend::mem_handle dummy;
64 
65           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
66           for (PetscInt i=0; i<=A->rmap->n; ++i)
67             row_buffer.set(i, (a->i)[i]);
68 
69           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
70           for (PetscInt i=0; i<a->nz; ++i)
71             col_buffer.set(i, (a->j)[i]);
72 
73           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
74         }
75         ViennaCLWaitForGPU();
76       } catch(std::exception const & ex) {
77         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
78       }
79 
80       // Create temporary vector for v += A*x:
81       if (viennaclstruct->tempvec) {
82         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(a->nz)) {
83           delete (ViennaCLVector*)viennaclstruct->tempvec;
84           viennaclstruct->tempvec = new ViennaCLVector(a->nz);
85         } else {
86           viennaclstruct->tempvec->clear();
87         }
88       } else {
89         viennaclstruct->tempvec = new ViennaCLVector(a->nz);
90       }
91 
92       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
93 
94       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
95     }
96   }
97   PetscFunctionReturn(0);
98 }
99 
100 #undef __FUNCT__
101 #define __FUNCT__ "MatViennaCLCopyFromGPU"
102 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103 {
104   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
105   PetscInt           m               = A->rmap->n;
106   PetscErrorCode     ierr;
107 
108 
109   PetscFunctionBegin;
110   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
111     try {
112       if (a->compressedrow.use) {
113         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
114       } else {
115 
116         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
117         a->nz           = Agpu->nnz();
118         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
119         A->preallocated = PETSC_TRUE;
120         if (a->singlemalloc) {
121           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
122         } else {
123           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
124           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
125           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
126         }
127         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
128         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
129 
130         a->singlemalloc = PETSC_TRUE;
131 
132         /* Setup row lengths */
133         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
134         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
135         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
136 
137         /* Copy data back from GPU */
138         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
139 
140         // copy row array
141         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
142         (a->i)[0] = row_buffer[0];
143         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
144           (a->i)[i+1] = row_buffer[i+1];
145           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
146         }
147 
148         // copy column indices
149         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
150         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
151         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
152           (a->j)[i] = col_buffer[i];
153 
154         // copy nonzero entries directly to destination (no conversion required)
155         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
156 
157         ViennaCLWaitForGPU();
158         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
159       }
160     } catch(std::exception const & ex) {
161       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
162     }
163 
164     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
165     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
167 
168     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
169   } else {
170     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
171   }
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL"
177 PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
178 {
179   PetscErrorCode ierr;
180   PetscInt rbs,cbs;
181 
182   PetscFunctionBegin;
183   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
184   if (right) {
185     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
186     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
187     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
188     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
189     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
190   }
191   if (left) {
192     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
193     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
194     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
195     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
196     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
197   }
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNCT__
202 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
203 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
204 {
205   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
206   PetscErrorCode       ierr;
207   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
208   const ViennaCLVector *xgpu=NULL;
209   ViennaCLVector       *ygpu=NULL;
210 
211   PetscFunctionBegin;
212   if (A->rmap->n > 0 && A->cmap->n > 0) {
213     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
214     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
215     try {
216       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
217       ViennaCLWaitForGPU();
218     } catch (std::exception const & ex) {
219       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
220     }
221     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
222     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
223     ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
224   }
225   PetscFunctionReturn(0);
226 }
227 
228 
229 
230 #undef __FUNCT__
231 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
232 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
233 {
234   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
235   PetscErrorCode       ierr;
236   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
237   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
238   ViennaCLVector       *zgpu=NULL;
239 
240   PetscFunctionBegin;
241   if (A->rmap->n > 0 && A->cmap->n > 0) {
242     try {
243       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
244       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
245       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
246 
247       if (a->compressedrow.use) {
248         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
249         *zgpu = *ygpu + temp;
250         ViennaCLWaitForGPU();
251       } else {
252         if (zz == xx || zz == yy) { //temporary required
253           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
254           *zgpu = *ygpu;
255           *zgpu += temp;
256           ViennaCLWaitForGPU();
257         } else {
258           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
259           *zgpu = *ygpu + *viennaclstruct->tempvec;
260           ViennaCLWaitForGPU();
261         }
262       }
263 
264       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
265       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
266       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
267 
268     } catch(std::exception const & ex) {
269       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
270     }
271     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
272   }
273   PetscFunctionReturn(0);
274 }
275 
276 #undef __FUNCT__
277 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
278 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
279 {
280   PetscErrorCode ierr;
281 
282   PetscFunctionBegin;
283   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
284   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
285   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
286   A->ops->mult    = MatMult_SeqAIJViennaCL;
287   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
288   PetscFunctionReturn(0);
289 }
290 
291 /* --------------------------------------------------------------------------------*/
292 #undef __FUNCT__
293 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
294 /*@
295    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
296    (the default parallel PETSc format).  This matrix will ultimately be pushed down
297    to GPUs and use the ViennaCL library for calculations. For good matrix
298    assembly performance the user should preallocate the matrix storage by setting
299    the parameter nz (or the array nnz).  By setting these parameters accurately,
300    performance during matrix assembly can be increased substantially.
301 
302 
303    Collective on MPI_Comm
304 
305    Input Parameters:
306 +  comm - MPI communicator, set to PETSC_COMM_SELF
307 .  m - number of rows
308 .  n - number of columns
309 .  nz - number of nonzeros per row (same for all rows)
310 -  nnz - array containing the number of nonzeros in the various rows
311          (possibly different for each row) or NULL
312 
313    Output Parameter:
314 .  A - the matrix
315 
316    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
317    MatXXXXSetPreallocation() paradigm instead of this routine directly.
318    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
319 
320    Notes:
321    If nnz is given then nz is ignored
322 
323    The AIJ format (also called the Yale sparse matrix format or
324    compressed row storage), is fully compatible with standard Fortran 77
325    storage.  That is, the stored row and column indices can begin at
326    either one (as in Fortran) or zero.  See the users' manual for details.
327 
328    Specify the preallocated storage with either nz or nnz (not both).
329    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
330    allocation.  For large problems you MUST preallocate memory or you
331    will get TERRIBLE performance, see the users' manual chapter on matrices.
332 
333    Level: intermediate
334 
335 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
336 
337 @*/
338 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
339 {
340   PetscErrorCode ierr;
341 
342   PetscFunctionBegin;
343   ierr = MatCreate(comm,A);CHKERRQ(ierr);
344   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
345   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
346   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 
351 #undef __FUNCT__
352 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
353 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
354 {
355   PetscErrorCode ierr;
356   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
357 
358   PetscFunctionBegin;
359   try {
360     if (viennaclcontainer) {
361       delete viennaclcontainer->tempvec;
362       delete viennaclcontainer->mat;
363       delete viennaclcontainer->compressed_mat;
364       delete viennaclcontainer;
365     }
366     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
367   } catch(std::exception const & ex) {
368     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
369   }
370   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
371   A->spptr = 0;
372   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
373   PetscFunctionReturn(0);
374 }
375 
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
379 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
380 {
381   PetscErrorCode ierr;
382   Mat_SeqAIJ     *aij;
383 
384   PetscFunctionBegin;
385   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
386   aij             = (Mat_SeqAIJ*)B->data;
387   aij->inode.use  = PETSC_FALSE;
388   B->ops->mult    = MatMult_SeqAIJViennaCL;
389   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
390   B->spptr        = new Mat_SeqAIJViennaCL();
391 
392   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
393   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
394   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
395 
396   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
397   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
398   B->ops->getvecs        = MatCreateVecs_SeqAIJViennaCL;
399 
400   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
401 
402   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
403   PetscFunctionReturn(0);
404 }
405 
406 
407 /*M
408    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
409 
410    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
411    default. All matrix calculations are performed using the ViennaCL library.
412 
413    Options Database Keys:
414 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
415 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
416 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
417 
418   Level: beginner
419 
420 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
421 M*/
422 
423