xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision f0a7718cfdcbca79bb7cc7b37cdbc4ee9b2ffab8)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <petscbt.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 
14 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   PetscErrorCode     ierr;
31 
32 
33   PetscFunctionBegin;
34   if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
35     if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
36       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
37 
38       try {
39         if (a->compressedrow.use) {
40           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
41 
42           // Since PetscInt is different from cl_uint, we have to convert:
43           viennacl::backend::mem_handle dummy;
44 
45           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
46           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
47             row_buffer.set(i, (a->compressedrow.i)[i]);
48 
49           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
50           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
51             row_indices.set(i, (a->compressedrow.rindex)[i]);
52 
53           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
54           for (PetscInt i=0; i<a->nz; ++i)
55             col_buffer.set(i, (a->j)[i]);
56 
57           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
58         } else {
59           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
60 
61           // Since PetscInt is in general different from cl_uint, we have to convert:
62           viennacl::backend::mem_handle dummy;
63 
64           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
65           for (PetscInt i=0; i<=A->rmap->n; ++i)
66             row_buffer.set(i, (a->i)[i]);
67 
68           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
69           for (PetscInt i=0; i<a->nz; ++i)
70             col_buffer.set(i, (a->j)[i]);
71 
72           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
73         }
74         ViennaCLWaitForGPU();
75       } catch(std::exception const & ex) {
76         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
77       }
78 
79       // Create temporary vector for v += A*x:
80       if (viennaclstruct->tempvec) {
81         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
82           delete (ViennaCLVector*)viennaclstruct->tempvec;
83           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
84         } else {
85           viennaclstruct->tempvec->clear();
86         }
87       } else {
88         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
89       }
90 
91       A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
92 
93       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
94     }
95   }
96   PetscFunctionReturn(0);
97 }
98 
99 #undef __FUNCT__
100 #define __FUNCT__ "MatViennaCLCopyFromGPU"
101 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
102 {
103   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
104   PetscInt           m               = A->rmap->n;
105   PetscErrorCode     ierr;
106 
107 
108   PetscFunctionBegin;
109   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
110     try {
111       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
112       else {
113 
114         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
115         a->nz           = Agpu->nnz();
116         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
117         A->preallocated = PETSC_TRUE;
118         if (a->singlemalloc) {
119           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
120         } else {
121           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
122           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
123           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
124         }
125         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
126         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
127 
128         a->singlemalloc = PETSC_TRUE;
129 
130         /* Setup row lengths */
131         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
132         ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
133         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
134 
135         /* Copy data back from GPU */
136         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
137 
138         // copy row array
139         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
140         (a->i)[0] = row_buffer[0];
141         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
142           (a->i)[i+1] = row_buffer[i+1];
143           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
144         }
145 
146         // copy column indices
147         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
148         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
149         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
150           (a->j)[i] = col_buffer[i];
151 
152         // copy nonzero entries directly to destination (no conversion required)
153         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
154 
155         ViennaCLWaitForGPU();
156         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
157       }
158     } catch(std::exception const & ex) {
159       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
160     }
161 
162     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
163     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
164     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165 
166     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
167   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
168   PetscFunctionReturn(0);
169 }
170 
171 #undef __FUNCT__
172 #define __FUNCT__ "MatCreateVecs_SeqAIJViennaCL"
173 PetscErrorCode MatCreateVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
174 {
175   PetscErrorCode ierr;
176   PetscInt rbs,cbs;
177 
178   PetscFunctionBegin;
179   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
180   if (right) {
181     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
182     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
183     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
184     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
185     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
186   }
187   if (left) {
188     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
189     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
190     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
191     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
192     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
193   }
194   PetscFunctionReturn(0);
195 }
196 
197 #undef __FUNCT__
198 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
199 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
200 {
201   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
202   PetscErrorCode       ierr;
203   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
204   const ViennaCLVector *xgpu=NULL;
205   ViennaCLVector       *ygpu=NULL;
206 
207   PetscFunctionBegin;
208   if (A->rmap->n > 0 && A->cmap->n > 0) {
209     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
210     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
211     try {
212       *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
213       ViennaCLWaitForGPU();
214     } catch (std::exception const & ex) {
215       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
216     }
217     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
218     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
219     ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
220   }
221   PetscFunctionReturn(0);
222 }
223 
224 
225 
226 #undef __FUNCT__
227 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
228 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
229 {
230   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
231   PetscErrorCode       ierr;
232   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
233   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
234   ViennaCLVector       *zgpu=NULL;
235 
236   PetscFunctionBegin;
237   if (A->rmap->n > 0 && A->cmap->n > 0) {
238     try {
239       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
240       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
241       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
242 
243       if (a->compressedrow.use) {
244         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
245         *zgpu = *ygpu + temp;
246         ViennaCLWaitForGPU();
247       } else {
248         if (zz == xx || zz == yy) { //temporary required
249           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
250           *zgpu = *ygpu;
251           *zgpu += temp;
252           ViennaCLWaitForGPU();
253         } else {
254           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
255           *zgpu = *ygpu + *viennaclstruct->tempvec;
256           ViennaCLWaitForGPU();
257         }
258       }
259 
260       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
261       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
262       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
263 
264     } catch(std::exception const & ex) {
265       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
266     }
267     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
268   }
269   PetscFunctionReturn(0);
270 }
271 
272 #undef __FUNCT__
273 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
274 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
275 {
276   PetscErrorCode ierr;
277 
278   PetscFunctionBegin;
279   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
280   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
281   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
282   A->ops->mult    = MatMult_SeqAIJViennaCL;
283   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
284   PetscFunctionReturn(0);
285 }
286 
287 /* --------------------------------------------------------------------------------*/
288 #undef __FUNCT__
289 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
290 /*@
291    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
292    (the default parallel PETSc format).  This matrix will ultimately be pushed down
293    to GPUs and use the ViennaCL library for calculations. For good matrix
294    assembly performance the user should preallocate the matrix storage by setting
295    the parameter nz (or the array nnz).  By setting these parameters accurately,
296    performance during matrix assembly can be increased substantially.
297 
298 
299    Collective on MPI_Comm
300 
301    Input Parameters:
302 +  comm - MPI communicator, set to PETSC_COMM_SELF
303 .  m - number of rows
304 .  n - number of columns
305 .  nz - number of nonzeros per row (same for all rows)
306 -  nnz - array containing the number of nonzeros in the various rows
307          (possibly different for each row) or NULL
308 
309    Output Parameter:
310 .  A - the matrix
311 
312    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
313    MatXXXXSetPreallocation() paradigm instead of this routine directly.
314    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
315 
316    Notes:
317    If nnz is given then nz is ignored
318 
319    The AIJ format (also called the Yale sparse matrix format or
320    compressed row storage), is fully compatible with standard Fortran 77
321    storage.  That is, the stored row and column indices can begin at
322    either one (as in Fortran) or zero.  See the users' manual for details.
323 
324    Specify the preallocated storage with either nz or nnz (not both).
325    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
326    allocation.  For large problems you MUST preallocate memory or you
327    will get TERRIBLE performance, see the users' manual chapter on matrices.
328 
329    Level: intermediate
330 
331 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
332 
333 @*/
334 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
335 {
336   PetscErrorCode ierr;
337 
338   PetscFunctionBegin;
339   ierr = MatCreate(comm,A);CHKERRQ(ierr);
340   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
341   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
342   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
349 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
350 {
351   PetscErrorCode ierr;
352   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
353 
354   PetscFunctionBegin;
355   try {
356     if (viennaclcontainer) {
357       delete viennaclcontainer->tempvec;
358       delete viennaclcontainer->mat;
359       delete viennaclcontainer->compressed_mat;
360       delete viennaclcontainer;
361     }
362     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
363   } catch(std::exception const & ex) {
364     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
365   }
366   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
367   A->spptr = 0;
368   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
369   PetscFunctionReturn(0);
370 }
371 
372 
373 #undef __FUNCT__
374 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
375 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
376 {
377   PetscErrorCode ierr;
378   Mat_SeqAIJ     *aij;
379 
380   PetscFunctionBegin;
381   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
382   aij             = (Mat_SeqAIJ*)B->data;
383   aij->inode.use  = PETSC_FALSE;
384   B->ops->mult    = MatMult_SeqAIJViennaCL;
385   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
386   B->spptr        = new Mat_SeqAIJViennaCL();
387 
388   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
389   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
390   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
391 
392   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
393   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
394   B->ops->getvecs        = MatCreateVecs_SeqAIJViennaCL;
395 
396   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
397 
398   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
399   PetscFunctionReturn(0);
400 }
401 
402 
403 /*M
404    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
405 
406    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
407    default. All matrix calculations are performed using the ViennaCL library.
408 
409    Options Database Keys:
410 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
411 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
412 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
413 
414   Level: beginner
415 
416 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
417 M*/
418 
419