xref: /petsc/src/mat/impls/aij/seq/seqviennacl/aijviennacl.cxx (revision 8f86e40f90bf3a380478a8a7e464cee7e32009cd)
1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include "petscconf.h"
9 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
10 #include "petscbt.h"
11 #include "../src/vec/vec/impls/dvecimpl.h"
12 #include "petsc-private/vecimpl.h"
13 
14 #include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
15 
16 
17 #include <algorithm>
18 #include <vector>
19 #include <string>
20 
21 #include "viennacl/linalg/prod.hpp"
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "MatViennaCLCopyToGPU"
25 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
26 {
27 
28   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
29   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
30   //PetscInt           m               = A->rmap->n,*ii,*ridx;
31   PetscInt           *ii;
32   PetscErrorCode     ierr;
33 
34 
35   PetscFunctionBegin;
36   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
37     ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38 
39     try {
40       //viennaclstruct->nonzerorow=0;
41       //for (PetscInt j = 0; j<m; j++) viennaclstruct->nonzerorow += (a->i[j+1] > a->i[j]);
42 
43       viennaclstruct->mat = new ViennaCLAIJMatrix();
44       if (a->compressedrow.use) {
45         //m    = a->compressedrow.nrows;
46         ii   = a->compressedrow.i;
47         //ridx = a->compressedrow.rindex;
48 
49         viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
50 
51         // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
52 
53         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
54       } else {
55 
56         // Since PetscInt is in general different from cl_uint, we have to convert:
57         viennacl::backend::mem_handle dummy;
58 
59         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
60         for (PetscInt i=0; i<=A->rmap->n; ++i)
61           row_buffer.set(i, (a->i)[i]);
62 
63         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
64         for (PetscInt i=0; i<a->nz; ++i)
65           col_buffer.set(i, (a->j)[i]);
66 
67         viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
68       }
69     } catch(char *ex) {
70       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
71     }
72 
73     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
74 
75     ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
76   }
77   PetscFunctionReturn(0);
78 }
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatViennaCLCopyFromGPU"
82 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu)
83 {
84   //Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
85   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
86   PetscInt           m               = A->rmap->n;
87   PetscErrorCode     ierr;
88 
89 
90   PetscFunctionBegin;
91   if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
92     try {
93       if (a->compressedrow.use) {
94         SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
95       } else {
96 
97         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
98         a->nz           = Agpu->nnz();
99         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
100         A->preallocated = PETSC_TRUE;
101         if (a->singlemalloc) {
102           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
103         } else {
104           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
105           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
106           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
107         }
108         ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
109         ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
110 
111         a->singlemalloc = PETSC_TRUE;
112 
113         /* Setup row lengths */
114         if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
115         ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
116         ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
117 
118         /* Copy data back from GPU */
119         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
120 
121         // copy row array
122         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
123         (a->i)[0] = row_buffer[0];
124         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
125           (a->i)[i+1] = row_buffer[i+1];
126           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
127         }
128 
129         // copy column indices
130         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
131         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
132         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
133           (a->j)[i] = col_buffer[i];
134 
135         // copy nonzero entries directly to destination (no conversion required)
136         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
137 
138         /* TODO: What about a->diag? */
139       }
140     } catch(char *ex) {
141       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex);
142     }
143 
144     /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
145     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
146     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
147 
148     A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
149   } else {
150     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
151   }
152   PetscFunctionReturn(0);
153 }
154 
155 #undef __FUNCT__
156 #define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
157 PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
158 {
159   PetscErrorCode ierr;
160 
161   PetscFunctionBegin;
162   if (right) {
163     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
164     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
165     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
166     ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
167     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
168   }
169   if (left) {
170     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
171     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
172     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
173     ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
174     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
175   }
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatMult_SeqAIJViennaCL"
181 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
182 {
183   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
184   PetscErrorCode     ierr;
185   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
186   ViennaCLVector     *xgpu=NULL,*ygpu=NULL;
187 
188   PetscFunctionBegin;
189   ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
190   ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
191   try {
192     *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
193   } catch (char *ex) {
194     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
195   }
196   ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
197   ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
198   ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
199   PetscFunctionReturn(0);
200 }
201 
202 
203 
204 #undef __FUNCT__
205 #define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
206 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
207 {
208   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
209   PetscErrorCode     ierr;
210   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
211   ViennaCLVector     *xgpu=NULL,*ygpu=NULL,*zgpu=NULL;
212 
213   PetscFunctionBegin;
214   try {
215     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
216     ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
217     ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
218 
219     if (a->compressedrow.use) {
220         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
221     } else {
222       if (zz == xx || zz == yy) { //temporary required
223         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
224         *zgpu = *ygpu + temp;
225       } else {
226         *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
227         *zgpu += *ygpu;
228       }
229     }
230 
231     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
232     ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
233     ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
234 
235   } catch(char *ex) {
236     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
237   }
238   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNCT__
243 #define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
244 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
245 {
246   PetscErrorCode ierr;
247 
248   PetscFunctionBegin;
249   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
250   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
251   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
252   A->ops->mult    = MatMult_SeqAIJViennaCL;
253   A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
254   PetscFunctionReturn(0);
255 }
256 
257 /* --------------------------------------------------------------------------------*/
258 #undef __FUNCT__
259 #define __FUNCT__ "MatCreateSeqAIJViennaCL"
260 /*@
261    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
262    (the default parallel PETSc format).  This matrix will ultimately pushed down
263    to GPUs and use the ViennaCL library for calculations. For good matrix
264    assembly performance the user should preallocate the matrix storage by setting
265    the parameter nz (or the array nnz).  By setting these parameters accurately,
266    performance during matrix assembly can be increased substantially.
267 
268 
269    Collective on MPI_Comm
270 
271    Input Parameters:
272 +  comm - MPI communicator, set to PETSC_COMM_SELF
273 .  m - number of rows
274 .  n - number of columns
275 .  nz - number of nonzeros per row (same for all rows)
276 -  nnz - array containing the number of nonzeros in the various rows
277          (possibly different for each row) or NULL
278 
279    Output Parameter:
280 .  A - the matrix
281 
282    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
283    MatXXXXSetPreallocation() paradigm instead of this routine directly.
284    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
285 
286    Notes:
287    If nnz is given then nz is ignored
288 
289    The AIJ format (also called the Yale sparse matrix format or
290    compressed row storage), is fully compatible with standard Fortran 77
291    storage.  That is, the stored row and column indices can begin at
292    either one (as in Fortran) or zero.  See the users' manual for details.
293 
294    Specify the preallocated storage with either nz or nnz (not both).
295    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
296    allocation.  For large problems you MUST preallocate memory or you
297    will get TERRIBLE performance, see the users' manual chapter on matrices.
298 
299    By default, this format uses inodes (identical nodes) when possible, to
300    improve numerical efficiency of matrix-vector products and solves. We
301    search for consecutive rows with the same nonzero structure, thereby
302    reusing matrix information to achieve increased efficiency.
303 
304    Level: intermediate
305 
306 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
307 
308 @*/
309 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
310 {
311   PetscErrorCode ierr;
312 
313   PetscFunctionBegin;
314   ierr = MatCreate(comm,A);CHKERRQ(ierr);
315   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
316   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
317   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
318   PetscFunctionReturn(0);
319 }
320 
321 
322 #undef __FUNCT__
323 #define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
324 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
325 {
326   PetscErrorCode ierr;
327   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
328 
329   PetscFunctionBegin;
330   try {
331     if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
332     delete viennaclcontainer;
333     A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
334   } catch(char *ex) {
335     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
336   }
337   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
338   A->spptr = 0;
339   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
340   PetscFunctionReturn(0);
341 }
342 
343 
344 #undef __FUNCT__
345 #define __FUNCT__ "MatCreate_SeqAIJViennaCL"
346 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
347 {
348   PetscErrorCode ierr;
349   Mat_SeqAIJ     *aij;
350 
351   PetscFunctionBegin;
352   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
353   aij             = (Mat_SeqAIJ*)B->data;
354   aij->inode.use  = PETSC_FALSE;
355   B->ops->mult    = MatMult_SeqAIJViennaCL;
356   B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
357   B->spptr        = new Mat_SeqAIJViennaCL();
358 
359   ((Mat_SeqAIJViennaCL*)B->spptr)->mat     = 0;
360 
361   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
362   B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
363   B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
364 
365   //ierr = PetscObjectComposeFunction((PetscObject)B,"MatViennaCLSetFormat_C", "MatViennaCLSetFormat_SeqAIJViennaCL", MatViennaCLSetFormat_SeqAIJViennaCL);CHKERRQ(ierr);
366   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
367 
368   B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
369   PetscFunctionReturn(0);
370 }
371 
372 
373 /*M
374    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
375 
376    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
377    default. All matrix calculations are performed using the ViennaCL library.
378 
379    Options Database Keys:
380 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
381 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
382 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
383 
384   Level: beginner
385 
386 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
387 M*/
388 
389