xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision dc424cfae46ffd055acbb99fbc61fc85fc92f9ad)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
20   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
21   struct matrix_descr descr;
22 #endif
23 } Mat_SeqAIJMKL;
24 
25 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
26 
27 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
28 {
29   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
30   /* so we will ignore 'MatType type'. */
31   PetscErrorCode ierr;
32   Mat            B       = *newmat;
33 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
34   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 #endif
36 
37   PetscFunctionBegin;
38   if (reuse == MAT_INITIAL_MATRIX) {
39     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
40   }
41 
42   /* Reset the original function pointers. */
43   B->ops->duplicate        = MatDuplicate_SeqAIJ;
44   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
45   B->ops->destroy          = MatDestroy_SeqAIJ;
46   B->ops->mult             = MatMult_SeqAIJ;
47   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
48   B->ops->multadd          = MatMultAdd_SeqAIJ;
49   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
50   B->ops->scale            = MatScale_SeqAIJ;
51   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
52   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
53   B->ops->axpy             = MatAXPY_SeqAIJ;
54 
55   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
56   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59 
60   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
61    * simply involves destroying the MKL sparse matrix handle and then freeing
62    * the spptr pointer. */
63 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
64   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
65 
66   if (aijmkl->sparse_optimized) {
67     sparse_status_t stat;
68     stat = mkl_sparse_destroy(aijmkl->csrA);
69     if (stat != SPARSE_STATUS_SUCCESS) {
70       PetscFunctionReturn(PETSC_ERR_LIB);
71     }
72   }
73 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
74   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
75 
76   /* Change the type of B to MATSEQAIJ. */
77   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
78 
79   *newmat = B;
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
84 {
85   PetscErrorCode ierr;
86   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
87 
88   PetscFunctionBegin;
89 
90   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
91    * spptr pointer. */
92   if (aijmkl) {
93     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
94 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
95     if (aijmkl->sparse_optimized) {
96       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
97       stat = mkl_sparse_destroy(aijmkl->csrA);
98       if (stat != SPARSE_STATUS_SUCCESS) {
99         PetscFunctionReturn(PETSC_ERR_LIB);
100       }
101     }
102 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
103     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
104   }
105 
106   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
107    * to destroy everything that remains. */
108   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
109   /* Note that I don't call MatSetType().  I believe this is because that
110    * is only to be called when *building* a matrix.  I could be wrong, but
111    * that is how things work for the SuperLU matrix class. */
112   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
113   PetscFunctionReturn(0);
114 }
115 
116 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
117  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
118  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
119  * handle, creates a new one, and then calls mkl_sparse_optimize().
120  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
121  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
122  * an unoptimized matrix handle here. */
123 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
124 {
125 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
126   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
127    * does nothing. We make it callable anyway in this case because it cuts
128    * down on littering the code with #ifdefs. */
129   PetscFunctionReturn(0);
130 #else
131   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
132   Mat_SeqAIJMKL   *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
133   PetscInt        m,n;
134   MatScalar       *aa;
135   PetscInt        *aj,*ai;
136   sparse_status_t stat;
137 
138   PetscFunctionBegin;
139   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
140 
141   if (aijmkl->sparse_optimized) {
142     /* Matrix has been previously assembled and optimized. Must destroy old
143      * matrix handle before running the optimization step again. */
144     stat = mkl_sparse_destroy(aijmkl->csrA);
145     if (stat != SPARSE_STATUS_SUCCESS) {
146       PetscFunctionReturn(PETSC_ERR_LIB);
147     }
148   }
149   aijmkl->sparse_optimized = PETSC_FALSE;
150 
151   /* Now perform the SpMV2 setup and matrix optimization. */
152   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
153   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
154   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
155   m = A->rmap->n;
156   n = A->cmap->n;
157   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
158   aa   = a->a;  /* Nonzero elements stored row-by-row. */
159   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
160   if ((a->nz!=0) & !(A->structure_only)) {
161     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
162      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
163     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
164     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
165     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
166     stat = mkl_sparse_optimize(aijmkl->csrA);
167     if (stat != SPARSE_STATUS_SUCCESS) {
168       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
169       PetscFunctionReturn(PETSC_ERR_LIB);
170     }
171     aijmkl->sparse_optimized = PETSC_TRUE;
172   }
173 
174   PetscFunctionReturn(0);
175 #endif
176 }
177 
178 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
179 {
180   PetscErrorCode ierr;
181   Mat_SeqAIJMKL *aijmkl;
182   Mat_SeqAIJMKL *aijmkl_dest;
183 
184   PetscFunctionBegin;
185   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
186   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
187   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
188   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
189   aijmkl_dest->sparse_optimized = PETSC_FALSE;
190   if (aijmkl->eager_inspection) {
191     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
192   }
193   PetscFunctionReturn(0);
194 }
195 
196 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
197 {
198   PetscErrorCode  ierr;
199   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
200   Mat_SeqAIJMKL *aijmkl;
201 
202   PetscFunctionBegin;
203   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
204 
205   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
206    * extra information and some different methods, call the AssemblyEnd
207    * routine for a MATSEQAIJ.
208    * I'm not sure if this is the best way to do this, but it avoids
209    * a lot of code duplication. */
210   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
211   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
212 
213   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
214    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
215   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
216   if (aijmkl->eager_inspection) {
217     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
218   }
219 
220   PetscFunctionReturn(0);
221 }
222 
223 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
224 {
225   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
226   const PetscScalar *x;
227   PetscScalar       *y;
228   const MatScalar   *aa;
229   PetscErrorCode    ierr;
230   PetscInt          m=A->rmap->n;
231   PetscInt          n=A->cmap->n;
232   PetscScalar       alpha = 1.0;
233   PetscScalar       beta = 0.0;
234   const PetscInt    *aj,*ai;
235   char              matdescra[6];
236 
237 
238   /* Variables not in MatMult_SeqAIJ. */
239   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
240 
241   PetscFunctionBegin;
242   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
243   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
244   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
245   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
246   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
247   aa   = a->a;  /* Nonzero elements stored row-by-row. */
248   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
249 
250   /* Call MKL sparse BLAS routine to do the MatMult. */
251   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
252 
253   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
254   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
255   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
260 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
261 {
262   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
263   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
264   const PetscScalar *x;
265   PetscScalar       *y;
266   PetscErrorCode    ierr;
267   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
268 
269   PetscFunctionBegin;
270 
271   /* If there are no nonzero entries, zero yy and return immediately. */
272   if(!a->nz) {
273     PetscInt i;
274     PetscInt m=A->rmap->n;
275     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
276     for (i=0; i<m; i++) {
277       y[i] = 0.0;
278     }
279     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
280     PetscFunctionReturn(0);
281   }
282 
283   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
284   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
285 
286   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
287    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
288    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
289   if (!aijmkl->sparse_optimized) {
290     MatSeqAIJMKL_create_mkl_handle(A);
291   }
292 
293   /* Call MKL SpMV2 executor routine to do the MatMult. */
294   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
295 
296   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
297   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
298   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
299   if (stat != SPARSE_STATUS_SUCCESS) {
300     PetscFunctionReturn(PETSC_ERR_LIB);
301   }
302   PetscFunctionReturn(0);
303 }
304 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
305 
306 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
307 {
308   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
309   const PetscScalar *x;
310   PetscScalar       *y;
311   const MatScalar   *aa;
312   PetscErrorCode    ierr;
313   PetscInt          m=A->rmap->n;
314   PetscInt          n=A->cmap->n;
315   PetscScalar       alpha = 1.0;
316   PetscScalar       beta = 0.0;
317   const PetscInt    *aj,*ai;
318   char              matdescra[6];
319 
320   /* Variables not in MatMultTranspose_SeqAIJ. */
321   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
322 
323   PetscFunctionBegin;
324   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
325   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
326   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
327   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
328   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
329   aa   = a->a;  /* Nonzero elements stored row-by-row. */
330   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
331 
332   /* Call MKL sparse BLAS routine to do the MatMult. */
333   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
334 
335   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
336   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
342 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
343 {
344   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
345   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
346   const PetscScalar *x;
347   PetscScalar       *y;
348   PetscErrorCode    ierr;
349   sparse_status_t   stat;
350 
351   PetscFunctionBegin;
352 
353   /* If there are no nonzero entries, zero yy and return immediately. */
354   if(!a->nz) {
355     PetscInt i;
356     PetscInt n=A->cmap->n;
357     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
358     for (i=0; i<n; i++) {
359       y[i] = 0.0;
360     }
361     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
362     PetscFunctionReturn(0);
363   }
364 
365   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
366   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
367 
368   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
369    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
370    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
371   if (!aijmkl->sparse_optimized) {
372     MatSeqAIJMKL_create_mkl_handle(A);
373   }
374 
375   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
376   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
377 
378   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
379   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
380   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
381   if (stat != SPARSE_STATUS_SUCCESS) {
382     PetscFunctionReturn(PETSC_ERR_LIB);
383   }
384   PetscFunctionReturn(0);
385 }
386 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
387 
388 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
389 {
390   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
391   const PetscScalar *x;
392   PetscScalar       *y,*z;
393   const MatScalar   *aa;
394   PetscErrorCode    ierr;
395   PetscInt          m=A->rmap->n;
396   PetscInt          n=A->cmap->n;
397   const PetscInt    *aj,*ai;
398   PetscInt          i;
399 
400   /* Variables not in MatMultAdd_SeqAIJ. */
401   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
402   PetscScalar       alpha = 1.0;
403   PetscScalar       beta;
404   char              matdescra[6];
405 
406   PetscFunctionBegin;
407   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
408   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
409 
410   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
411   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
412   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
413   aa   = a->a;  /* Nonzero elements stored row-by-row. */
414   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
415 
416   /* Call MKL sparse BLAS routine to do the MatMult. */
417   if (zz == yy) {
418     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
419     beta = 1.0;
420     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
421   } else {
422     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
423      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
424     beta = 0.0;
425     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
426     for (i=0; i<m; i++) {
427       z[i] += y[i];
428     }
429   }
430 
431   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
432   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
433   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
434   PetscFunctionReturn(0);
435 }
436 
437 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
438 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
439 {
440   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
441   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
442   const PetscScalar *x;
443   PetscScalar       *y,*z;
444   PetscErrorCode    ierr;
445   PetscInt          m=A->rmap->n;
446   PetscInt          i;
447 
448   /* Variables not in MatMultAdd_SeqAIJ. */
449   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
450 
451   PetscFunctionBegin;
452 
453   /* If there are no nonzero entries, set zz = yy and return immediately. */
454   if(!a->nz) {
455     PetscInt i;
456     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
457     for (i=0; i<m; i++) {
458       z[i] = y[i];
459     }
460     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
461     PetscFunctionReturn(0);
462   }
463 
464   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
465   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
466 
467   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
468    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
469    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
470   if (!aijmkl->sparse_optimized) {
471     MatSeqAIJMKL_create_mkl_handle(A);
472   }
473 
474   /* Call MKL sparse BLAS routine to do the MatMult. */
475   if (zz == yy) {
476     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
477      * with alpha and beta both set to 1.0. */
478     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
479   } else {
480     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
481      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
482     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
483     for (i=0; i<m; i++) {
484       z[i] += y[i];
485     }
486   }
487 
488   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
489   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
490   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
491   if (stat != SPARSE_STATUS_SUCCESS) {
492     PetscFunctionReturn(PETSC_ERR_LIB);
493   }
494   PetscFunctionReturn(0);
495 }
496 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
497 
498 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
499 {
500   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
501   const PetscScalar *x;
502   PetscScalar       *y,*z;
503   const MatScalar   *aa;
504   PetscErrorCode    ierr;
505   PetscInt          m=A->rmap->n;
506   PetscInt          n=A->cmap->n;
507   const PetscInt    *aj,*ai;
508   PetscInt          i;
509 
510   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
511   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
512   PetscScalar       alpha = 1.0;
513   PetscScalar       beta;
514   char              matdescra[6];
515 
516   PetscFunctionBegin;
517   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
518   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
519 
520   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
521   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
522   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
523   aa   = a->a;  /* Nonzero elements stored row-by-row. */
524   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
525 
526   /* Call MKL sparse BLAS routine to do the MatMult. */
527   if (zz == yy) {
528     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
529     beta = 1.0;
530     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
531   } else {
532     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
533      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
534     beta = 0.0;
535     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
536     for (i=0; i<n; i++) {
537       z[i] += y[i];
538     }
539   }
540 
541   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
542   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
543   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
544   PetscFunctionReturn(0);
545 }
546 
547 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
548 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
549 {
550   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
551   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
552   const PetscScalar *x;
553   PetscScalar       *y,*z;
554   PetscErrorCode    ierr;
555   PetscInt          n=A->cmap->n;
556   PetscInt          i;
557 
558   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
559   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
560 
561   PetscFunctionBegin;
562 
563   /* If there are no nonzero entries, set zz = yy and return immediately. */
564   if(!a->nz) {
565     PetscInt i;
566     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
567     for (i=0; i<n; i++) {
568       z[i] = y[i];
569     }
570     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
571     PetscFunctionReturn(0);
572   }
573 
574   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
575   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
576 
577   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
578    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
579    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
580   if (!aijmkl->sparse_optimized) {
581     MatSeqAIJMKL_create_mkl_handle(A);
582   }
583 
584   /* Call MKL sparse BLAS routine to do the MatMult. */
585   if (zz == yy) {
586     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
587      * with alpha and beta both set to 1.0. */
588     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
589   } else {
590     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
591      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
592     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
593     for (i=0; i<n; i++) {
594       z[i] += y[i];
595     }
596   }
597 
598   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
599   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
600   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
601   if (stat != SPARSE_STATUS_SUCCESS) {
602     PetscFunctionReturn(PETSC_ERR_LIB);
603   }
604   PetscFunctionReturn(0);
605 }
606 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
607 
608 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
609 {
610   PetscErrorCode ierr;
611 
612   PetscFunctionBegin;
613   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
614   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
615   PetscFunctionReturn(0);
616 }
617 
618 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
619 {
620   PetscErrorCode ierr;
621 
622   PetscFunctionBegin;
623   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
624   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
625   PetscFunctionReturn(0);
626 }
627 
628 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
629 {
630   PetscErrorCode ierr;
631 
632   PetscFunctionBegin;
633   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
634   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
635   PetscFunctionReturn(0);
636 }
637 
638 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
639 {
640   PetscErrorCode ierr;
641 
642   PetscFunctionBegin;
643   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
644   if (str == SAME_NONZERO_PATTERN) {
645     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
646     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
647   }
648   PetscFunctionReturn(0);
649 }
650 
651 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
652  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
653  * routine, but can also be used to convert an assembled SeqAIJ matrix
654  * into a SeqAIJMKL one. */
655 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
656 {
657   PetscErrorCode ierr;
658   Mat            B = *newmat;
659   Mat_SeqAIJMKL  *aijmkl;
660   PetscBool      set;
661   PetscBool      sametype;
662 
663   PetscFunctionBegin;
664   if (reuse == MAT_INITIAL_MATRIX) {
665     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
666   }
667 
668   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
669   if (sametype) PetscFunctionReturn(0);
670 
671   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
672   B->spptr = (void*) aijmkl;
673 
674   /* Set function pointers for methods that we inherit from AIJ but override.
675    * We also parse some command line options below, since those determine some of the methods we point to. */
676   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
677   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
678   B->ops->destroy          = MatDestroy_SeqAIJMKL;
679 
680   aijmkl->sparse_optimized = PETSC_FALSE;
681 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
682   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
683 #else
684   aijmkl->no_SpMV2 = PETSC_TRUE;
685 #endif
686   aijmkl->eager_inspection = PETSC_FALSE;
687 
688   /* Parse command line options. */
689   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
690   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
691   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
692   ierr = PetscOptionsEnd();CHKERRQ(ierr);
693 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
694   if(!aijmkl->no_SpMV2) {
695     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
696     aijmkl->no_SpMV2 = PETSC_TRUE;
697   }
698 #endif
699 
700   if(!aijmkl->no_SpMV2) {
701 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
702     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
703     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
704     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
705     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
706 #endif
707   } else {
708     B->ops->mult             = MatMult_SeqAIJMKL;
709     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
710     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
711     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
712   }
713 
714   B->ops->scale              = MatScale_SeqAIJMKL;
715   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
716   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
717   B->ops->axpy               = MatAXPY_SeqAIJMKL;
718 
719   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
720   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
721   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
722   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
723   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
724 
725   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
726   *newmat = B;
727   PetscFunctionReturn(0);
728 }
729 
730 /*@C
731    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
732    This type inherits from AIJ and is largely identical, but uses sparse BLAS
733    routines from Intel MKL whenever possible.
734    MatMult, MatMultAdd, MatMultTranspose, and MatMultTransposeAdd
735    operations are currently supported.
736    If the installed version of MKL supports the "SpMV2" sparse
737    inspector-executor routines, then those are used by default.
738 
739    Collective on MPI_Comm
740 
741    Input Parameters:
742 +  comm - MPI communicator, set to PETSC_COMM_SELF
743 .  m - number of rows
744 .  n - number of columns
745 .  nz - number of nonzeros per row (same for all rows)
746 -  nnz - array containing the number of nonzeros in the various rows
747          (possibly different for each row) or NULL
748 
749    Output Parameter:
750 .  A - the matrix
751 
752    Options Database Keys:
753 .  -mat_aijmkl_no_spmv2 - disables use of the SpMV2 inspector-executor routines
754 
755    Notes:
756    If nnz is given then nz is ignored
757 
758    Level: intermediate
759 
760 .keywords: matrix, MKL, sparse, parallel
761 
762 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
763 @*/
764 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
765 {
766   PetscErrorCode ierr;
767 
768   PetscFunctionBegin;
769   ierr = MatCreate(comm,A);CHKERRQ(ierr);
770   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
771   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
772   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
777 {
778   PetscErrorCode ierr;
779 
780   PetscFunctionBegin;
781   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
782   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
783   PetscFunctionReturn(0);
784 }
785