xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision d96e85fed5c8182a229394a8f92211a7d60de233)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
38   }
39 
40   /* Reset the original function pointers. */
41   B->ops->duplicate        = MatDuplicate_SeqAIJ;
42   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
43   B->ops->destroy          = MatDestroy_SeqAIJ;
44   B->ops->mult             = MatMult_SeqAIJ;
45   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
46   B->ops->multadd          = MatMultAdd_SeqAIJ;
47   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
48   B->ops->scale            = MatScale_SeqAIJ;
49   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
50   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
51   B->ops->axpy             = MatAXPY_SeqAIJ;
52 
53   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
54   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
55   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
56   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
57 
58   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
59    * simply involves destroying the MKL sparse matrix handle and then freeing
60    * the spptr pointer. */
61 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
62   if (aijmkl->sparse_optimized) {
63     sparse_status_t stat;
64     stat = mkl_sparse_destroy(aijmkl->csrA);
65     if (stat != SPARSE_STATUS_SUCCESS) {
66       PetscFunctionReturn(PETSC_ERR_LIB);
67     }
68   }
69 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
70   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
71 
72   /* Change the type of B to MATSEQAIJ. */
73   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
74 
75   *newmat = B;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
80 {
81   PetscErrorCode ierr;
82   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
83 
84   PetscFunctionBegin;
85 
86   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
87    * spptr pointer. */
88   if (aijmkl) {
89     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
90 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
91     if (aijmkl->sparse_optimized) {
92       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
93       stat = mkl_sparse_destroy(aijmkl->csrA);
94       if (stat != SPARSE_STATUS_SUCCESS) {
95         PetscFunctionReturn(PETSC_ERR_LIB);
96       }
97     }
98 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
99     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
100   }
101 
102   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
103    * to destroy everything that remains. */
104   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
105   /* Note that I don't call MatSetType().  I believe this is because that
106    * is only to be called when *building* a matrix.  I could be wrong, but
107    * that is how things work for the SuperLU matrix class. */
108   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
113 {
114   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
115   Mat_SeqAIJMKL   *aijmkl;
116   PetscInt        m,n;
117   MatScalar       *aa;
118   PetscInt        *aj,*ai;
119 
120 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
121   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
122    * does nothing. We make it callable anyway in this case because it cuts
123    * down on littering the code with #ifdefs. */
124   PetscFunctionReturn(0);
125 #else
126 
127   sparse_status_t stat;
128 
129   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
130 
131   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
132 
133   if (aijmkl->sparse_optimized) {
134     /* Matrix has been previously assembled and optimized. Must destroy old
135      * matrix handle before running the optimization step again. */
136     stat = mkl_sparse_destroy(aijmkl->csrA);
137     if (stat != SPARSE_STATUS_SUCCESS) {
138       PetscFunctionReturn(PETSC_ERR_LIB);
139     }
140   }
141   aijmkl->sparse_optimized = PETSC_FALSE;
142 
143   /* Now perform the SpMV2 setup and matrix optimization. */
144   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
145   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
146   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
147   m = A->rmap->n;
148   n = A->cmap->n;
149   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
150   aa   = a->a;  /* Nonzero elements stored row-by-row. */
151   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
152   if (a->nz) {
153     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
154      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
155     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
156     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
157     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
158     stat = mkl_sparse_optimize(aijmkl->csrA);
159     if (stat != SPARSE_STATUS_SUCCESS) {
160       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
161       PetscFunctionReturn(PETSC_ERR_LIB);
162     }
163     aijmkl->sparse_optimized = PETSC_TRUE;
164   }
165 
166   PetscFunctionReturn(0);
167 #endif
168 }
169 
170 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
171 {
172   PetscErrorCode ierr;
173   Mat_SeqAIJMKL *aijmkl;
174   Mat_SeqAIJMKL *aijmkl_dest;
175 
176   PetscFunctionBegin;
177   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
178   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
179   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
180   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
181   aijmkl_dest->sparse_optimized = PETSC_FALSE;
182   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
187 {
188   PetscErrorCode  ierr;
189   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
190 
191   PetscFunctionBegin;
192   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
193 
194   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
195    * extra information and some different methods, call the AssemblyEnd
196    * routine for a MATSEQAIJ.
197    * I'm not sure if this is the best way to do this, but it avoids
198    * a lot of code duplication. */
199   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
200   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
201 
202   /* Now create the MKL sparse handle (if needed; the function checks). */
203   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
204 
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
209 {
210   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
211   const PetscScalar *x;
212   PetscScalar       *y;
213   const MatScalar   *aa;
214   PetscErrorCode    ierr;
215   PetscInt          m=A->rmap->n;
216   PetscInt          n=A->cmap->n;
217   PetscScalar       alpha = 1.0;
218   PetscScalar       beta = 0.0;
219   const PetscInt    *aj,*ai;
220   char              matdescra[6];
221 
222 
223   /* Variables not in MatMult_SeqAIJ. */
224   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
225 
226   PetscFunctionBegin;
227   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
228   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
229   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
230   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
231   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
232   aa   = a->a;  /* Nonzero elements stored row-by-row. */
233   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
234 
235   /* Call MKL sparse BLAS routine to do the MatMult. */
236   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
237 
238   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
239   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
240   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
245 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
246 {
247   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
248   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
249   const PetscScalar *x;
250   PetscScalar       *y;
251   PetscErrorCode    ierr;
252   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
253 
254   PetscFunctionBegin;
255 
256   /* If there are no nonzero entries, this is a no-op: return immediately. */
257   if(!a->nz) PetscFunctionReturn(0);
258 
259   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
260   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
261 
262   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
263    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
264    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
265   if (!aijmkl->sparse_optimized) {
266     MatSeqAIJMKL_create_mkl_handle(A);
267   }
268 
269   /* Call MKL SpMV2 executor routine to do the MatMult. */
270   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
271 
272   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
273   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
275   if (stat != SPARSE_STATUS_SUCCESS) {
276     PetscFunctionReturn(PETSC_ERR_LIB);
277   }
278   PetscFunctionReturn(0);
279 }
280 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
281 
282 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
283 {
284   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
285   const PetscScalar *x;
286   PetscScalar       *y;
287   const MatScalar   *aa;
288   PetscErrorCode    ierr;
289   PetscInt          m=A->rmap->n;
290   PetscInt          n=A->cmap->n;
291   PetscScalar       alpha = 1.0;
292   PetscScalar       beta = 0.0;
293   const PetscInt    *aj,*ai;
294   char              matdescra[6];
295 
296   /* Variables not in MatMultTranspose_SeqAIJ. */
297   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
298 
299   PetscFunctionBegin;
300   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
301   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
302   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
304   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
305   aa   = a->a;  /* Nonzero elements stored row-by-row. */
306   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
307 
308   /* Call MKL sparse BLAS routine to do the MatMult. */
309   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
310 
311   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
312   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
313   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
318 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
319 {
320   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
321   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
322   const PetscScalar *x;
323   PetscScalar       *y;
324   PetscErrorCode    ierr;
325   sparse_status_t   stat;
326 
327   PetscFunctionBegin;
328 
329   /* If there are no nonzero entries, this is a no-op: return immediately. */
330   if(!a->nz) PetscFunctionReturn(0);
331 
332   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
333   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
334 
335   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
336    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
337    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
338   if (!aijmkl->sparse_optimized) {
339     MatSeqAIJMKL_create_mkl_handle(A);
340   }
341 
342   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
343   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
344 
345   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
346   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
347   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
348   if (stat != SPARSE_STATUS_SUCCESS) {
349     PetscFunctionReturn(PETSC_ERR_LIB);
350   }
351   PetscFunctionReturn(0);
352 }
353 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
354 
355 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
356 {
357   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
358   const PetscScalar *x;
359   PetscScalar       *y,*z;
360   const MatScalar   *aa;
361   PetscErrorCode    ierr;
362   PetscInt          m=A->rmap->n;
363   PetscInt          n=A->cmap->n;
364   const PetscInt    *aj,*ai;
365   PetscInt          i;
366 
367   /* Variables not in MatMultAdd_SeqAIJ. */
368   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
369   PetscScalar       alpha = 1.0;
370   PetscScalar       beta;
371   char              matdescra[6];
372 
373   PetscFunctionBegin;
374   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
375   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
376 
377   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
378   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
379   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
380   aa   = a->a;  /* Nonzero elements stored row-by-row. */
381   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
382 
383   /* Call MKL sparse BLAS routine to do the MatMult. */
384   if (zz == yy) {
385     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
386     beta = 1.0;
387     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
388   } else {
389     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
390      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
391     beta = 0.0;
392     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
393     for (i=0; i<m; i++) {
394       z[i] += y[i];
395     }
396   }
397 
398   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
399   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
400   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
401   PetscFunctionReturn(0);
402 }
403 
404 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
405 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
406 {
407   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
408   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
409   const PetscScalar *x;
410   PetscScalar       *y,*z;
411   PetscErrorCode    ierr;
412   PetscInt          m=A->rmap->n;
413   PetscInt          i;
414 
415   /* Variables not in MatMultAdd_SeqAIJ. */
416   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
417 
418   PetscFunctionBegin;
419 
420   /* If there are no nonzero entries, this is a no-op: return immediately. */
421   if(!a->nz) PetscFunctionReturn(0);
422 
423   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
424   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
425 
426   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
427    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
428    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
429   if (!aijmkl->sparse_optimized) {
430     MatSeqAIJMKL_create_mkl_handle(A);
431   }
432 
433   /* Call MKL sparse BLAS routine to do the MatMult. */
434   if (zz == yy) {
435     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
436      * with alpha and beta both set to 1.0. */
437     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
438   } else {
439     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
440      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
441     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
442     for (i=0; i<m; i++) {
443       z[i] += y[i];
444     }
445   }
446 
447   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
448   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
449   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
450   if (stat != SPARSE_STATUS_SUCCESS) {
451     PetscFunctionReturn(PETSC_ERR_LIB);
452   }
453   PetscFunctionReturn(0);
454 }
455 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
456 
457 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
458 {
459   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
460   const PetscScalar *x;
461   PetscScalar       *y,*z;
462   const MatScalar   *aa;
463   PetscErrorCode    ierr;
464   PetscInt          m=A->rmap->n;
465   PetscInt          n=A->cmap->n;
466   const PetscInt    *aj,*ai;
467   PetscInt          i;
468 
469   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
470   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
471   PetscScalar       alpha = 1.0;
472   PetscScalar       beta;
473   char              matdescra[6];
474 
475   PetscFunctionBegin;
476   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
477   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
478 
479   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
480   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
481   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
482   aa   = a->a;  /* Nonzero elements stored row-by-row. */
483   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
484 
485   /* Call MKL sparse BLAS routine to do the MatMult. */
486   if (zz == yy) {
487     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
488     beta = 1.0;
489     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
490   } else {
491     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
492      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
493     beta = 0.0;
494     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
495     for (i=0; i<n; i++) {
496       z[i] += y[i];
497     }
498   }
499 
500   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
501   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
502   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
507 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
508 {
509   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
510   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
511   const PetscScalar *x;
512   PetscScalar       *y,*z;
513   PetscErrorCode    ierr;
514   PetscInt          n=A->cmap->n;
515   PetscInt          i;
516 
517   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
518   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
519 
520   PetscFunctionBegin;
521 
522   /* If there are no nonzero entries, this is a no-op: return immediately. */
523   if(!a->nz) PetscFunctionReturn(0);
524 
525   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
526   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
527 
528   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
529    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
530    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
531   if (!aijmkl->sparse_optimized) {
532     MatSeqAIJMKL_create_mkl_handle(A);
533   }
534 
535   /* Call MKL sparse BLAS routine to do the MatMult. */
536   if (zz == yy) {
537     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
538      * with alpha and beta both set to 1.0. */
539     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
540   } else {
541     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
542      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
543     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
544     for (i=0; i<n; i++) {
545       z[i] += y[i];
546     }
547   }
548 
549   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
550   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
551   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
552   if (stat != SPARSE_STATUS_SUCCESS) {
553     PetscFunctionReturn(PETSC_ERR_LIB);
554   }
555   PetscFunctionReturn(0);
556 }
557 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
558 
559 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
560 {
561   PetscErrorCode ierr;
562 
563   PetscFunctionBegin;
564   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
565   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
566   PetscFunctionReturn(0);
567 }
568 
569 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
570 {
571   PetscErrorCode ierr;
572 
573   PetscFunctionBegin;
574   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
575   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 
579 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
580 {
581   PetscErrorCode ierr;
582 
583   PetscFunctionBegin;
584   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
585   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
586   PetscFunctionReturn(0);
587 }
588 
589 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
590 {
591   PetscErrorCode ierr;
592 
593   PetscFunctionBegin;
594   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
595   if (str == SAME_NONZERO_PATTERN) {
596     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
597     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
598   }
599   PetscFunctionReturn(0);
600 }
601 
602 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
603  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
604  * routine, but can also be used to convert an assembled SeqAIJ matrix
605  * into a SeqAIJMKL one. */
606 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
607 {
608   PetscErrorCode ierr;
609   Mat            B = *newmat;
610   Mat_SeqAIJMKL  *aijmkl;
611   PetscBool      set;
612   PetscBool      sametype;
613 
614   PetscFunctionBegin;
615   if (reuse == MAT_INITIAL_MATRIX) {
616     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
617   }
618 
619   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
620   if (sametype) PetscFunctionReturn(0);
621 
622   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
623   B->spptr = (void*) aijmkl;
624 
625   /* Set function pointers for methods that we inherit from AIJ but override.
626    * We also parse some command line options below, since those determine some of the methods we point to. */
627   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
628   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
629   B->ops->destroy          = MatDestroy_SeqAIJMKL;
630 
631   aijmkl->sparse_optimized = PETSC_FALSE;
632 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
633   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
634 #elif
635   aijmkl->no_SpMV2 = PETSC_TRUE;
636 #endif
637 
638   /* Parse command line options. */
639   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
640   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
641   ierr = PetscOptionsEnd();CHKERRQ(ierr);
642 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
643   if(!aijmkl->no_SpMV2) {
644     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
645     aijmkl->no_SpMV2 = PETSC_TRUE;
646   }
647 #endif
648 
649   if(!aijmkl->no_SpMV2) {
650 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
651     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
652     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
653     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
654     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
655 #endif
656   } else {
657     B->ops->mult             = MatMult_SeqAIJMKL;
658     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
659     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
660     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
661   }
662 
663   B->ops->scale              = MatScale_SeqAIJMKL;
664   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
665   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
666   B->ops->axpy               = MatAXPY_SeqAIJMKL;
667 
668   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
669   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
670   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
671   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
672   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
673 
674   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
675   *newmat = B;
676   PetscFunctionReturn(0);
677 }
678 
679 /*@C
680    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
681    This type inherits from AIJ and is largely identical, but uses sparse BLAS
682    routines from Intel MKL whenever possible.
683    MatMult, MatMultAdd, MatMultTranspose, and MatMultTransposeAdd
684    operations are currently supported.
685    If the installed version of MKL supports the "SpMV2" sparse
686    inspector-executor routines, then those are used by default.
687 
688    Collective on MPI_Comm
689 
690    Input Parameters:
691 +  comm - MPI communicator, set to PETSC_COMM_SELF
692 .  m - number of rows
693 .  n - number of columns
694 .  nz - number of nonzeros per row (same for all rows)
695 -  nnz - array containing the number of nonzeros in the various rows
696          (possibly different for each row) or NULL
697 
698    Output Parameter:
699 .  A - the matrix
700 
701    Options Database Keys:
702 .  -mat_aijmkl_no_spmv2 - disables use of the SpMV2 inspector-executor routines
703 
704    Notes:
705    If nnz is given then nz is ignored
706 
707    Level: intermediate
708 
709 .keywords: matrix, MKL, sparse, parallel
710 
711 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
712 @*/
713 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
714 {
715   PetscErrorCode ierr;
716 
717   PetscFunctionBegin;
718   ierr = MatCreate(comm,A);CHKERRQ(ierr);
719   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
720   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
721   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
722   PetscFunctionReturn(0);
723 }
724 
725 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
726 {
727   PetscErrorCode ierr;
728 
729   PetscFunctionBegin;
730   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
731   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734