xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision e7f46db8d62cb2e4e59111bf21061e64ea60daab)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
53   B->ops->scale            = MatScale_SeqAIJ;
54   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
55   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
56   B->ops->axpy             = MatAXPY_SeqAIJ;
57 
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
62 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
63   if(!aijmkl->no_SpMV2) {
64     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
65     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
66   }
67 
68   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
69    * simply involves destroying the MKL sparse matrix handle and then freeing
70    * the spptr pointer. */
71   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
72 
73   if (aijmkl->sparse_optimized) {
74     sparse_status_t stat;
75     stat = mkl_sparse_destroy(aijmkl->csrA);
76     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
77   }
78 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
79   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
80 
81   /* Change the type of B to MATSEQAIJ. */
82   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
83 
84   *newmat = B;
85   PetscFunctionReturn(0);
86 }
87 
88 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
89 {
90   PetscErrorCode ierr;
91   Mat_SeqAIJMKL  *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
92 
93   PetscFunctionBegin;
94 
95   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
96    * spptr pointer. */
97   if (aijmkl) {
98     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
99 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
100     if (aijmkl->sparse_optimized) {
101       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
102       stat = mkl_sparse_destroy(aijmkl->csrA);
103       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
104     }
105 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
106     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
107   }
108 
109   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
110    * to destroy everything that remains. */
111   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
112   /* Note that I don't call MatSetType().  I believe this is because that
113    * is only to be called when *building* a matrix.  I could be wrong, but
114    * that is how things work for the SuperLU matrix class. */
115   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
116   PetscFunctionReturn(0);
117 }
118 
119 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
120  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
121  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
122  * handle, creates a new one, and then calls mkl_sparse_optimize().
123  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
124  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
125  * an unoptimized matrix handle here. */
126 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
127 {
128 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
129   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
130    * does nothing. We make it callable anyway in this case because it cuts
131    * down on littering the code with #ifdefs. */
132   PetscFunctionBegin;
133   PetscFunctionReturn(0);
134 #else
135   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
136   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
137   PetscInt         m,n;
138   MatScalar        *aa;
139   PetscInt         *aj,*ai;
140   sparse_status_t  stat;
141   PetscErrorCode   ierr;
142   PetscObjectState state;
143 
144   PetscFunctionBegin;
145   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
146 
147   if (aijmkl->sparse_optimized) {
148     /* Matrix has been previously assembled and optimized. Must destroy old
149      * matrix handle before running the optimization step again. */
150     stat = mkl_sparse_destroy(aijmkl->csrA);
151     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
152   }
153   aijmkl->sparse_optimized = PETSC_FALSE;
154 
155   /* Now perform the SpMV2 setup and matrix optimization. */
156   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
157   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
158   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
159   m = A->rmap->n;
160   n = A->cmap->n;
161   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
162   aa   = a->a;  /* Nonzero elements stored row-by-row. */
163   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
164   if ((a->nz!=0) & !(A->structure_only)) {
165     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
166      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
167     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
168     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
169     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
170     stat = mkl_sparse_optimize(aijmkl->csrA);
171     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
172     aijmkl->sparse_optimized = PETSC_TRUE;
173   }
174   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
175 
176   PetscFunctionReturn(0);
177 #endif
178 }
179 
180 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
181  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
182  * matrix handle.
183  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
184  * there is no good alternative. */
185 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
186 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
187 {
188   PetscErrorCode      ierr;
189   sparse_status_t     stat;
190   sparse_index_base_t indexing;
191   PetscInt            nrows, ncols;
192   PetscInt            *aj,*ai,*dummy;
193   MatScalar           *aa;
194   Mat                 A;
195   Mat_SeqAIJMKL       *aijmkl;
196 
197   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
198   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
199   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
200 
201   if (reuse == MAT_REUSE_MATRIX) {
202     ierr = MatDestroy(mat);CHKERRQ(ierr);
203   }
204   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
205   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
206   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
207   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
208    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
209    * they will be destroyed when the MKL handle is destroyed.
210    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
211   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
212 
213   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
214    * Now turn it into a MATSEQAIJMKL. */
215   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
216 
217   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
218   aijmkl->csrA = csrA;
219 
220   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
221    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
222    * and just need to be able to run the MKL optimization step. */
223   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
224   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
225   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
226   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
227   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
228   stat = mkl_sparse_optimize(aijmkl->csrA);
229   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
230   aijmkl->sparse_optimized = PETSC_TRUE;
231 
232   *mat = A;
233   PetscFunctionReturn(0);
234 }
235 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
236 
237 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
238 {
239   PetscErrorCode ierr;
240   Mat_SeqAIJMKL  *aijmkl;
241   Mat_SeqAIJMKL  *aijmkl_dest;
242 
243   PetscFunctionBegin;
244   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
245   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
246   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
247   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
248   aijmkl_dest->sparse_optimized = PETSC_FALSE;
249   if (aijmkl->eager_inspection) {
250     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
251   }
252   PetscFunctionReturn(0);
253 }
254 
255 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
256 {
257   PetscErrorCode  ierr;
258   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
259   Mat_SeqAIJMKL   *aijmkl;
260 
261   PetscFunctionBegin;
262   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
263 
264   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
265    * extra information and some different methods, call the AssemblyEnd
266    * routine for a MATSEQAIJ.
267    * I'm not sure if this is the best way to do this, but it avoids
268    * a lot of code duplication. */
269   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
270   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
271 
272   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
273    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
274   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
275   if (aijmkl->eager_inspection) {
276     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
277 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
278   } else if (aijmkl->sparse_optimized) {
279     /* If doing lazy inspection and there is an optimized MKL handle, we need to destroy it, so that it will be
280      * rebuilt later when needed. Otherwise, some SeqAIJ implementations that we depend on for some operations
281      * (such as MatMatMultNumeric()) can modify the result matrix without the matrix handle being rebuilt.
282      * (The SeqAIJ version MatMatMultNumeric() knows nothing about matrix handles, but it *does* call MatAssemblyEnd().) */
283     sparse_status_t stat = mkl_sparse_destroy(aijmkl->csrA);
284     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
285     aijmkl->sparse_optimized = PETSC_FALSE;
286 #endif
287   }
288 
289   PetscFunctionReturn(0);
290 }
291 
292 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
293 {
294   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
295   const PetscScalar *x;
296   PetscScalar       *y;
297   const MatScalar   *aa;
298   PetscErrorCode    ierr;
299   PetscInt          m=A->rmap->n;
300   PetscInt          n=A->cmap->n;
301   PetscScalar       alpha = 1.0;
302   PetscScalar       beta = 0.0;
303   const PetscInt    *aj,*ai;
304   char              matdescra[6];
305 
306 
307   /* Variables not in MatMult_SeqAIJ. */
308   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
309 
310   PetscFunctionBegin;
311   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
312   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
313   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
314   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
315   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
316   aa   = a->a;  /* Nonzero elements stored row-by-row. */
317   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
318 
319   /* Call MKL sparse BLAS routine to do the MatMult. */
320   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
321 
322   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
323   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
324   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
329 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
330 {
331   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
332   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
333   const PetscScalar *x;
334   PetscScalar       *y;
335   PetscErrorCode    ierr;
336   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
337   PetscObjectState  state;
338 
339   PetscFunctionBegin;
340 
341   /* If there are no nonzero entries, zero yy and return immediately. */
342   if(!a->nz) {
343     PetscInt i;
344     PetscInt m=A->rmap->n;
345     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
346     for (i=0; i<m; i++) {
347       y[i] = 0.0;
348     }
349     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
350     PetscFunctionReturn(0);
351   }
352 
353   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
354   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
355 
356   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
357    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
358    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
359   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
360   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
361     MatSeqAIJMKL_create_mkl_handle(A);
362   }
363 
364   /* Call MKL SpMV2 executor routine to do the MatMult. */
365   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
366   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
367 
368   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
369   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
370   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
371   PetscFunctionReturn(0);
372 }
373 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
374 
375 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
376 {
377   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
378   const PetscScalar *x;
379   PetscScalar       *y;
380   const MatScalar   *aa;
381   PetscErrorCode    ierr;
382   PetscInt          m=A->rmap->n;
383   PetscInt          n=A->cmap->n;
384   PetscScalar       alpha = 1.0;
385   PetscScalar       beta = 0.0;
386   const PetscInt    *aj,*ai;
387   char              matdescra[6];
388 
389   /* Variables not in MatMultTranspose_SeqAIJ. */
390   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
391 
392   PetscFunctionBegin;
393   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
394   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
395   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
396   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
397   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
398   aa   = a->a;  /* Nonzero elements stored row-by-row. */
399   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
400 
401   /* Call MKL sparse BLAS routine to do the MatMult. */
402   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
403 
404   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
405   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
406   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
407   PetscFunctionReturn(0);
408 }
409 
410 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
411 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
412 {
413   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
414   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
415   const PetscScalar *x;
416   PetscScalar       *y;
417   PetscErrorCode    ierr;
418   sparse_status_t   stat;
419   PetscObjectState  state;
420 
421   PetscFunctionBegin;
422 
423   /* If there are no nonzero entries, zero yy and return immediately. */
424   if(!a->nz) {
425     PetscInt i;
426     PetscInt n=A->cmap->n;
427     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
428     for (i=0; i<n; i++) {
429       y[i] = 0.0;
430     }
431     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
432     PetscFunctionReturn(0);
433   }
434 
435   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
436   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
437 
438   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
439    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
440    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
441   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
442   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
443     MatSeqAIJMKL_create_mkl_handle(A);
444   }
445 
446   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
447   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
448   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
449 
450   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
451   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
452   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
453   PetscFunctionReturn(0);
454 }
455 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
456 
457 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
458 {
459   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
460   const PetscScalar *x;
461   PetscScalar       *y,*z;
462   const MatScalar   *aa;
463   PetscErrorCode    ierr;
464   PetscInt          m=A->rmap->n;
465   PetscInt          n=A->cmap->n;
466   const PetscInt    *aj,*ai;
467   PetscInt          i;
468 
469   /* Variables not in MatMultAdd_SeqAIJ. */
470   char              transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
471   PetscScalar       alpha = 1.0;
472   PetscScalar       beta;
473   char              matdescra[6];
474 
475   PetscFunctionBegin;
476   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
477   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
478 
479   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
480   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
481   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
482   aa   = a->a;  /* Nonzero elements stored row-by-row. */
483   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
484 
485   /* Call MKL sparse BLAS routine to do the MatMult. */
486   if (zz == yy) {
487     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
488     beta = 1.0;
489     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
490   } else {
491     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
492      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
493     beta = 0.0;
494     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
495     for (i=0; i<m; i++) {
496       z[i] += y[i];
497     }
498   }
499 
500   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
501   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
502   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
507 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
508 {
509   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
510   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
511   const PetscScalar *x;
512   PetscScalar       *y,*z;
513   PetscErrorCode    ierr;
514   PetscInt          m=A->rmap->n;
515   PetscInt          i;
516 
517   /* Variables not in MatMultAdd_SeqAIJ. */
518   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
519   PetscObjectState  state;
520 
521   PetscFunctionBegin;
522 
523   /* If there are no nonzero entries, set zz = yy and return immediately. */
524   if(!a->nz) {
525     PetscInt i;
526     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
527     for (i=0; i<m; i++) {
528       z[i] = y[i];
529     }
530     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
531     PetscFunctionReturn(0);
532   }
533 
534   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
535   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
536 
537   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
538    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
539    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
540   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
541   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
542     MatSeqAIJMKL_create_mkl_handle(A);
543   }
544 
545   /* Call MKL sparse BLAS routine to do the MatMult. */
546   if (zz == yy) {
547     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
548      * with alpha and beta both set to 1.0. */
549     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
550     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
551   } else {
552     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
553      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
554     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
555     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
556     for (i=0; i<m; i++) {
557       z[i] += y[i];
558     }
559   }
560 
561   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
562   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
563   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
567 
568 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
569 {
570   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
571   const PetscScalar *x;
572   PetscScalar       *y,*z;
573   const MatScalar   *aa;
574   PetscErrorCode    ierr;
575   PetscInt          m=A->rmap->n;
576   PetscInt          n=A->cmap->n;
577   const PetscInt    *aj,*ai;
578   PetscInt          i;
579 
580   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
581   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
582   PetscScalar       alpha = 1.0;
583   PetscScalar       beta;
584   char              matdescra[6];
585 
586   PetscFunctionBegin;
587   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
588   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
589 
590   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
591   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
592   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
593   aa   = a->a;  /* Nonzero elements stored row-by-row. */
594   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
595 
596   /* Call MKL sparse BLAS routine to do the MatMult. */
597   if (zz == yy) {
598     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
599     beta = 1.0;
600     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
601   } else {
602     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
603      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
604     beta = 0.0;
605     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
606     for (i=0; i<n; i++) {
607       z[i] += y[i];
608     }
609   }
610 
611   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
612   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
613   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
614   PetscFunctionReturn(0);
615 }
616 
617 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
618 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
619 {
620   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
621   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
622   const PetscScalar *x;
623   PetscScalar       *y,*z;
624   PetscErrorCode    ierr;
625   PetscInt          n=A->cmap->n;
626   PetscInt          i;
627   PetscObjectState  state;
628 
629   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
630   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
631 
632   PetscFunctionBegin;
633 
634   /* If there are no nonzero entries, set zz = yy and return immediately. */
635   if(!a->nz) {
636     PetscInt i;
637     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
638     for (i=0; i<n; i++) {
639       z[i] = y[i];
640     }
641     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
642     PetscFunctionReturn(0);
643   }
644 
645   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
646   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
647 
648   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
649    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
650    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
651   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
652   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
653     MatSeqAIJMKL_create_mkl_handle(A);
654   }
655 
656   /* Call MKL sparse BLAS routine to do the MatMult. */
657   if (zz == yy) {
658     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
659      * with alpha and beta both set to 1.0. */
660     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
661     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
662   } else {
663     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
664      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
665     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
666     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
667     for (i=0; i<n; i++) {
668       z[i] += y[i];
669     }
670   }
671 
672   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
673   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
674   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
675   PetscFunctionReturn(0);
676 }
677 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
678 
679 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
680 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
681  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
682  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
683  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
684  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
685  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
686 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
687 {
688   Mat_SeqAIJMKL    *a, *b;
689   sparse_matrix_t  csrA, csrB, csrC;
690   PetscErrorCode   ierr;
691   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
692   PetscObjectState state;
693 
694   PetscFunctionBegin;
695   a = (Mat_SeqAIJMKL*)A->spptr;
696   b = (Mat_SeqAIJMKL*)B->spptr;
697   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
698   if (!a->sparse_optimized || a->state != state) {
699     MatSeqAIJMKL_create_mkl_handle(A);
700   }
701   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
702   if (!b->sparse_optimized || b->state != state) {
703     MatSeqAIJMKL_create_mkl_handle(B);
704   }
705   csrA = a->csrA;
706   csrB = b->csrA;
707 
708   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
709   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
710 
711   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
712 
713   PetscFunctionReturn(0);
714 }
715 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
716 
717 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
718 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
719 {
720   Mat_SeqAIJMKL    *a, *b;
721   sparse_matrix_t  csrA, csrB, csrC;
722   PetscErrorCode   ierr;
723   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
724   PetscObjectState state;
725 
726   PetscFunctionBegin;
727   a = (Mat_SeqAIJMKL*)A->spptr;
728   b = (Mat_SeqAIJMKL*)B->spptr;
729   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
730   if (!a->sparse_optimized || a->state != state) {
731     MatSeqAIJMKL_create_mkl_handle(A);
732   }
733   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
734   if (!b->sparse_optimized || b->state != state) {
735     MatSeqAIJMKL_create_mkl_handle(B);
736   }
737   csrA = a->csrA;
738   csrB = b->csrA;
739 
740   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
741   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
742 
743   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
744 
745   PetscFunctionReturn(0);
746 }
747 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
748 
749 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
750 {
751   PetscErrorCode ierr;
752 
753   PetscFunctionBegin;
754   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
755   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
756   PetscFunctionReturn(0);
757 }
758 
759 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
760 {
761   PetscErrorCode ierr;
762 
763   PetscFunctionBegin;
764   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
765   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
766   PetscFunctionReturn(0);
767 }
768 
769 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
770 {
771   PetscErrorCode ierr;
772 
773   PetscFunctionBegin;
774   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
775   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
776   PetscFunctionReturn(0);
777 }
778 
779 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
780 {
781   PetscErrorCode ierr;
782 
783   PetscFunctionBegin;
784   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
785   if (str == SAME_NONZERO_PATTERN) {
786     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
787     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
793  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
794  * routine, but can also be used to convert an assembled SeqAIJ matrix
795  * into a SeqAIJMKL one. */
796 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
797 {
798   PetscErrorCode ierr;
799   Mat            B = *newmat;
800   Mat_SeqAIJMKL  *aijmkl;
801   PetscBool      set;
802   PetscBool      sametype;
803 
804   PetscFunctionBegin;
805   if (reuse == MAT_INITIAL_MATRIX) {
806     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
807   }
808 
809   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
810   if (sametype) PetscFunctionReturn(0);
811 
812   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
813   B->spptr = (void*) aijmkl;
814 
815   /* Set function pointers for methods that we inherit from AIJ but override.
816    * We also parse some command line options below, since those determine some of the methods we point to. */
817   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
818   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
819   B->ops->destroy          = MatDestroy_SeqAIJMKL;
820 
821   aijmkl->sparse_optimized = PETSC_FALSE;
822 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
823   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
824 #else
825   aijmkl->no_SpMV2 = PETSC_TRUE;
826 #endif
827   aijmkl->eager_inspection = PETSC_FALSE;
828 
829   /* Parse command line options. */
830   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
831   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
832   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
833   ierr = PetscOptionsEnd();CHKERRQ(ierr);
834 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
835   if(!aijmkl->no_SpMV2) {
836     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
837     aijmkl->no_SpMV2 = PETSC_TRUE;
838   }
839 #endif
840 
841   if(!aijmkl->no_SpMV2) {
842 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
843     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
844     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
845     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
846     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
847     B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
848     B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
849 #endif
850   } else {
851     B->ops->mult             = MatMult_SeqAIJMKL;
852     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
853     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
854     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
855   }
856 
857   B->ops->scale              = MatScale_SeqAIJMKL;
858   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
859   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
860   B->ops->axpy               = MatAXPY_SeqAIJMKL;
861 
862   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
863   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
864   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
865   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
866   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
867   if(!aijmkl->no_SpMV2) {
868 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
869     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
870     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
871 #endif
872   }
873 
874   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
875   *newmat = B;
876   PetscFunctionReturn(0);
877 }
878 
879 /*@C
880    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
881    This type inherits from AIJ and is largely identical, but uses sparse BLAS
882    routines from Intel MKL whenever possible.
883    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, and MatTransposeMatMult
884    operations are currently supported.
885    If the installed version of MKL supports the "SpMV2" sparse
886    inspector-executor routines, then those are used by default.
887 
888    Collective on MPI_Comm
889 
890    Input Parameters:
891 +  comm - MPI communicator, set to PETSC_COMM_SELF
892 .  m - number of rows
893 .  n - number of columns
894 .  nz - number of nonzeros per row (same for all rows)
895 -  nnz - array containing the number of nonzeros in the various rows
896          (possibly different for each row) or NULL
897 
898    Output Parameter:
899 .  A - the matrix
900 
901    Options Database Keys:
902 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
903 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
904 
905    Notes:
906    If nnz is given then nz is ignored
907 
908    Level: intermediate
909 
910 .keywords: matrix, MKL, sparse, parallel
911 
912 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
913 @*/
914 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
915 {
916   PetscErrorCode ierr;
917 
918   PetscFunctionBegin;
919   ierr = MatCreate(comm,A);CHKERRQ(ierr);
920   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
921   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
922   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
923   PetscFunctionReturn(0);
924 }
925 
926 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
927 {
928   PetscErrorCode ierr;
929 
930   PetscFunctionBegin;
931   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
932   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
933   PetscFunctionReturn(0);
934 }
935