xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 551aa5c87513b3dd66216044ca44a3df6d732447)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
53   B->ops->scale            = MatScale_SeqAIJ;
54   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
55   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
56   B->ops->axpy             = MatAXPY_SeqAIJ;
57 
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
62 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
63   if(!aijmkl->no_SpMV2) {
64     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
65     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
66   }
67 
68   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
69    * simply involves destroying the MKL sparse matrix handle and then freeing
70    * the spptr pointer. */
71   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
72 
73   if (aijmkl->sparse_optimized) {
74     sparse_status_t stat;
75     stat = mkl_sparse_destroy(aijmkl->csrA);
76     if (stat != SPARSE_STATUS_SUCCESS) {
77       PetscFunctionReturn(PETSC_ERR_LIB);
78     }
79   }
80 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
81   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
82 
83   /* Change the type of B to MATSEQAIJ. */
84   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
85 
86   *newmat = B;
87   PetscFunctionReturn(0);
88 }
89 
90 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
91 {
92   PetscErrorCode ierr;
93   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
94 
95   PetscFunctionBegin;
96 
97   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
98    * spptr pointer. */
99   if (aijmkl) {
100     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
101 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
102     if (aijmkl->sparse_optimized) {
103       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
104       stat = mkl_sparse_destroy(aijmkl->csrA);
105       if (stat != SPARSE_STATUS_SUCCESS) {
106         PetscFunctionReturn(PETSC_ERR_LIB);
107       }
108     }
109 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
110     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
111   }
112 
113   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
114    * to destroy everything that remains. */
115   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
116   /* Note that I don't call MatSetType().  I believe this is because that
117    * is only to be called when *building* a matrix.  I could be wrong, but
118    * that is how things work for the SuperLU matrix class. */
119   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
120   PetscFunctionReturn(0);
121 }
122 
123 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
124  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
125  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
126  * handle, creates a new one, and then calls mkl_sparse_optimize().
127  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
128  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
129  * an unoptimized matrix handle here. */
130 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
131 {
132 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
133   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
134    * does nothing. We make it callable anyway in this case because it cuts
135    * down on littering the code with #ifdefs. */
136   PetscFunctionBegin;
137   PetscFunctionReturn(0);
138 #else
139   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
140   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
141   PetscInt         m,n;
142   MatScalar        *aa;
143   PetscInt         *aj,*ai;
144   sparse_status_t  stat;
145   PetscErrorCode   ierr;
146   PetscObjectState state;
147 
148   PetscFunctionBegin;
149   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
150 
151   if (aijmkl->sparse_optimized) {
152     /* Matrix has been previously assembled and optimized. Must destroy old
153      * matrix handle before running the optimization step again. */
154     stat = mkl_sparse_destroy(aijmkl->csrA);
155     if (stat != SPARSE_STATUS_SUCCESS) {
156       PetscFunctionReturn(PETSC_ERR_LIB);
157     }
158   }
159   aijmkl->sparse_optimized = PETSC_FALSE;
160 
161   /* Now perform the SpMV2 setup and matrix optimization. */
162   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
163   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
164   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
165   m = A->rmap->n;
166   n = A->cmap->n;
167   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
168   aa   = a->a;  /* Nonzero elements stored row-by-row. */
169   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
170   if ((a->nz!=0) & !(A->structure_only)) {
171     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
172      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
173     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
174     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
175     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
176     stat = mkl_sparse_optimize(aijmkl->csrA);
177     if (stat != SPARSE_STATUS_SUCCESS) {
178       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
179       PetscFunctionReturn(PETSC_ERR_LIB);
180     }
181     aijmkl->sparse_optimized = PETSC_TRUE;
182   }
183   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
184 
185   PetscFunctionReturn(0);
186 #endif
187 }
188 
189 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
190  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
191  * matrix handle.
192  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
193  * there is no good alternative. */
194 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
195 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
196 {
197   PetscErrorCode ierr;
198   sparse_status_t stat;
199   sparse_index_base_t indexing;
200   PetscInt nrows, ncols;
201   PetscInt *aj,*ai,*dummy;
202   MatScalar *aa;
203   Mat A;
204   Mat_SeqAIJ *a;
205   Mat_SeqAIJMKL *aijmkl;
206 
207   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
208   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
209   if (stat != SPARSE_STATUS_SUCCESS) {
210     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
211     PetscFunctionReturn(PETSC_ERR_LIB);
212   }
213 
214   if (reuse == MAT_REUSE_MATRIX) {
215     ierr = MatDestroy(mat);CHKERRQ(ierr);
216   }
217   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
218   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
219   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
220   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
221    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
222    * they will be destroyed when the MKL handle is destroyed.
223    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
224   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
225 
226   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
227    * Now turn it into a MATSEQAIJMKL. */
228   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
229 
230   a = (Mat_SeqAIJ*)A->data;
231   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
232   aijmkl->csrA = csrA;
233 
234   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
235    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
236    * and just need to be able to run the MKL optimization step. */
237   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
238   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
239   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
240   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
241   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
242   stat = mkl_sparse_optimize(aijmkl->csrA);
243   if (stat != SPARSE_STATUS_SUCCESS) {
244     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
245     PetscFunctionReturn(PETSC_ERR_LIB);
246   }
247   aijmkl->sparse_optimized = PETSC_TRUE;
248 
249   *mat = A;
250   PetscFunctionReturn(0);
251 }
252 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
253 
254 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
255 {
256   PetscErrorCode ierr;
257   Mat_SeqAIJMKL *aijmkl;
258   Mat_SeqAIJMKL *aijmkl_dest;
259 
260   PetscFunctionBegin;
261   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
262   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
263   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
264   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
265   aijmkl_dest->sparse_optimized = PETSC_FALSE;
266   if (aijmkl->eager_inspection) {
267     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
268   }
269   PetscFunctionReturn(0);
270 }
271 
272 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
273 {
274   PetscErrorCode  ierr;
275   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
276   Mat_SeqAIJMKL *aijmkl;
277 
278   PetscFunctionBegin;
279   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
280 
281   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
282    * extra information and some different methods, call the AssemblyEnd
283    * routine for a MATSEQAIJ.
284    * I'm not sure if this is the best way to do this, but it avoids
285    * a lot of code duplication. */
286   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
287   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
288 
289   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
290    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
291   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
292   if (aijmkl->eager_inspection) {
293     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
294 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
295   } else if (aijmkl->sparse_optimized) {
296     /* If doing lazy inspection and there is an optimized MKL handle, we need to destroy it, so that it will be
297      * rebuilt later when needed. Otherwise, some SeqAIJ implementations that we depend on for some operations
298      * (such as MatMatMultNumeric()) can modify the result matrix without the matrix handle being rebuilt.
299      * (The SeqAIJ version MatMatMultNumeric() knows nothing about matrix handles, but it *does* call MatAssemblyEnd().) */
300     sparse_status_t stat = mkl_sparse_destroy(aijmkl->csrA);
301     if (stat != SPARSE_STATUS_SUCCESS) {
302       PetscFunctionReturn(PETSC_ERR_LIB);
303     }
304     aijmkl->sparse_optimized = PETSC_FALSE;
305 #endif
306   }
307 
308   PetscFunctionReturn(0);
309 }
310 
311 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
312 {
313   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
314   const PetscScalar *x;
315   PetscScalar       *y;
316   const MatScalar   *aa;
317   PetscErrorCode    ierr;
318   PetscInt          m=A->rmap->n;
319   PetscInt          n=A->cmap->n;
320   PetscScalar       alpha = 1.0;
321   PetscScalar       beta = 0.0;
322   const PetscInt    *aj,*ai;
323   char              matdescra[6];
324 
325 
326   /* Variables not in MatMult_SeqAIJ. */
327   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
328 
329   PetscFunctionBegin;
330   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
331   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
332   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
333   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
334   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
335   aa   = a->a;  /* Nonzero elements stored row-by-row. */
336   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
337 
338   /* Call MKL sparse BLAS routine to do the MatMult. */
339   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
340 
341   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
342   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
343   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
348 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
349 {
350   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
351   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
352   const PetscScalar *x;
353   PetscScalar       *y;
354   PetscErrorCode    ierr;
355   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
356   PetscObjectState  state;
357 
358   PetscFunctionBegin;
359 
360   /* If there are no nonzero entries, zero yy and return immediately. */
361   if(!a->nz) {
362     PetscInt i;
363     PetscInt m=A->rmap->n;
364     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
365     for (i=0; i<m; i++) {
366       y[i] = 0.0;
367     }
368     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
369     PetscFunctionReturn(0);
370   }
371 
372   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
373   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
374 
375   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
376    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
377    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
378   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
379   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
380     MatSeqAIJMKL_create_mkl_handle(A);
381   }
382 
383   /* Call MKL SpMV2 executor routine to do the MatMult. */
384   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
385 
386   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
387   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
388   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
389   if (stat != SPARSE_STATUS_SUCCESS) {
390     PetscFunctionReturn(PETSC_ERR_LIB);
391   }
392   PetscFunctionReturn(0);
393 }
394 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
395 
396 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
397 {
398   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
399   const PetscScalar *x;
400   PetscScalar       *y;
401   const MatScalar   *aa;
402   PetscErrorCode    ierr;
403   PetscInt          m=A->rmap->n;
404   PetscInt          n=A->cmap->n;
405   PetscScalar       alpha = 1.0;
406   PetscScalar       beta = 0.0;
407   const PetscInt    *aj,*ai;
408   char              matdescra[6];
409 
410   /* Variables not in MatMultTranspose_SeqAIJ. */
411   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
412 
413   PetscFunctionBegin;
414   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
415   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
416   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
417   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
418   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
419   aa   = a->a;  /* Nonzero elements stored row-by-row. */
420   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
421 
422   /* Call MKL sparse BLAS routine to do the MatMult. */
423   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
424 
425   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
426   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
427   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
432 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
433 {
434   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
435   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
436   const PetscScalar *x;
437   PetscScalar       *y;
438   PetscErrorCode    ierr;
439   sparse_status_t   stat;
440   PetscObjectState  state;
441 
442   PetscFunctionBegin;
443 
444   /* If there are no nonzero entries, zero yy and return immediately. */
445   if(!a->nz) {
446     PetscInt i;
447     PetscInt n=A->cmap->n;
448     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
449     for (i=0; i<n; i++) {
450       y[i] = 0.0;
451     }
452     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
453     PetscFunctionReturn(0);
454   }
455 
456   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
457   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
458 
459   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
460    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
461    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
462   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
463   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
464     MatSeqAIJMKL_create_mkl_handle(A);
465   }
466 
467   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
468   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
469 
470   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
471   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
472   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
473   if (stat != SPARSE_STATUS_SUCCESS) {
474     PetscFunctionReturn(PETSC_ERR_LIB);
475   }
476   PetscFunctionReturn(0);
477 }
478 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
479 
480 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
481 {
482   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
483   const PetscScalar *x;
484   PetscScalar       *y,*z;
485   const MatScalar   *aa;
486   PetscErrorCode    ierr;
487   PetscInt          m=A->rmap->n;
488   PetscInt          n=A->cmap->n;
489   const PetscInt    *aj,*ai;
490   PetscInt          i;
491 
492   /* Variables not in MatMultAdd_SeqAIJ. */
493   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
494   PetscScalar       alpha = 1.0;
495   PetscScalar       beta;
496   char              matdescra[6];
497 
498   PetscFunctionBegin;
499   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
500   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
501 
502   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
503   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
504   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
505   aa   = a->a;  /* Nonzero elements stored row-by-row. */
506   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
507 
508   /* Call MKL sparse BLAS routine to do the MatMult. */
509   if (zz == yy) {
510     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
511     beta = 1.0;
512     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
513   } else {
514     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
515      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
516     beta = 0.0;
517     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
518     for (i=0; i<m; i++) {
519       z[i] += y[i];
520     }
521   }
522 
523   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
524   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
525   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
526   PetscFunctionReturn(0);
527 }
528 
529 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
530 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
531 {
532   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
533   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
534   const PetscScalar *x;
535   PetscScalar       *y,*z;
536   PetscErrorCode    ierr;
537   PetscInt          m=A->rmap->n;
538   PetscInt          i;
539 
540   /* Variables not in MatMultAdd_SeqAIJ. */
541   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
542   PetscObjectState  state;
543 
544   PetscFunctionBegin;
545 
546   /* If there are no nonzero entries, set zz = yy and return immediately. */
547   if(!a->nz) {
548     PetscInt i;
549     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
550     for (i=0; i<m; i++) {
551       z[i] = y[i];
552     }
553     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
554     PetscFunctionReturn(0);
555   }
556 
557   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
558   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
559 
560   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
561    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
562    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
563   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
564   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
565     MatSeqAIJMKL_create_mkl_handle(A);
566   }
567 
568   /* Call MKL sparse BLAS routine to do the MatMult. */
569   if (zz == yy) {
570     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
571      * with alpha and beta both set to 1.0. */
572     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
573   } else {
574     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
575      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
576     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
577     for (i=0; i<m; i++) {
578       z[i] += y[i];
579     }
580   }
581 
582   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
583   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
584   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
585   if (stat != SPARSE_STATUS_SUCCESS) {
586     PetscFunctionReturn(PETSC_ERR_LIB);
587   }
588   PetscFunctionReturn(0);
589 }
590 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
591 
592 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
593 {
594   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
595   const PetscScalar *x;
596   PetscScalar       *y,*z;
597   const MatScalar   *aa;
598   PetscErrorCode    ierr;
599   PetscInt          m=A->rmap->n;
600   PetscInt          n=A->cmap->n;
601   const PetscInt    *aj,*ai;
602   PetscInt          i;
603 
604   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
605   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
606   PetscScalar       alpha = 1.0;
607   PetscScalar       beta;
608   char              matdescra[6];
609 
610   PetscFunctionBegin;
611   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
612   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
613 
614   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
615   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
616   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
617   aa   = a->a;  /* Nonzero elements stored row-by-row. */
618   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
619 
620   /* Call MKL sparse BLAS routine to do the MatMult. */
621   if (zz == yy) {
622     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
623     beta = 1.0;
624     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
625   } else {
626     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
627      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
628     beta = 0.0;
629     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
630     for (i=0; i<n; i++) {
631       z[i] += y[i];
632     }
633   }
634 
635   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
636   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
637   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
642 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
643 {
644   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
645   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
646   const PetscScalar *x;
647   PetscScalar       *y,*z;
648   PetscErrorCode    ierr;
649   PetscInt          n=A->cmap->n;
650   PetscInt          i;
651   PetscObjectState  state;
652 
653   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
654   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
655 
656   PetscFunctionBegin;
657 
658   /* If there are no nonzero entries, set zz = yy and return immediately. */
659   if(!a->nz) {
660     PetscInt i;
661     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
662     for (i=0; i<n; i++) {
663       z[i] = y[i];
664     }
665     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
666     PetscFunctionReturn(0);
667   }
668 
669   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
670   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
671 
672   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
673    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
674    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
675   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
676   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
677     MatSeqAIJMKL_create_mkl_handle(A);
678   }
679 
680   /* Call MKL sparse BLAS routine to do the MatMult. */
681   if (zz == yy) {
682     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
683      * with alpha and beta both set to 1.0. */
684     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
685   } else {
686     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
687      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
688     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
689     for (i=0; i<n; i++) {
690       z[i] += y[i];
691     }
692   }
693 
694   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
695   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
696   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
697   if (stat != SPARSE_STATUS_SUCCESS) {
698     PetscFunctionReturn(PETSC_ERR_LIB);
699   }
700   PetscFunctionReturn(0);
701 }
702 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
703 
704 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
705 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
706  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
707  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
708  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
709  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
710  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
711 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
712 {
713   Mat_SeqAIJMKL    *a, *b;
714   sparse_matrix_t  csrA, csrB, csrC;
715   PetscErrorCode   ierr;
716   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
717   PetscObjectState state;
718 
719   PetscFunctionBegin;
720   a = (Mat_SeqAIJMKL*)A->spptr;
721   b = (Mat_SeqAIJMKL*)B->spptr;
722   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
723   if (!a->sparse_optimized || a->state != state) {
724     MatSeqAIJMKL_create_mkl_handle(A);
725   }
726   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
727   if (!b->sparse_optimized || b->state != state) {
728     MatSeqAIJMKL_create_mkl_handle(B);
729   }
730   csrA = a->csrA;
731   csrB = b->csrA;
732 
733   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
734   if (stat != SPARSE_STATUS_SUCCESS) {
735     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
736     PetscFunctionReturn(PETSC_ERR_LIB);
737   }
738 
739   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
740 
741   PetscFunctionReturn(0);
742 }
743 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
744 
745 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
746 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
747 {
748   Mat_SeqAIJMKL    *a, *b;
749   sparse_matrix_t  csrA, csrB, csrC;
750   PetscErrorCode   ierr;
751   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
752   PetscObjectState state;
753 
754   PetscFunctionBegin;
755   a = (Mat_SeqAIJMKL*)A->spptr;
756   b = (Mat_SeqAIJMKL*)B->spptr;
757   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
758   if (!a->sparse_optimized || a->state != state) {
759     MatSeqAIJMKL_create_mkl_handle(A);
760   }
761   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
762   if (!b->sparse_optimized || b->state != state) {
763     MatSeqAIJMKL_create_mkl_handle(B);
764   }
765   csrA = a->csrA;
766   csrB = b->csrA;
767 
768   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
769   if (stat != SPARSE_STATUS_SUCCESS) {
770     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
771     PetscFunctionReturn(PETSC_ERR_LIB);
772   }
773 
774   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
775 
776   PetscFunctionReturn(0);
777 }
778 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
779 
780 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
781 {
782   PetscErrorCode ierr;
783 
784   PetscFunctionBegin;
785   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
786   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
787   PetscFunctionReturn(0);
788 }
789 
790 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
791 {
792   PetscErrorCode ierr;
793 
794   PetscFunctionBegin;
795   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
796   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
797   PetscFunctionReturn(0);
798 }
799 
800 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
801 {
802   PetscErrorCode ierr;
803 
804   PetscFunctionBegin;
805   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
806   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
807   PetscFunctionReturn(0);
808 }
809 
810 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
811 {
812   PetscErrorCode ierr;
813 
814   PetscFunctionBegin;
815   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
816   if (str == SAME_NONZERO_PATTERN) {
817     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
818     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
819   }
820   PetscFunctionReturn(0);
821 }
822 
823 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
824  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
825  * routine, but can also be used to convert an assembled SeqAIJ matrix
826  * into a SeqAIJMKL one. */
827 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
828 {
829   PetscErrorCode ierr;
830   Mat            B = *newmat;
831   Mat_SeqAIJMKL  *aijmkl;
832   PetscBool      set;
833   PetscBool      sametype;
834 
835   PetscFunctionBegin;
836   if (reuse == MAT_INITIAL_MATRIX) {
837     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
838   }
839 
840   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
841   if (sametype) PetscFunctionReturn(0);
842 
843   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
844   B->spptr = (void*) aijmkl;
845 
846   /* Set function pointers for methods that we inherit from AIJ but override.
847    * We also parse some command line options below, since those determine some of the methods we point to. */
848   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
849   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
850   B->ops->destroy          = MatDestroy_SeqAIJMKL;
851 
852   aijmkl->sparse_optimized = PETSC_FALSE;
853 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
854   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
855 #else
856   aijmkl->no_SpMV2 = PETSC_TRUE;
857 #endif
858   aijmkl->eager_inspection = PETSC_FALSE;
859 
860   /* Parse command line options. */
861   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
862   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
863   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
864   ierr = PetscOptionsEnd();CHKERRQ(ierr);
865 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
866   if(!aijmkl->no_SpMV2) {
867     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
868     aijmkl->no_SpMV2 = PETSC_TRUE;
869   }
870 #endif
871 
872   if(!aijmkl->no_SpMV2) {
873 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
874     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
875     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
876     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
877     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
878     B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
879     B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
880 #endif
881   } else {
882     B->ops->mult             = MatMult_SeqAIJMKL;
883     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
884     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
885     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
886   }
887 
888   B->ops->scale              = MatScale_SeqAIJMKL;
889   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
890   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
891   B->ops->axpy               = MatAXPY_SeqAIJMKL;
892 
893   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
894   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
895   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
896   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
897   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
898   if(!aijmkl->no_SpMV2) {
899 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
900     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
901     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
902 #endif
903   }
904 
905   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
906   *newmat = B;
907   PetscFunctionReturn(0);
908 }
909 
910 /*@C
911    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
912    This type inherits from AIJ and is largely identical, but uses sparse BLAS
913    routines from Intel MKL whenever possible.
914    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, and MatTransposeMatMult
915    operations are currently supported.
916    If the installed version of MKL supports the "SpMV2" sparse
917    inspector-executor routines, then those are used by default.
918 
919    Collective on MPI_Comm
920 
921    Input Parameters:
922 +  comm - MPI communicator, set to PETSC_COMM_SELF
923 .  m - number of rows
924 .  n - number of columns
925 .  nz - number of nonzeros per row (same for all rows)
926 -  nnz - array containing the number of nonzeros in the various rows
927          (possibly different for each row) or NULL
928 
929    Output Parameter:
930 .  A - the matrix
931 
932    Options Database Keys:
933 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
934 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
935 
936    Notes:
937    If nnz is given then nz is ignored
938 
939    Level: intermediate
940 
941 .keywords: matrix, MKL, sparse, parallel
942 
943 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
944 @*/
945 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
946 {
947   PetscErrorCode ierr;
948 
949   PetscFunctionBegin;
950   ierr = MatCreate(comm,A);CHKERRQ(ierr);
951   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
952   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
953   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
954   PetscFunctionReturn(0);
955 }
956 
957 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
958 {
959   PetscErrorCode ierr;
960 
961   PetscFunctionBegin;
962   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
963   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
964   PetscFunctionReturn(0);
965 }
966