xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision f8990b4a39a56ca2debaee2c8be1da38fe13b9f8)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJ_SeqAIJ;
53   B->ops->ptap             = MatPtAP_SeqAIJ_SeqAIJ;
54   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJ_SeqAIJ;
55   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
56 
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
62   if(!aijmkl->no_SpMV2) {
63     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
64 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
65     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
66 #endif
67     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
68   }
69 
70   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
71    * simply involves destroying the MKL sparse matrix handle and then freeing
72    * the spptr pointer. */
73   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
74 
75   if (aijmkl->sparse_optimized) {
76     sparse_status_t stat;
77     stat = mkl_sparse_destroy(aijmkl->csrA);
78     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
79   }
80 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
81   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
82 
83   /* Change the type of B to MATSEQAIJ. */
84   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
85 
86   *newmat = B;
87   PetscFunctionReturn(0);
88 }
89 
90 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
91 {
92   PetscErrorCode ierr;
93   Mat_SeqAIJMKL  *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
94 
95   PetscFunctionBegin;
96 
97   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
98    * spptr pointer. */
99   if (aijmkl) {
100     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
101 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
102     if (aijmkl->sparse_optimized) {
103       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
104       stat = mkl_sparse_destroy(aijmkl->csrA);
105       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
106     }
107 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
108     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
109   }
110 
111   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
112    * to destroy everything that remains. */
113   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
114   /* Note that I don't call MatSetType().  I believe this is because that
115    * is only to be called when *building* a matrix.  I could be wrong, but
116    * that is how things work for the SuperLU matrix class. */
117   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
118   PetscFunctionReturn(0);
119 }
120 
121 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
122  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
123  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
124  * handle, creates a new one, and then calls mkl_sparse_optimize().
125  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
126  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
127  * an unoptimized matrix handle here. */
128 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
129 {
130 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
131   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
132    * does nothing. We make it callable anyway in this case because it cuts
133    * down on littering the code with #ifdefs. */
134   PetscFunctionBegin;
135   PetscFunctionReturn(0);
136 #else
137   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
138   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
139   PetscInt         m,n;
140   MatScalar        *aa;
141   PetscInt         *aj,*ai;
142   sparse_status_t  stat;
143   PetscErrorCode   ierr;
144 
145   PetscFunctionBegin;
146   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
147 
148   if (aijmkl->sparse_optimized) {
149     /* Matrix has been previously assembled and optimized. Must destroy old
150      * matrix handle before running the optimization step again. */
151     stat = mkl_sparse_destroy(aijmkl->csrA);
152     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
153   }
154   aijmkl->sparse_optimized = PETSC_FALSE;
155 
156   /* Now perform the SpMV2 setup and matrix optimization. */
157   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
158   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
159   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
160   m = A->rmap->n;
161   n = A->cmap->n;
162   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
163   aa   = a->a;  /* Nonzero elements stored row-by-row. */
164   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
165   if ((a->nz!=0) & !(A->structure_only)) {
166     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
167      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
168     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
169     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle");
170     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
171     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
172     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
173     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
174     stat = mkl_sparse_optimize(aijmkl->csrA);
175     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
176     aijmkl->sparse_optimized = PETSC_TRUE;
177     ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
178   }
179 
180   PetscFunctionReturn(0);
181 #endif
182 }
183 
184 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
185  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
186  * matrix handle.
187  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
188  * there is no good alternative. */
189 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
190 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
191 {
192   PetscErrorCode      ierr;
193   sparse_status_t     stat;
194   sparse_index_base_t indexing;
195   PetscInt            nrows, ncols;
196   PetscInt            *aj,*ai,*dummy;
197   MatScalar           *aa;
198   Mat                 A;
199   Mat_SeqAIJMKL       *aijmkl;
200 
201   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
202   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
203   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
204 
205   if (reuse == MAT_REUSE_MATRIX) {
206     ierr = MatDestroy(mat);CHKERRQ(ierr);
207   }
208   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
209   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
210   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
211   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
212    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
213    * they will be destroyed when the MKL handle is destroyed.
214    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
215   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
216 
217   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
218    * Now turn it into a MATSEQAIJMKL. */
219   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
220 
221   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
222   aijmkl->csrA = csrA;
223 
224   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
225    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
226    * and just need to be able to run the MKL optimization step. */
227   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
228   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
229   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
230   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
231   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
232   stat = mkl_sparse_optimize(aijmkl->csrA);
233   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
234   aijmkl->sparse_optimized = PETSC_TRUE;
235   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
236 
237   *mat = A;
238   PetscFunctionReturn(0);
239 }
240 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
241 
242 /* MatSeqAIJMKL_update_from_mkl_handle() updates the matrix values array from the contents of the associated MKL sparse matrix handle.
243  * This is needed after mkl_sparse_sp2m() with SPARSE_STAGE_FINALIZE_MULT has been used to compute new values of the matrix in
244  * MatMatMultNumeric(). */
245 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
246 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_update_from_mkl_handle(Mat A)
247 {
248   PetscInt            i;
249   PetscInt            nrows,ncols;
250   PetscInt            nz;
251   PetscInt            *ai,*aj,*dummy;
252   PetscScalar         *aa;
253   PetscErrorCode      ierr;
254   Mat_SeqAIJMKL       *aijmkl;
255   sparse_status_t     stat;
256   sparse_index_base_t indexing;
257 
258   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
259 
260   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
261   stat = mkl_sparse_x_export_csr(aijmkl->csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
262   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
263 
264   /* We can't just do a copy from the arrays exported by MKL to those used for the PETSc AIJ storage, because the MKL and PETSc
265    * representations differ in small ways (e.g., more explicit nonzeros per row due to preallocation). */
266   for (i=0; i<nrows; i++) {
267     nz = ai[i+1] - ai[i];
268     ierr = MatSetValues_SeqAIJ(A, 1, &i, nz, aj+ai[i], aa+ai[i], INSERT_VALUES);CHKERRQ(ierr);
269   }
270 
271   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
272   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
273 
274   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
275   /* We mark our matrix as having a valid, optimized MKL handle.
276    * TODO: It is valid, but I am not sure if it is optimized. Need to ask MKL developers. */
277   aijmkl->sparse_optimized = PETSC_TRUE;
278 
279   PetscFunctionReturn(0);
280 }
281 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
282 
283 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
284 {
285   PetscErrorCode ierr;
286   Mat_SeqAIJMKL  *aijmkl;
287   Mat_SeqAIJMKL  *aijmkl_dest;
288 
289   PetscFunctionBegin;
290   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
291   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
292   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
293   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
294   aijmkl_dest->sparse_optimized = PETSC_FALSE;
295   if (aijmkl->eager_inspection) {
296     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
297   }
298   PetscFunctionReturn(0);
299 }
300 
301 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
302 {
303   PetscErrorCode  ierr;
304   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
305   Mat_SeqAIJMKL   *aijmkl;
306 
307   PetscFunctionBegin;
308   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
309 
310   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
311    * extra information and some different methods, call the AssemblyEnd
312    * routine for a MATSEQAIJ.
313    * I'm not sure if this is the best way to do this, but it avoids
314    * a lot of code duplication. */
315   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
316   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
317 
318   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
319    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
320   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
321   if (aijmkl->eager_inspection) {
322     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
323   }
324 
325   PetscFunctionReturn(0);
326 }
327 
328 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
329 {
330   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
331   const PetscScalar *x;
332   PetscScalar       *y;
333   const MatScalar   *aa;
334   PetscErrorCode    ierr;
335   PetscInt          m=A->rmap->n;
336   PetscInt          n=A->cmap->n;
337   PetscScalar       alpha = 1.0;
338   PetscScalar       beta = 0.0;
339   const PetscInt    *aj,*ai;
340   char              matdescra[6];
341 
342 
343   /* Variables not in MatMult_SeqAIJ. */
344   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
345 
346   PetscFunctionBegin;
347   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
348   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
349   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
350   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
351   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
352   aa   = a->a;  /* Nonzero elements stored row-by-row. */
353   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
354 
355   /* Call MKL sparse BLAS routine to do the MatMult. */
356   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
357 
358   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
359   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
360   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
361   PetscFunctionReturn(0);
362 }
363 
364 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
365 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
366 {
367   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
368   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
369   const PetscScalar *x;
370   PetscScalar       *y;
371   PetscErrorCode    ierr;
372   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
373   PetscObjectState  state;
374 
375   PetscFunctionBegin;
376 
377   /* If there are no nonzero entries, zero yy and return immediately. */
378   if(!a->nz) {
379     PetscInt i;
380     PetscInt m=A->rmap->n;
381     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
382     for (i=0; i<m; i++) {
383       y[i] = 0.0;
384     }
385     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
386     PetscFunctionReturn(0);
387   }
388 
389   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
391 
392   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
393    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
394    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
395   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
396   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
397     MatSeqAIJMKL_create_mkl_handle(A);
398   }
399 
400   /* Call MKL SpMV2 executor routine to do the MatMult. */
401   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
402   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
403 
404   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
405   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
406   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
407   PetscFunctionReturn(0);
408 }
409 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
410 
411 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
412 {
413   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
414   const PetscScalar *x;
415   PetscScalar       *y;
416   const MatScalar   *aa;
417   PetscErrorCode    ierr;
418   PetscInt          m=A->rmap->n;
419   PetscInt          n=A->cmap->n;
420   PetscScalar       alpha = 1.0;
421   PetscScalar       beta = 0.0;
422   const PetscInt    *aj,*ai;
423   char              matdescra[6];
424 
425   /* Variables not in MatMultTranspose_SeqAIJ. */
426   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
427 
428   PetscFunctionBegin;
429   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
430   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
431   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
432   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
433   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
434   aa   = a->a;  /* Nonzero elements stored row-by-row. */
435   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
436 
437   /* Call MKL sparse BLAS routine to do the MatMult. */
438   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
439 
440   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
441   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
442   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
443   PetscFunctionReturn(0);
444 }
445 
446 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
447 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
448 {
449   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
450   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
451   const PetscScalar *x;
452   PetscScalar       *y;
453   PetscErrorCode    ierr;
454   sparse_status_t   stat;
455   PetscObjectState  state;
456 
457   PetscFunctionBegin;
458 
459   /* If there are no nonzero entries, zero yy and return immediately. */
460   if(!a->nz) {
461     PetscInt i;
462     PetscInt n=A->cmap->n;
463     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
464     for (i=0; i<n; i++) {
465       y[i] = 0.0;
466     }
467     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
468     PetscFunctionReturn(0);
469   }
470 
471   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
472   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
473 
474   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
475    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
476    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
477   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
478   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
479     MatSeqAIJMKL_create_mkl_handle(A);
480   }
481 
482   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
483   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
484   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
485 
486   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
487   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
488   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
489   PetscFunctionReturn(0);
490 }
491 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
492 
493 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
494 {
495   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
496   const PetscScalar *x;
497   PetscScalar       *y,*z;
498   const MatScalar   *aa;
499   PetscErrorCode    ierr;
500   PetscInt          m=A->rmap->n;
501   PetscInt          n=A->cmap->n;
502   const PetscInt    *aj,*ai;
503   PetscInt          i;
504 
505   /* Variables not in MatMultAdd_SeqAIJ. */
506   char              transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
507   PetscScalar       alpha = 1.0;
508   PetscScalar       beta;
509   char              matdescra[6];
510 
511   PetscFunctionBegin;
512   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
513   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
514 
515   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
516   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
517   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
518   aa   = a->a;  /* Nonzero elements stored row-by-row. */
519   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
520 
521   /* Call MKL sparse BLAS routine to do the MatMult. */
522   if (zz == yy) {
523     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
524     beta = 1.0;
525     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
526   } else {
527     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
528      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
529     beta = 0.0;
530     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
531     for (i=0; i<m; i++) {
532       z[i] += y[i];
533     }
534   }
535 
536   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
537   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
538   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
539   PetscFunctionReturn(0);
540 }
541 
542 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
543 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
544 {
545   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
546   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
547   const PetscScalar *x;
548   PetscScalar       *y,*z;
549   PetscErrorCode    ierr;
550   PetscInt          m=A->rmap->n;
551   PetscInt          i;
552 
553   /* Variables not in MatMultAdd_SeqAIJ. */
554   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
555   PetscObjectState  state;
556 
557   PetscFunctionBegin;
558 
559   /* If there are no nonzero entries, set zz = yy and return immediately. */
560   if(!a->nz) {
561     PetscInt i;
562     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
563     for (i=0; i<m; i++) {
564       z[i] = y[i];
565     }
566     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
567     PetscFunctionReturn(0);
568   }
569 
570   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
571   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
572 
573   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
574    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
575    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
576   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
577   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
578     MatSeqAIJMKL_create_mkl_handle(A);
579   }
580 
581   /* Call MKL sparse BLAS routine to do the MatMult. */
582   if (zz == yy) {
583     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
584      * with alpha and beta both set to 1.0. */
585     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
586     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
587   } else {
588     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
589      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
590     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
591     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
592     for (i=0; i<m; i++) {
593       z[i] += y[i];
594     }
595   }
596 
597   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
598   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
600   PetscFunctionReturn(0);
601 }
602 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
603 
604 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
605 {
606   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
607   const PetscScalar *x;
608   PetscScalar       *y,*z;
609   const MatScalar   *aa;
610   PetscErrorCode    ierr;
611   PetscInt          m=A->rmap->n;
612   PetscInt          n=A->cmap->n;
613   const PetscInt    *aj,*ai;
614   PetscInt          i;
615 
616   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
617   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
618   PetscScalar       alpha = 1.0;
619   PetscScalar       beta;
620   char              matdescra[6];
621 
622   PetscFunctionBegin;
623   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
624   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
625 
626   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
627   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
628   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
629   aa   = a->a;  /* Nonzero elements stored row-by-row. */
630   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
631 
632   /* Call MKL sparse BLAS routine to do the MatMult. */
633   if (zz == yy) {
634     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
635     beta = 1.0;
636     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
637   } else {
638     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
639      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
640     beta = 0.0;
641     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
642     for (i=0; i<n; i++) {
643       z[i] += y[i];
644     }
645   }
646 
647   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
648   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
649   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
650   PetscFunctionReturn(0);
651 }
652 
653 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
654 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
655 {
656   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
657   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
658   const PetscScalar *x;
659   PetscScalar       *y,*z;
660   PetscErrorCode    ierr;
661   PetscInt          n=A->cmap->n;
662   PetscInt          i;
663   PetscObjectState  state;
664 
665   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
666   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
667 
668   PetscFunctionBegin;
669 
670   /* If there are no nonzero entries, set zz = yy and return immediately. */
671   if(!a->nz) {
672     PetscInt i;
673     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
674     for (i=0; i<n; i++) {
675       z[i] = y[i];
676     }
677     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
678     PetscFunctionReturn(0);
679   }
680 
681   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
682   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
683 
684   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
685    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
686    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
687   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
688   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
689     MatSeqAIJMKL_create_mkl_handle(A);
690   }
691 
692   /* Call MKL sparse BLAS routine to do the MatMult. */
693   if (zz == yy) {
694     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
695      * with alpha and beta both set to 1.0. */
696     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
697     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
698   } else {
699     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
700      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
701     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
702     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
703     for (i=0; i<n; i++) {
704       z[i] += y[i];
705     }
706   }
707 
708   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
709   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
710   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
711   PetscFunctionReturn(0);
712 }
713 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
714 
715 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
716 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
717  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
718  * For releases of MKL prior to version 18, update 2:
719  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
720  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
721  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
722  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
723 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
724 {
725   Mat_SeqAIJMKL    *a, *b;
726   sparse_matrix_t  csrA, csrB, csrC;
727   PetscErrorCode   ierr;
728   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
729   PetscObjectState state;
730 
731   PetscFunctionBegin;
732   a = (Mat_SeqAIJMKL*)A->spptr;
733   b = (Mat_SeqAIJMKL*)B->spptr;
734   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
735   if (!a->sparse_optimized || a->state != state) {
736     MatSeqAIJMKL_create_mkl_handle(A);
737   }
738   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
739   if (!b->sparse_optimized || b->state != state) {
740     MatSeqAIJMKL_create_mkl_handle(B);
741   }
742   csrA = a->csrA;
743   csrB = b->csrA;
744 
745   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
746   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
747 
748   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
749 
750   PetscFunctionReturn(0);
751 }
752 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
753 
754 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
755 PetscErrorCode MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,Mat C)
756 {
757   Mat_SeqAIJMKL       *a, *b, *c;
758   sparse_matrix_t     csrA, csrB, csrC;
759   PetscErrorCode      ierr;
760   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
761   struct matrix_descr descr_type_gen;
762   PetscObjectState    state;
763 
764   PetscFunctionBegin;
765   a = (Mat_SeqAIJMKL*)A->spptr;
766   b = (Mat_SeqAIJMKL*)B->spptr;
767   c = (Mat_SeqAIJMKL*)C->spptr;
768   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
769   if (!a->sparse_optimized || a->state != state) {
770     MatSeqAIJMKL_create_mkl_handle(A);
771   }
772   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
773   if (!b->sparse_optimized || b->state != state) {
774     MatSeqAIJMKL_create_mkl_handle(B);
775   }
776   csrA = a->csrA;
777   csrB = b->csrA;
778   csrC = c->csrA;
779   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
780 
781   stat = mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrA,
782                          SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrB,
783                          SPARSE_STAGE_FINALIZE_MULT,&csrC);
784 
785   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete numerical stage of sparse matrix-matrix multiply");
786 
787   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
788   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
789 
790   PetscFunctionReturn(0);
791 }
792 #endif /* PETSC_HAVE_MKL_SPARSE_SP2M */
793 
794 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
795 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
796 {
797   Mat_SeqAIJMKL    *a, *b;
798   sparse_matrix_t  csrA, csrB, csrC;
799   PetscErrorCode   ierr;
800   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
801   PetscObjectState state;
802 
803   PetscFunctionBegin;
804   a = (Mat_SeqAIJMKL*)A->spptr;
805   b = (Mat_SeqAIJMKL*)B->spptr;
806   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
807   if (!a->sparse_optimized || a->state != state) {
808     MatSeqAIJMKL_create_mkl_handle(A);
809   }
810   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
811   if (!b->sparse_optimized || b->state != state) {
812     MatSeqAIJMKL_create_mkl_handle(B);
813   }
814   csrA = a->csrA;
815   csrB = b->csrA;
816 
817   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
818   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
819 
820   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
821 
822   PetscFunctionReturn(0);
823 }
824 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
825 
826 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
827 PetscErrorCode MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,Mat C)
828 {
829   Mat_SeqAIJMKL       *a, *p, *c;
830   sparse_matrix_t     csrA, csrP, csrC;
831   PetscBool           set, flag;
832   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
833   struct matrix_descr descr_type_gen;
834   PetscObjectState    state;
835   PetscErrorCode      ierr;
836 
837   PetscFunctionBegin;
838   ierr = MatIsSymmetricKnown(A,&set,&flag);
839   if (!set || (set && !flag)) {
840     ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,C);CHKERRQ(ierr);
841     PetscFunctionReturn(0);
842   }
843 
844   a = (Mat_SeqAIJMKL*)A->spptr;
845   p = (Mat_SeqAIJMKL*)P->spptr;
846   c = (Mat_SeqAIJMKL*)C->spptr;
847   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
848   if (!a->sparse_optimized || a->state != state) {
849     MatSeqAIJMKL_create_mkl_handle(A);
850   }
851   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
852   if (!p->sparse_optimized || p->state != state) {
853     MatSeqAIJMKL_create_mkl_handle(P);
854   }
855   csrA = a->csrA;
856   csrP = p->csrA;
857   csrC = c->csrA;
858   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
859 
860   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
861   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FINALIZE_MULT);
862   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to finalize mkl_sparse_sypr");
863 
864   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
865   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
866 
867   PetscFunctionReturn(0);
868 }
869 #endif
870 
871 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
872 PetscErrorCode MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
873 {
874   Mat_SeqAIJMKL       *a, *p;
875   sparse_matrix_t     csrA, csrP, csrC;
876   PetscBool           set, flag;
877   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
878   struct matrix_descr descr_type_gen;
879   PetscObjectState    state;
880   PetscErrorCode      ierr;
881 
882   PetscFunctionBegin;
883   ierr = MatIsSymmetricKnown(A,&set,&flag);
884   if (!set || (set && !flag)) {
885     ierr = MatPtAP_SeqAIJ_SeqAIJ(A,P,scall,fill,C);CHKERRQ(ierr);
886     PetscFunctionReturn(0);
887   }
888 
889   if (scall == MAT_REUSE_MATRIX) {
890     ierr = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(A,P,*C);CHKERRQ(ierr);
891     PetscFunctionReturn(0);
892   }
893 
894   a = (Mat_SeqAIJMKL*)A->spptr;
895   p = (Mat_SeqAIJMKL*)P->spptr;
896   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
897   if (!a->sparse_optimized || a->state != state) {
898     MatSeqAIJMKL_create_mkl_handle(A);
899   }
900   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
901   if (!p->sparse_optimized || p->state != state) {
902     MatSeqAIJMKL_create_mkl_handle(P);
903   }
904   csrA = a->csrA;
905   csrP = p->csrA;
906   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
907 
908   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
909   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FULL_MULT);
910   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete full mkl_sparse_sypr");
911 
912   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
913   ierr = MatSetOption(*C,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
914 
915   PetscFunctionReturn(0);
916 }
917 #endif
918 
919 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
920  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
921  * routine, but can also be used to convert an assembled SeqAIJ matrix
922  * into a SeqAIJMKL one. */
923 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
924 {
925   PetscErrorCode ierr;
926   Mat            B = *newmat;
927   Mat_SeqAIJMKL  *aijmkl;
928   PetscBool      set;
929   PetscBool      sametype;
930 
931   PetscFunctionBegin;
932   if (reuse == MAT_INITIAL_MATRIX) {
933     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
934   }
935 
936   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
937   if (sametype) PetscFunctionReturn(0);
938 
939   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
940   B->spptr = (void*) aijmkl;
941 
942   /* Set function pointers for methods that we inherit from AIJ but override.
943    * We also parse some command line options below, since those determine some of the methods we point to. */
944   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
945   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
946   B->ops->destroy          = MatDestroy_SeqAIJMKL;
947 
948   aijmkl->sparse_optimized = PETSC_FALSE;
949 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
950   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
951 #else
952   aijmkl->no_SpMV2 = PETSC_TRUE;
953 #endif
954   aijmkl->eager_inspection = PETSC_FALSE;
955 
956   /* Parse command line options. */
957   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
958   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
959   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
960   ierr = PetscOptionsEnd();CHKERRQ(ierr);
961 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
962   if(!aijmkl->no_SpMV2) {
963     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
964     aijmkl->no_SpMV2 = PETSC_TRUE;
965   }
966 #endif
967 
968   if(!aijmkl->no_SpMV2) {
969 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
970     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
971     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
972     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
973     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
974     B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
975 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
976     B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
977 #ifndef PETSC_USE_COMPLEX
978     B->ops->ptap             = MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2;
979     B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
980 #endif
981 #endif
982     B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
983 #endif
984   } else {
985     B->ops->mult             = MatMult_SeqAIJMKL;
986     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
987     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
988     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
989   }
990 
991   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
992   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
993   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
994   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
995   if(!aijmkl->no_SpMV2) {
996 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
997     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
998 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
999     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1000 #endif
1001     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1002 #endif
1003   }
1004 
1005   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
1006   *newmat = B;
1007   PetscFunctionReturn(0);
1008 }
1009 
1010 /*@C
1011    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
1012    This type inherits from AIJ and is largely identical, but uses sparse BLAS
1013    routines from Intel MKL whenever possible.
1014    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, and MatTransposeMatMult
1015    operations are currently supported.
1016    If the installed version of MKL supports the "SpMV2" sparse
1017    inspector-executor routines, then those are used by default.
1018 
1019    Collective on MPI_Comm
1020 
1021    Input Parameters:
1022 +  comm - MPI communicator, set to PETSC_COMM_SELF
1023 .  m - number of rows
1024 .  n - number of columns
1025 .  nz - number of nonzeros per row (same for all rows)
1026 -  nnz - array containing the number of nonzeros in the various rows
1027          (possibly different for each row) or NULL
1028 
1029    Output Parameter:
1030 .  A - the matrix
1031 
1032    Options Database Keys:
1033 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
1034 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
1035 
1036    Notes:
1037    If nnz is given then nz is ignored
1038 
1039    Level: intermediate
1040 
1041 .keywords: matrix, MKL, sparse, parallel
1042 
1043 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
1044 @*/
1045 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1046 {
1047   PetscErrorCode ierr;
1048 
1049   PetscFunctionBegin;
1050   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1051   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1052   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
1053   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
1054   PetscFunctionReturn(0);
1055 }
1056 
1057 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
1058 {
1059   PetscErrorCode ierr;
1060 
1061   PetscFunctionBegin;
1062   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
1063   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1064   PetscFunctionReturn(0);
1065 }
1066