xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision feff33ee0b5b037fa8f9f294dede656a2f85cc47)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJ_SeqAIJ;
53   B->ops->ptap             = MatPtAP_SeqAIJ_SeqAIJ;
54   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJ_SeqAIJ;
55   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
56 
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
62   if(!aijmkl->no_SpMV2) {
63     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
64 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
65     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
66 #endif
67     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
68   }
69 
70   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
71    * simply involves destroying the MKL sparse matrix handle and then freeing
72    * the spptr pointer. */
73   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
74 
75   if (aijmkl->sparse_optimized) {
76     sparse_status_t stat;
77     stat = mkl_sparse_destroy(aijmkl->csrA);
78     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
79   }
80 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
81   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
82 
83   /* Change the type of B to MATSEQAIJ. */
84   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
85 
86   *newmat = B;
87   PetscFunctionReturn(0);
88 }
89 
90 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
91 {
92   PetscErrorCode ierr;
93   Mat_SeqAIJMKL  *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
94 
95   PetscFunctionBegin;
96 
97   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
98    * spptr pointer. */
99   if (aijmkl) {
100     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
101 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
102     if (aijmkl->sparse_optimized) {
103       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
104       stat = mkl_sparse_destroy(aijmkl->csrA);
105       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
106     }
107 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
108     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
109   }
110 
111   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
112    * to destroy everything that remains. */
113   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
114   /* Note that I don't call MatSetType().  I believe this is because that
115    * is only to be called when *building* a matrix.  I could be wrong, but
116    * that is how things work for the SuperLU matrix class. */
117   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
118   PetscFunctionReturn(0);
119 }
120 
121 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
122  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
123  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
124  * handle, creates a new one, and then calls mkl_sparse_optimize().
125  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
126  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
127  * an unoptimized matrix handle here. */
128 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
129 {
130 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
131   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
132    * does nothing. We make it callable anyway in this case because it cuts
133    * down on littering the code with #ifdefs. */
134   PetscFunctionBegin;
135   PetscFunctionReturn(0);
136 #else
137   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
138   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
139   PetscInt         m,n;
140   MatScalar        *aa;
141   PetscInt         *aj,*ai;
142   sparse_status_t  stat;
143   PetscErrorCode   ierr;
144 
145   PetscFunctionBegin;
146   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
147 
148   if (aijmkl->sparse_optimized) {
149     /* Matrix has been previously assembled and optimized. Must destroy old
150      * matrix handle before running the optimization step again. */
151     stat = mkl_sparse_destroy(aijmkl->csrA);
152     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
153   }
154   aijmkl->sparse_optimized = PETSC_FALSE;
155 
156   /* Now perform the SpMV2 setup and matrix optimization. */
157   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
158   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
159   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
160   m = A->rmap->n;
161   n = A->cmap->n;
162   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
163   aa   = a->a;  /* Nonzero elements stored row-by-row. */
164   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
165   if ((a->nz!=0) & !(A->structure_only)) {
166     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
167      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
168     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
169     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle");
170     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
171     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
172     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
173     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
174     stat = mkl_sparse_optimize(aijmkl->csrA);
175     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
176     aijmkl->sparse_optimized = PETSC_TRUE;
177     ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
178   }
179 
180   PetscFunctionReturn(0);
181 #endif
182 }
183 
184 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
185  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
186  * matrix handle.
187  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
188  * there is no good alternative. */
189 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
190 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
191 {
192   PetscErrorCode      ierr;
193   sparse_status_t     stat;
194   sparse_index_base_t indexing;
195   PetscInt            nrows, ncols;
196   PetscInt            *aj,*ai,*dummy;
197   MatScalar           *aa;
198   Mat                 A;
199   Mat_SeqAIJMKL       *aijmkl;
200 
201   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
202   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
203   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
204 
205   if (reuse == MAT_REUSE_MATRIX) {
206     ierr = MatDestroy(mat);CHKERRQ(ierr);
207   }
208   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
209   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
210   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
211   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
212    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
213    * they will be destroyed when the MKL handle is destroyed.
214    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
215   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
216 
217   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
218    * Now turn it into a MATSEQAIJMKL. */
219   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
220 
221   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
222   aijmkl->csrA = csrA;
223 
224   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
225    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
226    * and just need to be able to run the MKL optimization step. */
227   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
228   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
229   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
230   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
231   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
232   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
233   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
234   stat = mkl_sparse_optimize(aijmkl->csrA);
235   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
236   aijmkl->sparse_optimized = PETSC_TRUE;
237   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
238 
239   *mat = A;
240   PetscFunctionReturn(0);
241 }
242 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
243 
244 /* MatSeqAIJMKL_update_from_mkl_handle() updates the matrix values array from the contents of the associated MKL sparse matrix handle.
245  * This is needed after mkl_sparse_sp2m() with SPARSE_STAGE_FINALIZE_MULT has been used to compute new values of the matrix in
246  * MatMatMultNumeric(). */
247 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
248 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_update_from_mkl_handle(Mat A)
249 {
250   PetscInt            i;
251   PetscInt            nrows,ncols;
252   PetscInt            nz;
253   PetscInt            *ai,*aj,*dummy;
254   PetscScalar         *aa;
255   PetscErrorCode      ierr;
256   Mat_SeqAIJMKL       *aijmkl;
257   sparse_status_t     stat;
258   sparse_index_base_t indexing;
259 
260   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
261 
262   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
263   stat = mkl_sparse_x_export_csr(aijmkl->csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
264   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
265 
266   /* We can't just do a copy from the arrays exported by MKL to those used for the PETSc AIJ storage, because the MKL and PETSc
267    * representations differ in small ways (e.g., more explicit nonzeros per row due to preallocation). */
268   for (i=0; i<nrows; i++) {
269     nz = ai[i+1] - ai[i];
270     ierr = MatSetValues_SeqAIJ(A, 1, &i, nz, aj+ai[i], aa+ai[i], INSERT_VALUES);CHKERRQ(ierr);
271   }
272 
273   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
274   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
275 
276   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
277   /* We mark our matrix as having a valid, optimized MKL handle.
278    * TODO: It is valid, but I am not sure if it is optimized. Need to ask MKL developers. */
279   aijmkl->sparse_optimized = PETSC_TRUE;
280 
281   PetscFunctionReturn(0);
282 }
283 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
284 
285 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
286 {
287   PetscErrorCode ierr;
288   Mat_SeqAIJMKL  *aijmkl;
289   Mat_SeqAIJMKL  *aijmkl_dest;
290 
291   PetscFunctionBegin;
292   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
293   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
294   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
295   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
296   aijmkl_dest->sparse_optimized = PETSC_FALSE;
297   if (aijmkl->eager_inspection) {
298     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
299   }
300   PetscFunctionReturn(0);
301 }
302 
303 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
304 {
305   PetscErrorCode  ierr;
306   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
307   Mat_SeqAIJMKL   *aijmkl;
308 
309   PetscFunctionBegin;
310   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
311 
312   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
313    * extra information and some different methods, call the AssemblyEnd
314    * routine for a MATSEQAIJ.
315    * I'm not sure if this is the best way to do this, but it avoids
316    * a lot of code duplication. */
317   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
318   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
319 
320   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
321    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
322   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
323   if (aijmkl->eager_inspection) {
324     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
325   }
326 
327   PetscFunctionReturn(0);
328 }
329 
330 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
331 {
332   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
333   const PetscScalar *x;
334   PetscScalar       *y;
335   const MatScalar   *aa;
336   PetscErrorCode    ierr;
337   PetscInt          m=A->rmap->n;
338   PetscInt          n=A->cmap->n;
339   PetscScalar       alpha = 1.0;
340   PetscScalar       beta = 0.0;
341   const PetscInt    *aj,*ai;
342   char              matdescra[6];
343 
344 
345   /* Variables not in MatMult_SeqAIJ. */
346   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
347 
348   PetscFunctionBegin;
349   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
350   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
351   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
352   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
353   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
354   aa   = a->a;  /* Nonzero elements stored row-by-row. */
355   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
356 
357   /* Call MKL sparse BLAS routine to do the MatMult. */
358   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
359 
360   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
361   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
362   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
363   PetscFunctionReturn(0);
364 }
365 
366 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
367 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
368 {
369   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
370   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
371   const PetscScalar *x;
372   PetscScalar       *y;
373   PetscErrorCode    ierr;
374   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
375   PetscObjectState  state;
376 
377   PetscFunctionBegin;
378 
379   /* If there are no nonzero entries, zero yy and return immediately. */
380   if(!a->nz) {
381     PetscInt i;
382     PetscInt m=A->rmap->n;
383     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
384     for (i=0; i<m; i++) {
385       y[i] = 0.0;
386     }
387     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
388     PetscFunctionReturn(0);
389   }
390 
391   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
392   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
393 
394   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
395    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
396    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
397   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
398   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
399     MatSeqAIJMKL_create_mkl_handle(A);
400   }
401 
402   /* Call MKL SpMV2 executor routine to do the MatMult. */
403   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
404   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
405 
406   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
407   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
408   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
409   PetscFunctionReturn(0);
410 }
411 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
412 
413 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
414 {
415   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
416   const PetscScalar *x;
417   PetscScalar       *y;
418   const MatScalar   *aa;
419   PetscErrorCode    ierr;
420   PetscInt          m=A->rmap->n;
421   PetscInt          n=A->cmap->n;
422   PetscScalar       alpha = 1.0;
423   PetscScalar       beta = 0.0;
424   const PetscInt    *aj,*ai;
425   char              matdescra[6];
426 
427   /* Variables not in MatMultTranspose_SeqAIJ. */
428   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
429 
430   PetscFunctionBegin;
431   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
432   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
433   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
434   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
435   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
436   aa   = a->a;  /* Nonzero elements stored row-by-row. */
437   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
438 
439   /* Call MKL sparse BLAS routine to do the MatMult. */
440   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
441 
442   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
443   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
444   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
445   PetscFunctionReturn(0);
446 }
447 
448 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
449 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
450 {
451   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
452   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
453   const PetscScalar *x;
454   PetscScalar       *y;
455   PetscErrorCode    ierr;
456   sparse_status_t   stat;
457   PetscObjectState  state;
458 
459   PetscFunctionBegin;
460 
461   /* If there are no nonzero entries, zero yy and return immediately. */
462   if(!a->nz) {
463     PetscInt i;
464     PetscInt n=A->cmap->n;
465     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
466     for (i=0; i<n; i++) {
467       y[i] = 0.0;
468     }
469     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
470     PetscFunctionReturn(0);
471   }
472 
473   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
474   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
475 
476   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
477    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
478    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
479   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
480   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
481     MatSeqAIJMKL_create_mkl_handle(A);
482   }
483 
484   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
485   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
486   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
487 
488   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
489   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
490   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
491   PetscFunctionReturn(0);
492 }
493 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
494 
495 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
496 {
497   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
498   const PetscScalar *x;
499   PetscScalar       *y,*z;
500   const MatScalar   *aa;
501   PetscErrorCode    ierr;
502   PetscInt          m=A->rmap->n;
503   PetscInt          n=A->cmap->n;
504   const PetscInt    *aj,*ai;
505   PetscInt          i;
506 
507   /* Variables not in MatMultAdd_SeqAIJ. */
508   char              transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
509   PetscScalar       alpha = 1.0;
510   PetscScalar       beta;
511   char              matdescra[6];
512 
513   PetscFunctionBegin;
514   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
515   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
516 
517   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
518   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
519   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
520   aa   = a->a;  /* Nonzero elements stored row-by-row. */
521   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
522 
523   /* Call MKL sparse BLAS routine to do the MatMult. */
524   if (zz == yy) {
525     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
526     beta = 1.0;
527     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
528   } else {
529     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
530      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
531     beta = 0.0;
532     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
533     for (i=0; i<m; i++) {
534       z[i] += y[i];
535     }
536   }
537 
538   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
539   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
540   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
541   PetscFunctionReturn(0);
542 }
543 
544 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
545 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
546 {
547   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
548   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
549   const PetscScalar *x;
550   PetscScalar       *y,*z;
551   PetscErrorCode    ierr;
552   PetscInt          m=A->rmap->n;
553   PetscInt          i;
554 
555   /* Variables not in MatMultAdd_SeqAIJ. */
556   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
557   PetscObjectState  state;
558 
559   PetscFunctionBegin;
560 
561   /* If there are no nonzero entries, set zz = yy and return immediately. */
562   if(!a->nz) {
563     PetscInt i;
564     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
565     for (i=0; i<m; i++) {
566       z[i] = y[i];
567     }
568     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
569     PetscFunctionReturn(0);
570   }
571 
572   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
574 
575   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
576    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
577    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
578   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
579   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
580     MatSeqAIJMKL_create_mkl_handle(A);
581   }
582 
583   /* Call MKL sparse BLAS routine to do the MatMult. */
584   if (zz == yy) {
585     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
586      * with alpha and beta both set to 1.0. */
587     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
588     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
589   } else {
590     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
591      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
592     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
593     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
594     for (i=0; i<m; i++) {
595       z[i] += y[i];
596     }
597   }
598 
599   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
600   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
601   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
602   PetscFunctionReturn(0);
603 }
604 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
605 
606 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
607 {
608   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
609   const PetscScalar *x;
610   PetscScalar       *y,*z;
611   const MatScalar   *aa;
612   PetscErrorCode    ierr;
613   PetscInt          m=A->rmap->n;
614   PetscInt          n=A->cmap->n;
615   const PetscInt    *aj,*ai;
616   PetscInt          i;
617 
618   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
619   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
620   PetscScalar       alpha = 1.0;
621   PetscScalar       beta;
622   char              matdescra[6];
623 
624   PetscFunctionBegin;
625   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
626   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
627 
628   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
629   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
630   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
631   aa   = a->a;  /* Nonzero elements stored row-by-row. */
632   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
633 
634   /* Call MKL sparse BLAS routine to do the MatMult. */
635   if (zz == yy) {
636     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
637     beta = 1.0;
638     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
639   } else {
640     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
641      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
642     beta = 0.0;
643     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
644     for (i=0; i<n; i++) {
645       z[i] += y[i];
646     }
647   }
648 
649   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
650   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
651   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
652   PetscFunctionReturn(0);
653 }
654 
655 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
656 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
657 {
658   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
659   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
660   const PetscScalar *x;
661   PetscScalar       *y,*z;
662   PetscErrorCode    ierr;
663   PetscInt          n=A->cmap->n;
664   PetscInt          i;
665   PetscObjectState  state;
666 
667   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
668   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
669 
670   PetscFunctionBegin;
671 
672   /* If there are no nonzero entries, set zz = yy and return immediately. */
673   if(!a->nz) {
674     PetscInt i;
675     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
676     for (i=0; i<n; i++) {
677       z[i] = y[i];
678     }
679     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
680     PetscFunctionReturn(0);
681   }
682 
683   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
684   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
685 
686   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
687    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
688    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
689   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
690   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
691     MatSeqAIJMKL_create_mkl_handle(A);
692   }
693 
694   /* Call MKL sparse BLAS routine to do the MatMult. */
695   if (zz == yy) {
696     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
697      * with alpha and beta both set to 1.0. */
698     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
699     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
700   } else {
701     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
702      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
703     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
704     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
705     for (i=0; i<n; i++) {
706       z[i] += y[i];
707     }
708   }
709 
710   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
711   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
712   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
713   PetscFunctionReturn(0);
714 }
715 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
716 
717 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
718 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
719  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
720  * For releases of MKL prior to version 18, update 2:
721  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
722  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
723  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
724  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
725 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
726 {
727   Mat_SeqAIJMKL    *a, *b;
728   sparse_matrix_t  csrA, csrB, csrC;
729   PetscErrorCode   ierr;
730   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
731   PetscObjectState state;
732 
733   PetscFunctionBegin;
734   a = (Mat_SeqAIJMKL*)A->spptr;
735   b = (Mat_SeqAIJMKL*)B->spptr;
736   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
737   if (!a->sparse_optimized || a->state != state) {
738     MatSeqAIJMKL_create_mkl_handle(A);
739   }
740   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
741   if (!b->sparse_optimized || b->state != state) {
742     MatSeqAIJMKL_create_mkl_handle(B);
743   }
744   csrA = a->csrA;
745   csrB = b->csrA;
746 
747   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
748   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
749 
750   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
751 
752   PetscFunctionReturn(0);
753 }
754 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
755 
756 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
757 PetscErrorCode MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,Mat C)
758 {
759   Mat_SeqAIJMKL       *a, *b, *c;
760   sparse_matrix_t     csrA, csrB, csrC;
761   PetscErrorCode      ierr;
762   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
763   struct matrix_descr descr_type_gen;
764   PetscObjectState    state;
765 
766   PetscFunctionBegin;
767   a = (Mat_SeqAIJMKL*)A->spptr;
768   b = (Mat_SeqAIJMKL*)B->spptr;
769   c = (Mat_SeqAIJMKL*)C->spptr;
770   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
771   if (!a->sparse_optimized || a->state != state) {
772     MatSeqAIJMKL_create_mkl_handle(A);
773   }
774   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
775   if (!b->sparse_optimized || b->state != state) {
776     MatSeqAIJMKL_create_mkl_handle(B);
777   }
778   csrA = a->csrA;
779   csrB = b->csrA;
780   csrC = c->csrA;
781   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
782 
783   stat = mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrA,
784                          SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrB,
785                          SPARSE_STAGE_FINALIZE_MULT,&csrC);
786 
787   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete numerical stage of sparse matrix-matrix multiply");
788 
789   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
790   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
791 
792   PetscFunctionReturn(0);
793 }
794 #endif /* PETSC_HAVE_MKL_SPARSE_SP2M */
795 
796 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
797 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
798 {
799   Mat_SeqAIJMKL    *a, *b;
800   sparse_matrix_t  csrA, csrB, csrC;
801   PetscErrorCode   ierr;
802   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
803   PetscObjectState state;
804 
805   PetscFunctionBegin;
806   a = (Mat_SeqAIJMKL*)A->spptr;
807   b = (Mat_SeqAIJMKL*)B->spptr;
808   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
809   if (!a->sparse_optimized || a->state != state) {
810     MatSeqAIJMKL_create_mkl_handle(A);
811   }
812   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
813   if (!b->sparse_optimized || b->state != state) {
814     MatSeqAIJMKL_create_mkl_handle(B);
815   }
816   csrA = a->csrA;
817   csrB = b->csrA;
818 
819   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
820   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
821 
822   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
823 
824   PetscFunctionReturn(0);
825 }
826 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
827 
828 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
829 PetscErrorCode MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,Mat C)
830 {
831   Mat_SeqAIJMKL       *a, *p, *c;
832   sparse_matrix_t     csrA, csrP, csrC;
833   PetscBool           set, flag;
834   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
835   struct matrix_descr descr_type_gen;
836   PetscObjectState    state;
837   PetscErrorCode      ierr;
838 
839   PetscFunctionBegin;
840   ierr = MatIsSymmetricKnown(A,&set,&flag);
841   if (!set || (set && !flag)) {
842     ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,C);CHKERRQ(ierr);
843     PetscFunctionReturn(0);
844   }
845 
846   a = (Mat_SeqAIJMKL*)A->spptr;
847   p = (Mat_SeqAIJMKL*)P->spptr;
848   c = (Mat_SeqAIJMKL*)C->spptr;
849   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
850   if (!a->sparse_optimized || a->state != state) {
851     MatSeqAIJMKL_create_mkl_handle(A);
852   }
853   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
854   if (!p->sparse_optimized || p->state != state) {
855     MatSeqAIJMKL_create_mkl_handle(P);
856   }
857   csrA = a->csrA;
858   csrP = p->csrA;
859   csrC = c->csrA;
860   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
861 
862   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
863   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FINALIZE_MULT);
864   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to finalize mkl_sparse_sypr");
865 
866   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
867   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
868 
869   PetscFunctionReturn(0);
870 }
871 #endif
872 
873 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
874 PetscErrorCode MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
875 {
876   Mat_SeqAIJMKL       *a, *p;
877   sparse_matrix_t     csrA, csrP, csrC;
878   PetscBool           set, flag;
879   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
880   struct matrix_descr descr_type_gen;
881   PetscObjectState    state;
882   PetscErrorCode      ierr;
883 
884   PetscFunctionBegin;
885   ierr = MatIsSymmetricKnown(A,&set,&flag);
886   if (!set || (set && !flag)) {
887     ierr = MatPtAP_SeqAIJ_SeqAIJ(A,P,scall,fill,C);CHKERRQ(ierr);
888     PetscFunctionReturn(0);
889   }
890 
891   if (scall == MAT_REUSE_MATRIX) {
892     ierr = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(A,P,*C);CHKERRQ(ierr);
893     PetscFunctionReturn(0);
894   }
895 
896   a = (Mat_SeqAIJMKL*)A->spptr;
897   p = (Mat_SeqAIJMKL*)P->spptr;
898   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
899   if (!a->sparse_optimized || a->state != state) {
900     MatSeqAIJMKL_create_mkl_handle(A);
901   }
902   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
903   if (!p->sparse_optimized || p->state != state) {
904     MatSeqAIJMKL_create_mkl_handle(P);
905   }
906   csrA = a->csrA;
907   csrP = p->csrA;
908   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
909 
910   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
911   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FULL_MULT);
912   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete full mkl_sparse_sypr");
913 
914   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
915   ierr = MatSetOption(*C,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
916 
917   PetscFunctionReturn(0);
918 }
919 #endif
920 
921 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
922  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
923  * routine, but can also be used to convert an assembled SeqAIJ matrix
924  * into a SeqAIJMKL one. */
925 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
926 {
927   PetscErrorCode ierr;
928   Mat            B = *newmat;
929   Mat_SeqAIJMKL  *aijmkl;
930   PetscBool      set;
931   PetscBool      sametype;
932 
933   PetscFunctionBegin;
934   if (reuse == MAT_INITIAL_MATRIX) {
935     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
936   }
937 
938   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
939   if (sametype) PetscFunctionReturn(0);
940 
941   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
942   B->spptr = (void*) aijmkl;
943 
944   /* Set function pointers for methods that we inherit from AIJ but override.
945    * We also parse some command line options below, since those determine some of the methods we point to. */
946   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
947   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
948   B->ops->destroy          = MatDestroy_SeqAIJMKL;
949 
950   aijmkl->sparse_optimized = PETSC_FALSE;
951 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
952   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
953 #else
954   aijmkl->no_SpMV2 = PETSC_TRUE;
955 #endif
956   aijmkl->eager_inspection = PETSC_FALSE;
957 
958   /* Parse command line options. */
959   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
960   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
961   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
962   ierr = PetscOptionsEnd();CHKERRQ(ierr);
963 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
964   if(!aijmkl->no_SpMV2) {
965     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
966     aijmkl->no_SpMV2 = PETSC_TRUE;
967   }
968 #endif
969 
970   if(!aijmkl->no_SpMV2) {
971 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
972     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
973     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
974     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
975     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
976     B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
977 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
978     B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
979 #ifndef PETSC_USE_COMPLEX
980     B->ops->ptap             = MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2;
981     B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
982 #endif
983 #endif
984     B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
985 #endif
986   } else {
987     B->ops->mult             = MatMult_SeqAIJMKL;
988     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
989     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
990     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
991   }
992 
993   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
994   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
995   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
996   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
997   if(!aijmkl->no_SpMV2) {
998 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
999     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1000 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
1001     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1002 #endif
1003     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1004 #endif
1005   }
1006 
1007   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
1008   *newmat = B;
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 /*@C
1013    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
1014    This type inherits from AIJ and is largely identical, but uses sparse BLAS
1015    routines from Intel MKL whenever possible.
1016    If the installed version of MKL supports the "SpMV2" sparse
1017    inspector-executor routines, then those are used by default.
1018    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, MatTransposeMatMult, and MatPtAP (for
1019    symmetric A) operations are currently supported.
1020    Note that MKL version 18, update 2 or later is required for MatPtAP/MatPtAPNumeric and MatMatMultNumeric.
1021 
1022    Collective on MPI_Comm
1023 
1024    Input Parameters:
1025 +  comm - MPI communicator, set to PETSC_COMM_SELF
1026 .  m - number of rows
1027 .  n - number of columns
1028 .  nz - number of nonzeros per row (same for all rows)
1029 -  nnz - array containing the number of nonzeros in the various rows
1030          (possibly different for each row) or NULL
1031 
1032    Output Parameter:
1033 .  A - the matrix
1034 
1035    Options Database Keys:
1036 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
1037 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
1038 
1039    Notes:
1040    If nnz is given then nz is ignored
1041 
1042    Level: intermediate
1043 
1044 .keywords: matrix, MKL, sparse, parallel
1045 
1046 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
1047 @*/
1048 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1049 {
1050   PetscErrorCode ierr;
1051 
1052   PetscFunctionBegin;
1053   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1054   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1055   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
1056   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
1057   PetscFunctionReturn(0);
1058 }
1059 
1060 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
1061 {
1062   PetscErrorCode ierr;
1063 
1064   PetscFunctionBegin;
1065   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
1066   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1067   PetscFunctionReturn(0);
1068 }
1069