xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision cf019ec6f319a509de7602f53cdadf853cfc8a83)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJ_SeqAIJ;
53   B->ops->ptap             = MatPtAP_SeqAIJ_SeqAIJ;
54   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJ_SeqAIJ;
55   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
56 
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
62   if(!aijmkl->no_SpMV2) {
63     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
64 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
65     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
66 #endif
67     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
68   }
69 
70   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
71    * simply involves destroying the MKL sparse matrix handle and then freeing
72    * the spptr pointer. */
73   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
74 
75   if (aijmkl->sparse_optimized) {
76     sparse_status_t stat;
77     stat = mkl_sparse_destroy(aijmkl->csrA);
78     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
79   }
80 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
81   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
82 
83   /* Change the type of B to MATSEQAIJ. */
84   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
85 
86   *newmat = B;
87   PetscFunctionReturn(0);
88 }
89 
90 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
91 {
92   PetscErrorCode ierr;
93   Mat_SeqAIJMKL  *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
94 
95   PetscFunctionBegin;
96 
97   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
98    * spptr pointer. */
99   if (aijmkl) {
100     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
101 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
102     if (aijmkl->sparse_optimized) {
103       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
104       stat = mkl_sparse_destroy(aijmkl->csrA);
105       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
106     }
107 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
108     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
109   }
110 
111   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
112    * to destroy everything that remains. */
113   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
114   /* Note that I don't call MatSetType().  I believe this is because that
115    * is only to be called when *building* a matrix.  I could be wrong, but
116    * that is how things work for the SuperLU matrix class. */
117   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
118   PetscFunctionReturn(0);
119 }
120 
121 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
122  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
123  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
124  * handle, creates a new one, and then calls mkl_sparse_optimize().
125  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
126  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
127  * an unoptimized matrix handle here. */
128 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
129 {
130 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
131   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
132    * does nothing. We make it callable anyway in this case because it cuts
133    * down on littering the code with #ifdefs. */
134   PetscFunctionBegin;
135   PetscFunctionReturn(0);
136 #else
137   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
138   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
139   PetscInt         m,n;
140   MatScalar        *aa;
141   PetscInt         *aj,*ai;
142   sparse_status_t  stat;
143   PetscErrorCode   ierr;
144 
145   PetscFunctionBegin;
146   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
147 
148   if (aijmkl->sparse_optimized) {
149     /* Matrix has been previously assembled and optimized. Must destroy old
150      * matrix handle before running the optimization step again. */
151     stat = mkl_sparse_destroy(aijmkl->csrA);
152     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
153   }
154   aijmkl->sparse_optimized = PETSC_FALSE;
155 
156   /* Now perform the SpMV2 setup and matrix optimization. */
157   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
158   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
159   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
160   m = A->rmap->n;
161   n = A->cmap->n;
162   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
163   aa   = a->a;  /* Nonzero elements stored row-by-row. */
164   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
165   if ((a->nz!=0) & !(A->structure_only)) {
166     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
167      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
168     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
169     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle");
170     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
171     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
172     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
173     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
174     if (!aijmkl->no_SpMV2) {
175       stat = mkl_sparse_optimize(aijmkl->csrA);
176       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
177     }
178     aijmkl->sparse_optimized = PETSC_TRUE;
179     ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
180   }
181 
182   PetscFunctionReturn(0);
183 #endif
184 }
185 
186 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
187  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
188  * matrix handle.
189  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
190  * there is no good alternative. */
191 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
192 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
193 {
194   PetscErrorCode      ierr;
195   sparse_status_t     stat;
196   sparse_index_base_t indexing;
197   PetscInt            nrows, ncols;
198   PetscInt            *aj,*ai,*dummy;
199   MatScalar           *aa;
200   Mat                 A;
201   Mat_SeqAIJMKL       *aijmkl;
202 
203   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
204   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
205   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
206 
207   if (reuse == MAT_REUSE_MATRIX) {
208     ierr = MatDestroy(mat);CHKERRQ(ierr);
209   }
210   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
211   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
212   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
213   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
214    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
215    * they will be destroyed when the MKL handle is destroyed.
216    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
217   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
218 
219   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
220    * Now turn it into a MATSEQAIJMKL. */
221   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
222 
223   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
224   aijmkl->csrA = csrA;
225 
226   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
227    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
228    * and just need to be able to run the MKL optimization step. */
229   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
230   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
231   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
232   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
233   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
234   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
235   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
236   if (!aijmkl->no_SpMV2) {
237     stat = mkl_sparse_optimize(aijmkl->csrA);
238     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
239   }
240   aijmkl->sparse_optimized = PETSC_TRUE;
241   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
242 
243   *mat = A;
244   PetscFunctionReturn(0);
245 }
246 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
247 
248 /* MatSeqAIJMKL_update_from_mkl_handle() updates the matrix values array from the contents of the associated MKL sparse matrix handle.
249  * This is needed after mkl_sparse_sp2m() with SPARSE_STAGE_FINALIZE_MULT has been used to compute new values of the matrix in
250  * MatMatMultNumeric(). */
251 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
252 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_update_from_mkl_handle(Mat A)
253 {
254   PetscInt            i;
255   PetscInt            nrows,ncols;
256   PetscInt            nz;
257   PetscInt            *ai,*aj,*dummy;
258   PetscScalar         *aa;
259   PetscErrorCode      ierr;
260   Mat_SeqAIJMKL       *aijmkl;
261   sparse_status_t     stat;
262   sparse_index_base_t indexing;
263 
264   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
265 
266   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
267   stat = mkl_sparse_x_export_csr(aijmkl->csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
268   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
269 
270   /* We can't just do a copy from the arrays exported by MKL to those used for the PETSc AIJ storage, because the MKL and PETSc
271    * representations differ in small ways (e.g., more explicit nonzeros per row due to preallocation). */
272   for (i=0; i<nrows; i++) {
273     nz = ai[i+1] - ai[i];
274     ierr = MatSetValues_SeqAIJ(A, 1, &i, nz, aj+ai[i], aa+ai[i], INSERT_VALUES);CHKERRQ(ierr);
275   }
276 
277   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
278   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
279 
280   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
281   /* We mark our matrix as having a valid, optimized MKL handle.
282    * TODO: It is valid, but I am not sure if it is optimized. Need to ask MKL developers. */
283   aijmkl->sparse_optimized = PETSC_TRUE;
284 
285   PetscFunctionReturn(0);
286 }
287 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
288 
289 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
290 {
291   PetscErrorCode ierr;
292   Mat_SeqAIJMKL  *aijmkl;
293   Mat_SeqAIJMKL  *aijmkl_dest;
294 
295   PetscFunctionBegin;
296   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
297   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
298   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
299   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
300   aijmkl_dest->sparse_optimized = PETSC_FALSE;
301   if (aijmkl->eager_inspection) {
302     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
303   }
304   PetscFunctionReturn(0);
305 }
306 
307 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
308 {
309   PetscErrorCode  ierr;
310   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
311   Mat_SeqAIJMKL   *aijmkl;
312 
313   PetscFunctionBegin;
314   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
315 
316   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
317    * extra information and some different methods, call the AssemblyEnd
318    * routine for a MATSEQAIJ.
319    * I'm not sure if this is the best way to do this, but it avoids
320    * a lot of code duplication. */
321   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
322   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
323 
324   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
325    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
326   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
327   if (aijmkl->eager_inspection) {
328     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
329   }
330 
331   PetscFunctionReturn(0);
332 }
333 
334 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
335 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
336 {
337   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
338   const PetscScalar *x;
339   PetscScalar       *y;
340   const MatScalar   *aa;
341   PetscErrorCode    ierr;
342   PetscInt          m=A->rmap->n;
343   PetscInt          n=A->cmap->n;
344   PetscScalar       alpha = 1.0;
345   PetscScalar       beta = 0.0;
346   const PetscInt    *aj,*ai;
347   char              matdescra[6];
348 
349 
350   /* Variables not in MatMult_SeqAIJ. */
351   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
352 
353   PetscFunctionBegin;
354   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
355   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
356   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
357   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
358   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
359   aa   = a->a;  /* Nonzero elements stored row-by-row. */
360   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
361 
362   /* Call MKL sparse BLAS routine to do the MatMult. */
363   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
364 
365   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
366   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
367   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 #endif
371 
372 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
373 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
374 {
375   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
376   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
377   const PetscScalar *x;
378   PetscScalar       *y;
379   PetscErrorCode    ierr;
380   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
381   PetscObjectState  state;
382 
383   PetscFunctionBegin;
384 
385   /* If there are no nonzero entries, zero yy and return immediately. */
386   if(!a->nz) {
387     PetscInt i;
388     PetscInt m=A->rmap->n;
389     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
390     for (i=0; i<m; i++) {
391       y[i] = 0.0;
392     }
393     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
394     PetscFunctionReturn(0);
395   }
396 
397   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
398   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
399 
400   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
401    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
402    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
403   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
404   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
405     MatSeqAIJMKL_create_mkl_handle(A);
406   }
407 
408   /* Call MKL SpMV2 executor routine to do the MatMult. */
409   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
410   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
411 
412   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
413   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
414   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
415   PetscFunctionReturn(0);
416 }
417 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
418 
419 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
420 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
421 {
422   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
423   const PetscScalar *x;
424   PetscScalar       *y;
425   const MatScalar   *aa;
426   PetscErrorCode    ierr;
427   PetscInt          m=A->rmap->n;
428   PetscInt          n=A->cmap->n;
429   PetscScalar       alpha = 1.0;
430   PetscScalar       beta = 0.0;
431   const PetscInt    *aj,*ai;
432   char              matdescra[6];
433 
434   /* Variables not in MatMultTranspose_SeqAIJ. */
435   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
436 
437   PetscFunctionBegin;
438   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
439   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
440   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
441   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
442   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
443   aa   = a->a;  /* Nonzero elements stored row-by-row. */
444   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
445 
446   /* Call MKL sparse BLAS routine to do the MatMult. */
447   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
448 
449   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
450   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
451   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
452   PetscFunctionReturn(0);
453 }
454 #endif
455 
456 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
457 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
458 {
459   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
460   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
461   const PetscScalar *x;
462   PetscScalar       *y;
463   PetscErrorCode    ierr;
464   sparse_status_t   stat;
465   PetscObjectState  state;
466 
467   PetscFunctionBegin;
468 
469   /* If there are no nonzero entries, zero yy and return immediately. */
470   if(!a->nz) {
471     PetscInt i;
472     PetscInt n=A->cmap->n;
473     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
474     for (i=0; i<n; i++) {
475       y[i] = 0.0;
476     }
477     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
478     PetscFunctionReturn(0);
479   }
480 
481   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
482   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
483 
484   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
485    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
486    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
487   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
488   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
489     MatSeqAIJMKL_create_mkl_handle(A);
490   }
491 
492   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
493   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
494   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
495 
496   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
497   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
498   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
499   PetscFunctionReturn(0);
500 }
501 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
502 
503 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
504 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
505 {
506   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
507   const PetscScalar *x;
508   PetscScalar       *y,*z;
509   const MatScalar   *aa;
510   PetscErrorCode    ierr;
511   PetscInt          m=A->rmap->n;
512   PetscInt          n=A->cmap->n;
513   const PetscInt    *aj,*ai;
514   PetscInt          i;
515 
516   /* Variables not in MatMultAdd_SeqAIJ. */
517   char              transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
518   PetscScalar       alpha = 1.0;
519   PetscScalar       beta;
520   char              matdescra[6];
521 
522   PetscFunctionBegin;
523   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
524   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
525 
526   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
527   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
528   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
529   aa   = a->a;  /* Nonzero elements stored row-by-row. */
530   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
531 
532   /* Call MKL sparse BLAS routine to do the MatMult. */
533   if (zz == yy) {
534     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
535     beta = 1.0;
536     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
537   } else {
538     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
539      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
540     beta = 0.0;
541     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
542     for (i=0; i<m; i++) {
543       z[i] += y[i];
544     }
545   }
546 
547   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
548   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
549   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
550   PetscFunctionReturn(0);
551 }
552 #endif
553 
554 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
555 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
556 {
557   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
558   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
559   const PetscScalar *x;
560   PetscScalar       *y,*z;
561   PetscErrorCode    ierr;
562   PetscInt          m=A->rmap->n;
563   PetscInt          i;
564 
565   /* Variables not in MatMultAdd_SeqAIJ. */
566   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
567   PetscObjectState  state;
568 
569   PetscFunctionBegin;
570 
571   /* If there are no nonzero entries, set zz = yy and return immediately. */
572   if(!a->nz) {
573     PetscInt i;
574     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
575     for (i=0; i<m; i++) {
576       z[i] = y[i];
577     }
578     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
579     PetscFunctionReturn(0);
580   }
581 
582   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
583   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
584 
585   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
586    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
587    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
588   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
589   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
590     MatSeqAIJMKL_create_mkl_handle(A);
591   }
592 
593   /* Call MKL sparse BLAS routine to do the MatMult. */
594   if (zz == yy) {
595     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
596      * with alpha and beta both set to 1.0. */
597     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
598     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
599   } else {
600     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
601      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
602     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
603     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
604     for (i=0; i<m; i++) {
605       z[i] += y[i];
606     }
607   }
608 
609   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
610   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
611   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
612   PetscFunctionReturn(0);
613 }
614 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
615 
616 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
617 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
618 {
619   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
620   const PetscScalar *x;
621   PetscScalar       *y,*z;
622   const MatScalar   *aa;
623   PetscErrorCode    ierr;
624   PetscInt          m=A->rmap->n;
625   PetscInt          n=A->cmap->n;
626   const PetscInt    *aj,*ai;
627   PetscInt          i;
628 
629   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
630   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
631   PetscScalar       alpha = 1.0;
632   PetscScalar       beta;
633   char              matdescra[6];
634 
635   PetscFunctionBegin;
636   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
637   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
638 
639   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
640   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
641   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
642   aa   = a->a;  /* Nonzero elements stored row-by-row. */
643   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
644 
645   /* Call MKL sparse BLAS routine to do the MatMult. */
646   if (zz == yy) {
647     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
648     beta = 1.0;
649     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
650   } else {
651     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
652      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
653     beta = 0.0;
654     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
655     for (i=0; i<n; i++) {
656       z[i] += y[i];
657     }
658   }
659 
660   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
661   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
662   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
663   PetscFunctionReturn(0);
664 }
665 #endif
666 
667 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
668 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
669 {
670   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
671   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
672   const PetscScalar *x;
673   PetscScalar       *y,*z;
674   PetscErrorCode    ierr;
675   PetscInt          n=A->cmap->n;
676   PetscInt          i;
677   PetscObjectState  state;
678 
679   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
680   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
681 
682   PetscFunctionBegin;
683 
684   /* If there are no nonzero entries, set zz = yy and return immediately. */
685   if(!a->nz) {
686     PetscInt i;
687     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
688     for (i=0; i<n; i++) {
689       z[i] = y[i];
690     }
691     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
692     PetscFunctionReturn(0);
693   }
694 
695   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
696   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
697 
698   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
699    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
700    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
701   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
702   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
703     MatSeqAIJMKL_create_mkl_handle(A);
704   }
705 
706   /* Call MKL sparse BLAS routine to do the MatMult. */
707   if (zz == yy) {
708     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
709      * with alpha and beta both set to 1.0. */
710     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
711     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
712   } else {
713     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
714      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
715     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
716     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
717     for (i=0; i<n; i++) {
718       z[i] += y[i];
719     }
720   }
721 
722   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
723   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
724   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
725   PetscFunctionReturn(0);
726 }
727 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
728 
729 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
730 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
731  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
732  * For releases of MKL prior to version 18, update 2:
733  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
734  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
735  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
736  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
737 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
738 {
739   Mat_SeqAIJMKL    *a, *b;
740   sparse_matrix_t  csrA, csrB, csrC;
741   PetscErrorCode   ierr;
742   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
743   PetscObjectState state;
744 
745   PetscFunctionBegin;
746   a = (Mat_SeqAIJMKL*)A->spptr;
747   b = (Mat_SeqAIJMKL*)B->spptr;
748   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
749   if (!a->sparse_optimized || a->state != state) {
750     MatSeqAIJMKL_create_mkl_handle(A);
751   }
752   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
753   if (!b->sparse_optimized || b->state != state) {
754     MatSeqAIJMKL_create_mkl_handle(B);
755   }
756   csrA = a->csrA;
757   csrB = b->csrA;
758 
759   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
760   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
761 
762   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
763 
764   PetscFunctionReturn(0);
765 }
766 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
767 
768 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
769 PetscErrorCode MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,Mat C)
770 {
771   Mat_SeqAIJMKL       *a, *b, *c;
772   sparse_matrix_t     csrA, csrB, csrC;
773   PetscErrorCode      ierr;
774   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
775   struct matrix_descr descr_type_gen;
776   PetscObjectState    state;
777 
778   PetscFunctionBegin;
779   a = (Mat_SeqAIJMKL*)A->spptr;
780   b = (Mat_SeqAIJMKL*)B->spptr;
781   c = (Mat_SeqAIJMKL*)C->spptr;
782   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
783   if (!a->sparse_optimized || a->state != state) {
784     MatSeqAIJMKL_create_mkl_handle(A);
785   }
786   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
787   if (!b->sparse_optimized || b->state != state) {
788     MatSeqAIJMKL_create_mkl_handle(B);
789   }
790   csrA = a->csrA;
791   csrB = b->csrA;
792   csrC = c->csrA;
793   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
794 
795   stat = mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrA,
796                          SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrB,
797                          SPARSE_STAGE_FINALIZE_MULT,&csrC);
798 
799   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete numerical stage of sparse matrix-matrix multiply");
800 
801   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
802   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
803 
804   PetscFunctionReturn(0);
805 }
806 #endif /* PETSC_HAVE_MKL_SPARSE_SP2M */
807 
808 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
809 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
810 {
811   Mat_SeqAIJMKL    *a, *b;
812   sparse_matrix_t  csrA, csrB, csrC;
813   PetscErrorCode   ierr;
814   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
815   PetscObjectState state;
816 
817   PetscFunctionBegin;
818   a = (Mat_SeqAIJMKL*)A->spptr;
819   b = (Mat_SeqAIJMKL*)B->spptr;
820   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
821   if (!a->sparse_optimized || a->state != state) {
822     MatSeqAIJMKL_create_mkl_handle(A);
823   }
824   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
825   if (!b->sparse_optimized || b->state != state) {
826     MatSeqAIJMKL_create_mkl_handle(B);
827   }
828   csrA = a->csrA;
829   csrB = b->csrA;
830 
831   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
832   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
833 
834   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
835 
836   PetscFunctionReturn(0);
837 }
838 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
839 
840 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
841 PetscErrorCode MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,Mat C)
842 {
843   Mat_SeqAIJMKL       *a, *p, *c;
844   sparse_matrix_t     csrA, csrP, csrC;
845   PetscBool           set, flag;
846   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
847   struct matrix_descr descr_type_sym;
848   PetscObjectState    state;
849   PetscErrorCode      ierr;
850 
851   PetscFunctionBegin;
852   ierr = MatIsSymmetricKnown(A,&set,&flag);
853   if (!set || (set && !flag)) {
854     ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,C);CHKERRQ(ierr);
855     PetscFunctionReturn(0);
856   }
857 
858   a = (Mat_SeqAIJMKL*)A->spptr;
859   p = (Mat_SeqAIJMKL*)P->spptr;
860   c = (Mat_SeqAIJMKL*)C->spptr;
861   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
862   if (!a->sparse_optimized || a->state != state) {
863     MatSeqAIJMKL_create_mkl_handle(A);
864   }
865   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
866   if (!p->sparse_optimized || p->state != state) {
867     MatSeqAIJMKL_create_mkl_handle(P);
868   }
869   csrA = a->csrA;
870   csrP = p->csrA;
871   csrC = c->csrA;
872   descr_type_sym.type = SPARSE_MATRIX_TYPE_SYMMETRIC;
873   descr_type_sym.mode = SPARSE_FILL_MODE_LOWER;
874   descr_type_sym.diag = SPARSE_DIAG_NON_UNIT;
875 
876   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
877   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_sym,&csrC,SPARSE_STAGE_FINALIZE_MULT);
878   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to finalize mkl_sparse_sypr");
879 
880   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
881   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
882 
883   PetscFunctionReturn(0);
884 }
885 #endif
886 
887 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
888 PetscErrorCode MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
889 {
890   Mat_SeqAIJMKL       *a, *p;
891   sparse_matrix_t     csrA, csrP, csrC;
892   PetscBool           set, flag;
893   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
894   struct matrix_descr descr_type_sym;
895   PetscObjectState    state;
896   PetscErrorCode      ierr;
897 
898   PetscFunctionBegin;
899   ierr = MatIsSymmetricKnown(A,&set,&flag);
900   if (!set || (set && !flag)) {
901     ierr = MatPtAP_SeqAIJ_SeqAIJ(A,P,scall,fill,C);CHKERRQ(ierr);
902     PetscFunctionReturn(0);
903   }
904 
905   if (scall == MAT_REUSE_MATRIX) {
906     ierr = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(A,P,*C);CHKERRQ(ierr);
907     PetscFunctionReturn(0);
908   }
909 
910   a = (Mat_SeqAIJMKL*)A->spptr;
911   p = (Mat_SeqAIJMKL*)P->spptr;
912   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
913   if (!a->sparse_optimized || a->state != state) {
914     MatSeqAIJMKL_create_mkl_handle(A);
915   }
916   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
917   if (!p->sparse_optimized || p->state != state) {
918     MatSeqAIJMKL_create_mkl_handle(P);
919   }
920   csrA = a->csrA;
921   csrP = p->csrA;
922   descr_type_sym.type = SPARSE_MATRIX_TYPE_SYMMETRIC;
923   descr_type_sym.mode = SPARSE_FILL_MODE_LOWER;
924   descr_type_sym.diag = SPARSE_DIAG_NON_UNIT;
925 
926   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
927   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_sym,&csrC,SPARSE_STAGE_FULL_MULT);
928   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete full mkl_sparse_sypr");
929 
930   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
931   ierr = MatSetOption(*C,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
932 
933   PetscFunctionReturn(0);
934 }
935 #endif
936 
937 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
938  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
939  * routine, but can also be used to convert an assembled SeqAIJ matrix
940  * into a SeqAIJMKL one. */
941 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
942 {
943   PetscErrorCode ierr;
944   Mat            B = *newmat;
945   Mat_SeqAIJMKL  *aijmkl;
946   PetscBool      set;
947   PetscBool      sametype;
948 
949   PetscFunctionBegin;
950   if (reuse == MAT_INITIAL_MATRIX) {
951     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
952   }
953 
954   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
955   if (sametype) PetscFunctionReturn(0);
956 
957   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
958   B->spptr = (void*) aijmkl;
959 
960   /* Set function pointers for methods that we inherit from AIJ but override.
961    * We also parse some command line options below, since those determine some of the methods we point to. */
962   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
963   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
964   B->ops->destroy          = MatDestroy_SeqAIJMKL;
965 
966   aijmkl->sparse_optimized = PETSC_FALSE;
967 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
968   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
969 #else
970   aijmkl->no_SpMV2 = PETSC_TRUE;
971 #endif
972   aijmkl->eager_inspection = PETSC_FALSE;
973 
974   /* Parse command line options. */
975   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
976   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
977   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
978   ierr = PetscOptionsEnd();CHKERRQ(ierr);
979 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
980   if(!aijmkl->no_SpMV2) {
981     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
982     aijmkl->no_SpMV2 = PETSC_TRUE;
983   }
984 #endif
985 
986 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
987   B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
988   B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
989   B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
990   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
991   B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
992 # ifdef PETSC_HAVE_MKL_SPARSE_SP2M
993   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
994 #   ifndef PETSC_USE_COMPLEX
995   B->ops->ptap             = MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2;
996   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
997 #   endif
998 # endif
999   B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
1000 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
1001 
1002 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
1003   /* In the same release in which MKL introduced mkl_sparse_sp2m() (version 18, update 2), the old sparse BLAS interfaces were
1004    * marked as deprecated. If "no_SpMV2" has been specified by the user and MKL 18u2 or later is being used, we use the new
1005    * _SpMV2 routines (set above), but do not call mkl_sparse_optimize(), which results in the old numerical kernels (without the
1006    * inspector-executor model) being used. For versions in which the older interface has not been deprecated, we use the old
1007    * interface. */
1008   if (aijmkl->no_SpMV2) {
1009     B->ops->mult             = MatMult_SeqAIJMKL;
1010     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
1011     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
1012     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
1013   }
1014 #endif
1015 
1016   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
1017   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
1018   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
1019   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
1020   if(!aijmkl->no_SpMV2) {
1021 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
1022     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1023 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
1024     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1025 #endif
1026     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1027 #endif
1028   }
1029 
1030   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
1031   *newmat = B;
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 /*@C
1036    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
1037    This type inherits from AIJ and is largely identical, but uses sparse BLAS
1038    routines from Intel MKL whenever possible.
1039    If the installed version of MKL supports the "SpMV2" sparse
1040    inspector-executor routines, then those are used by default.
1041    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, MatTransposeMatMult, and MatPtAP (for
1042    symmetric A) operations are currently supported.
1043    Note that MKL version 18, update 2 or later is required for MatPtAP/MatPtAPNumeric and MatMatMultNumeric.
1044 
1045    Collective on MPI_Comm
1046 
1047    Input Parameters:
1048 +  comm - MPI communicator, set to PETSC_COMM_SELF
1049 .  m - number of rows
1050 .  n - number of columns
1051 .  nz - number of nonzeros per row (same for all rows)
1052 -  nnz - array containing the number of nonzeros in the various rows
1053          (possibly different for each row) or NULL
1054 
1055    Output Parameter:
1056 .  A - the matrix
1057 
1058    Options Database Keys:
1059 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
1060 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
1061 
1062    Notes:
1063    If nnz is given then nz is ignored
1064 
1065    Level: intermediate
1066 
1067 .keywords: matrix, MKL, sparse, parallel
1068 
1069 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
1070 @*/
1071 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1072 {
1073   PetscErrorCode ierr;
1074 
1075   PetscFunctionBegin;
1076   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1077   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1078   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
1079   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
1080   PetscFunctionReturn(0);
1081 }
1082 
1083 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
1084 {
1085   PetscErrorCode ierr;
1086 
1087   PetscFunctionBegin;
1088   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
1089   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1090   PetscFunctionReturn(0);
1091 }
1092