xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 50a5026b66e4ae6d6116d3e4dbeba8d3ca3a2c8d)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJ_SeqAIJ;
53   B->ops->ptap             = MatPtAP_SeqAIJ_SeqAIJ;
54   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJ_SeqAIJ;
55   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
56 
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
62   if(!aijmkl->no_SpMV2) {
63     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
64 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
65     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
66 #endif
67     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
68   }
69 
70   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
71    * simply involves destroying the MKL sparse matrix handle and then freeing
72    * the spptr pointer. */
73   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
74 
75   if (aijmkl->sparse_optimized) {
76     sparse_status_t stat;
77     stat = mkl_sparse_destroy(aijmkl->csrA);
78     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
79   }
80 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
81   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
82 
83   /* Change the type of B to MATSEQAIJ. */
84   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
85 
86   *newmat = B;
87   PetscFunctionReturn(0);
88 }
89 
90 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
91 {
92   PetscErrorCode ierr;
93   Mat_SeqAIJMKL  *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
94 
95   PetscFunctionBegin;
96 
97   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
98    * spptr pointer. */
99   if (aijmkl) {
100     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
101 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
102     if (aijmkl->sparse_optimized) {
103       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
104       stat = mkl_sparse_destroy(aijmkl->csrA);
105       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
106     }
107 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
108     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
109   }
110 
111   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
112    * to destroy everything that remains. */
113   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
114   /* Note that I don't call MatSetType().  I believe this is because that
115    * is only to be called when *building* a matrix.  I could be wrong, but
116    * that is how things work for the SuperLU matrix class. */
117   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
118   PetscFunctionReturn(0);
119 }
120 
121 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
122  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
123  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
124  * handle, creates a new one, and then calls mkl_sparse_optimize().
125  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
126  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
127  * an unoptimized matrix handle here. */
128 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
129 {
130 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
131   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
132    * does nothing. We make it callable anyway in this case because it cuts
133    * down on littering the code with #ifdefs. */
134   PetscFunctionBegin;
135   PetscFunctionReturn(0);
136 #else
137   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
138   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
139   PetscInt         m,n;
140   MatScalar        *aa;
141   PetscInt         *aj,*ai;
142   sparse_status_t  stat;
143   PetscErrorCode   ierr;
144 
145   PetscFunctionBegin;
146   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
147 
148   if (aijmkl->sparse_optimized) {
149     /* Matrix has been previously assembled and optimized. Must destroy old
150      * matrix handle before running the optimization step again. */
151     stat = mkl_sparse_destroy(aijmkl->csrA);
152     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
153   }
154   aijmkl->sparse_optimized = PETSC_FALSE;
155 
156   /* Now perform the SpMV2 setup and matrix optimization. */
157   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
158   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
159   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
160   m = A->rmap->n;
161   n = A->cmap->n;
162   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
163   aa   = a->a;  /* Nonzero elements stored row-by-row. */
164   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
165   if ((a->nz!=0) & !(A->structure_only)) {
166     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
167      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
168     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
169     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle");
170     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
171     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
172     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
173     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
174     if (!aijmkl->no_SpMV2) {
175       stat = mkl_sparse_optimize(aijmkl->csrA);
176       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
177     }
178     aijmkl->sparse_optimized = PETSC_TRUE;
179     ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
180   }
181 
182   PetscFunctionReturn(0);
183 #endif
184 }
185 
186 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
187  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
188  * matrix handle.
189  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
190  * there is no good alternative. */
191 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
192 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
193 {
194   PetscErrorCode      ierr;
195   sparse_status_t     stat;
196   sparse_index_base_t indexing;
197   PetscInt            nrows, ncols;
198   PetscInt            *aj,*ai,*dummy;
199   MatScalar           *aa;
200   Mat                 A;
201   Mat_SeqAIJMKL       *aijmkl;
202 
203   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
204   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
205   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
206 
207   if (reuse == MAT_REUSE_MATRIX) {
208     ierr = MatDestroy(mat);CHKERRQ(ierr);
209   }
210   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
211   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
212   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
213   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
214    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
215    * they will be destroyed when the MKL handle is destroyed.
216    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
217   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
218 
219   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
220    * Now turn it into a MATSEQAIJMKL. */
221   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
222 
223   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
224   aijmkl->csrA = csrA;
225 
226   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
227    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
228    * and just need to be able to run the MKL optimization step. */
229   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
230   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
231   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
232   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
233   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
234   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
235   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
236   if (!aijmkl->no_SpMV2) {
237     stat = mkl_sparse_optimize(aijmkl->csrA);
238     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
239   }
240   aijmkl->sparse_optimized = PETSC_TRUE;
241   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
242 
243   *mat = A;
244   PetscFunctionReturn(0);
245 }
246 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
247 
248 /* MatSeqAIJMKL_update_from_mkl_handle() updates the matrix values array from the contents of the associated MKL sparse matrix handle.
249  * This is needed after mkl_sparse_sp2m() with SPARSE_STAGE_FINALIZE_MULT has been used to compute new values of the matrix in
250  * MatMatMultNumeric(). */
251 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
252 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_update_from_mkl_handle(Mat A)
253 {
254   PetscInt            i;
255   PetscInt            nrows,ncols;
256   PetscInt            nz;
257   PetscInt            *ai,*aj,*dummy;
258   PetscScalar         *aa;
259   PetscErrorCode      ierr;
260   Mat_SeqAIJMKL       *aijmkl;
261   sparse_status_t     stat;
262   sparse_index_base_t indexing;
263 
264   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
265 
266   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
267   stat = mkl_sparse_x_export_csr(aijmkl->csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
268   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
269 
270   /* We can't just do a copy from the arrays exported by MKL to those used for the PETSc AIJ storage, because the MKL and PETSc
271    * representations differ in small ways (e.g., more explicit nonzeros per row due to preallocation). */
272   for (i=0; i<nrows; i++) {
273     nz = ai[i+1] - ai[i];
274     ierr = MatSetValues_SeqAIJ(A, 1, &i, nz, aj+ai[i], aa+ai[i], INSERT_VALUES);CHKERRQ(ierr);
275   }
276 
277   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
278   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
279 
280   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
281   /* We mark our matrix as having a valid, optimized MKL handle.
282    * TODO: It is valid, but I am not sure if it is optimized. Need to ask MKL developers. */
283   aijmkl->sparse_optimized = PETSC_TRUE;
284 
285   PetscFunctionReturn(0);
286 }
287 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
288 
289 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
290 {
291   PetscErrorCode ierr;
292   Mat_SeqAIJMKL  *aijmkl;
293   Mat_SeqAIJMKL  *aijmkl_dest;
294 
295   PetscFunctionBegin;
296   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
297   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
298   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
299   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
300   aijmkl_dest->sparse_optimized = PETSC_FALSE;
301   if (aijmkl->eager_inspection) {
302     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
303   }
304   PetscFunctionReturn(0);
305 }
306 
307 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
308 {
309   PetscErrorCode  ierr;
310   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
311   Mat_SeqAIJMKL   *aijmkl;
312 
313   PetscFunctionBegin;
314   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
315 
316   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
317    * extra information and some different methods, call the AssemblyEnd
318    * routine for a MATSEQAIJ.
319    * I'm not sure if this is the best way to do this, but it avoids
320    * a lot of code duplication. */
321   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
322   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
323 
324   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
325    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
326   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
327   if (aijmkl->eager_inspection) {
328     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
329   }
330 
331   PetscFunctionReturn(0);
332 }
333 
334 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
335 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
336 {
337   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
338   const PetscScalar *x;
339   PetscScalar       *y;
340   const MatScalar   *aa;
341   PetscErrorCode    ierr;
342   PetscInt          m=A->rmap->n;
343   PetscInt          n=A->cmap->n;
344   PetscScalar       alpha = 1.0;
345   PetscScalar       beta = 0.0;
346   const PetscInt    *aj,*ai;
347   char              matdescra[6];
348 
349 
350   /* Variables not in MatMult_SeqAIJ. */
351   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
352 
353   PetscFunctionBegin;
354   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
355   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
356   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
357   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
358   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
359   aa   = a->a;  /* Nonzero elements stored row-by-row. */
360   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
361 
362   /* Call MKL sparse BLAS routine to do the MatMult. */
363   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
364 
365   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
366   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
367   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 #endif
371 
372 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
373 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
374 {
375   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
376   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
377   const PetscScalar *x;
378   PetscScalar       *y;
379   PetscErrorCode    ierr;
380   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
381   PetscObjectState  state;
382 
383   PetscFunctionBegin;
384 
385   /* If there are no nonzero entries, zero yy and return immediately. */
386   if(!a->nz) {
387     PetscInt i;
388     PetscInt m=A->rmap->n;
389     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
390     for (i=0; i<m; i++) {
391       y[i] = 0.0;
392     }
393     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
394     PetscFunctionReturn(0);
395   }
396 
397   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
398   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
399 
400   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
401    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
402    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
403   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
404   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
405     MatSeqAIJMKL_create_mkl_handle(A);
406   }
407 
408   /* Call MKL SpMV2 executor routine to do the MatMult. */
409   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
410   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
411 
412   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
413   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
414   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
415   PetscFunctionReturn(0);
416 }
417 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
418 
419 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
420 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
421 {
422   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
423   const PetscScalar *x;
424   PetscScalar       *y;
425   const MatScalar   *aa;
426   PetscErrorCode    ierr;
427   PetscInt          m=A->rmap->n;
428   PetscInt          n=A->cmap->n;
429   PetscScalar       alpha = 1.0;
430   PetscScalar       beta = 0.0;
431   const PetscInt    *aj,*ai;
432   char              matdescra[6];
433 
434   /* Variables not in MatMultTranspose_SeqAIJ. */
435   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
436 
437   PetscFunctionBegin;
438   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
439   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
440   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
441   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
442   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
443   aa   = a->a;  /* Nonzero elements stored row-by-row. */
444   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
445 
446   /* Call MKL sparse BLAS routine to do the MatMult. */
447   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
448 
449   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
450   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
451   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
452   PetscFunctionReturn(0);
453 }
454 #endif
455 
456 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
457 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
458 {
459   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
460   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
461   const PetscScalar *x;
462   PetscScalar       *y;
463   PetscErrorCode    ierr;
464   sparse_status_t   stat;
465   PetscObjectState  state;
466 
467   PetscFunctionBegin;
468 
469   /* If there are no nonzero entries, zero yy and return immediately. */
470   if(!a->nz) {
471     PetscInt i;
472     PetscInt n=A->cmap->n;
473     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
474     for (i=0; i<n; i++) {
475       y[i] = 0.0;
476     }
477     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
478     PetscFunctionReturn(0);
479   }
480 
481   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
482   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
483 
484   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
485    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
486    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
487   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
488   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
489     MatSeqAIJMKL_create_mkl_handle(A);
490   }
491 
492   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
493   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
494   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
495 
496   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
497   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
498   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
499   PetscFunctionReturn(0);
500 }
501 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
502 
503 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
504 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
505 {
506   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
507   const PetscScalar *x;
508   PetscScalar       *y,*z;
509   const MatScalar   *aa;
510   PetscErrorCode    ierr;
511   PetscInt          m=A->rmap->n;
512   PetscInt          n=A->cmap->n;
513   const PetscInt    *aj,*ai;
514   PetscInt          i;
515 
516   /* Variables not in MatMultAdd_SeqAIJ. */
517   char              transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
518   PetscScalar       alpha = 1.0;
519   PetscScalar       beta;
520   char              matdescra[6];
521 
522   PetscFunctionBegin;
523   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
524   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
525 
526   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
527   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
528   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
529   aa   = a->a;  /* Nonzero elements stored row-by-row. */
530   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
531 
532   /* Call MKL sparse BLAS routine to do the MatMult. */
533   if (zz == yy) {
534     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
535     beta = 1.0;
536     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
537   } else {
538     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
539      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
540     beta = 0.0;
541     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
542     for (i=0; i<m; i++) {
543       z[i] += y[i];
544     }
545   }
546 
547   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
548   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
549   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
550   PetscFunctionReturn(0);
551 }
552 #endif
553 
554 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
555 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
556 {
557   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
558   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
559   const PetscScalar *x;
560   PetscScalar       *y,*z;
561   PetscErrorCode    ierr;
562   PetscInt          m=A->rmap->n;
563   PetscInt          i;
564 
565   /* Variables not in MatMultAdd_SeqAIJ. */
566   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
567   PetscObjectState  state;
568 
569   PetscFunctionBegin;
570 
571   /* If there are no nonzero entries, set zz = yy and return immediately. */
572   if(!a->nz) {
573     PetscInt i;
574     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
575     for (i=0; i<m; i++) {
576       z[i] = y[i];
577     }
578     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
579     PetscFunctionReturn(0);
580   }
581 
582   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
583   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
584 
585   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
586    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
587    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
588   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
589   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
590     MatSeqAIJMKL_create_mkl_handle(A);
591   }
592 
593   /* Call MKL sparse BLAS routine to do the MatMult. */
594   if (zz == yy) {
595     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
596      * with alpha and beta both set to 1.0. */
597     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
598     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
599   } else {
600     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
601      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
602     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
603     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
604     for (i=0; i<m; i++) {
605       z[i] += y[i];
606     }
607   }
608 
609   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
610   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
611   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
612   PetscFunctionReturn(0);
613 }
614 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
615 
616 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
617 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
618 {
619   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
620   const PetscScalar *x;
621   PetscScalar       *y,*z;
622   const MatScalar   *aa;
623   PetscErrorCode    ierr;
624   PetscInt          m=A->rmap->n;
625   PetscInt          n=A->cmap->n;
626   const PetscInt    *aj,*ai;
627   PetscInt          i;
628 
629   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
630   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
631   PetscScalar       alpha = 1.0;
632   PetscScalar       beta;
633   char              matdescra[6];
634 
635   PetscFunctionBegin;
636   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
637   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
638 
639   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
640   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
641   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
642   aa   = a->a;  /* Nonzero elements stored row-by-row. */
643   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
644 
645   /* Call MKL sparse BLAS routine to do the MatMult. */
646   if (zz == yy) {
647     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
648     beta = 1.0;
649     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
650   } else {
651     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
652      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
653     beta = 0.0;
654     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
655     for (i=0; i<n; i++) {
656       z[i] += y[i];
657     }
658   }
659 
660   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
661   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
662   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
663   PetscFunctionReturn(0);
664 }
665 #endif
666 
667 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
668 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
669 {
670   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
671   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
672   const PetscScalar *x;
673   PetscScalar       *y,*z;
674   PetscErrorCode    ierr;
675   PetscInt          n=A->cmap->n;
676   PetscInt          i;
677   PetscObjectState  state;
678 
679   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
680   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
681 
682   PetscFunctionBegin;
683 
684   /* If there are no nonzero entries, set zz = yy and return immediately. */
685   if(!a->nz) {
686     PetscInt i;
687     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
688     for (i=0; i<n; i++) {
689       z[i] = y[i];
690     }
691     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
692     PetscFunctionReturn(0);
693   }
694 
695   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
696   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
697 
698   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
699    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
700    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
701   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
702   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
703     MatSeqAIJMKL_create_mkl_handle(A);
704   }
705 
706   /* Call MKL sparse BLAS routine to do the MatMult. */
707   if (zz == yy) {
708     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
709      * with alpha and beta both set to 1.0. */
710     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
711     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
712   } else {
713     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
714      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
715     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
716     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
717     for (i=0; i<n; i++) {
718       z[i] += y[i];
719     }
720   }
721 
722   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
723   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
724   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
725   PetscFunctionReturn(0);
726 }
727 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
728 
729 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
730 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
731  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
732  * For releases of MKL prior to version 18, update 2:
733  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
734  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
735  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
736  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
737 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
738 {
739   Mat_SeqAIJMKL    *a, *b;
740   sparse_matrix_t  csrA, csrB, csrC;
741   PetscErrorCode   ierr;
742   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
743   PetscObjectState state;
744 
745   PetscFunctionBegin;
746   a = (Mat_SeqAIJMKL*)A->spptr;
747   b = (Mat_SeqAIJMKL*)B->spptr;
748   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
749   if (!a->sparse_optimized || a->state != state) {
750     MatSeqAIJMKL_create_mkl_handle(A);
751   }
752   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
753   if (!b->sparse_optimized || b->state != state) {
754     MatSeqAIJMKL_create_mkl_handle(B);
755   }
756   csrA = a->csrA;
757   csrB = b->csrA;
758 
759   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
760   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
761 
762   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
763 
764   PetscFunctionReturn(0);
765 }
766 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
767 
768 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
769 PetscErrorCode MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,Mat C)
770 {
771   Mat_SeqAIJMKL       *a, *b, *c;
772   sparse_matrix_t     csrA, csrB, csrC;
773   PetscErrorCode      ierr;
774   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
775   struct matrix_descr descr_type_gen;
776   PetscObjectState    state;
777 
778   PetscFunctionBegin;
779   a = (Mat_SeqAIJMKL*)A->spptr;
780   b = (Mat_SeqAIJMKL*)B->spptr;
781   c = (Mat_SeqAIJMKL*)C->spptr;
782   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
783   if (!a->sparse_optimized || a->state != state) {
784     MatSeqAIJMKL_create_mkl_handle(A);
785   }
786   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
787   if (!b->sparse_optimized || b->state != state) {
788     MatSeqAIJMKL_create_mkl_handle(B);
789   }
790   csrA = a->csrA;
791   csrB = b->csrA;
792   csrC = c->csrA;
793   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
794 
795   stat = mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrA,
796                          SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrB,
797                          SPARSE_STAGE_FINALIZE_MULT,&csrC);
798 
799   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete numerical stage of sparse matrix-matrix multiply");
800 
801   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
802   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
803 
804   PetscFunctionReturn(0);
805 }
806 #endif /* PETSC_HAVE_MKL_SPARSE_SP2M */
807 
808 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
809 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
810 {
811   Mat_SeqAIJMKL    *a, *b;
812   sparse_matrix_t  csrA, csrB, csrC;
813   PetscErrorCode   ierr;
814   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
815   PetscObjectState state;
816 
817   PetscFunctionBegin;
818   a = (Mat_SeqAIJMKL*)A->spptr;
819   b = (Mat_SeqAIJMKL*)B->spptr;
820   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
821   if (!a->sparse_optimized || a->state != state) {
822     MatSeqAIJMKL_create_mkl_handle(A);
823   }
824   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
825   if (!b->sparse_optimized || b->state != state) {
826     MatSeqAIJMKL_create_mkl_handle(B);
827   }
828   csrA = a->csrA;
829   csrB = b->csrA;
830 
831   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
832   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
833 
834   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
835 
836   PetscFunctionReturn(0);
837 }
838 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
839 
840 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
841 PetscErrorCode MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,Mat C)
842 {
843   Mat_SeqAIJMKL       *a, *p, *c;
844   sparse_matrix_t     csrA, csrP, csrC;
845   PetscBool           set, flag;
846   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
847   struct matrix_descr descr_type_gen;
848   PetscObjectState    state;
849   PetscErrorCode      ierr;
850 
851   PetscFunctionBegin;
852   ierr = MatIsSymmetricKnown(A,&set,&flag);
853   if (!set || (set && !flag)) {
854     ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,C);CHKERRQ(ierr);
855     PetscFunctionReturn(0);
856   }
857 
858   a = (Mat_SeqAIJMKL*)A->spptr;
859   p = (Mat_SeqAIJMKL*)P->spptr;
860   c = (Mat_SeqAIJMKL*)C->spptr;
861   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
862   if (!a->sparse_optimized || a->state != state) {
863     MatSeqAIJMKL_create_mkl_handle(A);
864   }
865   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
866   if (!p->sparse_optimized || p->state != state) {
867     MatSeqAIJMKL_create_mkl_handle(P);
868   }
869   csrA = a->csrA;
870   csrP = p->csrA;
871   csrC = c->csrA;
872   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
873 
874   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
875   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FINALIZE_MULT);
876   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to finalize mkl_sparse_sypr");
877 
878   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
879   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
880 
881   PetscFunctionReturn(0);
882 }
883 #endif
884 
885 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
886 PetscErrorCode MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
887 {
888   Mat_SeqAIJMKL       *a, *p;
889   sparse_matrix_t     csrA, csrP, csrC;
890   PetscBool           set, flag;
891   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
892   struct matrix_descr descr_type_gen;
893   PetscObjectState    state;
894   PetscErrorCode      ierr;
895 
896   PetscFunctionBegin;
897   ierr = MatIsSymmetricKnown(A,&set,&flag);
898   if (!set || (set && !flag)) {
899     ierr = MatPtAP_SeqAIJ_SeqAIJ(A,P,scall,fill,C);CHKERRQ(ierr);
900     PetscFunctionReturn(0);
901   }
902 
903   if (scall == MAT_REUSE_MATRIX) {
904     ierr = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(A,P,*C);CHKERRQ(ierr);
905     PetscFunctionReturn(0);
906   }
907 
908   a = (Mat_SeqAIJMKL*)A->spptr;
909   p = (Mat_SeqAIJMKL*)P->spptr;
910   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
911   if (!a->sparse_optimized || a->state != state) {
912     MatSeqAIJMKL_create_mkl_handle(A);
913   }
914   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
915   if (!p->sparse_optimized || p->state != state) {
916     MatSeqAIJMKL_create_mkl_handle(P);
917   }
918   csrA = a->csrA;
919   csrP = p->csrA;
920   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
921 
922   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
923   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_gen,&csrC,SPARSE_STAGE_FULL_MULT);
924   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete full mkl_sparse_sypr");
925 
926   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
927   ierr = MatSetOption(*C,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
928 
929   PetscFunctionReturn(0);
930 }
931 #endif
932 
933 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
934  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
935  * routine, but can also be used to convert an assembled SeqAIJ matrix
936  * into a SeqAIJMKL one. */
937 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
938 {
939   PetscErrorCode ierr;
940   Mat            B = *newmat;
941   Mat_SeqAIJMKL  *aijmkl;
942   PetscBool      set;
943   PetscBool      sametype;
944 
945   PetscFunctionBegin;
946   if (reuse == MAT_INITIAL_MATRIX) {
947     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
948   }
949 
950   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
951   if (sametype) PetscFunctionReturn(0);
952 
953   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
954   B->spptr = (void*) aijmkl;
955 
956   /* Set function pointers for methods that we inherit from AIJ but override.
957    * We also parse some command line options below, since those determine some of the methods we point to. */
958   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
959   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
960   B->ops->destroy          = MatDestroy_SeqAIJMKL;
961 
962   aijmkl->sparse_optimized = PETSC_FALSE;
963 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
964   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
965 #else
966   aijmkl->no_SpMV2 = PETSC_TRUE;
967 #endif
968   aijmkl->eager_inspection = PETSC_FALSE;
969 
970   /* Parse command line options. */
971   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
972   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
973   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
974   ierr = PetscOptionsEnd();CHKERRQ(ierr);
975 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
976   if(!aijmkl->no_SpMV2) {
977     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
978     aijmkl->no_SpMV2 = PETSC_TRUE;
979   }
980 #endif
981 
982 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
983   B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
984   B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
985   B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
986   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
987   B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
988 # ifdef PETSC_HAVE_MKL_SPARSE_SP2M
989   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
990 #   ifndef PETSC_USE_COMPLEX
991   B->ops->ptap             = MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2;
992   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
993 #   endif
994 # endif
995   B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
996 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
997 
998 #ifndef PETSC_HAVE_MKL_SPARSE_SP2M
999   /* In the same release in which MKL introduced mkl_sparse_sp2m() (version 18, update 2), the old sparse BLAS interfaces were
1000    * marked as deprecated. If "no_SpMV2" has been specified by the user and MKL 18u2 or later is being used, we use the new
1001    * _SpMV2 routines (set above), but do not call mkl_sparse_optimize(), which results in the old numerical kernels (without the
1002    * inspector-executor model) being used. For versions in which the older interface has not been deprecated, we use the old
1003    * interface. */
1004   if (aijmkl->no_SpMV2) {
1005     B->ops->mult             = MatMult_SeqAIJMKL;
1006     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
1007     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
1008     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
1009   }
1010 #endif
1011 
1012   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
1013   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
1014   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
1015   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
1016   if(!aijmkl->no_SpMV2) {
1017 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
1018     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1019 #ifdef PETSC_HAVE_MKL_SPARSE_SP2M
1020     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1021 #endif
1022     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1023 #endif
1024   }
1025 
1026   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
1027   *newmat = B;
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 /*@C
1032    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
1033    This type inherits from AIJ and is largely identical, but uses sparse BLAS
1034    routines from Intel MKL whenever possible.
1035    If the installed version of MKL supports the "SpMV2" sparse
1036    inspector-executor routines, then those are used by default.
1037    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, MatTransposeMatMult, and MatPtAP (for
1038    symmetric A) operations are currently supported.
1039    Note that MKL version 18, update 2 or later is required for MatPtAP/MatPtAPNumeric and MatMatMultNumeric.
1040 
1041    Collective on MPI_Comm
1042 
1043    Input Parameters:
1044 +  comm - MPI communicator, set to PETSC_COMM_SELF
1045 .  m - number of rows
1046 .  n - number of columns
1047 .  nz - number of nonzeros per row (same for all rows)
1048 -  nnz - array containing the number of nonzeros in the various rows
1049          (possibly different for each row) or NULL
1050 
1051    Output Parameter:
1052 .  A - the matrix
1053 
1054    Options Database Keys:
1055 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
1056 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
1057 
1058    Notes:
1059    If nnz is given then nz is ignored
1060 
1061    Level: intermediate
1062 
1063 .keywords: matrix, MKL, sparse, parallel
1064 
1065 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
1066 @*/
1067 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1068 {
1069   PetscErrorCode ierr;
1070 
1071   PetscFunctionBegin;
1072   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1073   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1074   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
1075   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
1080 {
1081   PetscErrorCode ierr;
1082 
1083   PetscFunctionBegin;
1084   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
1085   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1086   PetscFunctionReturn(0);
1087 }
1088