xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision d875aeafb16f00a1010d1b59f3666f78a04e796e)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19   PetscObjectState state;
20 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
21   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
22   struct matrix_descr descr;
23 #endif
24 } Mat_SeqAIJMKL;
25 
26 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
27 
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
35   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
36 #endif
37 
38   PetscFunctionBegin;
39   if (reuse == MAT_INITIAL_MATRIX) {
40     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
41   }
42 
43   /* Reset the original function pointers. */
44   B->ops->duplicate        = MatDuplicate_SeqAIJ;
45   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
46   B->ops->destroy          = MatDestroy_SeqAIJ;
47   B->ops->mult             = MatMult_SeqAIJ;
48   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
49   B->ops->multadd          = MatMultAdd_SeqAIJ;
50   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
51   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
52   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJ_SeqAIJ;
53   B->ops->ptap             = MatPtAP_SeqAIJ_SeqAIJ;
54   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJ_SeqAIJ;
55   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
56 
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61   ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijmkl_C",NULL);CHKERRQ(ierr);
62 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
63   if(!aijmkl->no_SpMV2) {
64     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
65 #if defined(PETSC_HAVE_MKL_SPARSE_SP2M)
66     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
67 #endif
68     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
69   }
70 
71   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
72    * simply involves destroying the MKL sparse matrix handle and then freeing
73    * the spptr pointer. */
74   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
75 
76   if (aijmkl->sparse_optimized) {
77     sparse_status_t stat;
78     stat = mkl_sparse_destroy(aijmkl->csrA);
79     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
80   }
81 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
82   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
83 
84   /* Change the type of B to MATSEQAIJ. */
85   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
86 
87   *newmat = B;
88   PetscFunctionReturn(0);
89 }
90 
91 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
92 {
93   PetscErrorCode ierr;
94   Mat_SeqAIJMKL  *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
95 
96   PetscFunctionBegin;
97 
98   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
99    * spptr pointer. */
100   if (aijmkl) {
101     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
102 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
103     if (aijmkl->sparse_optimized) {
104       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
105       stat = mkl_sparse_destroy(aijmkl->csrA);
106       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
107     }
108 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
109     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
110   }
111 
112   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
113    * to destroy everything that remains. */
114   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
115   /* Note that I don't call MatSetType().  I believe this is because that
116    * is only to be called when *building* a matrix.  I could be wrong, but
117    * that is how things work for the SuperLU matrix class. */
118   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
123  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
124  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
125  * handle, creates a new one, and then calls mkl_sparse_optimize().
126  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
127  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
128  * an unoptimized matrix handle here. */
129 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
130 {
131 #if !defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
132   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
133    * does nothing. We make it callable anyway in this case because it cuts
134    * down on littering the code with #ifdefs. */
135   PetscFunctionBegin;
136   PetscFunctionReturn(0);
137 #else
138   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
139   Mat_SeqAIJMKL    *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
140   PetscInt         m,n;
141   MatScalar        *aa;
142   PetscInt         *aj,*ai;
143   sparse_status_t  stat;
144   PetscErrorCode   ierr;
145 
146   PetscFunctionBegin;
147   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
148 
149   if (aijmkl->sparse_optimized) {
150     /* Matrix has been previously assembled and optimized. Must destroy old
151      * matrix handle before running the optimization step again. */
152     stat = mkl_sparse_destroy(aijmkl->csrA);
153     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_destroy");
154   }
155   aijmkl->sparse_optimized = PETSC_FALSE;
156 
157   /* Now perform the SpMV2 setup and matrix optimization. */
158   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
159   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
160   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
161   m = A->rmap->n;
162   n = A->cmap->n;
163   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
164   aa   = a->a;  /* Nonzero elements stored row-by-row. */
165   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
166   if ((a->nz!=0) & !(A->structure_only)) {
167     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
168      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
169     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
170     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle");
171     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
172     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
173     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
174     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
175     if (!aijmkl->no_SpMV2) {
176       stat = mkl_sparse_optimize(aijmkl->csrA);
177       if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
178     }
179     aijmkl->sparse_optimized = PETSC_TRUE;
180     ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
181   }
182 
183   PetscFunctionReturn(0);
184 #endif
185 }
186 
187 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
188  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
189  * matrix handle.
190  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
191  * there is no good alternative. */
192 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
193 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
194 {
195   PetscErrorCode      ierr;
196   sparse_status_t     stat;
197   sparse_index_base_t indexing;
198   PetscInt            nrows, ncols;
199   PetscInt            *aj,*ai,*dummy;
200   MatScalar           *aa;
201   Mat                 A;
202   Mat_SeqAIJMKL       *aijmkl;
203 
204   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
205   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
206   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
207 
208   if (reuse == MAT_REUSE_MATRIX) {
209     ierr = MatDestroy(mat);CHKERRQ(ierr);
210   }
211   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
212   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
213   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
214   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
215    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
216    * they will be destroyed when the MKL handle is destroyed.
217    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
218   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
219 
220   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
221    * Now turn it into a MATSEQAIJMKL. */
222   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
223 
224   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
225   aijmkl->csrA = csrA;
226 
227   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
228    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
229    * and just need to be able to run the MKL optimization step. */
230   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
231   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
232   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
233   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
234   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set mv_hint");
235   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
236   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set memory_hint");
237   if (!aijmkl->no_SpMV2) {
238     stat = mkl_sparse_optimize(aijmkl->csrA);
239     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
240   }
241   aijmkl->sparse_optimized = PETSC_TRUE;
242   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
243 
244   *mat = A;
245   PetscFunctionReturn(0);
246 }
247 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
248 
249 /* MatSeqAIJMKL_update_from_mkl_handle() updates the matrix values array from the contents of the associated MKL sparse matrix handle.
250  * This is needed after mkl_sparse_sp2m() with SPARSE_STAGE_FINALIZE_MULT has been used to compute new values of the matrix in
251  * MatMatMultNumeric(). */
252 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
253 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_update_from_mkl_handle(Mat A)
254 {
255   PetscInt            i;
256   PetscInt            nrows,ncols;
257   PetscInt            nz;
258   PetscInt            *ai,*aj,*dummy;
259   PetscScalar         *aa;
260   PetscErrorCode      ierr;
261   Mat_SeqAIJMKL       *aijmkl;
262   sparse_status_t     stat;
263   sparse_index_base_t indexing;
264 
265   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
266 
267   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
268   stat = mkl_sparse_x_export_csr(aijmkl->csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
269   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
270 
271   /* We can't just do a copy from the arrays exported by MKL to those used for the PETSc AIJ storage, because the MKL and PETSc
272    * representations differ in small ways (e.g., more explicit nonzeros per row due to preallocation). */
273   for (i=0; i<nrows; i++) {
274     nz = ai[i+1] - ai[i];
275     ierr = MatSetValues_SeqAIJ(A, 1, &i, nz, aj+ai[i], aa+ai[i], INSERT_VALUES);CHKERRQ(ierr);
276   }
277 
278   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
279   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
280 
281   ierr = PetscObjectStateGet((PetscObject)A,&(aijmkl->state));CHKERRQ(ierr);
282   /* We mark our matrix as having a valid, optimized MKL handle.
283    * TODO: It is valid, but I am not sure if it is optimized. Need to ask MKL developers. */
284   aijmkl->sparse_optimized = PETSC_TRUE;
285 
286   PetscFunctionReturn(0);
287 }
288 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
289 
290 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
291 {
292   PetscErrorCode ierr;
293   Mat_SeqAIJMKL  *aijmkl;
294   Mat_SeqAIJMKL  *aijmkl_dest;
295 
296   PetscFunctionBegin;
297   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
298   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
299   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
300   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
301   aijmkl_dest->sparse_optimized = PETSC_FALSE;
302   if (aijmkl->eager_inspection) {
303     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
304   }
305   PetscFunctionReturn(0);
306 }
307 
308 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
309 {
310   PetscErrorCode  ierr;
311   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
312   Mat_SeqAIJMKL   *aijmkl;
313 
314   PetscFunctionBegin;
315   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
316 
317   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
318    * extra information and some different methods, call the AssemblyEnd
319    * routine for a MATSEQAIJ.
320    * I'm not sure if this is the best way to do this, but it avoids
321    * a lot of code duplication. */
322   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
323   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
324 
325   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
326    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
327   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
328   if (aijmkl->eager_inspection) {
329     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
330   }
331 
332   PetscFunctionReturn(0);
333 }
334 
335 #if !defined(PETSC_HAVE_MKL_SPARSE_SP2M)
336 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
337 {
338   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
339   const PetscScalar *x;
340   PetscScalar       *y;
341   const MatScalar   *aa;
342   PetscErrorCode    ierr;
343   PetscInt          m=A->rmap->n;
344   PetscInt          n=A->cmap->n;
345   PetscScalar       alpha = 1.0;
346   PetscScalar       beta = 0.0;
347   const PetscInt    *aj,*ai;
348   char              matdescra[6];
349 
350 
351   /* Variables not in MatMult_SeqAIJ. */
352   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
353 
354   PetscFunctionBegin;
355   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
356   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
357   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
358   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
359   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
360   aa   = a->a;  /* Nonzero elements stored row-by-row. */
361   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
362 
363   /* Call MKL sparse BLAS routine to do the MatMult. */
364   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
365 
366   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
367   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
368   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
369   PetscFunctionReturn(0);
370 }
371 #endif
372 
373 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
374 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
375 {
376   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
377   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
378   const PetscScalar *x;
379   PetscScalar       *y;
380   PetscErrorCode    ierr;
381   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
382   PetscObjectState  state;
383 
384   PetscFunctionBegin;
385 
386   /* If there are no nonzero entries, zero yy and return immediately. */
387   if(!a->nz) {
388     PetscInt i;
389     PetscInt m=A->rmap->n;
390     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
391     for (i=0; i<m; i++) {
392       y[i] = 0.0;
393     }
394     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
395     PetscFunctionReturn(0);
396   }
397 
398   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
399   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
400 
401   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
402    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
403    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
404   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
405   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
406     MatSeqAIJMKL_create_mkl_handle(A);
407   }
408 
409   /* Call MKL SpMV2 executor routine to do the MatMult. */
410   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
411   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
412 
413   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
414   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
415   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
419 
420 #if !defined(PETSC_HAVE_MKL_SPARSE_SP2M)
421 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
422 {
423   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
424   const PetscScalar *x;
425   PetscScalar       *y;
426   const MatScalar   *aa;
427   PetscErrorCode    ierr;
428   PetscInt          m=A->rmap->n;
429   PetscInt          n=A->cmap->n;
430   PetscScalar       alpha = 1.0;
431   PetscScalar       beta = 0.0;
432   const PetscInt    *aj,*ai;
433   char              matdescra[6];
434 
435   /* Variables not in MatMultTranspose_SeqAIJ. */
436   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
437 
438   PetscFunctionBegin;
439   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
440   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
441   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
442   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
443   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
444   aa   = a->a;  /* Nonzero elements stored row-by-row. */
445   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
446 
447   /* Call MKL sparse BLAS routine to do the MatMult. */
448   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
449 
450   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
451   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
452   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
453   PetscFunctionReturn(0);
454 }
455 #endif
456 
457 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
458 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
459 {
460   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
461   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
462   const PetscScalar *x;
463   PetscScalar       *y;
464   PetscErrorCode    ierr;
465   sparse_status_t   stat;
466   PetscObjectState  state;
467 
468   PetscFunctionBegin;
469 
470   /* If there are no nonzero entries, zero yy and return immediately. */
471   if(!a->nz) {
472     PetscInt i;
473     PetscInt n=A->cmap->n;
474     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
475     for (i=0; i<n; i++) {
476       y[i] = 0.0;
477     }
478     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
479     PetscFunctionReturn(0);
480   }
481 
482   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
483   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
484 
485   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
486    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
487    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
488   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
489   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
490     MatSeqAIJMKL_create_mkl_handle(A);
491   }
492 
493   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
494   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
495   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
496 
497   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
498   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
499   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
500   PetscFunctionReturn(0);
501 }
502 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
503 
504 #if !defined(PETSC_HAVE_MKL_SPARSE_SP2M)
505 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
506 {
507   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
508   const PetscScalar *x;
509   PetscScalar       *y,*z;
510   const MatScalar   *aa;
511   PetscErrorCode    ierr;
512   PetscInt          m=A->rmap->n;
513   PetscInt          n=A->cmap->n;
514   const PetscInt    *aj,*ai;
515   PetscInt          i;
516 
517   /* Variables not in MatMultAdd_SeqAIJ. */
518   char              transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
519   PetscScalar       alpha = 1.0;
520   PetscScalar       beta;
521   char              matdescra[6];
522 
523   PetscFunctionBegin;
524   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
525   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
526 
527   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
528   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
529   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
530   aa   = a->a;  /* Nonzero elements stored row-by-row. */
531   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
532 
533   /* Call MKL sparse BLAS routine to do the MatMult. */
534   if (zz == yy) {
535     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
536     beta = 1.0;
537     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
538   } else {
539     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
540      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
541     beta = 0.0;
542     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
543     for (i=0; i<m; i++) {
544       z[i] += y[i];
545     }
546   }
547 
548   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
549   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
550   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553 #endif
554 
555 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
556 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
557 {
558   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
559   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
560   const PetscScalar *x;
561   PetscScalar       *y,*z;
562   PetscErrorCode    ierr;
563   PetscInt          m=A->rmap->n;
564   PetscInt          i;
565 
566   /* Variables not in MatMultAdd_SeqAIJ. */
567   sparse_status_t   stat = SPARSE_STATUS_SUCCESS;
568   PetscObjectState  state;
569 
570   PetscFunctionBegin;
571 
572   /* If there are no nonzero entries, set zz = yy and return immediately. */
573   if(!a->nz) {
574     PetscInt i;
575     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
576     for (i=0; i<m; i++) {
577       z[i] = y[i];
578     }
579     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
580     PetscFunctionReturn(0);
581   }
582 
583   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
584   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
585 
586   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
587    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
588    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
589   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
590   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
591     MatSeqAIJMKL_create_mkl_handle(A);
592   }
593 
594   /* Call MKL sparse BLAS routine to do the MatMult. */
595   if (zz == yy) {
596     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
597      * with alpha and beta both set to 1.0. */
598     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
599     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
600   } else {
601     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
602      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
603     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
604     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
605     for (i=0; i<m; i++) {
606       z[i] += y[i];
607     }
608   }
609 
610   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
611   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
612   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
613   PetscFunctionReturn(0);
614 }
615 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
616 
617 #if !defined(PETSC_HAVE_MKL_SPARSE_SP2M)
618 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
619 {
620   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
621   const PetscScalar *x;
622   PetscScalar       *y,*z;
623   const MatScalar   *aa;
624   PetscErrorCode    ierr;
625   PetscInt          m=A->rmap->n;
626   PetscInt          n=A->cmap->n;
627   const PetscInt    *aj,*ai;
628   PetscInt          i;
629 
630   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
631   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
632   PetscScalar       alpha = 1.0;
633   PetscScalar       beta;
634   char              matdescra[6];
635 
636   PetscFunctionBegin;
637   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
638   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
639 
640   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
641   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
642   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
643   aa   = a->a;  /* Nonzero elements stored row-by-row. */
644   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
645 
646   /* Call MKL sparse BLAS routine to do the MatMult. */
647   if (zz == yy) {
648     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
649     beta = 1.0;
650     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
651   } else {
652     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
653      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
654     beta = 0.0;
655     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
656     for (i=0; i<n; i++) {
657       z[i] += y[i];
658     }
659   }
660 
661   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
662   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
663   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 #endif
667 
668 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
669 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
670 {
671   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
672   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
673   const PetscScalar *x;
674   PetscScalar       *y,*z;
675   PetscErrorCode    ierr;
676   PetscInt          n=A->cmap->n;
677   PetscInt          i;
678   PetscObjectState  state;
679 
680   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
681   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
682 
683   PetscFunctionBegin;
684 
685   /* If there are no nonzero entries, set zz = yy and return immediately. */
686   if(!a->nz) {
687     PetscInt i;
688     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
689     for (i=0; i<n; i++) {
690       z[i] = y[i];
691     }
692     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
693     PetscFunctionReturn(0);
694   }
695 
696   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
697   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
698 
699   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
700    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
701    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
702   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
703   if (!aijmkl->sparse_optimized || aijmkl->state != state) {
704     MatSeqAIJMKL_create_mkl_handle(A);
705   }
706 
707   /* Call MKL sparse BLAS routine to do the MatMult. */
708   if (zz == yy) {
709     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
710      * with alpha and beta both set to 1.0. */
711     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
712     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
713   } else {
714     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
715      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
716     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
717     if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: error in mkl_sparse_x_mv");
718     for (i=0; i<n; i++) {
719       z[i] += y[i];
720     }
721   }
722 
723   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
724   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
725   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
729 
730 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
731 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
732  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
733  * For releases of MKL prior to version 18, update 2:
734  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
735  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
736  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
737  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
738 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
739 {
740   Mat_SeqAIJMKL    *a, *b;
741   sparse_matrix_t  csrA, csrB, csrC;
742   PetscErrorCode   ierr;
743   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
744   PetscObjectState state;
745 
746   PetscFunctionBegin;
747   a = (Mat_SeqAIJMKL*)A->spptr;
748   b = (Mat_SeqAIJMKL*)B->spptr;
749   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
750   if (!a->sparse_optimized || a->state != state) {
751     MatSeqAIJMKL_create_mkl_handle(A);
752   }
753   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
754   if (!b->sparse_optimized || b->state != state) {
755     MatSeqAIJMKL_create_mkl_handle(B);
756   }
757   csrA = a->csrA;
758   csrB = b->csrA;
759 
760   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
761   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
762 
763   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
764 
765   PetscFunctionReturn(0);
766 }
767 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
768 
769 #if defined(PETSC_HAVE_MKL_SPARSE_SP2M)
770 PetscErrorCode MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,Mat C)
771 {
772   Mat_SeqAIJMKL       *a, *b, *c;
773   sparse_matrix_t     csrA, csrB, csrC;
774   PetscErrorCode      ierr;
775   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
776   struct matrix_descr descr_type_gen;
777   PetscObjectState    state;
778 
779   PetscFunctionBegin;
780   a = (Mat_SeqAIJMKL*)A->spptr;
781   b = (Mat_SeqAIJMKL*)B->spptr;
782   c = (Mat_SeqAIJMKL*)C->spptr;
783   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
784   if (!a->sparse_optimized || a->state != state) {
785     MatSeqAIJMKL_create_mkl_handle(A);
786   }
787   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
788   if (!b->sparse_optimized || b->state != state) {
789     MatSeqAIJMKL_create_mkl_handle(B);
790   }
791   csrA = a->csrA;
792   csrB = b->csrA;
793   csrC = c->csrA;
794   descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
795 
796   stat = mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrA,
797                          SPARSE_OPERATION_NON_TRANSPOSE,descr_type_gen,csrB,
798                          SPARSE_STAGE_FINALIZE_MULT,&csrC);
799 
800   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete numerical stage of sparse matrix-matrix multiply");
801 
802   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
803   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
804 
805   PetscFunctionReturn(0);
806 }
807 #endif /* PETSC_HAVE_MKL_SPARSE_SP2M */
808 
809 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
810 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
811 {
812   Mat_SeqAIJMKL    *a, *b;
813   sparse_matrix_t  csrA, csrB, csrC;
814   PetscErrorCode   ierr;
815   sparse_status_t  stat = SPARSE_STATUS_SUCCESS;
816   PetscObjectState state;
817 
818   PetscFunctionBegin;
819   a = (Mat_SeqAIJMKL*)A->spptr;
820   b = (Mat_SeqAIJMKL*)B->spptr;
821   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
822   if (!a->sparse_optimized || a->state != state) {
823     MatSeqAIJMKL_create_mkl_handle(A);
824   }
825   ierr = PetscObjectStateGet((PetscObject)B,&state);CHKERRQ(ierr);
826   if (!b->sparse_optimized || b->state != state) {
827     MatSeqAIJMKL_create_mkl_handle(B);
828   }
829   csrA = a->csrA;
830   csrB = b->csrA;
831 
832   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
833   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
834 
835   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
836 
837   PetscFunctionReturn(0);
838 }
839 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
840 
841 #if defined(PETSC_HAVE_MKL_SPARSE_SP2M)
842 PetscErrorCode MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,Mat C)
843 {
844   Mat_SeqAIJMKL       *a, *p, *c;
845   sparse_matrix_t     csrA, csrP, csrC;
846   PetscBool           set, flag;
847   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
848   struct matrix_descr descr_type_sym;
849   PetscObjectState    state;
850   PetscErrorCode      ierr;
851 
852   PetscFunctionBegin;
853   ierr = MatIsSymmetricKnown(A,&set,&flag);
854   if (!set || (set && !flag)) {
855     ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,C);CHKERRQ(ierr);
856     PetscFunctionReturn(0);
857   }
858 
859   a = (Mat_SeqAIJMKL*)A->spptr;
860   p = (Mat_SeqAIJMKL*)P->spptr;
861   c = (Mat_SeqAIJMKL*)C->spptr;
862   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
863   if (!a->sparse_optimized || a->state != state) {
864     MatSeqAIJMKL_create_mkl_handle(A);
865   }
866   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
867   if (!p->sparse_optimized || p->state != state) {
868     MatSeqAIJMKL_create_mkl_handle(P);
869   }
870   csrA = a->csrA;
871   csrP = p->csrA;
872   csrC = c->csrA;
873   descr_type_sym.type = SPARSE_MATRIX_TYPE_SYMMETRIC;
874   descr_type_sym.mode = SPARSE_FILL_MODE_LOWER;
875   descr_type_sym.diag = SPARSE_DIAG_NON_UNIT;
876 
877   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
878   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_sym,&csrC,SPARSE_STAGE_FINALIZE_MULT);
879   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to finalize mkl_sparse_sypr");
880 
881   /* Have to update the PETSc AIJ representation for matrix C from contents of MKL handle. */
882   ierr = MatSeqAIJMKL_update_from_mkl_handle(C);CHKERRQ(ierr);
883 
884   PetscFunctionReturn(0);
885 }
886 #endif
887 
888 #if defined(PETSC_HAVE_MKL_SPARSE_SP2M)
889 PetscErrorCode MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
890 {
891   Mat_SeqAIJMKL       *a, *p;
892   sparse_matrix_t     csrA, csrP, csrC;
893   PetscBool           set, flag;
894   sparse_status_t     stat = SPARSE_STATUS_SUCCESS;
895   struct matrix_descr descr_type_sym;
896   PetscObjectState    state;
897   PetscErrorCode      ierr;
898 
899   PetscFunctionBegin;
900   ierr = MatIsSymmetricKnown(A,&set,&flag);
901   if (!set || (set && !flag)) {
902     ierr = MatPtAP_SeqAIJ_SeqAIJ(A,P,scall,fill,C);CHKERRQ(ierr);
903     PetscFunctionReturn(0);
904   }
905 
906   if (scall == MAT_REUSE_MATRIX) {
907     ierr = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2(A,P,*C);CHKERRQ(ierr);
908     PetscFunctionReturn(0);
909   }
910 
911   a = (Mat_SeqAIJMKL*)A->spptr;
912   p = (Mat_SeqAIJMKL*)P->spptr;
913   ierr = PetscObjectStateGet((PetscObject)A,&state);CHKERRQ(ierr);
914   if (!a->sparse_optimized || a->state != state) {
915     MatSeqAIJMKL_create_mkl_handle(A);
916   }
917   ierr = PetscObjectStateGet((PetscObject)P,&state);CHKERRQ(ierr);
918   if (!p->sparse_optimized || p->state != state) {
919     MatSeqAIJMKL_create_mkl_handle(P);
920   }
921   csrA = a->csrA;
922   csrP = p->csrA;
923   descr_type_sym.type = SPARSE_MATRIX_TYPE_SYMMETRIC;
924   descr_type_sym.mode = SPARSE_FILL_MODE_LOWER;
925   descr_type_sym.diag = SPARSE_DIAG_NON_UNIT;
926 
927   /* Note that the call below won't work for complex matrices. (We protect this when pointers are assigned in MatConvert.) */
928   stat = mkl_sparse_sypr(SPARSE_OPERATION_TRANSPOSE,csrP,csrA,descr_type_sym,&csrC,SPARSE_STAGE_FULL_MULT);
929   if (stat != SPARSE_STATUS_SUCCESS) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete full mkl_sparse_sypr");
930 
931   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
932   ierr = MatSetOption(*C,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
933 
934   PetscFunctionReturn(0);
935 }
936 #endif
937 
938 /* This function prototype is needed in MatConvert_SeqAIJ_SeqAIJMKL(), below. */
939 PETSC_INTERN PetscErrorCode MatPtAP_IS_XAIJ(Mat,Mat,MatReuse,PetscReal,Mat*);
940 
941 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
942  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
943  * routine, but can also be used to convert an assembled SeqAIJ matrix
944  * into a SeqAIJMKL one. */
945 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
946 {
947   PetscErrorCode ierr;
948   Mat            B = *newmat;
949   Mat_SeqAIJMKL  *aijmkl;
950   PetscBool      set;
951   PetscBool      sametype;
952 
953   PetscFunctionBegin;
954   if (reuse == MAT_INITIAL_MATRIX) {
955     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
956   }
957 
958   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
959   if (sametype) PetscFunctionReturn(0);
960 
961   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
962   B->spptr = (void*) aijmkl;
963 
964   /* Set function pointers for methods that we inherit from AIJ but override.
965    * We also parse some command line options below, since those determine some of the methods we point to. */
966   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
967   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
968   B->ops->destroy          = MatDestroy_SeqAIJMKL;
969 
970   aijmkl->sparse_optimized = PETSC_FALSE;
971 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
972   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
973 #else
974   aijmkl->no_SpMV2 = PETSC_TRUE;
975 #endif
976   aijmkl->eager_inspection = PETSC_FALSE;
977 
978   /* Parse command line options. */
979   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
980   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
981   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
982   ierr = PetscOptionsEnd();CHKERRQ(ierr);
983 #if !defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
984   if(!aijmkl->no_SpMV2) {
985     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
986     aijmkl->no_SpMV2 = PETSC_TRUE;
987   }
988 #endif
989 
990 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
991   B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
992   B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
993   B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
994   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
995   B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
996 # if defined(PETSC_HAVE_MKL_SPARSE_SP2M)
997   B->ops->matmultnumeric   = MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
998 #   if !defined(PETSC_USE_COMPLEX)
999   B->ops->ptap             = MatPtAP_SeqAIJMKL_SeqAIJMKL_SpMV2;
1000   B->ops->ptapnumeric      = MatPtAPNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2;
1001 #   endif
1002 # endif
1003   B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
1004 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
1005 
1006 #if !defined(PETSC_HAVE_MKL_SPARSE_SP2M)
1007   /* In the same release in which MKL introduced mkl_sparse_sp2m() (version 18, update 2), the old sparse BLAS interfaces were
1008    * marked as deprecated. If "no_SpMV2" has been specified by the user and MKL 18u2 or later is being used, we use the new
1009    * _SpMV2 routines (set above), but do not call mkl_sparse_optimize(), which results in the old numerical kernels (without the
1010    * inspector-executor model) being used. For versions in which the older interface has not been deprecated, we use the old
1011    * interface. */
1012   if (aijmkl->no_SpMV2) {
1013     B->ops->mult             = MatMult_SeqAIJMKL;
1014     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
1015     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
1016     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
1017   }
1018 #endif
1019 
1020   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
1021   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
1022   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
1023   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
1024   ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijmkl_C",MatPtAP_IS_XAIJ);CHKERRQ(ierr);
1025   if(!aijmkl->no_SpMV2) {
1026 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
1027     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1028 #if defined(PETSC_HAVE_MKL_SPARSE_SP2M)
1029     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqaijmkl_seqaijmkl_C",MatMatMultNumeric_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1030 #endif
1031     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
1032 #endif
1033   }
1034 
1035   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
1036   *newmat = B;
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 /*@C
1041    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
1042    This type inherits from AIJ and is largely identical, but uses sparse BLAS
1043    routines from Intel MKL whenever possible.
1044    If the installed version of MKL supports the "SpMV2" sparse
1045    inspector-executor routines, then those are used by default.
1046    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, MatTransposeMatMult, and MatPtAP (for
1047    symmetric A) operations are currently supported.
1048    Note that MKL version 18, update 2 or later is required for MatPtAP/MatPtAPNumeric and MatMatMultNumeric.
1049 
1050    Collective on MPI_Comm
1051 
1052    Input Parameters:
1053 +  comm - MPI communicator, set to PETSC_COMM_SELF
1054 .  m - number of rows
1055 .  n - number of columns
1056 .  nz - number of nonzeros per row (same for all rows)
1057 -  nnz - array containing the number of nonzeros in the various rows
1058          (possibly different for each row) or NULL
1059 
1060    Output Parameter:
1061 .  A - the matrix
1062 
1063    Options Database Keys:
1064 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
1065 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
1066 
1067    Notes:
1068    If nnz is given then nz is ignored
1069 
1070    Level: intermediate
1071 
1072 .keywords: matrix, MKL, sparse, parallel
1073 
1074 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
1075 @*/
1076 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1077 {
1078   PetscErrorCode ierr;
1079 
1080   PetscFunctionBegin;
1081   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1082   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1083   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
1084   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
1089 {
1090   PetscErrorCode ierr;
1091 
1092   PetscFunctionBegin;
1093   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
1094   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1095   PetscFunctionReturn(0);
1096 }
1097