xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 19e8fbdabb07dcd9104f2d10b4dec626a2e1a063)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool eager_inspection; /* If PETSC_TRUE, then call mkl_sparse_optimize() in MatDuplicate()/MatAssemblyEnd(). */
18   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
19 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
20   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
21   struct matrix_descr descr;
22 #endif
23 } Mat_SeqAIJMKL;
24 
25 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
26 
27 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
28 {
29   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
30   /* so we will ignore 'MatType type'. */
31   PetscErrorCode ierr;
32   Mat            B       = *newmat;
33 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
34   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 #endif
36 
37   PetscFunctionBegin;
38   if (reuse == MAT_INITIAL_MATRIX) {
39     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
40   }
41 
42   /* Reset the original function pointers. */
43   B->ops->duplicate        = MatDuplicate_SeqAIJ;
44   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
45   B->ops->destroy          = MatDestroy_SeqAIJ;
46   B->ops->mult             = MatMult_SeqAIJ;
47   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
48   B->ops->multadd          = MatMultAdd_SeqAIJ;
49   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
50   B->ops->matmult          = MatMatMult_SeqAIJ_SeqAIJ;
51   B->ops->transposematmult = MatTransposeMatMult_SeqAIJ_SeqAIJ;
52   B->ops->scale            = MatScale_SeqAIJ;
53   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
54   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
55   B->ops->axpy             = MatAXPY_SeqAIJ;
56 
57   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
58   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
59   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
60   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
61   if(!aijmkl->no_SpMV2) {
62 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
63     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
64     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",NULL);CHKERRQ(ierr);
65 #endif
66   }
67 
68   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
69    * simply involves destroying the MKL sparse matrix handle and then freeing
70    * the spptr pointer. */
71 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
72   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;
73 
74   if (aijmkl->sparse_optimized) {
75     sparse_status_t stat;
76     stat = mkl_sparse_destroy(aijmkl->csrA);
77     if (stat != SPARSE_STATUS_SUCCESS) {
78       PetscFunctionReturn(PETSC_ERR_LIB);
79     }
80   }
81 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
82   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
83 
84   /* Change the type of B to MATSEQAIJ. */
85   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
86 
87   *newmat = B;
88   PetscFunctionReturn(0);
89 }
90 
91 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
92 {
93   PetscErrorCode ierr;
94   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
95 
96   PetscFunctionBegin;
97 
98   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
99    * spptr pointer. */
100   if (aijmkl) {
101     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
102 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
103     if (aijmkl->sparse_optimized) {
104       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
105       stat = mkl_sparse_destroy(aijmkl->csrA);
106       if (stat != SPARSE_STATUS_SUCCESS) {
107         PetscFunctionReturn(PETSC_ERR_LIB);
108       }
109     }
110 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
111     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
112   }
113 
114   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
115    * to destroy everything that remains. */
116   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
117   /* Note that I don't call MatSetType().  I believe this is because that
118    * is only to be called when *building* a matrix.  I could be wrong, but
119    * that is how things work for the SuperLU matrix class. */
120   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 /* MatSeqAIJKL_create_mkl_handle(), if called with an AIJMKL matrix that has not had mkl_sparse_optimize() called for it,
125  * creates an MKL sparse matrix handle from the AIJ arrays and calls mkl_sparse_optimize().
126  * If called with an AIJMKL matrix for which aijmkl->sparse_optimized == PETSC_TRUE, then it destroys the old matrix
127  * handle, creates a new one, and then calls mkl_sparse_optimize().
128  * Although in normal MKL usage it is possible to have a valid matrix handle on which mkl_sparse_optimize() has not been
129  * called, for AIJMKL the handle creation and optimization step always occur together, so we don't handle the case of
130  * an unoptimized matrix handle here. */
131 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
132 {
133 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
134   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
135    * does nothing. We make it callable anyway in this case because it cuts
136    * down on littering the code with #ifdefs. */
137   PetscFunctionBegin;
138   PetscFunctionReturn(0);
139 #else
140   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
141   Mat_SeqAIJMKL   *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
142   PetscInt        m,n;
143   MatScalar       *aa;
144   PetscInt        *aj,*ai;
145   sparse_status_t stat;
146 
147   PetscFunctionBegin;
148   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
149 
150   if (aijmkl->sparse_optimized) {
151     /* Matrix has been previously assembled and optimized. Must destroy old
152      * matrix handle before running the optimization step again. */
153     stat = mkl_sparse_destroy(aijmkl->csrA);
154     if (stat != SPARSE_STATUS_SUCCESS) {
155       PetscFunctionReturn(PETSC_ERR_LIB);
156     }
157   }
158   aijmkl->sparse_optimized = PETSC_FALSE;
159 
160   /* Now perform the SpMV2 setup and matrix optimization. */
161   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
162   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
163   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
164   m = A->rmap->n;
165   n = A->cmap->n;
166   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
167   aa   = a->a;  /* Nonzero elements stored row-by-row. */
168   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
169   if ((a->nz!=0) & !(A->structure_only)) {
170     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
171      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
172     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
173     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
174     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
175     stat = mkl_sparse_optimize(aijmkl->csrA);
176     if (stat != SPARSE_STATUS_SUCCESS) {
177       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
178       PetscFunctionReturn(PETSC_ERR_LIB);
179     }
180     aijmkl->sparse_optimized = PETSC_TRUE;
181   }
182 
183   PetscFunctionReturn(0);
184 #endif
185 }
186 
187 /* MatSeqAIJMKL_create_from_mkl_handle() creates a sequential AIJMKL matrix from an MKL sparse matrix handle.
188  * We need this to implement MatMatMult() using the MKL inspector-executor routines, which return an (unoptimized)
189  * matrix handle.
190  * Note: This routine simply destroys and replaces the original matrix if MAT_REUSE_MATRIX has been specified, as
191  * there is no good alternative. */
192 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
193 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_from_mkl_handle(MPI_Comm comm,sparse_matrix_t csrA,MatReuse reuse,Mat *mat)
194 {
195   PetscErrorCode ierr;
196   sparse_status_t stat;
197   sparse_index_base_t indexing;
198   PetscInt nrows, ncols;
199   PetscInt *aj,*ai,*dummy;
200   MatScalar *aa;
201   Mat A;
202   Mat_SeqAIJ *a;
203   Mat_SeqAIJMKL *aijmkl;
204 
205   /* Note: Must pass in &dummy below since MKL can't accept NULL for this output array we don't actually want. */
206   stat = mkl_sparse_x_export_csr(csrA,&indexing,&nrows,&ncols,&ai,&dummy,&aj,&aa);
207   if (stat != SPARSE_STATUS_SUCCESS) {
208     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_x_export_csr()");
209     PetscFunctionReturn(PETSC_ERR_LIB);
210   }
211 
212   if (reuse == MAT_REUSE_MATRIX) {
213     ierr = MatDestroy(mat);CHKERRQ(ierr);
214   }
215   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
216   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
217   ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,nrows,ncols);CHKERRQ(ierr);
218   /* We use MatSeqAIJSetPreallocationCSR() instead of MatCreateSeqAIJWithArrays() because we must copy the arrays exported
219    * from MKL; MKL developers tell us that modifying the arrays may cause unexpected results when using the MKL handle, and
220    * they will be destroyed when the MKL handle is destroyed.
221    * (In the interest of reducing memory consumption in future, can we figure out good ways to deal with this?) */
222   ierr = MatSeqAIJSetPreallocationCSR(A,ai,aj,aa);CHKERRQ(ierr);
223 
224   /* We now have an assembled sequential AIJ matrix created from copies of the exported arrays from the MKL matrix handle.
225    * Now turn it into a MATSEQAIJMKL. */
226   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
227 
228   a = (Mat_SeqAIJ*)A->data;
229   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
230   aijmkl->csrA = csrA;
231 
232   /* The below code duplicates much of what is in MatSeqAIJKL_create_mkl_handle(). I dislike this code duplication, but
233    * MatSeqAIJMKL_create_mkl_handle() cannot be used because we don't need to create a handle -- we've already got one,
234    * and just need to be able to run the MKL optimization step. */
235   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
236   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
237   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
238   stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
239   stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
240   stat = mkl_sparse_optimize(aijmkl->csrA);
241   if (stat != SPARSE_STATUS_SUCCESS) {
242     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to set hints/complete mkl_sparse_optimize");
243     PetscFunctionReturn(PETSC_ERR_LIB);
244   }
245   aijmkl->sparse_optimized = PETSC_TRUE;
246 
247   *mat = A;
248   PetscFunctionReturn(0);
249 }
250 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
251 
252 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
253 {
254   PetscErrorCode ierr;
255   Mat_SeqAIJMKL *aijmkl;
256   Mat_SeqAIJMKL *aijmkl_dest;
257 
258   PetscFunctionBegin;
259   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
260   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
261   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
262   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
263   aijmkl_dest->sparse_optimized = PETSC_FALSE;
264   if (aijmkl->eager_inspection) {
265     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
266   }
267   PetscFunctionReturn(0);
268 }
269 
270 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
271 {
272   PetscErrorCode  ierr;
273   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
274   Mat_SeqAIJMKL *aijmkl;
275 
276   PetscFunctionBegin;
277   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
278 
279   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
280    * extra information and some different methods, call the AssemblyEnd
281    * routine for a MATSEQAIJ.
282    * I'm not sure if this is the best way to do this, but it avoids
283    * a lot of code duplication. */
284   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
285   ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
286 
287   /* If the user has requested "eager" inspection, create the optimized MKL sparse handle (if needed; the function checks).
288    * (The default is to do "lazy" inspection, deferring this until something like MatMult() is called.) */
289   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
290   if (aijmkl->eager_inspection) {
291     ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
292   } else if (aijmkl->sparse_optimized) {
293     /* If doing lazy inspection and there is an optimized MKL handle, we need to destroy it, so that it will be
294      * rebuilt later when needed. Otherwise, some SeqAIJ implementations that we depend on for some operations
295      * (such as MatMatMultNumeric()) can modify the result matrix without the matrix handle being rebuilt.
296      * (The SeqAIJ version MatMatMultNumeric() knows nothing about matrix handles, but it *does* call MatAssemblyEnd().) */
297     sparse_status_t stat = mkl_sparse_destroy(aijmkl->csrA);
298     if (stat != SPARSE_STATUS_SUCCESS) {
299       PetscFunctionReturn(PETSC_ERR_LIB);
300     }
301     aijmkl->sparse_optimized = PETSC_FALSE;
302   }
303 
304   PetscFunctionReturn(0);
305 }
306 
307 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
308 {
309   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
310   const PetscScalar *x;
311   PetscScalar       *y;
312   const MatScalar   *aa;
313   PetscErrorCode    ierr;
314   PetscInt          m=A->rmap->n;
315   PetscInt          n=A->cmap->n;
316   PetscScalar       alpha = 1.0;
317   PetscScalar       beta = 0.0;
318   const PetscInt    *aj,*ai;
319   char              matdescra[6];
320 
321 
322   /* Variables not in MatMult_SeqAIJ. */
323   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
324 
325   PetscFunctionBegin;
326   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
327   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
328   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
329   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
330   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
331   aa   = a->a;  /* Nonzero elements stored row-by-row. */
332   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
333 
334   /* Call MKL sparse BLAS routine to do the MatMult. */
335   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
336 
337   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
338   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
339   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
340   PetscFunctionReturn(0);
341 }
342 
343 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
344 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
345 {
346   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
347   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
348   const PetscScalar *x;
349   PetscScalar       *y;
350   PetscErrorCode    ierr;
351   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
352 
353   PetscFunctionBegin;
354 
355   /* If there are no nonzero entries, zero yy and return immediately. */
356   if(!a->nz) {
357     PetscInt i;
358     PetscInt m=A->rmap->n;
359     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
360     for (i=0; i<m; i++) {
361       y[i] = 0.0;
362     }
363     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
364     PetscFunctionReturn(0);
365   }
366 
367   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
368   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
369 
370   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
371    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
372    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
373   if (!aijmkl->sparse_optimized) {
374     MatSeqAIJMKL_create_mkl_handle(A);
375   }
376 
377   /* Call MKL SpMV2 executor routine to do the MatMult. */
378   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
379 
380   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
381   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
382   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
383   if (stat != SPARSE_STATUS_SUCCESS) {
384     PetscFunctionReturn(PETSC_ERR_LIB);
385   }
386   PetscFunctionReturn(0);
387 }
388 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
389 
390 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   const PetscScalar *x;
394   PetscScalar       *y;
395   const MatScalar   *aa;
396   PetscErrorCode    ierr;
397   PetscInt          m=A->rmap->n;
398   PetscInt          n=A->cmap->n;
399   PetscScalar       alpha = 1.0;
400   PetscScalar       beta = 0.0;
401   const PetscInt    *aj,*ai;
402   char              matdescra[6];
403 
404   /* Variables not in MatMultTranspose_SeqAIJ. */
405   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
406 
407   PetscFunctionBegin;
408   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
409   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
410   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
411   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
412   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
413   aa   = a->a;  /* Nonzero elements stored row-by-row. */
414   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
415 
416   /* Call MKL sparse BLAS routine to do the MatMult. */
417   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
418 
419   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
420   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
421   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
422   PetscFunctionReturn(0);
423 }
424 
425 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
426 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
427 {
428   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
429   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
430   const PetscScalar *x;
431   PetscScalar       *y;
432   PetscErrorCode    ierr;
433   sparse_status_t   stat;
434 
435   PetscFunctionBegin;
436 
437   /* If there are no nonzero entries, zero yy and return immediately. */
438   if(!a->nz) {
439     PetscInt i;
440     PetscInt n=A->cmap->n;
441     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
442     for (i=0; i<n; i++) {
443       y[i] = 0.0;
444     }
445     ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
446     PetscFunctionReturn(0);
447   }
448 
449   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
450   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
451 
452   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
453    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
454    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
455   if (!aijmkl->sparse_optimized) {
456     MatSeqAIJMKL_create_mkl_handle(A);
457   }
458 
459   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
460   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
461 
462   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
463   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
464   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
465   if (stat != SPARSE_STATUS_SUCCESS) {
466     PetscFunctionReturn(PETSC_ERR_LIB);
467   }
468   PetscFunctionReturn(0);
469 }
470 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
471 
472 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
473 {
474   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
475   const PetscScalar *x;
476   PetscScalar       *y,*z;
477   const MatScalar   *aa;
478   PetscErrorCode    ierr;
479   PetscInt          m=A->rmap->n;
480   PetscInt          n=A->cmap->n;
481   const PetscInt    *aj,*ai;
482   PetscInt          i;
483 
484   /* Variables not in MatMultAdd_SeqAIJ. */
485   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
486   PetscScalar       alpha = 1.0;
487   PetscScalar       beta;
488   char              matdescra[6];
489 
490   PetscFunctionBegin;
491   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
492   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
493 
494   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
495   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
496   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
497   aa   = a->a;  /* Nonzero elements stored row-by-row. */
498   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
499 
500   /* Call MKL sparse BLAS routine to do the MatMult. */
501   if (zz == yy) {
502     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
503     beta = 1.0;
504     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
505   } else {
506     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
507      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
508     beta = 0.0;
509     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
510     for (i=0; i<m; i++) {
511       z[i] += y[i];
512     }
513   }
514 
515   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
516   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
517   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
518   PetscFunctionReturn(0);
519 }
520 
521 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
522 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
523 {
524   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
525   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
526   const PetscScalar *x;
527   PetscScalar       *y,*z;
528   PetscErrorCode    ierr;
529   PetscInt          m=A->rmap->n;
530   PetscInt          i;
531 
532   /* Variables not in MatMultAdd_SeqAIJ. */
533   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
534 
535   PetscFunctionBegin;
536 
537   /* If there are no nonzero entries, set zz = yy and return immediately. */
538   if(!a->nz) {
539     PetscInt i;
540     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
541     for (i=0; i<m; i++) {
542       z[i] = y[i];
543     }
544     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
545     PetscFunctionReturn(0);
546   }
547 
548   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
549   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
550 
551   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
552    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
553    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
554   if (!aijmkl->sparse_optimized) {
555     MatSeqAIJMKL_create_mkl_handle(A);
556   }
557 
558   /* Call MKL sparse BLAS routine to do the MatMult. */
559   if (zz == yy) {
560     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
561      * with alpha and beta both set to 1.0. */
562     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
563   } else {
564     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
565      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
566     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
567     for (i=0; i<m; i++) {
568       z[i] += y[i];
569     }
570   }
571 
572   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
575   if (stat != SPARSE_STATUS_SUCCESS) {
576     PetscFunctionReturn(PETSC_ERR_LIB);
577   }
578   PetscFunctionReturn(0);
579 }
580 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
581 
582 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   const PetscScalar *x;
586   PetscScalar       *y,*z;
587   const MatScalar   *aa;
588   PetscErrorCode    ierr;
589   PetscInt          m=A->rmap->n;
590   PetscInt          n=A->cmap->n;
591   const PetscInt    *aj,*ai;
592   PetscInt          i;
593 
594   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
595   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
596   PetscScalar       alpha = 1.0;
597   PetscScalar       beta;
598   char              matdescra[6];
599 
600   PetscFunctionBegin;
601   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
602   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
603 
604   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
605   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
606   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
607   aa   = a->a;  /* Nonzero elements stored row-by-row. */
608   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
609 
610   /* Call MKL sparse BLAS routine to do the MatMult. */
611   if (zz == yy) {
612     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
613     beta = 1.0;
614     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
615   } else {
616     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
617      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
618     beta = 0.0;
619     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
620     for (i=0; i<n; i++) {
621       z[i] += y[i];
622     }
623   }
624 
625   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
626   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
627   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
628   PetscFunctionReturn(0);
629 }
630 
631 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
632 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
633 {
634   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
635   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
636   const PetscScalar *x;
637   PetscScalar       *y,*z;
638   PetscErrorCode    ierr;
639   PetscInt          n=A->cmap->n;
640   PetscInt          i;
641 
642   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
643   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
644 
645   PetscFunctionBegin;
646 
647   /* If there are no nonzero entries, set zz = yy and return immediately. */
648   if(!a->nz) {
649     PetscInt i;
650     ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
651     for (i=0; i<n; i++) {
652       z[i] = y[i];
653     }
654     ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
655     PetscFunctionReturn(0);
656   }
657 
658   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
659   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
660 
661   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
662    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
663    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
664   if (!aijmkl->sparse_optimized) {
665     MatSeqAIJMKL_create_mkl_handle(A);
666   }
667 
668   /* Call MKL sparse BLAS routine to do the MatMult. */
669   if (zz == yy) {
670     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
671      * with alpha and beta both set to 1.0. */
672     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
673   } else {
674     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
675      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
676     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
677     for (i=0; i<n; i++) {
678       z[i] += y[i];
679     }
680   }
681 
682   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
683   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
684   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
685   if (stat != SPARSE_STATUS_SUCCESS) {
686     PetscFunctionReturn(PETSC_ERR_LIB);
687   }
688   PetscFunctionReturn(0);
689 }
690 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
691 
692 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
693 /* Note that this code currently doesn't actually get used when MatMatMult() is called with MAT_REUSE_MATRIX, because
694  * the MatMatMult() interface code calls MatMatMultNumeric() in this case.
695  * MKL has no notion of separately callable symbolic vs. numeric phases of sparse matrix-matrix multiply, so in the
696  * MAT_REUSE_MATRIX case, the SeqAIJ routines end up being used. Even though this means that the (hopefully more
697  * optimized) MKL routines do not get used, this probably is best because the MKL routines would waste time re-computing
698  * the symbolic portion, whereas the native PETSc SeqAIJ routines will avoid this. */
699 PetscErrorCode MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
700 {
701   Mat_SeqAIJMKL *a, *b;
702   sparse_matrix_t csrA, csrB, csrC;
703   PetscErrorCode ierr;
704   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
705 
706   PetscFunctionBegin;
707   a = (Mat_SeqAIJMKL*)A->spptr;
708   b = (Mat_SeqAIJMKL*)B->spptr;
709   if (!a->sparse_optimized) {
710     MatSeqAIJMKL_create_mkl_handle(A);
711   }
712   if (!b->sparse_optimized) {
713     MatSeqAIJMKL_create_mkl_handle(B);
714   }
715   csrA = a->csrA;
716   csrB = b->csrA;
717 
718   stat = mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,csrA,csrB,&csrC);
719   if (stat != SPARSE_STATUS_SUCCESS) {
720     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
721     PetscFunctionReturn(PETSC_ERR_LIB);
722   }
723 
724   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
725 
726   PetscFunctionReturn(0);
727 }
728 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
729 
730 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
731 PetscErrorCode MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat*C)
732 {
733   Mat_SeqAIJMKL *a, *b;
734   sparse_matrix_t csrA, csrB, csrC;
735   PetscErrorCode ierr;
736   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
737 
738   PetscFunctionBegin;
739   a = (Mat_SeqAIJMKL*)A->spptr;
740   b = (Mat_SeqAIJMKL*)B->spptr;
741   if (!a->sparse_optimized) {
742     MatSeqAIJMKL_create_mkl_handle(A);
743   }
744   if (!b->sparse_optimized) {
745     MatSeqAIJMKL_create_mkl_handle(B);
746   }
747   csrA = a->csrA;
748   csrB = b->csrA;
749 
750   stat = mkl_sparse_spmm(SPARSE_OPERATION_TRANSPOSE,csrA,csrB,&csrC);
751   if (stat != SPARSE_STATUS_SUCCESS) {
752     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete sparse matrix-matrix multiply");
753     PetscFunctionReturn(PETSC_ERR_LIB);
754   }
755 
756   ierr = MatSeqAIJMKL_create_from_mkl_handle(PETSC_COMM_SELF,csrC,scall,C);CHKERRQ(ierr);
757 
758   PetscFunctionReturn(0);
759 }
760 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
761 
762 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
763 {
764   PetscErrorCode ierr;
765 
766   PetscFunctionBegin;
767   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
768   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
769   PetscFunctionReturn(0);
770 }
771 
772 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
773 {
774   PetscErrorCode ierr;
775 
776   PetscFunctionBegin;
777   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
778   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
779   PetscFunctionReturn(0);
780 }
781 
782 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
783 {
784   PetscErrorCode ierr;
785 
786   PetscFunctionBegin;
787   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
788   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 
792 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
793 {
794   PetscErrorCode ierr;
795 
796   PetscFunctionBegin;
797   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
798   if (str == SAME_NONZERO_PATTERN) {
799     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
800     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
801   }
802   PetscFunctionReturn(0);
803 }
804 
805 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
806  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
807  * routine, but can also be used to convert an assembled SeqAIJ matrix
808  * into a SeqAIJMKL one. */
809 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
810 {
811   PetscErrorCode ierr;
812   Mat            B = *newmat;
813   Mat_SeqAIJMKL  *aijmkl;
814   PetscBool      set;
815   PetscBool      sametype;
816 
817   PetscFunctionBegin;
818   if (reuse == MAT_INITIAL_MATRIX) {
819     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
820   }
821 
822   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
823   if (sametype) PetscFunctionReturn(0);
824 
825   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
826   B->spptr = (void*) aijmkl;
827 
828   /* Set function pointers for methods that we inherit from AIJ but override.
829    * We also parse some command line options below, since those determine some of the methods we point to. */
830   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
831   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
832   B->ops->destroy          = MatDestroy_SeqAIJMKL;
833 
834   aijmkl->sparse_optimized = PETSC_FALSE;
835 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
836   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
837 #else
838   aijmkl->no_SpMV2 = PETSC_TRUE;
839 #endif
840   aijmkl->eager_inspection = PETSC_FALSE;
841 
842   /* Parse command line options. */
843   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
844   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
845   ierr = PetscOptionsBool("-mat_aijmkl_eager_inspection","Eager Inspection","None",(PetscBool)aijmkl->eager_inspection,(PetscBool*)&aijmkl->eager_inspection,&set);CHKERRQ(ierr);
846   ierr = PetscOptionsEnd();CHKERRQ(ierr);
847 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
848   if(!aijmkl->no_SpMV2) {
849     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
850     aijmkl->no_SpMV2 = PETSC_TRUE;
851   }
852 #endif
853 
854   if(!aijmkl->no_SpMV2) {
855 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
856     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
857     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
858     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
859     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
860     B->ops->matmult          = MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
861     B->ops->transposematmult = MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2;
862 #endif
863   } else {
864     B->ops->mult             = MatMult_SeqAIJMKL;
865     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
866     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
867     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
868   }
869 
870   B->ops->scale              = MatScale_SeqAIJMKL;
871   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
872   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
873   B->ops->axpy               = MatAXPY_SeqAIJMKL;
874 
875   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
876   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
877   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
878   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
879   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
880   if(!aijmkl->no_SpMV2) {
881 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
882     ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqaijmkl_seqaijmkl_C",MatMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
883     ierr = PetscObjectComposeFunction((PetscObject)B,"MatTransposeMatMult_seqaijmkl_seqaijmkl_C",MatTransposeMatMult_SeqAIJMKL_SeqAIJMKL_SpMV2);CHKERRQ(ierr);
884 #endif
885   }
886 
887   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
888   *newmat = B;
889   PetscFunctionReturn(0);
890 }
891 
892 /*@C
893    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
894    This type inherits from AIJ and is largely identical, but uses sparse BLAS
895    routines from Intel MKL whenever possible.
896    MatMult, MatMultAdd, MatMultTranspose, MatMultTransposeAdd, MatMatMult, and MatTransposeMatMult
897    operations are currently supported.
898    If the installed version of MKL supports the "SpMV2" sparse
899    inspector-executor routines, then those are used by default.
900 
901    Collective on MPI_Comm
902 
903    Input Parameters:
904 +  comm - MPI communicator, set to PETSC_COMM_SELF
905 .  m - number of rows
906 .  n - number of columns
907 .  nz - number of nonzeros per row (same for all rows)
908 -  nnz - array containing the number of nonzeros in the various rows
909          (possibly different for each row) or NULL
910 
911    Output Parameter:
912 .  A - the matrix
913 
914    Options Database Keys:
915 +  -mat_aijmkl_no_spmv2 - disable use of the SpMV2 inspector-executor routines
916 -  -mat_aijmkl_eager_inspection - perform MKL "inspection" phase upon matrix assembly; default is to do "lazy" inspection, performing this step the first time the matrix is applied
917 
918    Notes:
919    If nnz is given then nz is ignored
920 
921    Level: intermediate
922 
923 .keywords: matrix, MKL, sparse, parallel
924 
925 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
926 @*/
927 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
928 {
929   PetscErrorCode ierr;
930 
931   PetscFunctionBegin;
932   ierr = MatCreate(comm,A);CHKERRQ(ierr);
933   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
934   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
935   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
936   PetscFunctionReturn(0);
937 }
938 
939 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
940 {
941   PetscErrorCode ierr;
942 
943   PetscFunctionBegin;
944   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
945   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
946   PetscFunctionReturn(0);
947 }
948