xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 87c2a1d7fa67ebe3ae3506c155f9d34b3f33b963)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
38   }
39 
40   /* Reset the original function pointers. */
41   B->ops->duplicate        = MatDuplicate_SeqAIJ;
42   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
43   B->ops->destroy          = MatDestroy_SeqAIJ;
44   B->ops->mult             = MatMult_SeqAIJ;
45   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
46   B->ops->multadd          = MatMultAdd_SeqAIJ;
47   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
48   B->ops->scale            = MatScale_SeqAIJ;
49   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
50   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
51   B->ops->axpy             = MatAXPY_SeqAIJ;
52 
53   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
54   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
55   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
56   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
57 
58   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
59    * simply involves destroying the MKL sparse matrix handle and then freeing
60    * the spptr pointer. */
61 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
62   if (aijmkl->sparse_optimized) {
63     sparse_status_t stat;
64     stat = mkl_sparse_destroy(aijmkl->csrA);
65     if (stat != SPARSE_STATUS_SUCCESS) {
66       PetscFunctionReturn(PETSC_ERR_LIB);
67     }
68   }
69 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
70   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
71 
72   /* Change the type of B to MATSEQAIJ. */
73   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
74 
75   *newmat = B;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
80 {
81   PetscErrorCode ierr;
82   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
83 
84   PetscFunctionBegin;
85 
86   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
87    * spptr pointer. */
88   if (aijmkl) {
89     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
90 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
91     if (aijmkl->sparse_optimized) {
92       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
93       stat = mkl_sparse_destroy(aijmkl->csrA);
94       if (stat != SPARSE_STATUS_SUCCESS) {
95         PetscFunctionReturn(PETSC_ERR_LIB);
96       }
97     }
98 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
99     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
100   }
101 
102   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
103    * to destroy everything that remains. */
104   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
105   /* Note that I don't call MatSetType().  I believe this is because that
106    * is only to be called when *building* a matrix.  I could be wrong, but
107    * that is how things work for the SuperLU matrix class. */
108   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
113 {
114   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
115   Mat_SeqAIJMKL   *aijmkl;
116   PetscInt        m,n;
117   MatScalar       *aa;
118   PetscInt        *aj,*ai;
119 
120 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
121   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
122    * does nothing. We make it callable anyway in this case because it cuts
123    * down on littering the code with #ifdefs. */
124   PetscFunctionReturn(0);
125 #else
126 
127   sparse_status_t stat;
128 
129   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
130 
131   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
132 
133   if (aijmkl->sparse_optimized) {
134     /* Matrix has been previously assembled and optimized. Must destroy old
135      * matrix handle before running the optimization step again. */
136     stat = mkl_sparse_destroy(aijmkl->csrA);
137     if (stat != SPARSE_STATUS_SUCCESS) {
138       PetscFunctionReturn(PETSC_ERR_LIB);
139     }
140   }
141   aijmkl->sparse_optimized = PETSC_FALSE;
142 
143   /* Now perform the SpMV2 setup and matrix optimization. */
144   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
145   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
146   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
147   m = A->rmap->n;
148   n = A->cmap->n;
149   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
150   aa   = a->a;  /* Nonzero elements stored row-by-row. */
151   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
152   if (a->nz) {
153     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
154      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
155     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
156     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
157     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
158     stat = mkl_sparse_optimize(aijmkl->csrA);
159     if (stat != SPARSE_STATUS_SUCCESS) {
160       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
161       PetscFunctionReturn(PETSC_ERR_LIB);
162     }
163     aijmkl->sparse_optimized = PETSC_TRUE;
164   }
165 
166   PetscFunctionReturn(0);
167 #endif
168 }
169 
170 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
171 {
172   PetscErrorCode ierr;
173   Mat_SeqAIJMKL *aijmkl;
174   Mat_SeqAIJMKL *aijmkl_dest;
175 
176   PetscFunctionBegin;
177   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
178   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
179   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
180   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
181   aijmkl_dest->sparse_optimized = PETSC_FALSE;
182   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
187 {
188   PetscErrorCode  ierr;
189   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
190 
191   PetscFunctionBegin;
192   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
193 
194   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
195    * extra information and some different methods, call the AssemblyEnd
196    * routine for a MATSEQAIJ.
197    * I'm not sure if this is the best way to do this, but it avoids
198    * a lot of code duplication.
199    * I also note that currently MATSEQAIJMKL doesn't know anything about
200    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
201    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
202    * this, this may break things.  (Don't know... haven't looked at it.
203    * Do I need to disable this somehow?) */
204   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
205   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
206 
207   /* Now create the MKL sparse handle (if needed; the function checks). */
208   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
209 
210   PetscFunctionReturn(0);
211 }
212 
213 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
214 {
215   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
216   const PetscScalar *x;
217   PetscScalar       *y;
218   const MatScalar   *aa;
219   PetscErrorCode    ierr;
220   PetscInt          m=A->rmap->n;
221   PetscInt          n=A->cmap->n;
222   PetscScalar       alpha = 1.0;
223   PetscScalar       beta = 0.0;
224   const PetscInt    *aj,*ai;
225   char              matdescra[6];
226 
227 
228   /* Variables not in MatMult_SeqAIJ. */
229   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
230 
231   PetscFunctionBegin;
232   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
233   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
234   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
235   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
236   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
237   aa   = a->a;  /* Nonzero elements stored row-by-row. */
238   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
239 
240   /* Call MKL sparse BLAS routine to do the MatMult. */
241   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
242 
243   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
244   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
245   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
246   PetscFunctionReturn(0);
247 }
248 
249 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
250 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
251 {
252   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
253   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
254   const PetscScalar *x;
255   PetscScalar       *y;
256   PetscErrorCode    ierr;
257   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
258 
259   PetscFunctionBegin;
260 
261   /* If there are no nonzero entries, this is a no-op: return immediately. */
262   if(!a->nz) PetscFunctionReturn(0);
263 
264   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
265   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
266 
267   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
268    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
269    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
270   if (!aijmkl->sparse_optimized) {
271     MatSeqAIJMKL_create_mkl_handle(A);
272   }
273 
274   /* Call MKL SpMV2 executor routine to do the MatMult. */
275   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
276 
277   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
278   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
279   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
280   if (stat != SPARSE_STATUS_SUCCESS) {
281     PetscFunctionReturn(PETSC_ERR_LIB);
282   }
283   PetscFunctionReturn(0);
284 }
285 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
286 
287 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
288 {
289   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
290   const PetscScalar *x;
291   PetscScalar       *y;
292   const MatScalar   *aa;
293   PetscErrorCode    ierr;
294   PetscInt          m=A->rmap->n;
295   PetscInt          n=A->cmap->n;
296   PetscScalar       alpha = 1.0;
297   PetscScalar       beta = 0.0;
298   const PetscInt    *aj,*ai;
299   char              matdescra[6];
300 
301   /* Variables not in MatMultTranspose_SeqAIJ. */
302   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
303 
304   PetscFunctionBegin;
305   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
306   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
307   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
309   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
310   aa   = a->a;  /* Nonzero elements stored row-by-row. */
311   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
312 
313   /* Call MKL sparse BLAS routine to do the MatMult. */
314   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
315 
316   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
317   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
318   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
319   PetscFunctionReturn(0);
320 }
321 
322 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
323 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
324 {
325   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
326   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
327   const PetscScalar *x;
328   PetscScalar       *y;
329   PetscErrorCode    ierr;
330   sparse_status_t   stat;
331 
332   PetscFunctionBegin;
333 
334   /* If there are no nonzero entries, this is a no-op: return immediately. */
335   if(!a->nz) PetscFunctionReturn(0);
336 
337   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
339 
340   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
341    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
342    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
343   if (!aijmkl->sparse_optimized) {
344     MatSeqAIJMKL_create_mkl_handle(A);
345   }
346 
347   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
348   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
349 
350   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
351   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
352   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
353   if (stat != SPARSE_STATUS_SUCCESS) {
354     PetscFunctionReturn(PETSC_ERR_LIB);
355   }
356   PetscFunctionReturn(0);
357 }
358 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
359 
360 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
361 {
362   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
363   const PetscScalar *x;
364   PetscScalar       *y,*z;
365   const MatScalar   *aa;
366   PetscErrorCode    ierr;
367   PetscInt          m=A->rmap->n;
368   PetscInt          n=A->cmap->n;
369   const PetscInt    *aj,*ai;
370   PetscInt          i;
371 
372   /* Variables not in MatMultAdd_SeqAIJ. */
373   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
374   PetscScalar       alpha = 1.0;
375   PetscScalar       beta;
376   char              matdescra[6];
377 
378   PetscFunctionBegin;
379   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
380   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
381 
382   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
383   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
384   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
385   aa   = a->a;  /* Nonzero elements stored row-by-row. */
386   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
387 
388   /* Call MKL sparse BLAS routine to do the MatMult. */
389   if (zz == yy) {
390     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
391     beta = 1.0;
392     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
393   } else {
394     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
395      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
396     beta = 0.0;
397     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
398     for (i=0; i<m; i++) {
399       z[i] += y[i];
400     }
401   }
402 
403   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
404   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
405   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
406   PetscFunctionReturn(0);
407 }
408 
409 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
410 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
411 {
412   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
413   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
414   const PetscScalar *x;
415   PetscScalar       *y,*z;
416   PetscErrorCode    ierr;
417   PetscInt          m=A->rmap->n;
418   PetscInt          i;
419 
420   /* Variables not in MatMultAdd_SeqAIJ. */
421   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
422 
423   PetscFunctionBegin;
424 
425   /* If there are no nonzero entries, this is a no-op: return immediately. */
426   if(!a->nz) PetscFunctionReturn(0);
427 
428   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
429   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
430 
431   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
432    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
433    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
434   if (!aijmkl->sparse_optimized) {
435     MatSeqAIJMKL_create_mkl_handle(A);
436   }
437 
438   /* Call MKL sparse BLAS routine to do the MatMult. */
439   if (zz == yy) {
440     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
441      * with alpha and beta both set to 1.0. */
442     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
443   } else {
444     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
445      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
446     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
447     for (i=0; i<m; i++) {
448       z[i] += y[i];
449     }
450   }
451 
452   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
453   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
454   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
455   if (stat != SPARSE_STATUS_SUCCESS) {
456     PetscFunctionReturn(PETSC_ERR_LIB);
457   }
458   PetscFunctionReturn(0);
459 }
460 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
461 
462 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
463 {
464   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
465   const PetscScalar *x;
466   PetscScalar       *y,*z;
467   const MatScalar   *aa;
468   PetscErrorCode    ierr;
469   PetscInt          m=A->rmap->n;
470   PetscInt          n=A->cmap->n;
471   const PetscInt    *aj,*ai;
472   PetscInt          i;
473 
474   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
475   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
476   PetscScalar       alpha = 1.0;
477   PetscScalar       beta;
478   char              matdescra[6];
479 
480   PetscFunctionBegin;
481   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
482   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
483 
484   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
485   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
486   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
487   aa   = a->a;  /* Nonzero elements stored row-by-row. */
488   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
489 
490   /* Call MKL sparse BLAS routine to do the MatMult. */
491   if (zz == yy) {
492     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
493     beta = 1.0;
494     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
495   } else {
496     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
497      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
498     beta = 0.0;
499     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
500     for (i=0; i<n; i++) {
501       z[i] += y[i];
502     }
503   }
504 
505   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
506   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
507   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
508   PetscFunctionReturn(0);
509 }
510 
511 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
512 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
513 {
514   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
515   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
516   const PetscScalar *x;
517   PetscScalar       *y,*z;
518   PetscErrorCode    ierr;
519   PetscInt          n=A->cmap->n;
520   PetscInt          i;
521 
522   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
523   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
524 
525   PetscFunctionBegin;
526 
527   /* If there are no nonzero entries, this is a no-op: return immediately. */
528   if(!a->nz) PetscFunctionReturn(0);
529 
530   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
531   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
532 
533   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
534    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
535    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
536   if (!aijmkl->sparse_optimized) {
537     MatSeqAIJMKL_create_mkl_handle(A);
538   }
539 
540   /* Call MKL sparse BLAS routine to do the MatMult. */
541   if (zz == yy) {
542     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
543      * with alpha and beta both set to 1.0. */
544     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
545   } else {
546     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
547      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
548     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
549     for (i=0; i<n; i++) {
550       z[i] += y[i];
551     }
552   }
553 
554   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
555   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
556   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
557   if (stat != SPARSE_STATUS_SUCCESS) {
558     PetscFunctionReturn(PETSC_ERR_LIB);
559   }
560   PetscFunctionReturn(0);
561 }
562 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
563 
564 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
565 {
566   PetscErrorCode ierr;
567 
568   PetscFunctionBegin;
569   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
570   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
571   PetscFunctionReturn(0);
572 }
573 
574 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
575 {
576   PetscErrorCode ierr;
577 
578   PetscFunctionBegin;
579   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
580   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 
584 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
585 {
586   PetscErrorCode ierr;
587 
588   PetscFunctionBegin;
589   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
590   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
591   PetscFunctionReturn(0);
592 }
593 
594 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
595 {
596   PetscErrorCode ierr;
597 
598   PetscFunctionBegin;
599   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
600   if (str == SAME_NONZERO_PATTERN) {
601     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
602     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
603   }
604   PetscFunctionReturn(0);
605 }
606 
607 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
608  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
609  * routine, but can also be used to convert an assembled SeqAIJ matrix
610  * into a SeqAIJMKL one. */
611 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
612 {
613   PetscErrorCode ierr;
614   Mat            B = *newmat;
615   Mat_SeqAIJMKL  *aijmkl;
616   PetscBool      set;
617   PetscBool      sametype;
618 
619   PetscFunctionBegin;
620   if (reuse == MAT_INITIAL_MATRIX) {
621     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
622   }
623 
624   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
625   if (sametype) PetscFunctionReturn(0);
626 
627   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
628   B->spptr = (void*) aijmkl;
629 
630   /* Set function pointers for methods that we inherit from AIJ but override.
631    * We also parse some command line options below, since those determine some of the methods we point to. */
632   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
633   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
634   B->ops->destroy          = MatDestroy_SeqAIJMKL;
635 
636   aijmkl->sparse_optimized = PETSC_FALSE;
637 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
638   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
639 #elif
640   aijmkl->no_SpMV2 = PETSC_TRUE;
641 #endif
642 
643   /* Parse command line options. */
644   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
645   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
646   ierr = PetscOptionsEnd();CHKERRQ(ierr);
647 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
648   if(!aijmkl->no_SpMV2) {
649     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
650     aijmkl->no_SpMV2 = PETSC_TRUE;
651   }
652 #endif
653 
654   if(!aijmkl->no_SpMV2) {
655 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
656     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
657     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
658     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
659     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
660 #endif
661   } else {
662     B->ops->mult             = MatMult_SeqAIJMKL;
663     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
664     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
665     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
666   }
667 
668   B->ops->scale              = MatScale_SeqAIJMKL;
669   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
670   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
671   B->ops->axpy               = MatAXPY_SeqAIJMKL;
672 
673   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
674   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
675   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
676   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
677   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
678 
679   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
680   *newmat = B;
681   PetscFunctionReturn(0);
682 }
683 
684 /*@C
685    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
686    This type inherits from AIJ and is largely identical, but uses sparse BLAS
687    routines from Intel MKL whenever possible.
688    Collective on MPI_Comm
689 
690    Input Parameters:
691 +  comm - MPI communicator, set to PETSC_COMM_SELF
692 .  m - number of rows
693 .  n - number of columns
694 .  nz - number of nonzeros per row (same for all rows)
695 -  nnz - array containing the number of nonzeros in the various rows
696          (possibly different for each row) or NULL
697 
698    Output Parameter:
699 .  A - the matrix
700 
701    Notes:
702    If nnz is given then nz is ignored
703 
704    Level: intermediate
705 
706 .keywords: matrix, cray, sparse, parallel
707 
708 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
709 @*/
710 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
711 {
712   PetscErrorCode ierr;
713 
714   PetscFunctionBegin;
715   ierr = MatCreate(comm,A);CHKERRQ(ierr);
716   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
717   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
718   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
719   PetscFunctionReturn(0);
720 }
721 
722 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
723 {
724   PetscErrorCode ierr;
725 
726   PetscFunctionBegin;
727   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
728   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
729   PetscFunctionReturn(0);
730 }
731