xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 3fa157625e098f83cace929ef7467bc3afc3f531)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
38   }
39 
40   /* Reset the original function pointers. */
41   B->ops->duplicate        = MatDuplicate_SeqAIJ;
42   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
43   B->ops->destroy          = MatDestroy_SeqAIJ;
44   B->ops->mult             = MatMult_SeqAIJ;
45   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
46   B->ops->multadd          = MatMultAdd_SeqAIJ;
47   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
48 
49   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
50   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
51   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
52   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
53 
54   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
55    * simply involves destroying the MKL sparse matrix handle and then freeing
56    * the spptr pointer. */
57 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
58   if (aijmkl->sparse_optimized) {
59     sparse_status_t stat;
60     stat = mkl_sparse_destroy(aijmkl->csrA);
61     if (stat != SPARSE_STATUS_SUCCESS) {
62       PetscFunctionReturn(PETSC_ERR_LIB);
63     }
64   }
65 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
66   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
67 
68   /* Change the type of B to MATSEQAIJ. */
69   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
70 
71   *newmat = B;
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
76 {
77   PetscErrorCode ierr;
78   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
79 
80   PetscFunctionBegin;
81 
82   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
83    * spptr pointer. */
84   if (aijmkl) {
85     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
86 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
87     if (aijmkl->sparse_optimized) {
88       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
89       stat = mkl_sparse_destroy(aijmkl->csrA);
90       if (stat != SPARSE_STATUS_SUCCESS) {
91         PetscFunctionReturn(PETSC_ERR_LIB);
92       }
93     }
94 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
95     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
96   }
97 
98   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
99    * to destroy everything that remains. */
100   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
101   /* Note that I don't call MatSetType().  I believe this is because that
102    * is only to be called when *building* a matrix.  I could be wrong, but
103    * that is how things work for the SuperLU matrix class. */
104   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
109 {
110   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
111   Mat_SeqAIJMKL   *aijmkl;
112   PetscInt        m,n;
113   MatScalar       *aa;
114   PetscInt        *aj,*ai;
115 
116 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
117   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
118    * does nothing. We make it callable anyway in this case because it cuts
119    * down on littering the code with #ifdefs. */
120   PetscFunctionReturn(0);
121 #else
122 
123   sparse_status_t stat;
124 
125   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
126 
127   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
128 
129   if (aijmkl->sparse_optimized) {
130     /* Matrix has been previously assembled and optimized. Must destroy old
131      * matrix handle before running the optimization step again. */
132     stat = mkl_sparse_destroy(aijmkl->csrA);
133     if (stat != SPARSE_STATUS_SUCCESS) {
134       PetscFunctionReturn(PETSC_ERR_LIB);
135     }
136   }
137   aijmkl->sparse_optimized = PETSC_FALSE;
138 
139   /* Now perform the SpMV2 setup and matrix optimization. */
140   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
141   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
142   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
143   m = A->rmap->n;
144   n = A->cmap->n;
145   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
146   aa   = a->a;  /* Nonzero elements stored row-by-row. */
147   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
148   if (a->nz) {
149     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
150      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
151     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
152     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
153     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
154     stat = mkl_sparse_optimize(aijmkl->csrA);
155     if (stat != SPARSE_STATUS_SUCCESS) {
156       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
157       PetscFunctionReturn(PETSC_ERR_LIB);
158     }
159     aijmkl->sparse_optimized = PETSC_TRUE;
160   }
161 
162   PetscFunctionReturn(0);
163 #endif
164 }
165 
166 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
167 {
168   PetscErrorCode ierr;
169   Mat_SeqAIJMKL *aijmkl;
170   Mat_SeqAIJMKL *aijmkl_dest;
171 
172   PetscFunctionBegin;
173   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
174   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
175   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
176   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
177   aijmkl_dest->sparse_optimized = PETSC_FALSE;
178   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
183 {
184   PetscErrorCode  ierr;
185   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
186 
187   PetscFunctionBegin;
188   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
189 
190   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
191    * extra information and some different methods, call the AssemblyEnd
192    * routine for a MATSEQAIJ.
193    * I'm not sure if this is the best way to do this, but it avoids
194    * a lot of code duplication.
195    * I also note that currently MATSEQAIJMKL doesn't know anything about
196    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
197    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
198    * this, this may break things.  (Don't know... haven't looked at it.
199    * Do I need to disable this somehow?) */
200   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
201   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
202 
203   /* Now create the MKL sparse handle (if needed; the function checks). */
204   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
205 
206   PetscFunctionReturn(0);
207 }
208 
209 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
210 {
211   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
212   const PetscScalar *x;
213   PetscScalar       *y;
214   const MatScalar   *aa;
215   PetscErrorCode    ierr;
216   PetscInt          m=A->rmap->n;
217   PetscInt          n=A->cmap->n;
218   PetscScalar       alpha = 1.0;
219   PetscScalar       beta = 0.0;
220   const PetscInt    *aj,*ai;
221   char              matdescra[6];
222 
223 
224   /* Variables not in MatMult_SeqAIJ. */
225   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
226 
227   PetscFunctionBegin;
228   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
229   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
230   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
231   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
232   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
233   aa   = a->a;  /* Nonzero elements stored row-by-row. */
234   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
235 
236   /* Call MKL sparse BLAS routine to do the MatMult. */
237   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
238 
239   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
240   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
241   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
242   PetscFunctionReturn(0);
243 }
244 
245 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
246 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
247 {
248   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
249   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
250   const PetscScalar *x;
251   PetscScalar       *y;
252   PetscErrorCode    ierr;
253   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
254 
255   PetscFunctionBegin;
256 
257   /* If there are no nonzero entries, this is a no-op: return immediately. */
258   if(!a->nz) PetscFunctionReturn(0);
259 
260   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
261   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
262 
263   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
264    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
265    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
266   if (!aijmkl->sparse_optimized) {
267     MatSeqAIJMKL_create_mkl_handle(A);
268   }
269 
270   /* Call MKL SpMV2 executor routine to do the MatMult. */
271   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
272 
273   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
274   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
275   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
276   if (stat != SPARSE_STATUS_SUCCESS) {
277     PetscFunctionReturn(PETSC_ERR_LIB);
278   }
279   PetscFunctionReturn(0);
280 }
281 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
282 
283 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
284 {
285   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
286   const PetscScalar *x;
287   PetscScalar       *y;
288   const MatScalar   *aa;
289   PetscErrorCode    ierr;
290   PetscInt          m=A->rmap->n;
291   PetscInt          n=A->cmap->n;
292   PetscScalar       alpha = 1.0;
293   PetscScalar       beta = 0.0;
294   const PetscInt    *aj,*ai;
295   char              matdescra[6];
296 
297   /* Variables not in MatMultTranspose_SeqAIJ. */
298   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
299 
300   PetscFunctionBegin;
301   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
302   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
303   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
304   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
305   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
306   aa   = a->a;  /* Nonzero elements stored row-by-row. */
307   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
308 
309   /* Call MKL sparse BLAS routine to do the MatMult. */
310   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
311 
312   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
313   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
314   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
319 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
320 {
321   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
322   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
323   const PetscScalar *x;
324   PetscScalar       *y;
325   PetscErrorCode    ierr;
326   sparse_status_t   stat;
327 
328   PetscFunctionBegin;
329 
330   /* If there are no nonzero entries, this is a no-op: return immediately. */
331   if(!a->nz) PetscFunctionReturn(0);
332 
333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
335 
336   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
337    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
338    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
339   if (!aijmkl->sparse_optimized) {
340     MatSeqAIJMKL_create_mkl_handle(A);
341   }
342 
343   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
344   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
345 
346   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
347   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
348   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
349   if (stat != SPARSE_STATUS_SUCCESS) {
350     PetscFunctionReturn(PETSC_ERR_LIB);
351   }
352   PetscFunctionReturn(0);
353 }
354 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
355 
356 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
357 {
358   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
359   const PetscScalar *x;
360   PetscScalar       *y,*z;
361   const MatScalar   *aa;
362   PetscErrorCode    ierr;
363   PetscInt          m=A->rmap->n;
364   PetscInt          n=A->cmap->n;
365   const PetscInt    *aj,*ai;
366   PetscInt          i;
367 
368   /* Variables not in MatMultAdd_SeqAIJ. */
369   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
370   PetscScalar       alpha = 1.0;
371   PetscScalar       beta;
372   char              matdescra[6];
373 
374   PetscFunctionBegin;
375   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
376   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
377 
378   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
379   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
380   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
381   aa   = a->a;  /* Nonzero elements stored row-by-row. */
382   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
383 
384   /* Call MKL sparse BLAS routine to do the MatMult. */
385   if (zz == yy) {
386     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
387     beta = 1.0;
388     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
389   } else {
390     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
391      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
392     beta = 0.0;
393     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
394     for (i=0; i<m; i++) {
395       z[i] += y[i];
396     }
397   }
398 
399   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
400   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
401   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
402   PetscFunctionReturn(0);
403 }
404 
405 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
406 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
407 {
408   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
409   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
410   const PetscScalar *x;
411   PetscScalar       *y,*z;
412   PetscErrorCode    ierr;
413   PetscInt          m=A->rmap->n;
414   PetscInt          i;
415 
416   /* Variables not in MatMultAdd_SeqAIJ. */
417   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
418 
419   PetscFunctionBegin;
420 
421   /* If there are no nonzero entries, this is a no-op: return immediately. */
422   if(!a->nz) PetscFunctionReturn(0);
423 
424   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
425   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
426 
427   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
428    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
429    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
430   if (!aijmkl->sparse_optimized) {
431     MatSeqAIJMKL_create_mkl_handle(A);
432   }
433 
434   /* Call MKL sparse BLAS routine to do the MatMult. */
435   if (zz == yy) {
436     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
437      * with alpha and beta both set to 1.0. */
438     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
439   } else {
440     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
441      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
442     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
443     for (i=0; i<m; i++) {
444       z[i] += y[i];
445     }
446   }
447 
448   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
449   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
450   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
451   if (stat != SPARSE_STATUS_SUCCESS) {
452     PetscFunctionReturn(PETSC_ERR_LIB);
453   }
454   PetscFunctionReturn(0);
455 }
456 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
457 
458 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
459 {
460   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
461   const PetscScalar *x;
462   PetscScalar       *y,*z;
463   const MatScalar   *aa;
464   PetscErrorCode    ierr;
465   PetscInt          m=A->rmap->n;
466   PetscInt          n=A->cmap->n;
467   const PetscInt    *aj,*ai;
468   PetscInt          i;
469 
470   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
471   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
472   PetscScalar       alpha = 1.0;
473   PetscScalar       beta;
474   char              matdescra[6];
475 
476   PetscFunctionBegin;
477   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
478   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
479 
480   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
481   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
482   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
483   aa   = a->a;  /* Nonzero elements stored row-by-row. */
484   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
485 
486   /* Call MKL sparse BLAS routine to do the MatMult. */
487   if (zz == yy) {
488     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
489     beta = 1.0;
490     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
491   } else {
492     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
493      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
494     beta = 0.0;
495     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
496     for (i=0; i<n; i++) {
497       z[i] += y[i];
498     }
499   }
500 
501   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
502   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
503   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
508 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
509 {
510   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
511   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
512   const PetscScalar *x;
513   PetscScalar       *y,*z;
514   PetscErrorCode    ierr;
515   PetscInt          n=A->cmap->n;
516   PetscInt          i;
517 
518   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
519   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
520 
521   PetscFunctionBegin;
522 
523   /* If there are no nonzero entries, this is a no-op: return immediately. */
524   if(!a->nz) PetscFunctionReturn(0);
525 
526   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
527   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
528 
529   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
530    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
531    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
532   if (!aijmkl->sparse_optimized) {
533     MatSeqAIJMKL_create_mkl_handle(A);
534   }
535 
536   /* Call MKL sparse BLAS routine to do the MatMult. */
537   if (zz == yy) {
538     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
539      * with alpha and beta both set to 1.0. */
540     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
541   } else {
542     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
543      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
544     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
545     for (i=0; i<n; i++) {
546       z[i] += y[i];
547     }
548   }
549 
550   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
551   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
552   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
553   if (stat != SPARSE_STATUS_SUCCESS) {
554     PetscFunctionReturn(PETSC_ERR_LIB);
555   }
556   PetscFunctionReturn(0);
557 }
558 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
559 
560 PETSC_INTERN PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
561 {
562   PetscErrorCode ierr;
563 
564   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
565   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
566   PetscFunctionReturn(0);
567 }
568 
569 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
570  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
571  * routine, but can also be used to convert an assembled SeqAIJ matrix
572  * into a SeqAIJMKL one. */
573 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
574 {
575   PetscErrorCode ierr;
576   Mat            B = *newmat;
577   Mat_SeqAIJMKL  *aijmkl;
578   PetscBool      set;
579   PetscBool      sametype;
580 
581   PetscFunctionBegin;
582   if (reuse == MAT_INITIAL_MATRIX) {
583     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
584   }
585 
586   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
587   if (sametype) PetscFunctionReturn(0);
588 
589   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
590   B->spptr = (void*) aijmkl;
591 
592   /* Set function pointers for methods that we inherit from AIJ but override.
593    * We also parse some command line options below, since those determine some of the methods we point to. */
594   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
595   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
596   B->ops->destroy          = MatDestroy_SeqAIJMKL;
597 
598   aijmkl->sparse_optimized = PETSC_FALSE;
599 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
600   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
601 #elif
602   aijmkl->no_SpMV2 = PETSC_TRUE;
603 #endif
604 
605   /* Parse command line options. */
606   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
607   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
608   ierr = PetscOptionsEnd();CHKERRQ(ierr);
609 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
610   if(!aijmkl->no_SpMV2) {
611     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
612     aijmkl->no_SpMV2 = PETSC_TRUE;
613   }
614 #endif
615 
616   if(!aijmkl->no_SpMV2) {
617 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
618     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
619     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
620     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
621     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
622 #endif
623   } else {
624     B->ops->mult             = MatMult_SeqAIJMKL;
625     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
626     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
627     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
628   }
629 
630   B->ops->scale = MatScale_SeqAIJMKL;
631 
632   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
633   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
634   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
635   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
636   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
637 
638   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
639   *newmat = B;
640   PetscFunctionReturn(0);
641 }
642 
643 /*@C
644    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
645    This type inherits from AIJ and is largely identical, but uses sparse BLAS
646    routines from Intel MKL whenever possible.
647    Collective on MPI_Comm
648 
649    Input Parameters:
650 +  comm - MPI communicator, set to PETSC_COMM_SELF
651 .  m - number of rows
652 .  n - number of columns
653 .  nz - number of nonzeros per row (same for all rows)
654 -  nnz - array containing the number of nonzeros in the various rows
655          (possibly different for each row) or NULL
656 
657    Output Parameter:
658 .  A - the matrix
659 
660    Notes:
661    If nnz is given then nz is ignored
662 
663    Level: intermediate
664 
665 .keywords: matrix, cray, sparse, parallel
666 
667 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
668 @*/
669 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
670 {
671   PetscErrorCode ierr;
672 
673   PetscFunctionBegin;
674   ierr = MatCreate(comm,A);CHKERRQ(ierr);
675   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
676   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
677   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
682 {
683   PetscErrorCode ierr;
684 
685   PetscFunctionBegin;
686   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
687   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
688   PetscFunctionReturn(0);
689 }
690