xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 90147e499865435888bfe4e6622d57285fd4ffc4)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
38   }
39 
40   /* Reset the original function pointers. */
41   B->ops->duplicate        = MatDuplicate_SeqAIJ;
42   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
43   B->ops->destroy          = MatDestroy_SeqAIJ;
44   B->ops->mult             = MatMult_SeqAIJ;
45   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
46   B->ops->multadd          = MatMultAdd_SeqAIJ;
47   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
48   B->ops->scale            = MatScale_SeqAIJ;
49   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
50   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
51   B->ops->axpy             = MatAXPY_SeqAIJ;
52 
53   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
54   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
55   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
56   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
57 
58   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
59    * simply involves destroying the MKL sparse matrix handle and then freeing
60    * the spptr pointer. */
61 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
62   if (aijmkl->sparse_optimized) {
63     sparse_status_t stat;
64     stat = mkl_sparse_destroy(aijmkl->csrA);
65     if (stat != SPARSE_STATUS_SUCCESS) {
66       PetscFunctionReturn(PETSC_ERR_LIB);
67     }
68   }
69 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
70   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
71 
72   /* Change the type of B to MATSEQAIJ. */
73   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
74 
75   *newmat = B;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
80 {
81   PetscErrorCode ierr;
82   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
83 
84   PetscFunctionBegin;
85 
86   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
87    * spptr pointer. */
88   if (aijmkl) {
89     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
90 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
91     if (aijmkl->sparse_optimized) {
92       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
93       stat = mkl_sparse_destroy(aijmkl->csrA);
94       if (stat != SPARSE_STATUS_SUCCESS) {
95         PetscFunctionReturn(PETSC_ERR_LIB);
96       }
97     }
98 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
99     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
100   }
101 
102   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
103    * to destroy everything that remains. */
104   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
105   /* Note that I don't call MatSetType().  I believe this is because that
106    * is only to be called when *building* a matrix.  I could be wrong, but
107    * that is how things work for the SuperLU matrix class. */
108   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
113 {
114   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
115   Mat_SeqAIJMKL   *aijmkl;
116   PetscInt        m,n;
117   MatScalar       *aa;
118   PetscInt        *aj,*ai;
119 
120 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
121   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
122    * does nothing. We make it callable anyway in this case because it cuts
123    * down on littering the code with #ifdefs. */
124   PetscFunctionReturn(0);
125 #else
126 
127   sparse_status_t stat;
128 
129   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
130 
131   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
132 
133   if (aijmkl->sparse_optimized) {
134     /* Matrix has been previously assembled and optimized. Must destroy old
135      * matrix handle before running the optimization step again. */
136     stat = mkl_sparse_destroy(aijmkl->csrA);
137     if (stat != SPARSE_STATUS_SUCCESS) {
138       PetscFunctionReturn(PETSC_ERR_LIB);
139     }
140   }
141   aijmkl->sparse_optimized = PETSC_FALSE;
142 
143   /* Now perform the SpMV2 setup and matrix optimization. */
144   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
145   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
146   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
147   m = A->rmap->n;
148   n = A->cmap->n;
149   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
150   aa   = a->a;  /* Nonzero elements stored row-by-row. */
151   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
152   if (a->nz) {
153     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
154      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
155     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
156     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
157     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
158     stat = mkl_sparse_optimize(aijmkl->csrA);
159     if (stat != SPARSE_STATUS_SUCCESS) {
160       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
161       PetscFunctionReturn(PETSC_ERR_LIB);
162     }
163     aijmkl->sparse_optimized = PETSC_TRUE;
164   }
165 
166   PetscFunctionReturn(0);
167 #endif
168 }
169 
170 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
171 {
172   PetscErrorCode ierr;
173   Mat_SeqAIJMKL *aijmkl;
174   Mat_SeqAIJMKL *aijmkl_dest;
175 
176   PetscFunctionBegin;
177   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
178   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
179   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
180   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
181   aijmkl_dest->sparse_optimized = PETSC_FALSE;
182   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
187 {
188   PetscErrorCode  ierr;
189   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
190 
191   PetscFunctionBegin;
192   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
193 
194   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
195    * extra information and some different methods, call the AssemblyEnd
196    * routine for a MATSEQAIJ.
197    * I'm not sure if this is the best way to do this, but it avoids
198    * a lot of code duplication.
199    * I also note that currently MATSEQAIJMKL doesn't know anything about
200    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
201    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
202    * this, this may break things.  (Don't know... haven't looked at it.
203    * Do I need to disable this somehow?) */
204   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
205   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
206 
207   /* Now create the MKL sparse handle (if needed; the function checks). */
208   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
209 
210   PetscFunctionReturn(0);
211 }
212 
213 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
214 {
215   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
216   const PetscScalar *x;
217   PetscScalar       *y;
218   const MatScalar   *aa;
219   PetscErrorCode    ierr;
220   PetscInt          m=A->rmap->n;
221   PetscInt          n=A->cmap->n;
222   PetscScalar       alpha = 1.0;
223   PetscScalar       beta = 0.0;
224   const PetscInt    *aj,*ai;
225   char              matdescra[6];
226 
227 
228   /* Variables not in MatMult_SeqAIJ. */
229   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
230 
231   PetscFunctionBegin;
232   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
233   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
234   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
235   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
236   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
237   aa   = a->a;  /* Nonzero elements stored row-by-row. */
238   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
239 
240   /* Call MKL sparse BLAS routine to do the MatMult. */
241   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
242 
243   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
244   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
245   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
246   PetscFunctionReturn(0);
247 }
248 
249 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
250 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
251 {
252   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
253   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
254   const PetscScalar *x;
255   PetscScalar       *y;
256   PetscErrorCode    ierr;
257   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
258 
259   PetscFunctionBegin;
260 
261   /* If there are no nonzero entries, this is a no-op: return immediately. */
262   if(!a->nz) PetscFunctionReturn(0);
263 
264   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
265   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
266 
267   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
268    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
269    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
270   if (!aijmkl->sparse_optimized) {
271     MatSeqAIJMKL_create_mkl_handle(A);
272   }
273 
274   /* Call MKL SpMV2 executor routine to do the MatMult. */
275   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
276 
277   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
278   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
279   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
280   if (stat != SPARSE_STATUS_SUCCESS) {
281     PetscFunctionReturn(PETSC_ERR_LIB);
282   }
283   PetscFunctionReturn(0);
284 }
285 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
286 
287 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
288 {
289   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
290   const PetscScalar *x;
291   PetscScalar       *y;
292   const MatScalar   *aa;
293   PetscErrorCode    ierr;
294   PetscInt          m=A->rmap->n;
295   PetscInt          n=A->cmap->n;
296   PetscScalar       alpha = 1.0;
297   PetscScalar       beta = 0.0;
298   const PetscInt    *aj,*ai;
299   char              matdescra[6];
300 
301   /* Variables not in MatMultTranspose_SeqAIJ. */
302   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
303 
304   PetscFunctionBegin;
305   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
306   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
307   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
309   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
310   aa   = a->a;  /* Nonzero elements stored row-by-row. */
311   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
312 
313   /* Call MKL sparse BLAS routine to do the MatMult. */
314   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
315 
316   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
317   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
318   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
319   PetscFunctionReturn(0);
320 }
321 
322 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
323 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
324 {
325   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
326   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
327   const PetscScalar *x;
328   PetscScalar       *y;
329   PetscErrorCode    ierr;
330   sparse_status_t   stat;
331 
332   PetscFunctionBegin;
333 
334   /* If there are no nonzero entries, this is a no-op: return immediately. */
335   if(!a->nz) PetscFunctionReturn(0);
336 
337   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
339 
340   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
341    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
342    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
343   if (!aijmkl->sparse_optimized) {
344     MatSeqAIJMKL_create_mkl_handle(A);
345   }
346 
347   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
348   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
349 
350   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
351   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
352   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
353   if (stat != SPARSE_STATUS_SUCCESS) {
354     PetscFunctionReturn(PETSC_ERR_LIB);
355   }
356   PetscFunctionReturn(0);
357 }
358 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
359 
360 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
361 {
362   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
363   const PetscScalar *x;
364   PetscScalar       *y,*z;
365   const MatScalar   *aa;
366   PetscErrorCode    ierr;
367   PetscInt          m=A->rmap->n;
368   PetscInt          n=A->cmap->n;
369   const PetscInt    *aj,*ai;
370   PetscInt          i;
371 
372   /* Variables not in MatMultAdd_SeqAIJ. */
373   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
374   PetscScalar       alpha = 1.0;
375   PetscScalar       beta;
376   char              matdescra[6];
377 
378   PetscFunctionBegin;
379   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
380   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
381 
382   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
383   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
384   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
385   aa   = a->a;  /* Nonzero elements stored row-by-row. */
386   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
387 
388   /* Call MKL sparse BLAS routine to do the MatMult. */
389   if (zz == yy) {
390     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
391     beta = 1.0;
392     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
393   } else {
394     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
395      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
396     beta = 0.0;
397     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
398     for (i=0; i<m; i++) {
399       z[i] += y[i];
400     }
401   }
402 
403   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
404   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
405   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
406   PetscFunctionReturn(0);
407 }
408 
409 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
410 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
411 {
412   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
413   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
414   const PetscScalar *x;
415   PetscScalar       *y,*z;
416   PetscErrorCode    ierr;
417   PetscInt          m=A->rmap->n;
418   PetscInt          i;
419 
420   /* Variables not in MatMultAdd_SeqAIJ. */
421   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
422 
423   PetscFunctionBegin;
424 
425   /* If there are no nonzero entries, this is a no-op: return immediately. */
426   if(!a->nz) PetscFunctionReturn(0);
427 
428   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
429   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
430 
431   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
432    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
433    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
434   if (!aijmkl->sparse_optimized) {
435     MatSeqAIJMKL_create_mkl_handle(A);
436   }
437 
438   /* Call MKL sparse BLAS routine to do the MatMult. */
439   if (zz == yy) {
440     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
441      * with alpha and beta both set to 1.0. */
442     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
443   } else {
444     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
445      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
446     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
447     for (i=0; i<m; i++) {
448       z[i] += y[i];
449     }
450   }
451 
452   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
453   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
454   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
455   if (stat != SPARSE_STATUS_SUCCESS) {
456     PetscFunctionReturn(PETSC_ERR_LIB);
457   }
458   PetscFunctionReturn(0);
459 }
460 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
461 
462 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
463 {
464   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
465   const PetscScalar *x;
466   PetscScalar       *y,*z;
467   const MatScalar   *aa;
468   PetscErrorCode    ierr;
469   PetscInt          m=A->rmap->n;
470   PetscInt          n=A->cmap->n;
471   const PetscInt    *aj,*ai;
472   PetscInt          i;
473 
474   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
475   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
476   PetscScalar       alpha = 1.0;
477   PetscScalar       beta;
478   char              matdescra[6];
479 
480   PetscFunctionBegin;
481   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
482   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
483 
484   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
485   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
486   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
487   aa   = a->a;  /* Nonzero elements stored row-by-row. */
488   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
489 
490   /* Call MKL sparse BLAS routine to do the MatMult. */
491   if (zz == yy) {
492     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
493     beta = 1.0;
494     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
495   } else {
496     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
497      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
498     beta = 0.0;
499     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
500     for (i=0; i<n; i++) {
501       z[i] += y[i];
502     }
503   }
504 
505   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
506   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
507   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
508   PetscFunctionReturn(0);
509 }
510 
511 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
512 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
513 {
514   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
515   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
516   const PetscScalar *x;
517   PetscScalar       *y,*z;
518   PetscErrorCode    ierr;
519   PetscInt          n=A->cmap->n;
520   PetscInt          i;
521 
522   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
523   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
524 
525   PetscFunctionBegin;
526 
527   /* If there are no nonzero entries, this is a no-op: return immediately. */
528   if(!a->nz) PetscFunctionReturn(0);
529 
530   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
531   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
532 
533   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
534    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
535    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
536   if (!aijmkl->sparse_optimized) {
537     MatSeqAIJMKL_create_mkl_handle(A);
538   }
539 
540   /* Call MKL sparse BLAS routine to do the MatMult. */
541   if (zz == yy) {
542     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
543      * with alpha and beta both set to 1.0. */
544     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
545   } else {
546     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
547      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
548     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
549     for (i=0; i<n; i++) {
550       z[i] += y[i];
551     }
552   }
553 
554   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
555   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
556   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
557   if (stat != SPARSE_STATUS_SUCCESS) {
558     PetscFunctionReturn(PETSC_ERR_LIB);
559   }
560   PetscFunctionReturn(0);
561 }
562 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
563 
564 PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
565 {
566   PetscErrorCode ierr;
567 
568   PetscFunctionBegin;
569   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
570   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
571   PetscFunctionReturn(0);
572 }
573 
574 PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
575 {
576   PetscErrorCode ierr;
577 
578   PetscFunctionBegin;
579   ierr = MatDiagonalScale_SeqAIJ(A,ll,rr);CHKERRQ(ierr);
580   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 
584 PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
585 {
586   PetscErrorCode ierr;
587 
588   PetscFunctionBegin;
589   ierr = MatDiagonalSet_SeqAIJ(Y,D,is);CHKERRQ(ierr);
590   ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
591   PetscFunctionReturn(0);
592 }
593 
594 PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
595 {
596   PetscErrorCode ierr;
597 
598   PetscFunctionBegin;
599   ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
600   if (str == SAME_NONZERO_PATTERN) {
601     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
602     ierr = MatSeqAIJMKL_create_mkl_handle(Y);CHKERRQ(ierr);
603   }
604   PetscFunctionReturn(0);
605 }
606 
607 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
608  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
609  * routine, but can also be used to convert an assembled SeqAIJ matrix
610  * into a SeqAIJMKL one. */
611 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
612 {
613   PetscErrorCode ierr;
614   Mat            B = *newmat;
615   Mat_SeqAIJMKL  *aijmkl;
616   PetscBool      set;
617   PetscBool      sametype;
618 
619   PetscFunctionBegin;
620   if (reuse == MAT_INITIAL_MATRIX) {
621     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
622   }
623 
624   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
625   if (sametype) PetscFunctionReturn(0);
626 
627   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
628   B->spptr = (void*) aijmkl;
629 
630   /* Set function pointers for methods that we inherit from AIJ but override.
631    * We also parse some command line options below, since those determine some of the methods we point to. */
632   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
633   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
634   B->ops->destroy          = MatDestroy_SeqAIJMKL;
635 
636   aijmkl->sparse_optimized = PETSC_FALSE;
637 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
638   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
639 #elif
640   aijmkl->no_SpMV2 = PETSC_TRUE;
641 #endif
642 
643   /* Parse command line options. */
644   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
645   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
646   ierr = PetscOptionsEnd();CHKERRQ(ierr);
647 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
648   if(!aijmkl->no_SpMV2) {
649     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
650     aijmkl->no_SpMV2 = PETSC_TRUE;
651   }
652 #endif
653 
654   if(!aijmkl->no_SpMV2) {
655 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
656     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
657     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
658     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
659     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
660 #endif
661   } else {
662     B->ops->mult             = MatMult_SeqAIJMKL;
663     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
664     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
665     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
666   }
667 
668   B->ops->scale              = MatScale_SeqAIJMKL;
669   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
670   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
671   B->ops->axpy               = MatAXPY_SeqAIJMKL;
672 
673   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
674   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
675   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
676   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
677   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
678 
679   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
680   *newmat = B;
681   PetscFunctionReturn(0);
682 }
683 
684 /*@C
685    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
686    This type inherits from AIJ and is largely identical, but uses sparse BLAS
687    routines from Intel MKL whenever possible.
688    MatMult, MatMultAdd, MatMultTranspose, and MatMultTransposeAdd
689    operations are currently supported.
690    If the installed version of MKL supports the "SpMV2" sparse
691    inspector-executor routines, then those are used by default.
692 
693    Collective on MPI_Comm
694 
695    Input Parameters:
696 +  comm - MPI communicator, set to PETSC_COMM_SELF
697 .  m - number of rows
698 .  n - number of columns
699 .  nz - number of nonzeros per row (same for all rows)
700 -  nnz - array containing the number of nonzeros in the various rows
701          (possibly different for each row) or NULL
702 
703    Output Parameter:
704 .  A - the matrix
705 
706    Options Database Keys:
707 .  -mat_aijmkl_no_spmv2 - disables use of the SpMV2 inspector-executor routines
708 
709    Notes:
710    If nnz is given then nz is ignored
711 
712    Level: intermediate
713 
714 .keywords: matrix, MKL, sparse, parallel
715 
716 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
717 @*/
718 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
719 {
720   PetscErrorCode ierr;
721 
722   PetscFunctionBegin;
723   ierr = MatCreate(comm,A);CHKERRQ(ierr);
724   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
725   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
726   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
731 {
732   PetscErrorCode ierr;
733 
734   PetscFunctionBegin;
735   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
736   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
737   PetscFunctionReturn(0);
738 }
739