xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 58678438215255af79c88dccdbe587bcfb6313c3)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 #undef __FUNCT__
27 #define __FUNCT__ "MatConvert_SeqAIJMKL_SeqAIJ"
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 
36   PetscFunctionBegin;
37   if (reuse == MAT_INITIAL_MATRIX) {
38     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
39     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
40   }
41 
42   /* Reset the original function pointers. */
43   B->ops->duplicate        = MatDuplicate_SeqAIJ;
44   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
45   B->ops->destroy          = MatDestroy_SeqAIJ;
46   B->ops->mult             = MatMult_SeqAIJ;
47   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
48   B->ops->multadd          = MatMultAdd_SeqAIJ;
49   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
50 
51   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
52   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
53   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
54   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
55 
56   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
57    * simply involves destroying the MKL sparse matrix handle and then freeing
58    * the spptr pointer. */
59 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
60   if (aijmkl->sparse_optimized) {
61     sparse_status_t stat;
62     stat = mkl_sparse_destroy(aijmkl->csrA);
63     if (stat != SPARSE_STATUS_SUCCESS) {
64       PetscFunctionReturn(PETSC_ERR_LIB);
65     }
66   }
67 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
68   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
69 
70   /* Change the type of B to MATSEQAIJ. */
71   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
72 
73   *newmat = B;
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNCT__
78 #define __FUNCT__ "MatDestroy_SeqAIJMKL"
79 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
80 {
81   PetscErrorCode ierr;
82   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
83 
84   PetscFunctionBegin;
85 
86   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
87    * spptr pointer. */
88   if (aijmkl) {
89     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
90 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
91     if (aijmkl->sparse_optimized) {
92       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
93       stat = mkl_sparse_destroy(aijmkl->csrA);
94       if (stat != SPARSE_STATUS_SUCCESS) {
95         PetscFunctionReturn(PETSC_ERR_LIB);
96       }
97     }
98 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
99     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
100   }
101 
102   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
103    * to destroy everything that remains. */
104   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
105   /* Note that I don't call MatSetType().  I believe this is because that
106    * is only to be called when *building* a matrix.  I could be wrong, but
107    * that is how things work for the SuperLU matrix class. */
108   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNCT__
113 #define __FUNCT__ "MatDuplicate_SeqAIJMKL"
114 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
115 {
116   PetscErrorCode ierr;
117   Mat_SeqAIJMKL *aijmkl;
118   Mat_SeqAIJMKL *aijmkl_dest;
119 
120   PetscFunctionBegin;
121   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
122   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
123   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
124   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
125   aijmkl_dest->sparse_optimized = PETSC_FALSE;
126 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
127   aijmkl_dest->csrA = NULL;
128   if (!aijmkl->no_SpMV2) {
129     sparse_status_t stat;
130     stat = mkl_sparse_copy(aijmkl->csrA,aijmkl->descr,&aijmkl_dest->csrA);
131     if (stat != SPARSE_STATUS_SUCCESS) {
132       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_copy");
133       PetscFunctionReturn(PETSC_ERR_LIB);
134     }
135     stat = mkl_sparse_optimize(aijmkl_dest->csrA);
136     if (stat != SPARSE_STATUS_SUCCESS) {
137       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
138       PetscFunctionReturn(PETSC_ERR_LIB);
139     }
140     aijmkl_dest->sparse_optimized = PETSC_TRUE;
141   }
142 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
143   PetscFunctionReturn(0);
144 }
145 
146 #undef __FUNCT__
147 #define __FUNCT__ "MatAssemblyEnd_SeqAIJMKL"
148 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
149 {
150   PetscErrorCode  ierr;
151   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
152   Mat_SeqAIJMKL   *aijmkl;
153 
154   MatScalar       *aa;
155   PetscInt        m,n;
156   PetscInt        *aj,*ai;
157 
158   PetscFunctionBegin;
159   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
160 
161   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
162    * extra information and some different methods, call the AssemblyEnd
163    * routine for a MATSEQAIJ.
164    * I'm not sure if this is the best way to do this, but it avoids
165    * a lot of code duplication.
166    * I also note that currently MATSEQAIJMKL doesn't know anything about
167    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
168    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
169    * this, this may break things.  (Don't know... haven't looked at it.
170    * Do I need to disable this somehow?) */
171   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
172   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
173 
174   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
175 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
176   if (!aijmkl->no_SpMV2) {
177     sparse_status_t stat;
178     if (aijmkl->sparse_optimized) {
179       /* Matrix has been previously assembled and optimized. Must destroy old
180        * matrix handle before running the optimization step again. */
181       sparse_status_t stat;
182       stat = mkl_sparse_destroy(aijmkl->csrA);
183       if (stat != SPARSE_STATUS_SUCCESS) {
184         PetscFunctionReturn(PETSC_ERR_LIB);
185       }
186     }
187     /* Now perform the SpMV2 setup and matrix optimization. */
188     aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
189     aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
190     aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
191     m = A->rmap->n;
192     n = A->cmap->n;
193     aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
194     aa   = a->a;  /* Nonzero elements stored row-by-row. */
195     ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
196     stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
197     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
198     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
199     stat = mkl_sparse_optimize(aijmkl->csrA);
200     if (stat != SPARSE_STATUS_SUCCESS) {
201       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
202       PetscFunctionReturn(PETSC_ERR_LIB);
203     }
204     aijmkl->sparse_optimized = PETSC_TRUE;
205   }
206 #endif
207 
208   PetscFunctionReturn(0);
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatMult_SeqAIJMKL"
213 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
214 {
215   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
216   const PetscScalar *x;
217   PetscScalar       *y;
218   const MatScalar   *aa;
219   PetscErrorCode    ierr;
220   PetscInt          m=A->rmap->n;
221   const PetscInt    *aj,*ai;
222 
223   /* Variables not in MatMult_SeqAIJ. */
224   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
225 
226   PetscFunctionBegin;
227   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
228   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
229   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
230   aa   = a->a;  /* Nonzero elements stored row-by-row. */
231   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
232 
233   /* Call MKL sparse BLAS routine to do the MatMult. */
234   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
235 
236   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
237   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
238   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
243 #undef __FUNCT__
244 #define __FUNCT__ "MatMult_SeqAIJMKL_SpMV2"
245 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
246 {
247   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
248   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
249   const PetscScalar *x;
250   PetscScalar       *y;
251   PetscErrorCode    ierr;
252   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
253 
254   PetscFunctionBegin;
255 
256   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
257   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
258 
259   /* Call MKL SpMV2 executor routine to do the MatMult. */
260   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
261 
262   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
263   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
264   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
265   if (stat != SPARSE_STATUS_SUCCESS) {
266     PetscFunctionReturn(PETSC_ERR_LIB);
267   }
268   PetscFunctionReturn(0);
269 }
270 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
271 
272 #undef __FUNCT__
273 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL"
274 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
275 {
276   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
277   const PetscScalar *x;
278   PetscScalar       *y;
279   const MatScalar   *aa;
280   PetscErrorCode    ierr;
281   PetscInt          m=A->rmap->n;
282   const PetscInt    *aj,*ai;
283 
284   /* Variables not in MatMultTranspose_SeqAIJ. */
285   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
286 
287   PetscFunctionBegin;
288   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
289   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
290   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
291   aa   = a->a;  /* Nonzero elements stored row-by-row. */
292   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
293 
294   /* Call MKL sparse BLAS routine to do the MatMult. */
295   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
296 
297   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
298   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
299   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
304 #undef __FUNCT__
305 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL_SpMV2"
306 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
307 {
308   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
309   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
310   const PetscScalar *x;
311   PetscScalar       *y;
312   PetscErrorCode    ierr;
313   sparse_status_t   stat;
314 
315   PetscFunctionBegin;
316 
317   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
318   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
319 
320   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
321   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
322 
323   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
324   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
325   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
326   if (stat != SPARSE_STATUS_SUCCESS) {
327     PetscFunctionReturn(PETSC_ERR_LIB);
328   }
329   PetscFunctionReturn(0);
330 }
331 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
332 
333 #undef __FUNCT__
334 #define __FUNCT__ "MatMultAdd_SeqAIJMKL"
335 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
336 {
337   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
338   const PetscScalar *x;
339   PetscScalar       *y,*z;
340   const MatScalar   *aa;
341   PetscErrorCode    ierr;
342   PetscInt          m=A->rmap->n;
343   const PetscInt    *aj,*ai;
344   PetscInt          i;
345 
346   /* Variables not in MatMultAdd_SeqAIJ. */
347   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
348   PetscScalar       alpha = 1.0;
349   PetscScalar       beta = 1.0;
350   char              matdescra[6];
351 
352   PetscFunctionBegin;
353   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
354   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
355 
356   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
357   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
358   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
359   aa   = a->a;  /* Nonzero elements stored row-by-row. */
360   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
361 
362   /* Call MKL sparse BLAS routine to do the MatMult. */
363   if (zz == yy) {
364     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
365     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
366   } else {
367     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
368      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
369     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
370     for (i=0; i<m; i++) {
371       z[i] += y[i];
372     }
373   }
374 
375   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
376   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
377   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
382 #undef __FUNCT__
383 #define __FUNCT__ "MatMultAdd_SeqAIJMKL_SpMV2"
384 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
385 {
386   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
387   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
388   const PetscScalar *x;
389   PetscScalar       *y,*z;
390   PetscErrorCode    ierr;
391   PetscInt          m=A->rmap->n;
392   PetscInt          i;
393 
394   /* Variables not in MatMultAdd_SeqAIJ. */
395   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
396 
397   PetscFunctionBegin;
398 
399 
400   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
401   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
402 
403   /* Call MKL sparse BLAS routine to do the MatMult. */
404   if (zz == yy) {
405     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
406      * with alpha and beta both set to 1.0. */
407     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
408   } else {
409     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
410      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
411     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
412     for (i=0; i<m; i++) {
413       z[i] += y[i];
414     }
415   }
416 
417   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
418   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
419   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
420   if (stat != SPARSE_STATUS_SUCCESS) {
421     PetscFunctionReturn(PETSC_ERR_LIB);
422   }
423   PetscFunctionReturn(0);
424 }
425 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
426 
427 #undef __FUNCT__
428 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL"
429 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
430 {
431   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
432   const PetscScalar *x;
433   PetscScalar       *y,*z;
434   const MatScalar   *aa;
435   PetscErrorCode    ierr;
436   PetscInt          m=A->rmap->n;
437   const PetscInt    *aj,*ai;
438   PetscInt          i;
439 
440   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
441   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
442   PetscScalar       alpha = 1.0;
443   PetscScalar       beta = 1.0;
444   char              matdescra[6];
445 
446   PetscFunctionBegin;
447   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
448   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
449 
450   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
451   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
452   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
453   aa   = a->a;  /* Nonzero elements stored row-by-row. */
454   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
455 
456   /* Call MKL sparse BLAS routine to do the MatMult. */
457   if (zz == yy) {
458     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
459     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
460   } else {
461     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
462      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
463     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
464     for (i=0; i<m; i++) {
465       z[i] += y[i];
466     }
467   }
468 
469   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
470   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
471   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
472   PetscFunctionReturn(0);
473 }
474 
475 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
476 #undef __FUNCT__
477 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL_SpMV2"
478 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
479 {
480   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
481   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
482   const PetscScalar *x;
483   PetscScalar       *y,*z;
484   PetscErrorCode    ierr;
485   PetscInt          m=A->rmap->n;
486   PetscInt          i;
487 
488   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
489   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
490 
491   PetscFunctionBegin;
492 
493   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
495 
496   /* Call MKL sparse BLAS routine to do the MatMult. */
497   if (zz == yy) {
498     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
499      * with alpha and beta both set to 1.0. */
500     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
501   } else {
502     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
503      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
504     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
505     for (i=0; i<m; i++) {
506       z[i] += y[i];
507     }
508   }
509 
510   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
511   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
512   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
513   if (stat != SPARSE_STATUS_SUCCESS) {
514     PetscFunctionReturn(PETSC_ERR_LIB);
515   }
516   PetscFunctionReturn(0);
517 }
518 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
519 
520 
521 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
522  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
523  * routine, but can also be used to convert an assembled SeqAIJ matrix
524  * into a SeqAIJMKL one. */
525 #undef __FUNCT__
526 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJMKL"
527 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
528 {
529   PetscErrorCode ierr;
530   Mat            B = *newmat;
531   Mat_SeqAIJMKL  *aijmkl;
532   PetscBool      set;
533   PetscBool      sametype;
534 
535   PetscFunctionBegin;
536   if (reuse == MAT_INITIAL_MATRIX) {
537     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
538   }
539 
540   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
541   if (sametype) PetscFunctionReturn(0);
542 
543   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
544   B->spptr = (void*) aijmkl;
545 
546   /* Set function pointers for methods that we inherit from AIJ but override.
547    * We also parse some command line options below, since those determine some of the methods we point to.
548    * Note: Currently the transposed operations are not being set because I encounter memory corruption
549    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
550   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
551   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
552   B->ops->destroy          = MatDestroy_SeqAIJMKL;
553 
554   aijmkl->sparse_optimized = PETSC_FALSE;
555 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
556   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
557 #elif
558   aijmkl->no_SpMV2 = PETSC_TRUE;
559 #endif
560 
561   /* Parse command line options. */
562   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
563   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
564   ierr = PetscOptionsEnd();CHKERRQ(ierr);
565 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
566   if(!aijmkl->no_SpMV2) {
567     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
568     aijmkl->no_SpMV2 = PETSC_TRUE;
569   }
570 #endif
571 
572   if(!aijmkl->no_SpMV2) {
573 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
574     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
575     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
576     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
577     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
578 #endif
579   } else {
580     B->ops->mult             = MatMult_SeqAIJMKL;
581     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
582     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
583     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
584   }
585 
586   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
587   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
588   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
589   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
590 
591   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
592   *newmat = B;
593   PetscFunctionReturn(0);
594 }
595 
596 #undef __FUNCT__
597 #define __FUNCT__ "MatCreateSeqAIJMKL"
598 /*@C
599    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
600    This type inherits from AIJ and is largely identical, but uses sparse BLAS
601    routines from Intel MKL whenever possible.
602    Collective on MPI_Comm
603 
604    Input Parameters:
605 +  comm - MPI communicator, set to PETSC_COMM_SELF
606 .  m - number of rows
607 .  n - number of columns
608 .  nz - number of nonzeros per row (same for all rows)
609 -  nnz - array containing the number of nonzeros in the various rows
610          (possibly different for each row) or NULL
611 
612    Output Parameter:
613 .  A - the matrix
614 
615    Notes:
616    If nnz is given then nz is ignored
617 
618    Level: intermediate
619 
620 .keywords: matrix, cray, sparse, parallel
621 
622 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
623 @*/
624 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
625 {
626   PetscErrorCode ierr;
627 
628   PetscFunctionBegin;
629   ierr = MatCreate(comm,A);CHKERRQ(ierr);
630   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
631   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
632   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
633   PetscFunctionReturn(0);
634 }
635 
636 #undef __FUNCT__
637 #define __FUNCT__ "MatCreate_SeqAIJMKL"
638 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
639 {
640   PetscErrorCode ierr;
641 
642   PetscFunctionBegin;
643   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
644   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
645   PetscFunctionReturn(0);
646 }
647