xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 4abfa3b389e8175bd8b584da0b85b12435119091)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
19   struct matrix_descr descr;
20 } Mat_SeqAIJMKL;
21 
22 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
23 
24 #undef __FUNCT__
25 #define __FUNCT__ "MatConvert_SeqAIJMKL_SeqAIJ"
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37   }
38 
39   /* Reset the original function pointers. */
40   B->ops->duplicate        = MatDuplicate_SeqAIJ;
41   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
42   B->ops->destroy          = MatDestroy_SeqAIJ;
43   B->ops->mult             = MatMult_SeqAIJ;
44   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
45   B->ops->multadd          = MatMultAdd_SeqAIJ;
46   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
47 
48   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
49    * simply involves destroying the MKL sparse matrix handle.
50    * We don't free the Mat_SeqAIJMKL struct itself, as this will
51    * cause problems later when MatDestroy() tries to free it. */
52 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
53   if (aijmkl->sparse_optimized) {
54     sparse_status_t stat = SPARSE_STATUS_SUCCESS;
55     stat = mkl_sparse_destroy(aijmkl->csrA);
56     if (stat != SPARSE_STATUS_SUCCESS) {
57       PetscFunctionReturn(PETSC_ERR_LIB);
58     }
59   }
60 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
61 
62   /* Change the type of B to MATSEQAIJ. */
63   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
64 
65   *newmat = B;
66   PetscFunctionReturn(0);
67 }
68 
69 #undef __FUNCT__
70 #define __FUNCT__ "MatDestroy_SeqAIJMKL"
71 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
72 {
73   PetscErrorCode ierr;
74   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
75 
76   PetscFunctionBegin;
77   /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
78 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
79   if (aijmkl->sparse_optimized) {
80     sparse_status_t stat = SPARSE_STATUS_SUCCESS;
81     stat = mkl_sparse_destroy(aijmkl->csrA);
82     if (stat != SPARSE_STATUS_SUCCESS) {
83       PetscFunctionReturn(PETSC_ERR_LIB);
84     }
85   }
86 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
87   ierr = PetscFree(A->spptr);CHKERRQ(ierr);
88 
89   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
90    * to destroy everything that remains. */
91   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
92   /* Note that I don't call MatSetType().  I believe this is because that
93    * is only to be called when *building* a matrix.  I could be wrong, but
94    * that is how things work for the SuperLU matrix class. */
95   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
96   PetscFunctionReturn(0);
97 }
98 
99 #undef __FUNCT__
100 #define __FUNCT__ "MatDuplicate_SeqAIJMKL"
101 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
102 {
103   PetscErrorCode ierr;
104   Mat_SeqAIJMKL *aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
105   Mat_SeqAIJMKL *aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
106 
107   PetscFunctionBegin;
108   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
109   ierr = PetscMemcpy((*M)->spptr,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
110   PetscFunctionReturn(0);
111 }
112 
113 #undef __FUNCT__
114 #define __FUNCT__ "MatAssemblyEnd_SeqAIJMKL"
115 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
116 {
117   PetscErrorCode  ierr;
118   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
119   Mat_SeqAIJMKL   *aijmkl;
120 
121   MatScalar       *aa;
122   PetscInt        n;
123   PetscInt        *aj,*ai;
124   PetscBool       set;
125 
126   PetscFunctionBegin;
127   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
128 
129   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
130    * extra information and some different methods, call the AssemblyEnd
131    * routine for a MATSEQAIJ.
132    * I'm not sure if this is the best way to do this, but it avoids
133    * a lot of code duplication.
134    * I also note that currently MATSEQAIJMKL doesn't know anything about
135    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
136    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
137    * this, this may break things.  (Don't know... haven't looked at it.
138    * Do I need to disable this somehow?) */
139   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
140   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
141 
142   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
143 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
144   if (!aijmkl->no_SpMV2) {
145     sparse_status_t stat = SPARSE_STATUS_SUCCESS;
146     /* Now perform the SpMV2 setup and matrix optimization. */
147     aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
148     aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
149     aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
150     n = A->rmap->n;
151     aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
152     aa   = a->a;  /* Nonzero elements stored row-by-row. */
153     ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
154     stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,n,n,ai,ai+1,aj,aa);
155     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
156     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
157     stat = mkl_sparse_optimize(aijmkl->csrA);
158     if (stat != SPARSE_STATUS_SUCCESS) {
159       PetscFunctionReturn(PETSC_ERR_LIB);
160     }
161     aijmkl->sparse_optimized = PETSC_TRUE;
162   }
163 #endif
164 
165   PetscFunctionReturn(0);
166 }
167 
168 #undef __FUNCT__
169 #define __FUNCT__ "MatMult_SeqAIJMKL"
170 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
171 {
172   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
173   const PetscScalar *x;
174   PetscScalar       *y;
175   const MatScalar   *aa;
176   PetscErrorCode    ierr;
177   PetscInt          m=A->rmap->n;
178   const PetscInt    *aj,*ai;
179   PetscInt          i;
180 
181   /* Variables not in MatMult_SeqAIJ. */
182   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
183 
184   PetscFunctionBegin;
185   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
186   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
187   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
188   aa   = a->a;  /* Nonzero elements stored row-by-row. */
189   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
190 
191   /* Call MKL sparse BLAS routine to do the MatMult. */
192   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
193 
194   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
195   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
196   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
197   PetscFunctionReturn(0);
198 }
199 
200 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
201 #undef __FUNCT__
202 #define __FUNCT__ "MatMult_SeqAIJMKL_SpMV2"
203 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
204 {
205   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
206   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
207   const PetscScalar *x;
208   PetscScalar       *y;
209   const MatScalar   *aa;
210   PetscErrorCode    ierr;
211   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
212 
213   PetscFunctionBegin;
214 
215 #ifdef DEBUG
216   printf("DEBUG: In MatMult_SeqAIJMKL_SpMV2\n");
217 #endif
218   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
219   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
220 
221   /* Call MKL SpMV2 executor routine to do the MatMult. */
222   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
223 
224   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
225   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
226   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
227   if (stat != SPARSE_STATUS_SUCCESS) {
228     PetscFunctionReturn(PETSC_ERR_LIB);
229   }
230   PetscFunctionReturn(0);
231 }
232 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
233 
234 #undef __FUNCT__
235 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL"
236 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
237 {
238   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
239   const PetscScalar *x;
240   PetscScalar       *y;
241   const MatScalar   *aa;
242   PetscErrorCode    ierr;
243   PetscInt          m=A->rmap->n;
244   const PetscInt    *aj,*ai;
245   PetscInt          i;
246 
247   /* Variables not in MatMultTranspose_SeqAIJ. */
248   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
249 
250   PetscFunctionBegin;
251   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
252   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
253   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
254   aa   = a->a;  /* Nonzero elements stored row-by-row. */
255   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
256 
257   /* Call MKL sparse BLAS routine to do the MatMult. */
258   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
259 
260   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
261   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
262   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
263   PetscFunctionReturn(0);
264 }
265 
266 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
267 #undef __FUNCT__
268 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL_SpMV2"
269 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
270 {
271   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
272   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
273   const PetscScalar *x;
274   PetscScalar       *y;
275   const MatScalar   *aa;
276   PetscErrorCode    ierr;
277   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
278 
279   PetscFunctionBegin;
280 
281 #ifdef DEBUG
282   printf("DEBUG: In MatMultTranspose_SeqAIJMKL_SpMV2\n");
283 #endif
284   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
285   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
286 
287   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
288   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
289 
290   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
291   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
293   if (stat != SPARSE_STATUS_SUCCESS) {
294     PetscFunctionReturn(PETSC_ERR_LIB);
295   }
296   PetscFunctionReturn(0);
297 }
298 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatMultAdd_SeqAIJMKL"
302 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
303 {
304   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
305   const PetscScalar *x;
306   PetscScalar       *y,*z;
307   const MatScalar   *aa;
308   PetscErrorCode    ierr;
309   PetscInt          m=A->rmap->n;
310   const PetscInt    *aj,*ai;
311   PetscInt          i;
312 
313   /* Variables not in MatMultAdd_SeqAIJ. */
314   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
315   PetscScalar       alpha = 1.0;
316   PetscScalar       beta = 1.0;
317   char              matdescra[6];
318 
319   PetscFunctionBegin;
320   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
321   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
322 
323   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
324   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
325   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
326   aa   = a->a;  /* Nonzero elements stored row-by-row. */
327   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
328 
329   /* Call MKL sparse BLAS routine to do the MatMult. */
330   if (zz == yy) {
331     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
332     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
333   } else {
334     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
335      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
336     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
337     for (i=0; i<m; i++) {
338       z[i] += y[i];
339     }
340   }
341 
342   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
343   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
344   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
349 #undef __FUNCT__
350 #define __FUNCT__ "MatMultAdd_SeqAIJMKL_SpMV2"
351 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
352 {
353   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
354   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
355   const PetscScalar *x;
356   PetscScalar       *y,*z;
357   const MatScalar   *aa;
358   PetscErrorCode    ierr;
359   PetscInt          m=A->rmap->n;
360   const PetscInt    *aj,*ai;
361   PetscInt          i;
362 
363   /* Variables not in MatMultAdd_SeqAIJ. */
364   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
365 
366   PetscFunctionBegin;
367 
368 #ifdef DEBUG
369   printf("DEBUG: In MatMultAdd_SeqAIJMKL_SpMV2\n");
370 #endif
371 
372   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
373   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
374   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
375   aa   = a->a;  /* Nonzero elements stored row-by-row. */
376   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
377 
378   /* Call MKL sparse BLAS routine to do the MatMult. */
379   if (zz == yy) {
380     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
381      * with alpha and beta both set to 1.0. */
382     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
383   } else {
384     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
385      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
386     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
387     for (i=0; i<m; i++) {
388       z[i] += y[i];
389     }
390   }
391 
392   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
393   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
395   if (stat != SPARSE_STATUS_SUCCESS) {
396     PetscFunctionReturn(PETSC_ERR_LIB);
397   }
398   PetscFunctionReturn(0);
399 }
400 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
401 
402 #undef __FUNCT__
403 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL"
404 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
405 {
406   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
407   const PetscScalar *x;
408   PetscScalar       *y,*z;
409   const MatScalar   *aa;
410   PetscErrorCode    ierr;
411   PetscInt          m=A->rmap->n;
412   const PetscInt    *aj,*ai;
413   PetscInt          i;
414 
415   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
416   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
417   PetscScalar       alpha = 1.0;
418   PetscScalar       beta = 1.0;
419   char              matdescra[6];
420 
421   PetscFunctionBegin;
422   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
423   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
424 
425   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
426   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
427   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
428   aa   = a->a;  /* Nonzero elements stored row-by-row. */
429   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
430 
431   /* Call MKL sparse BLAS routine to do the MatMult. */
432   if (zz == yy) {
433     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
434     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
435   } else {
436     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
437      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
438     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
439     for (i=0; i<m; i++) {
440       z[i] += y[i];
441     }
442   }
443 
444   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
445   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
446   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
447   PetscFunctionReturn(0);
448 }
449 
450 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
451 #undef __FUNCT__
452 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL_SpMV2"
453 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
454 {
455   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
456   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
457   const PetscScalar *x;
458   PetscScalar       *y,*z;
459   const MatScalar   *aa;
460   PetscErrorCode    ierr;
461   PetscInt          m=A->rmap->n;
462   const PetscInt    *aj,*ai;
463   PetscInt          i;
464 
465   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
466   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
467 
468   PetscFunctionBegin;
469 
470 #ifdef DEBUG
471   printf("DEBUG: In MatMultTransposeAdd_SeqAIJMKL_SpMV2\n");
472 #endif
473 
474   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
475   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
476   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
477   aa   = a->a;  /* Nonzero elements stored row-by-row. */
478   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
479 
480   /* Call MKL sparse BLAS routine to do the MatMult. */
481   if (zz == yy) {
482     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
483      * with alpha and beta both set to 1.0. */
484     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
485   } else {
486     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
487      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
488     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
489     for (i=0; i<m; i++) {
490       z[i] += y[i];
491     }
492   }
493 
494   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
495   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
496   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
497   if (stat != SPARSE_STATUS_SUCCESS) {
498     PetscFunctionReturn(PETSC_ERR_LIB);
499   }
500   PetscFunctionReturn(0);
501 }
502 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
503 
504 
505 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
506  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
507  * routine, but can also be used to convert an assembled SeqAIJ matrix
508  * into a SeqAIJMKL one. */
509 #undef __FUNCT__
510 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJMKL"
511 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
512 {
513   PetscErrorCode ierr;
514   Mat            B = *newmat;
515   Mat_SeqAIJMKL *aijmkl;
516   PetscBool       set;
517 
518   PetscFunctionBegin;
519   if (reuse == MAT_INITIAL_MATRIX) {
520     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
521   }
522 
523   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
524   B->spptr = (void*) aijmkl;
525 
526   /* Set function pointers for methods that we inherit from AIJ but override.
527    * Currently the transposed operations are not being set because I encounter memory corruption
528    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
529   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
530   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
531   B->ops->destroy          = MatDestroy_SeqAIJMKL;
532 
533   aijmkl->sparse_optimized = PETSC_FALSE;
534 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
535   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
536 #elif
537   aijmkl->no_SpMV2 = PETSC_TRUE;
538 #endif
539 
540   /* Parse command line options. */
541   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
542   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
543   ierr = PetscOptionsEnd();CHKERRQ(ierr);
544 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
545   if(!aijmkl->no_SpMV2) {
546     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
547     aijmkl->no_SpMV2 = PETSC_TRUE;
548   }
549 #endif
550 
551   if(!aijmkl->no_SpMV2) {
552 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
553     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
554     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
555     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
556     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
557 #endif
558   } else {
559     B->ops->mult             = MatMult_SeqAIJMKL;
560     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
561     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
562     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
563   }
564 
565   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
566 
567   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
568   *newmat = B;
569   PetscFunctionReturn(0);
570 }
571 
572 #undef __FUNCT__
573 #define __FUNCT__ "MatCreateSeqAIJMKL"
574 /*@C
575    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
576    This type inherits from AIJ and is largely identical, but uses sparse BLAS
577    routines from Intel MKL whenever possible.
578    Collective on MPI_Comm
579 
580    Input Parameters:
581 +  comm - MPI communicator, set to PETSC_COMM_SELF
582 .  m - number of rows
583 .  n - number of columns
584 .  nz - number of nonzeros per row (same for all rows)
585 -  nnz - array containing the number of nonzeros in the various rows
586          (possibly different for each row) or NULL
587 
588    Output Parameter:
589 .  A - the matrix
590 
591    Notes:
592    If nnz is given then nz is ignored
593 
594    Level: intermediate
595 
596 .keywords: matrix, cray, sparse, parallel
597 
598 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
599 @*/
600 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
601 {
602   PetscErrorCode ierr;
603 
604   PetscFunctionBegin;
605   ierr = MatCreate(comm,A);CHKERRQ(ierr);
606   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
607   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
608   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatCreate_SeqAIJMKL"
614 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
615 {
616   PetscErrorCode ierr;
617 
618   PetscFunctionBegin;
619   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
620   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
621   PetscFunctionReturn(0);
622 }
623