xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision b8cbc1fbbe9b331625828766e7152f3c64a6f997)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 #undef __FUNCT__
27 #define __FUNCT__ "MatConvert_SeqAIJMKL_SeqAIJ"
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34   Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 
36   PetscFunctionBegin;
37   if (reuse == MAT_INITIAL_MATRIX) {
38     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
39   }
40 
41   /* Reset the original function pointers. */
42   B->ops->duplicate        = MatDuplicate_SeqAIJ;
43   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
44   B->ops->destroy          = MatDestroy_SeqAIJ;
45   B->ops->mult             = MatMult_SeqAIJ;
46   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
47   B->ops->multadd          = MatMultAdd_SeqAIJ;
48   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
49 
50   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
51    * simply involves destroying the MKL sparse matrix handle.
52    * We don't free the Mat_SeqAIJMKL struct itself, as this will
53    * cause problems later when MatDestroy() tries to free it. */
54 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
55   if (aijmkl->sparse_optimized) {
56     sparse_status_t stat = SPARSE_STATUS_SUCCESS;
57     stat = mkl_sparse_destroy(aijmkl->csrA);
58     if (stat != SPARSE_STATUS_SUCCESS) {
59       PetscFunctionReturn(PETSC_ERR_LIB);
60     }
61   }
62 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
63 
64   /* Change the type of B to MATSEQAIJ. */
65   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
66 
67   *newmat = B;
68   PetscFunctionReturn(0);
69 }
70 
71 #undef __FUNCT__
72 #define __FUNCT__ "MatDestroy_SeqAIJMKL"
73 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
74 {
75   PetscErrorCode ierr;
76   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
77 
78   PetscFunctionBegin;
79   /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
80 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
81   if (aijmkl->sparse_optimized) {
82     sparse_status_t stat = SPARSE_STATUS_SUCCESS;
83     stat = mkl_sparse_destroy(aijmkl->csrA);
84     if (stat != SPARSE_STATUS_SUCCESS) {
85       PetscFunctionReturn(PETSC_ERR_LIB);
86     }
87   }
88 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
89   ierr = PetscFree(A->spptr);CHKERRQ(ierr);
90 
91   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
92    * to destroy everything that remains. */
93   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
94   /* Note that I don't call MatSetType().  I believe this is because that
95    * is only to be called when *building* a matrix.  I could be wrong, but
96    * that is how things work for the SuperLU matrix class. */
97   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
98   PetscFunctionReturn(0);
99 }
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatDuplicate_SeqAIJMKL"
103 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
104 {
105   PetscErrorCode ierr;
106   Mat_SeqAIJMKL *aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
107   Mat_SeqAIJMKL *aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
108 
109   PetscFunctionBegin;
110   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
111   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatAssemblyEnd_SeqAIJMKL"
117 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
118 {
119   PetscErrorCode  ierr;
120   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
121   Mat_SeqAIJMKL   *aijmkl;
122 
123   MatScalar       *aa;
124   PetscInt        n;
125   PetscInt        *aj,*ai;
126 
127   PetscFunctionBegin;
128   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
129 
130   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
131    * extra information and some different methods, call the AssemblyEnd
132    * routine for a MATSEQAIJ.
133    * I'm not sure if this is the best way to do this, but it avoids
134    * a lot of code duplication.
135    * I also note that currently MATSEQAIJMKL doesn't know anything about
136    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
137    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
138    * this, this may break things.  (Don't know... haven't looked at it.
139    * Do I need to disable this somehow?) */
140   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
141   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
142 
143   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
144 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
145   if (!aijmkl->no_SpMV2) {
146     sparse_status_t stat = SPARSE_STATUS_SUCCESS;
147     /* Now perform the SpMV2 setup and matrix optimization. */
148     aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
149     aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
150     aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
151     n = A->rmap->n;
152     aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
153     aa   = a->a;  /* Nonzero elements stored row-by-row. */
154     ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
155     stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,n,n,ai,ai+1,aj,aa);
156     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
157     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
158     stat = mkl_sparse_optimize(aijmkl->csrA);
159     if (stat != SPARSE_STATUS_SUCCESS) {
160       PetscFunctionReturn(PETSC_ERR_LIB);
161     }
162     aijmkl->sparse_optimized = PETSC_TRUE;
163   }
164 #endif
165 
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatMult_SeqAIJMKL"
171 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
172 {
173   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
174   const PetscScalar *x;
175   PetscScalar       *y;
176   const MatScalar   *aa;
177   PetscErrorCode    ierr;
178   PetscInt          m=A->rmap->n;
179   const PetscInt    *aj,*ai;
180 
181   /* Variables not in MatMult_SeqAIJ. */
182   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
183 
184   PetscFunctionBegin;
185   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
186   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
187   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
188   aa   = a->a;  /* Nonzero elements stored row-by-row. */
189   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
190 
191   /* Call MKL sparse BLAS routine to do the MatMult. */
192   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
193 
194   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
195   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
196   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
197   PetscFunctionReturn(0);
198 }
199 
200 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
201 #undef __FUNCT__
202 #define __FUNCT__ "MatMult_SeqAIJMKL_SpMV2"
203 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
204 {
205   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
206   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
207   const PetscScalar *x;
208   PetscScalar       *y;
209   PetscErrorCode    ierr;
210   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
211 
212   PetscFunctionBegin;
213 
214 #ifdef DEBUG
215   printf("DEBUG: In MatMult_SeqAIJMKL_SpMV2\n");
216 #endif
217   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
218   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
219 
220   /* Call MKL SpMV2 executor routine to do the MatMult. */
221   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
222 
223   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
224   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
225   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
226   if (stat != SPARSE_STATUS_SUCCESS) {
227     PetscFunctionReturn(PETSC_ERR_LIB);
228   }
229   PetscFunctionReturn(0);
230 }
231 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
232 
233 #undef __FUNCT__
234 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL"
235 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
236 {
237   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
238   const PetscScalar *x;
239   PetscScalar       *y;
240   const MatScalar   *aa;
241   PetscErrorCode    ierr;
242   PetscInt          m=A->rmap->n;
243   const PetscInt    *aj,*ai;
244 
245   /* Variables not in MatMultTranspose_SeqAIJ. */
246   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
247 
248   PetscFunctionBegin;
249   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
250   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
251   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
252   aa   = a->a;  /* Nonzero elements stored row-by-row. */
253   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
254 
255   /* Call MKL sparse BLAS routine to do the MatMult. */
256   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
257 
258   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
259   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
260   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
265 #undef __FUNCT__
266 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL_SpMV2"
267 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
268 {
269   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
270   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
271   const PetscScalar *x;
272   PetscScalar       *y;
273   PetscErrorCode    ierr;
274   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
275 
276   PetscFunctionBegin;
277 
278 #ifdef DEBUG
279   printf("DEBUG: In MatMultTranspose_SeqAIJMKL_SpMV2\n");
280 #endif
281   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
282   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
283 
284   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
285   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
286 
287   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
288   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
289   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
290   if (stat != SPARSE_STATUS_SUCCESS) {
291     PetscFunctionReturn(PETSC_ERR_LIB);
292   }
293   PetscFunctionReturn(0);
294 }
295 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
296 
297 #undef __FUNCT__
298 #define __FUNCT__ "MatMultAdd_SeqAIJMKL"
299 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
300 {
301   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
302   const PetscScalar *x;
303   PetscScalar       *y,*z;
304   const MatScalar   *aa;
305   PetscErrorCode    ierr;
306   PetscInt          m=A->rmap->n;
307   const PetscInt    *aj,*ai;
308   PetscInt          i;
309 
310   /* Variables not in MatMultAdd_SeqAIJ. */
311   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
312   PetscScalar       alpha = 1.0;
313   PetscScalar       beta = 1.0;
314   char              matdescra[6];
315 
316   PetscFunctionBegin;
317   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
318   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
319 
320   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
321   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
322   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
323   aa   = a->a;  /* Nonzero elements stored row-by-row. */
324   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
325 
326   /* Call MKL sparse BLAS routine to do the MatMult. */
327   if (zz == yy) {
328     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
329     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
330   } else {
331     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
332      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
333     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
334     for (i=0; i<m; i++) {
335       z[i] += y[i];
336     }
337   }
338 
339   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
340   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
341   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
346 #undef __FUNCT__
347 #define __FUNCT__ "MatMultAdd_SeqAIJMKL_SpMV2"
348 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
349 {
350   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
351   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
352   const PetscScalar *x;
353   PetscScalar       *y,*z;
354   PetscErrorCode    ierr;
355   PetscInt          m=A->rmap->n;
356   PetscInt          i;
357 
358   /* Variables not in MatMultAdd_SeqAIJ. */
359   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
360 
361   PetscFunctionBegin;
362 
363 #ifdef DEBUG
364   printf("DEBUG: In MatMultAdd_SeqAIJMKL_SpMV2\n");
365 #endif
366 
367   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
368   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
369 
370   /* Call MKL sparse BLAS routine to do the MatMult. */
371   if (zz == yy) {
372     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
373      * with alpha and beta both set to 1.0. */
374     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
375   } else {
376     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
377      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
378     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
379     for (i=0; i<m; i++) {
380       z[i] += y[i];
381     }
382   }
383 
384   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
385   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
386   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
387   if (stat != SPARSE_STATUS_SUCCESS) {
388     PetscFunctionReturn(PETSC_ERR_LIB);
389   }
390   PetscFunctionReturn(0);
391 }
392 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
393 
394 #undef __FUNCT__
395 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL"
396 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
397 {
398   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
399   const PetscScalar *x;
400   PetscScalar       *y,*z;
401   const MatScalar   *aa;
402   PetscErrorCode    ierr;
403   PetscInt          m=A->rmap->n;
404   const PetscInt    *aj,*ai;
405   PetscInt          i;
406 
407   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
408   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
409   PetscScalar       alpha = 1.0;
410   PetscScalar       beta = 1.0;
411   char              matdescra[6];
412 
413   PetscFunctionBegin;
414   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
415   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
416 
417   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
418   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
419   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
420   aa   = a->a;  /* Nonzero elements stored row-by-row. */
421   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
422 
423   /* Call MKL sparse BLAS routine to do the MatMult. */
424   if (zz == yy) {
425     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
426     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
427   } else {
428     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
429      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
430     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
431     for (i=0; i<m; i++) {
432       z[i] += y[i];
433     }
434   }
435 
436   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
437   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
438   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
439   PetscFunctionReturn(0);
440 }
441 
442 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
443 #undef __FUNCT__
444 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL_SpMV2"
445 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
446 {
447   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
448   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
449   const PetscScalar *x;
450   PetscScalar       *y,*z;
451   PetscErrorCode    ierr;
452   PetscInt          m=A->rmap->n;
453   PetscInt          i;
454 
455   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
456   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
457 
458   PetscFunctionBegin;
459 
460 #ifdef DEBUG
461   printf("DEBUG: In MatMultTransposeAdd_SeqAIJMKL_SpMV2\n");
462 #endif
463 
464   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
465   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
466 
467   /* Call MKL sparse BLAS routine to do the MatMult. */
468   if (zz == yy) {
469     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
470      * with alpha and beta both set to 1.0. */
471     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
472   } else {
473     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
474      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
475     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
476     for (i=0; i<m; i++) {
477       z[i] += y[i];
478     }
479   }
480 
481   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
482   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
483   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
484   if (stat != SPARSE_STATUS_SUCCESS) {
485     PetscFunctionReturn(PETSC_ERR_LIB);
486   }
487   PetscFunctionReturn(0);
488 }
489 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
490 
491 
492 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
493  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
494  * routine, but can also be used to convert an assembled SeqAIJ matrix
495  * into a SeqAIJMKL one. */
496 #undef __FUNCT__
497 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJMKL"
498 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
499 {
500   PetscErrorCode ierr;
501   Mat            B = *newmat;
502   Mat_SeqAIJMKL *aijmkl;
503   PetscBool       set;
504 
505   PetscFunctionBegin;
506   if (reuse == MAT_INITIAL_MATRIX) {
507     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
508   }
509 
510   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
511   B->spptr = (void*) aijmkl;
512 
513   /* Set function pointers for methods that we inherit from AIJ but override.
514    * Currently the transposed operations are not being set because I encounter memory corruption
515    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
516   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
517   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
518   B->ops->destroy          = MatDestroy_SeqAIJMKL;
519 
520   aijmkl->sparse_optimized = PETSC_FALSE;
521 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
522   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
523 #elif
524   aijmkl->no_SpMV2 = PETSC_TRUE;
525 #endif
526 
527   /* Parse command line options. */
528   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
529   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
530   ierr = PetscOptionsEnd();CHKERRQ(ierr);
531 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
532   if(!aijmkl->no_SpMV2) {
533     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
534     aijmkl->no_SpMV2 = PETSC_TRUE;
535   }
536 #endif
537 
538   if(!aijmkl->no_SpMV2) {
539 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
540     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
541     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
542     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
543     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
544 #endif
545   } else {
546     B->ops->mult             = MatMult_SeqAIJMKL;
547     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
548     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
549     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
550   }
551 
552   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
553 
554   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
555   *newmat = B;
556   PetscFunctionReturn(0);
557 }
558 
559 #undef __FUNCT__
560 #define __FUNCT__ "MatCreateSeqAIJMKL"
561 /*@C
562    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
563    This type inherits from AIJ and is largely identical, but uses sparse BLAS
564    routines from Intel MKL whenever possible.
565    Collective on MPI_Comm
566 
567    Input Parameters:
568 +  comm - MPI communicator, set to PETSC_COMM_SELF
569 .  m - number of rows
570 .  n - number of columns
571 .  nz - number of nonzeros per row (same for all rows)
572 -  nnz - array containing the number of nonzeros in the various rows
573          (possibly different for each row) or NULL
574 
575    Output Parameter:
576 .  A - the matrix
577 
578    Notes:
579    If nnz is given then nz is ignored
580 
581    Level: intermediate
582 
583 .keywords: matrix, cray, sparse, parallel
584 
585 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
586 @*/
587 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
588 {
589   PetscErrorCode ierr;
590 
591   PetscFunctionBegin;
592   ierr = MatCreate(comm,A);CHKERRQ(ierr);
593   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
594   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
595   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
596   PetscFunctionReturn(0);
597 }
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatCreate_SeqAIJMKL"
601 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
602 {
603   PetscErrorCode ierr;
604 
605   PetscFunctionBegin;
606   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
607   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
608   PetscFunctionReturn(0);
609 }
610