xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 0632b357eb3bdb30d1732d9f6ccb5de673198085)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 #undef __FUNCT__
27 #define __FUNCT__ "MatConvert_SeqAIJMKL_SeqAIJ"
28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
29 {
30   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
31   /* so we will ignore 'MatType type'. */
32   PetscErrorCode ierr;
33   Mat            B       = *newmat;
34   Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
35 
36   PetscFunctionBegin;
37   if (reuse == MAT_INITIAL_MATRIX) {
38     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
39   }
40 
41   /* Reset the original function pointers. */
42   B->ops->duplicate        = MatDuplicate_SeqAIJ;
43   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
44   B->ops->destroy          = MatDestroy_SeqAIJ;
45   B->ops->mult             = MatMult_SeqAIJ;
46   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
47   B->ops->multadd          = MatMultAdd_SeqAIJ;
48   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
49 
50   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
51    * simply involves destroying the MKL sparse matrix handle.
52    * We don't free the Mat_SeqAIJMKL struct itself, as this will
53    * cause problems later when MatDestroy() tries to free it. */
54 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
55   if (aijmkl->sparse_optimized) {
56     sparse_status_t stat;
57     stat = mkl_sparse_destroy(aijmkl->csrA);
58     if (stat != SPARSE_STATUS_SUCCESS) {
59       PetscFunctionReturn(PETSC_ERR_LIB);
60     }
61   }
62 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
63 
64   /* Change the type of B to MATSEQAIJ. */
65   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
66 
67   *newmat = B;
68   PetscFunctionReturn(0);
69 }
70 
71 #undef __FUNCT__
72 #define __FUNCT__ "MatDestroy_SeqAIJMKL"
73 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
74 {
75   PetscErrorCode ierr;
76   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
77 
78   PetscFunctionBegin;
79   /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
80 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
81   if (aijmkl->sparse_optimized) {
82     sparse_status_t stat = SPARSE_STATUS_SUCCESS;
83     stat = mkl_sparse_destroy(aijmkl->csrA);
84     if (stat != SPARSE_STATUS_SUCCESS) {
85       PetscFunctionReturn(PETSC_ERR_LIB);
86     }
87   }
88 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
89   ierr = PetscFree(A->spptr);CHKERRQ(ierr);
90 
91   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
92    * to destroy everything that remains. */
93   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
94   /* Note that I don't call MatSetType().  I believe this is because that
95    * is only to be called when *building* a matrix.  I could be wrong, but
96    * that is how things work for the SuperLU matrix class. */
97   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
98   PetscFunctionReturn(0);
99 }
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatDuplicate_SeqAIJMKL"
103 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
104 {
105   PetscErrorCode ierr;
106   Mat_SeqAIJMKL *aijmkl;
107   Mat_SeqAIJMKL *aijmkl_dest;
108 
109   PetscFunctionBegin;
110   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
111   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
112   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
113   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
114   aijmkl_dest->sparse_optimized = PETSC_FALSE;
115 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
116   aijmkl_dest->csrA = NULL;
117   if (!aijmkl->no_SpMV2) {
118     sparse_status_t stat;
119     stat = mkl_sparse_copy(aijmkl->csrA,aijmkl->descr,&aijmkl_dest->csrA);
120     stat = mkl_sparse_optimize(aijmkl_dest->csrA);
121     if (stat != SPARSE_STATUS_SUCCESS) {
122       PetscFunctionReturn(PETSC_ERR_LIB);
123     }
124     aijmkl_dest->sparse_optimized = PETSC_TRUE;
125   }
126 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
127   PetscFunctionReturn(0);
128 }
129 
130 #undef __FUNCT__
131 #define __FUNCT__ "MatAssemblyEnd_SeqAIJMKL"
132 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
133 {
134   PetscErrorCode  ierr;
135   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
136   Mat_SeqAIJMKL   *aijmkl;
137 
138   MatScalar       *aa;
139   PetscInt        n;
140   PetscInt        *aj,*ai;
141 
142   PetscFunctionBegin;
143   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
144 
145   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
146    * extra information and some different methods, call the AssemblyEnd
147    * routine for a MATSEQAIJ.
148    * I'm not sure if this is the best way to do this, but it avoids
149    * a lot of code duplication.
150    * I also note that currently MATSEQAIJMKL doesn't know anything about
151    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
152    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
153    * this, this may break things.  (Don't know... haven't looked at it.
154    * Do I need to disable this somehow?) */
155   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
156   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
157 
158   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
159 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
160   if (!aijmkl->no_SpMV2) {
161     sparse_status_t stat;
162     if (aijmkl->sparse_optimized) {
163       /* Matrix has been previously assembled and optimized. Must destroy old
164        * matrix handle before running the optimization step again. */
165       sparse_status_t stat;
166       stat = mkl_sparse_destroy(aijmkl->csrA);
167       if (stat != SPARSE_STATUS_SUCCESS) {
168         PetscFunctionReturn(PETSC_ERR_LIB);
169       }
170     }
171     /* Now perform the SpMV2 setup and matrix optimization. */
172     aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
173     aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
174     aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
175     n = A->rmap->n;
176     aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
177     aa   = a->a;  /* Nonzero elements stored row-by-row. */
178     ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
179     stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,n,n,ai,ai+1,aj,aa);
180     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
181     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
182     stat = mkl_sparse_optimize(aijmkl->csrA);
183     if (stat != SPARSE_STATUS_SUCCESS) {
184       PetscFunctionReturn(PETSC_ERR_LIB);
185     }
186     aijmkl->sparse_optimized = PETSC_TRUE;
187   }
188 #endif
189 
190   PetscFunctionReturn(0);
191 }
192 
193 #undef __FUNCT__
194 #define __FUNCT__ "MatMult_SeqAIJMKL"
195 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
196 {
197   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
198   const PetscScalar *x;
199   PetscScalar       *y;
200   const MatScalar   *aa;
201   PetscErrorCode    ierr;
202   PetscInt          m=A->rmap->n;
203   const PetscInt    *aj,*ai;
204 
205   /* Variables not in MatMult_SeqAIJ. */
206   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
207 
208   PetscFunctionBegin;
209   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
210   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
211   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
212   aa   = a->a;  /* Nonzero elements stored row-by-row. */
213   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
214 
215   /* Call MKL sparse BLAS routine to do the MatMult. */
216   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
217 
218   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
219   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
220   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
221   PetscFunctionReturn(0);
222 }
223 
224 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
225 #undef __FUNCT__
226 #define __FUNCT__ "MatMult_SeqAIJMKL_SpMV2"
227 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
228 {
229   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
230   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
231   const PetscScalar *x;
232   PetscScalar       *y;
233   PetscErrorCode    ierr;
234   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
235 
236   PetscFunctionBegin;
237 
238 #ifdef DEBUG
239   printf("DEBUG: In MatMult_SeqAIJMKL_SpMV2\n");
240 #endif
241   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
242   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
243 
244   /* Call MKL SpMV2 executor routine to do the MatMult. */
245   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
246 
247   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
248   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
249   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
250   if (stat != SPARSE_STATUS_SUCCESS) {
251     PetscFunctionReturn(PETSC_ERR_LIB);
252   }
253   PetscFunctionReturn(0);
254 }
255 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
256 
257 #undef __FUNCT__
258 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL"
259 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
260 {
261   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
262   const PetscScalar *x;
263   PetscScalar       *y;
264   const MatScalar   *aa;
265   PetscErrorCode    ierr;
266   PetscInt          m=A->rmap->n;
267   const PetscInt    *aj,*ai;
268 
269   /* Variables not in MatMultTranspose_SeqAIJ. */
270   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
271 
272   PetscFunctionBegin;
273   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
274   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
275   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
276   aa   = a->a;  /* Nonzero elements stored row-by-row. */
277   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
278 
279   /* Call MKL sparse BLAS routine to do the MatMult. */
280   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
281 
282   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
283   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
284   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
285   PetscFunctionReturn(0);
286 }
287 
288 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
289 #undef __FUNCT__
290 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL_SpMV2"
291 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
292 {
293   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
294   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
295   const PetscScalar *x;
296   PetscScalar       *y;
297   PetscErrorCode    ierr;
298   sparse_status_t   stat;
299 
300   PetscFunctionBegin;
301 
302 #ifdef DEBUG
303   printf("DEBUG: In MatMultTranspose_SeqAIJMKL_SpMV2\n");
304 #endif
305   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
306   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
307 
308   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
309   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
310 
311   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
312   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
313   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
314   if (stat != SPARSE_STATUS_SUCCESS) {
315     PetscFunctionReturn(PETSC_ERR_LIB);
316   }
317   PetscFunctionReturn(0);
318 }
319 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
320 
321 #undef __FUNCT__
322 #define __FUNCT__ "MatMultAdd_SeqAIJMKL"
323 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
324 {
325   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
326   const PetscScalar *x;
327   PetscScalar       *y,*z;
328   const MatScalar   *aa;
329   PetscErrorCode    ierr;
330   PetscInt          m=A->rmap->n;
331   const PetscInt    *aj,*ai;
332   PetscInt          i;
333 
334   /* Variables not in MatMultAdd_SeqAIJ. */
335   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
336   PetscScalar       alpha = 1.0;
337   PetscScalar       beta = 1.0;
338   char              matdescra[6];
339 
340   PetscFunctionBegin;
341   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
342   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
343 
344   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
345   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
346   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
347   aa   = a->a;  /* Nonzero elements stored row-by-row. */
348   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
349 
350   /* Call MKL sparse BLAS routine to do the MatMult. */
351   if (zz == yy) {
352     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
353     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
354   } else {
355     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
356      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
357     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
358     for (i=0; i<m; i++) {
359       z[i] += y[i];
360     }
361   }
362 
363   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
364   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
365   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
366   PetscFunctionReturn(0);
367 }
368 
369 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
370 #undef __FUNCT__
371 #define __FUNCT__ "MatMultAdd_SeqAIJMKL_SpMV2"
372 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
373 {
374   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
375   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
376   const PetscScalar *x;
377   PetscScalar       *y,*z;
378   PetscErrorCode    ierr;
379   PetscInt          m=A->rmap->n;
380   PetscInt          i;
381 
382   /* Variables not in MatMultAdd_SeqAIJ. */
383   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
384 
385   PetscFunctionBegin;
386 
387 #ifdef DEBUG
388   printf("DEBUG: In MatMultAdd_SeqAIJMKL_SpMV2\n");
389 #endif
390 
391   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
392   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
393 
394   /* Call MKL sparse BLAS routine to do the MatMult. */
395   if (zz == yy) {
396     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
397      * with alpha and beta both set to 1.0. */
398     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
399   } else {
400     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
401      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
402     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
403     for (i=0; i<m; i++) {
404       z[i] += y[i];
405     }
406   }
407 
408   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
409   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
411   if (stat != SPARSE_STATUS_SUCCESS) {
412     PetscFunctionReturn(PETSC_ERR_LIB);
413   }
414   PetscFunctionReturn(0);
415 }
416 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
417 
418 #undef __FUNCT__
419 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL"
420 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
421 {
422   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
423   const PetscScalar *x;
424   PetscScalar       *y,*z;
425   const MatScalar   *aa;
426   PetscErrorCode    ierr;
427   PetscInt          m=A->rmap->n;
428   const PetscInt    *aj,*ai;
429   PetscInt          i;
430 
431   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
432   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
433   PetscScalar       alpha = 1.0;
434   PetscScalar       beta = 1.0;
435   char              matdescra[6];
436 
437   PetscFunctionBegin;
438   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
439   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
440 
441   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
442   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
443   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
444   aa   = a->a;  /* Nonzero elements stored row-by-row. */
445   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
446 
447   /* Call MKL sparse BLAS routine to do the MatMult. */
448   if (zz == yy) {
449     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
450     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
451   } else {
452     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
453      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
454     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
455     for (i=0; i<m; i++) {
456       z[i] += y[i];
457     }
458   }
459 
460   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
461   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
462   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
463   PetscFunctionReturn(0);
464 }
465 
466 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
467 #undef __FUNCT__
468 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL_SpMV2"
469 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
470 {
471   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
472   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
473   const PetscScalar *x;
474   PetscScalar       *y,*z;
475   PetscErrorCode    ierr;
476   PetscInt          m=A->rmap->n;
477   PetscInt          i;
478 
479   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
480   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
481 
482   PetscFunctionBegin;
483 
484 #ifdef DEBUG
485   printf("DEBUG: In MatMultTransposeAdd_SeqAIJMKL_SpMV2\n");
486 #endif
487 
488   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
489   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
490 
491   /* Call MKL sparse BLAS routine to do the MatMult. */
492   if (zz == yy) {
493     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
494      * with alpha and beta both set to 1.0. */
495     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
496   } else {
497     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
498      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
499     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
500     for (i=0; i<m; i++) {
501       z[i] += y[i];
502     }
503   }
504 
505   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
506   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
507   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
508   if (stat != SPARSE_STATUS_SUCCESS) {
509     PetscFunctionReturn(PETSC_ERR_LIB);
510   }
511   PetscFunctionReturn(0);
512 }
513 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
514 
515 
516 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
517  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
518  * routine, but can also be used to convert an assembled SeqAIJ matrix
519  * into a SeqAIJMKL one. */
520 #undef __FUNCT__
521 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJMKL"
522 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
523 {
524   PetscErrorCode ierr;
525   Mat            B = *newmat;
526   Mat_SeqAIJMKL *aijmkl;
527   PetscBool       set;
528 
529   PetscFunctionBegin;
530   if (reuse == MAT_INITIAL_MATRIX) {
531     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
532   }
533 
534   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
535   B->spptr = (void*) aijmkl;
536 
537   /* Set function pointers for methods that we inherit from AIJ but override.
538    * Currently the transposed operations are not being set because I encounter memory corruption
539    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
540   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
541   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
542   B->ops->destroy          = MatDestroy_SeqAIJMKL;
543 
544   aijmkl->sparse_optimized = PETSC_FALSE;
545 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
546   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
547 #elif
548   aijmkl->no_SpMV2 = PETSC_TRUE;
549 #endif
550 
551   /* Parse command line options. */
552   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
553   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
554   ierr = PetscOptionsEnd();CHKERRQ(ierr);
555 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
556   if(!aijmkl->no_SpMV2) {
557     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
558     aijmkl->no_SpMV2 = PETSC_TRUE;
559   }
560 #endif
561 
562   if(!aijmkl->no_SpMV2) {
563 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
564     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
565     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
566     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
567     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
568 #endif
569   } else {
570     B->ops->mult             = MatMult_SeqAIJMKL;
571     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
572     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
573     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
574   }
575 
576   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
577 
578   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
579   *newmat = B;
580   PetscFunctionReturn(0);
581 }
582 
583 #undef __FUNCT__
584 #define __FUNCT__ "MatCreateSeqAIJMKL"
585 /*@C
586    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
587    This type inherits from AIJ and is largely identical, but uses sparse BLAS
588    routines from Intel MKL whenever possible.
589    Collective on MPI_Comm
590 
591    Input Parameters:
592 +  comm - MPI communicator, set to PETSC_COMM_SELF
593 .  m - number of rows
594 .  n - number of columns
595 .  nz - number of nonzeros per row (same for all rows)
596 -  nnz - array containing the number of nonzeros in the various rows
597          (possibly different for each row) or NULL
598 
599    Output Parameter:
600 .  A - the matrix
601 
602    Notes:
603    If nnz is given then nz is ignored
604 
605    Level: intermediate
606 
607 .keywords: matrix, cray, sparse, parallel
608 
609 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
610 @*/
611 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
612 {
613   PetscErrorCode ierr;
614 
615   PetscFunctionBegin;
616   ierr = MatCreate(comm,A);CHKERRQ(ierr);
617   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
618   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
619   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
620   PetscFunctionReturn(0);
621 }
622 
623 #undef __FUNCT__
624 #define __FUNCT__ "MatCreate_SeqAIJMKL"
625 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
626 {
627   PetscErrorCode ierr;
628 
629   PetscFunctionBegin;
630   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
631   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
632   PetscFunctionReturn(0);
633 }
634