xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision db63039fb01984a1f226b30ca0b0b82e7facf18c)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
38   }
39 
40   /* Reset the original function pointers. */
41   B->ops->duplicate        = MatDuplicate_SeqAIJ;
42   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
43   B->ops->destroy          = MatDestroy_SeqAIJ;
44   B->ops->mult             = MatMult_SeqAIJ;
45   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
46   B->ops->multadd          = MatMultAdd_SeqAIJ;
47   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
48 
49   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
50   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
51   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
52   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
53 
54   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
55    * simply involves destroying the MKL sparse matrix handle and then freeing
56    * the spptr pointer. */
57 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
58   if (aijmkl->sparse_optimized) {
59     sparse_status_t stat;
60     stat = mkl_sparse_destroy(aijmkl->csrA);
61     if (stat != SPARSE_STATUS_SUCCESS) {
62       PetscFunctionReturn(PETSC_ERR_LIB);
63     }
64   }
65 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
66   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
67 
68   /* Change the type of B to MATSEQAIJ. */
69   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
70 
71   *newmat = B;
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
76 {
77   PetscErrorCode ierr;
78   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
79 
80   PetscFunctionBegin;
81 
82   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
83    * spptr pointer. */
84   if (aijmkl) {
85     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
86 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
87     if (aijmkl->sparse_optimized) {
88       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
89       stat = mkl_sparse_destroy(aijmkl->csrA);
90       if (stat != SPARSE_STATUS_SUCCESS) {
91         PetscFunctionReturn(PETSC_ERR_LIB);
92       }
93     }
94 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
95     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
96   }
97 
98   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
99    * to destroy everything that remains. */
100   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
101   /* Note that I don't call MatSetType().  I believe this is because that
102    * is only to be called when *building* a matrix.  I could be wrong, but
103    * that is how things work for the SuperLU matrix class. */
104   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
109 {
110   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
111   Mat_SeqAIJMKL   *aijmkl;
112   PetscInt        m,n;
113   MatScalar       *aa;
114   PetscInt        *aj,*ai;
115 
116 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
117   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
118    * does nothing. We make it callable anyway in this case because it cuts
119    * down on littering the code with #ifdefs. */
120   PetscFunctionReturn(0);
121 #else
122 
123   sparse_status_t stat;
124 
125   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
126 
127   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
128 
129   if (aijmkl->sparse_optimized) {
130     /* Matrix has been previously assembled and optimized. Must destroy old
131      * matrix handle before running the optimization step again. */
132     stat = mkl_sparse_destroy(aijmkl->csrA);
133     if (stat != SPARSE_STATUS_SUCCESS) {
134       PetscFunctionReturn(PETSC_ERR_LIB);
135     }
136   }
137 
138   /* Now perform the SpMV2 setup and matrix optimization. */
139   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
140   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
141   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
142   m = A->rmap->n;
143   n = A->cmap->n;
144   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
145   aa   = a->a;  /* Nonzero elements stored row-by-row. */
146   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
147   if (n>0) {
148     stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
149     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
150     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
151     stat = mkl_sparse_optimize(aijmkl->csrA);
152     if (stat != SPARSE_STATUS_SUCCESS) {
153       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
154       PetscFunctionReturn(PETSC_ERR_LIB);
155     }
156     aijmkl->sparse_optimized = PETSC_TRUE;
157   }
158 
159   PetscFunctionReturn(0);
160 #endif
161 }
162 
163 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
164 {
165   PetscErrorCode ierr;
166   Mat_SeqAIJMKL *aijmkl;
167   Mat_SeqAIJMKL *aijmkl_dest;
168 
169   PetscFunctionBegin;
170   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
171   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
172   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
173   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
174   aijmkl_dest->sparse_optimized = PETSC_FALSE;
175   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
180 {
181   PetscErrorCode  ierr;
182   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
183 
184   PetscFunctionBegin;
185   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
186 
187   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
188    * extra information and some different methods, call the AssemblyEnd
189    * routine for a MATSEQAIJ.
190    * I'm not sure if this is the best way to do this, but it avoids
191    * a lot of code duplication.
192    * I also note that currently MATSEQAIJMKL doesn't know anything about
193    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
194    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
195    * this, this may break things.  (Don't know... haven't looked at it.
196    * Do I need to disable this somehow?) */
197   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
198   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
199 
200   /* Now create the MKL sparse handle (if needed; the function checks). */
201   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
202 
203   PetscFunctionReturn(0);
204 }
205 
206 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
207 {
208   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
209   const PetscScalar *x;
210   PetscScalar       *y;
211   const MatScalar   *aa;
212   PetscErrorCode    ierr;
213   PetscInt          m=A->rmap->n;
214   PetscInt          n=A->cmap->n;
215   PetscScalar       alpha = 1.0;
216   PetscScalar       beta = 0.0;
217   const PetscInt    *aj,*ai;
218   char              matdescra[6];
219 
220 
221   /* Variables not in MatMult_SeqAIJ. */
222   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
223 
224   PetscFunctionBegin;
225   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
226   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
227   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
228   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
229   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
230   aa   = a->a;  /* Nonzero elements stored row-by-row. */
231   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
232 
233   /* Call MKL sparse BLAS routine to do the MatMult. */
234   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
235 
236   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
237   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
238   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
243 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
244 {
245   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
246   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
247   const PetscScalar *x;
248   PetscScalar       *y;
249   PetscErrorCode    ierr;
250   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
251 
252   PetscFunctionBegin;
253 
254   /* If there are no rows, this is a no-op: return immediately. */
255   if(A->cmap->n < 1) PetscFunctionReturn(0);
256 
257   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
258   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
259 
260   /* Call MKL SpMV2 executor routine to do the MatMult. */
261   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
262 
263   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
264   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
265   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
266   if (stat != SPARSE_STATUS_SUCCESS) {
267     PetscFunctionReturn(PETSC_ERR_LIB);
268   }
269   PetscFunctionReturn(0);
270 }
271 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
272 
273 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
274 {
275   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
276   const PetscScalar *x;
277   PetscScalar       *y;
278   const MatScalar   *aa;
279   PetscErrorCode    ierr;
280   PetscInt          m=A->rmap->n;
281   PetscInt          n=A->cmap->n;
282   PetscScalar       alpha = 1.0;
283   PetscScalar       beta = 0.0;
284   const PetscInt    *aj,*ai;
285   char              matdescra[6];
286 
287   /* Variables not in MatMultTranspose_SeqAIJ. */
288   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
289 
290   PetscFunctionBegin;
291   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
293   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
294   aa   = a->a;  /* Nonzero elements stored row-by-row. */
295   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
296 
297   /* Call MKL sparse BLAS routine to do the MatMult. */
298   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
299 
300   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
301   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
302   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
303   PetscFunctionReturn(0);
304 }
305 
306 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
307 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
308 {
309   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
310   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
311   const PetscScalar *x;
312   PetscScalar       *y;
313   PetscErrorCode    ierr;
314   sparse_status_t   stat;
315 
316   PetscFunctionBegin;
317 
318   /* If there are no rows, this is a no-op: return immediately. */
319   if(A->cmap->n < 1) PetscFunctionReturn(0);
320 
321   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
322   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
323 
324   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
325   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
326 
327   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
328   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
329   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
330   if (stat != SPARSE_STATUS_SUCCESS) {
331     PetscFunctionReturn(PETSC_ERR_LIB);
332   }
333   PetscFunctionReturn(0);
334 }
335 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
336 
337 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
338 {
339   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
340   const PetscScalar *x;
341   PetscScalar       *y,*z;
342   const MatScalar   *aa;
343   PetscErrorCode    ierr;
344   PetscInt          m=A->rmap->n;
345   PetscInt          n=A->cmap->n;
346   const PetscInt    *aj,*ai;
347   PetscInt          i;
348 
349   /* Variables not in MatMultAdd_SeqAIJ. */
350   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
351   PetscScalar       alpha = 1.0;
352   PetscScalar       beta;
353   char              matdescra[6];
354 
355   PetscFunctionBegin;
356   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
357   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
358 
359   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
360   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
361   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
362   aa   = a->a;  /* Nonzero elements stored row-by-row. */
363   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
364 
365   /* Call MKL sparse BLAS routine to do the MatMult. */
366   if (zz == yy) {
367     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
368     beta = 1.0;
369     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
370   } else {
371     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
372      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
373     beta = 0.0;
374     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
375     for (i=0; i<m; i++) {
376       z[i] += y[i];
377     }
378   }
379 
380   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
381   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
382   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
387 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
388 {
389   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
390   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
391   const PetscScalar *x;
392   PetscScalar       *y,*z;
393   PetscErrorCode    ierr;
394   PetscInt          m=A->rmap->n;
395   PetscInt          i;
396 
397   /* Variables not in MatMultAdd_SeqAIJ. */
398   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
399 
400   PetscFunctionBegin;
401 
402   /* If there are no rows, this is a no-op: return immediately. */
403   if(A->cmap->n < 1) PetscFunctionReturn(0);
404 
405   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
406   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
407 
408   /* Call MKL sparse BLAS routine to do the MatMult. */
409   if (zz == yy) {
410     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
411      * with alpha and beta both set to 1.0. */
412     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
413   } else {
414     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
415      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
416     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
417     for (i=0; i<m; i++) {
418       z[i] += y[i];
419     }
420   }
421 
422   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
423   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
424   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
425   if (stat != SPARSE_STATUS_SUCCESS) {
426     PetscFunctionReturn(PETSC_ERR_LIB);
427   }
428   PetscFunctionReturn(0);
429 }
430 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
431 
432 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
433 {
434   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
435   const PetscScalar *x;
436   PetscScalar       *y,*z;
437   const MatScalar   *aa;
438   PetscErrorCode    ierr;
439   PetscInt          m=A->rmap->n;
440   PetscInt          n=A->cmap->n;
441   const PetscInt    *aj,*ai;
442   PetscInt          i;
443 
444   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
445   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
446   PetscScalar       alpha = 1.0;
447   PetscScalar       beta;
448   char              matdescra[6];
449 
450   PetscFunctionBegin;
451   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
452   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
453 
454   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
455   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
456   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
457   aa   = a->a;  /* Nonzero elements stored row-by-row. */
458   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
459 
460   /* Call MKL sparse BLAS routine to do the MatMult. */
461   if (zz == yy) {
462     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
463     beta = 1.0;
464     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
465   } else {
466     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z.
467      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
468     beta = 0.0;
469     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
470     for (i=0; i<m; i++) {
471       z[i] += y[i];
472     }
473   }
474 
475   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
476   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
477   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
478   PetscFunctionReturn(0);
479 }
480 
481 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
482 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
483 {
484   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
485   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
486   const PetscScalar *x;
487   PetscScalar       *y,*z;
488   PetscErrorCode    ierr;
489   PetscInt          m=A->rmap->n;
490   PetscInt          i;
491 
492   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
493   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
494 
495   PetscFunctionBegin;
496 
497   /* If there are no rows, this is a no-op: return immediately. */
498   if(A->cmap->n < 1) PetscFunctionReturn(0);
499 
500   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
501   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
502 
503   /* Call MKL sparse BLAS routine to do the MatMult. */
504   if (zz == yy) {
505     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
506      * with alpha and beta both set to 1.0. */
507     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
508   } else {
509     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
510      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
511     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
512     for (i=0; i<m; i++) {
513       z[i] += y[i];
514     }
515   }
516 
517   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
518   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
519   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
520   if (stat != SPARSE_STATUS_SUCCESS) {
521     PetscFunctionReturn(PETSC_ERR_LIB);
522   }
523   PetscFunctionReturn(0);
524 }
525 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
526 
527 PETSC_INTERN PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
528 {
529   PetscErrorCode ierr;
530 
531   ierr = MatScale_SeqAIJ(inA,alpha);CHKERRQ(ierr);
532   ierr = MatSeqAIJMKL_create_mkl_handle(inA);CHKERRQ(ierr);
533   PetscFunctionReturn(0);
534 }
535 
536 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
537  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
538  * routine, but can also be used to convert an assembled SeqAIJ matrix
539  * into a SeqAIJMKL one. */
540 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
541 {
542   PetscErrorCode ierr;
543   Mat            B = *newmat;
544   Mat_SeqAIJMKL  *aijmkl;
545   PetscBool      set;
546   PetscBool      sametype;
547 
548   PetscFunctionBegin;
549   if (reuse == MAT_INITIAL_MATRIX) {
550     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
551   }
552 
553   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
554   if (sametype) PetscFunctionReturn(0);
555 
556   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
557   B->spptr = (void*) aijmkl;
558 
559   /* Set function pointers for methods that we inherit from AIJ but override.
560    * We also parse some command line options below, since those determine some of the methods we point to.
561    * Note: Currently the transposed operations are not being set because I encounter memory corruption
562    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
563   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
564   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
565   B->ops->destroy          = MatDestroy_SeqAIJMKL;
566 
567   aijmkl->sparse_optimized = PETSC_FALSE;
568 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
569   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
570 #elif
571   aijmkl->no_SpMV2 = PETSC_TRUE;
572 #endif
573 
574   /* Parse command line options. */
575   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
576   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
577   ierr = PetscOptionsEnd();CHKERRQ(ierr);
578 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
579   if(!aijmkl->no_SpMV2) {
580     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
581     aijmkl->no_SpMV2 = PETSC_TRUE;
582   }
583 #endif
584 
585   if(!aijmkl->no_SpMV2) {
586 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
587     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
588     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
589     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
590     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
591 #endif
592   } else {
593     B->ops->mult             = MatMult_SeqAIJMKL;
594     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
595     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
596     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
597   }
598 
599   B->ops->scale = MatScale_SeqAIJMKL;
600 
601   ierr = PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);CHKERRQ(ierr);
602   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
603   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
604   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
605   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
606 
607   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
608   *newmat = B;
609   PetscFunctionReturn(0);
610 }
611 
612 /*@C
613    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
614    This type inherits from AIJ and is largely identical, but uses sparse BLAS
615    routines from Intel MKL whenever possible.
616    Collective on MPI_Comm
617 
618    Input Parameters:
619 +  comm - MPI communicator, set to PETSC_COMM_SELF
620 .  m - number of rows
621 .  n - number of columns
622 .  nz - number of nonzeros per row (same for all rows)
623 -  nnz - array containing the number of nonzeros in the various rows
624          (possibly different for each row) or NULL
625 
626    Output Parameter:
627 .  A - the matrix
628 
629    Notes:
630    If nnz is given then nz is ignored
631 
632    Level: intermediate
633 
634 .keywords: matrix, cray, sparse, parallel
635 
636 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
637 @*/
638 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
639 {
640   PetscErrorCode ierr;
641 
642   PetscFunctionBegin;
643   ierr = MatCreate(comm,A);CHKERRQ(ierr);
644   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
645   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
646   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
647   PetscFunctionReturn(0);
648 }
649 
650 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
651 {
652   PetscErrorCode ierr;
653 
654   PetscFunctionBegin;
655   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
656   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
657   PetscFunctionReturn(0);
658 }
659