xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 9b6041cc2373647947d4eddddc70abc040a88f7b)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
38   }
39 
40   /* Reset the original function pointers. */
41   B->ops->duplicate        = MatDuplicate_SeqAIJ;
42   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
43   B->ops->destroy          = MatDestroy_SeqAIJ;
44   B->ops->mult             = MatMult_SeqAIJ;
45   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
46   B->ops->multadd          = MatMultAdd_SeqAIJ;
47   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
48 
49   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
50   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
51   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
52   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
53 
54   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
55    * simply involves destroying the MKL sparse matrix handle and then freeing
56    * the spptr pointer. */
57 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
58   if (aijmkl->sparse_optimized) {
59     sparse_status_t stat;
60     stat = mkl_sparse_destroy(aijmkl->csrA);
61     if (stat != SPARSE_STATUS_SUCCESS) {
62       PetscFunctionReturn(PETSC_ERR_LIB);
63     }
64   }
65 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
66   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
67 
68   /* Change the type of B to MATSEQAIJ. */
69   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
70 
71   *newmat = B;
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
76 {
77   PetscErrorCode ierr;
78   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
79 
80   PetscFunctionBegin;
81 
82   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
83    * spptr pointer. */
84   if (aijmkl) {
85     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
86 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
87     if (aijmkl->sparse_optimized) {
88       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
89       stat = mkl_sparse_destroy(aijmkl->csrA);
90       if (stat != SPARSE_STATUS_SUCCESS) {
91         PetscFunctionReturn(PETSC_ERR_LIB);
92       }
93     }
94 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
95     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
96   }
97 
98   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
99    * to destroy everything that remains. */
100   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
101   /* Note that I don't call MatSetType().  I believe this is because that
102    * is only to be called when *building* a matrix.  I could be wrong, but
103    * that is how things work for the SuperLU matrix class. */
104   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
109 {
110   PetscErrorCode ierr;
111   Mat_SeqAIJMKL *aijmkl;
112   Mat_SeqAIJMKL *aijmkl_dest;
113 
114   PetscFunctionBegin;
115   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
116   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
117   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
118   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
119   aijmkl_dest->sparse_optimized = PETSC_FALSE;
120 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
121   aijmkl_dest->csrA = NULL;
122   if (!aijmkl->no_SpMV2 && A->cmap->n > 0) {
123     sparse_status_t stat;
124     stat = mkl_sparse_copy(aijmkl->csrA,aijmkl->descr,&aijmkl_dest->csrA);
125     if (stat != SPARSE_STATUS_SUCCESS) {
126       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_copy");
127       PetscFunctionReturn(PETSC_ERR_LIB);
128     }
129     stat = mkl_sparse_optimize(aijmkl_dest->csrA);
130     if (stat != SPARSE_STATUS_SUCCESS) {
131       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize");
132       PetscFunctionReturn(PETSC_ERR_LIB);
133     }
134     aijmkl_dest->sparse_optimized = PETSC_TRUE;
135   }
136 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
137   PetscFunctionReturn(0);
138 }
139 
140 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
141 {
142   PetscErrorCode  ierr;
143   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
144   Mat_SeqAIJMKL   *aijmkl;
145 
146   MatScalar       *aa;
147   PetscInt        m,n;
148   PetscInt        *aj,*ai;
149 
150   PetscFunctionBegin;
151   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
152 
153   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
154    * extra information and some different methods, call the AssemblyEnd
155    * routine for a MATSEQAIJ.
156    * I'm not sure if this is the best way to do this, but it avoids
157    * a lot of code duplication.
158    * I also note that currently MATSEQAIJMKL doesn't know anything about
159    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
160    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
161    * this, this may break things.  (Don't know... haven't looked at it.
162    * Do I need to disable this somehow?) */
163   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
164   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
165 
166   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
167 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
168   if (!aijmkl->no_SpMV2) {
169     sparse_status_t stat;
170     if (aijmkl->sparse_optimized) {
171       /* Matrix has been previously assembled and optimized. Must destroy old
172        * matrix handle before running the optimization step again. */
173       sparse_status_t stat;
174       stat = mkl_sparse_destroy(aijmkl->csrA);
175       if (stat != SPARSE_STATUS_SUCCESS) {
176         PetscFunctionReturn(PETSC_ERR_LIB);
177       }
178     }
179     /* Now perform the SpMV2 setup and matrix optimization. */
180     aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
181     aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
182     aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
183     m = A->rmap->n;
184     n = A->cmap->n;
185     aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
186     aa   = a->a;  /* Nonzero elements stored row-by-row. */
187     ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
188     if (n>0) {
189       stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
190       stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
191       stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
192       stat = mkl_sparse_optimize(aijmkl->csrA);
193       if (stat != SPARSE_STATUS_SUCCESS) {
194         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
195         PetscFunctionReturn(PETSC_ERR_LIB);
196       }
197       aijmkl->sparse_optimized = PETSC_TRUE;
198     }
199   }
200 #endif
201 
202   PetscFunctionReturn(0);
203 }
204 
205 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
206 {
207   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
208   const PetscScalar *x;
209   PetscScalar       *y;
210   const MatScalar   *aa;
211   PetscErrorCode    ierr;
212   PetscInt          m=A->rmap->n;
213   const PetscInt    *aj,*ai;
214 
215   /* Variables not in MatMult_SeqAIJ. */
216   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
217 
218   PetscFunctionBegin;
219   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
220   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
221   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
222   aa   = a->a;  /* Nonzero elements stored row-by-row. */
223   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
224 
225   /* Call MKL sparse BLAS routine to do the MatMult. */
226   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
227 
228   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
229   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
230   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
231   PetscFunctionReturn(0);
232 }
233 
234 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
235 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
236 {
237   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
238   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
239   const PetscScalar *x;
240   PetscScalar       *y;
241   PetscErrorCode    ierr;
242   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
243 
244   PetscFunctionBegin;
245 
246   /* If there are no rows, this is a no-op: return immediately. */
247   if(A->cmap->n < 1) PetscFunctionReturn(0);
248 
249   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
250   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
251 
252   /* Call MKL SpMV2 executor routine to do the MatMult. */
253   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
254 
255   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
256   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
257   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
258   if (stat != SPARSE_STATUS_SUCCESS) {
259     PetscFunctionReturn(PETSC_ERR_LIB);
260   }
261   PetscFunctionReturn(0);
262 }
263 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
264 
265 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
266 {
267   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
268   const PetscScalar *x;
269   PetscScalar       *y;
270   const MatScalar   *aa;
271   PetscErrorCode    ierr;
272   PetscInt          m=A->rmap->n;
273   const PetscInt    *aj,*ai;
274 
275   /* Variables not in MatMultTranspose_SeqAIJ. */
276   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
277 
278   PetscFunctionBegin;
279   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
280   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
281   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
282   aa   = a->a;  /* Nonzero elements stored row-by-row. */
283   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
284 
285   /* Call MKL sparse BLAS routine to do the MatMult. */
286   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
287 
288   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
289   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
290   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
291   PetscFunctionReturn(0);
292 }
293 
294 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
295 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
296 {
297   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
298   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
299   const PetscScalar *x;
300   PetscScalar       *y;
301   PetscErrorCode    ierr;
302   sparse_status_t   stat;
303 
304   PetscFunctionBegin;
305 
306   /* If there are no rows, this is a no-op: return immediately. */
307   if(A->cmap->n < 1) PetscFunctionReturn(0);
308 
309   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
310   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
311 
312   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
313   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
314 
315   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
316   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
317   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
318   if (stat != SPARSE_STATUS_SUCCESS) {
319     PetscFunctionReturn(PETSC_ERR_LIB);
320   }
321   PetscFunctionReturn(0);
322 }
323 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
324 
325 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
326 {
327   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
328   const PetscScalar *x;
329   PetscScalar       *y,*z;
330   const MatScalar   *aa;
331   PetscErrorCode    ierr;
332   PetscInt          m=A->rmap->n;
333   const PetscInt    *aj,*ai;
334   PetscInt          i;
335 
336   /* Variables not in MatMultAdd_SeqAIJ. */
337   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
338   PetscScalar       alpha = 1.0;
339   PetscScalar       beta = 1.0;
340   char              matdescra[6];
341 
342   PetscFunctionBegin;
343   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
344   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
345 
346   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
347   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
348   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
349   aa   = a->a;  /* Nonzero elements stored row-by-row. */
350   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
351 
352   /* Call MKL sparse BLAS routine to do the MatMult. */
353   if (zz == yy) {
354     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
355     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
356   } else {
357     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
358      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
359     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
360     for (i=0; i<m; i++) {
361       z[i] += y[i];
362     }
363   }
364 
365   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
366   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
367   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 
371 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
372 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
373 {
374   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
375   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
376   const PetscScalar *x;
377   PetscScalar       *y,*z;
378   PetscErrorCode    ierr;
379   PetscInt          m=A->rmap->n;
380   PetscInt          i;
381 
382   /* Variables not in MatMultAdd_SeqAIJ. */
383   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
384 
385   PetscFunctionBegin;
386 
387   /* If there are no rows, this is a no-op: return immediately. */
388   if(A->cmap->n < 1) PetscFunctionReturn(0);
389 
390   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
391   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
392 
393   /* Call MKL sparse BLAS routine to do the MatMult. */
394   if (zz == yy) {
395     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
396      * with alpha and beta both set to 1.0. */
397     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
398   } else {
399     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
400      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
401     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
402     for (i=0; i<m; i++) {
403       z[i] += y[i];
404     }
405   }
406 
407   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
408   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
409   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
410   if (stat != SPARSE_STATUS_SUCCESS) {
411     PetscFunctionReturn(PETSC_ERR_LIB);
412   }
413   PetscFunctionReturn(0);
414 }
415 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
416 
417 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
418 {
419   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
420   const PetscScalar *x;
421   PetscScalar       *y,*z;
422   const MatScalar   *aa;
423   PetscErrorCode    ierr;
424   PetscInt          m=A->rmap->n;
425   const PetscInt    *aj,*ai;
426   PetscInt          i;
427 
428   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
429   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
430   PetscScalar       alpha = 1.0;
431   PetscScalar       beta = 1.0;
432   char              matdescra[6];
433 
434   PetscFunctionBegin;
435   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
436   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
437 
438   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
439   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
440   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
441   aa   = a->a;  /* Nonzero elements stored row-by-row. */
442   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
443 
444   /* Call MKL sparse BLAS routine to do the MatMult. */
445   if (zz == yy) {
446     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
447     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
448   } else {
449     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
450      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
451     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
452     for (i=0; i<m; i++) {
453       z[i] += y[i];
454     }
455   }
456 
457   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
458   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
459   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
464 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
465 {
466   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
467   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
468   const PetscScalar *x;
469   PetscScalar       *y,*z;
470   PetscErrorCode    ierr;
471   PetscInt          m=A->rmap->n;
472   PetscInt          i;
473 
474   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
475   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
476 
477   PetscFunctionBegin;
478 
479   /* If there are no rows, this is a no-op: return immediately. */
480   if(A->cmap->n < 1) PetscFunctionReturn(0);
481 
482   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
483   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
484 
485   /* Call MKL sparse BLAS routine to do the MatMult. */
486   if (zz == yy) {
487     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
488      * with alpha and beta both set to 1.0. */
489     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
490   } else {
491     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
492      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
493     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
494     for (i=0; i<m; i++) {
495       z[i] += y[i];
496     }
497   }
498 
499   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
500   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
501   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
502   if (stat != SPARSE_STATUS_SUCCESS) {
503     PetscFunctionReturn(PETSC_ERR_LIB);
504   }
505   PetscFunctionReturn(0);
506 }
507 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
508 
509 
510 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
511  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
512  * routine, but can also be used to convert an assembled SeqAIJ matrix
513  * into a SeqAIJMKL one. */
514 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
515 {
516   PetscErrorCode ierr;
517   Mat            B = *newmat;
518   Mat_SeqAIJMKL  *aijmkl;
519   PetscBool      set;
520   PetscBool      sametype;
521 
522   PetscFunctionBegin;
523   if (reuse == MAT_INITIAL_MATRIX) {
524     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
525   }
526 
527   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
528   if (sametype) PetscFunctionReturn(0);
529 
530   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
531   B->spptr = (void*) aijmkl;
532 
533   /* Set function pointers for methods that we inherit from AIJ but override.
534    * We also parse some command line options below, since those determine some of the methods we point to.
535    * Note: Currently the transposed operations are not being set because I encounter memory corruption
536    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
537   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
538   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
539   B->ops->destroy          = MatDestroy_SeqAIJMKL;
540 
541   aijmkl->sparse_optimized = PETSC_FALSE;
542 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
543   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
544 #elif
545   aijmkl->no_SpMV2 = PETSC_TRUE;
546 #endif
547 
548   /* Parse command line options. */
549   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
550   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
551   ierr = PetscOptionsEnd();CHKERRQ(ierr);
552 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
553   if(!aijmkl->no_SpMV2) {
554     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
555     aijmkl->no_SpMV2 = PETSC_TRUE;
556   }
557 #endif
558 
559   if(!aijmkl->no_SpMV2) {
560 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
561     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
562     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
563     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
564     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
565 #endif
566   } else {
567     B->ops->mult             = MatMult_SeqAIJMKL;
568     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
569     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
570     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
571   }
572 
573   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
574   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
575   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
576   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
577 
578   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
579   *newmat = B;
580   PetscFunctionReturn(0);
581 }
582 
583 /*@C
584    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
585    This type inherits from AIJ and is largely identical, but uses sparse BLAS
586    routines from Intel MKL whenever possible.
587    Collective on MPI_Comm
588 
589    Input Parameters:
590 +  comm - MPI communicator, set to PETSC_COMM_SELF
591 .  m - number of rows
592 .  n - number of columns
593 .  nz - number of nonzeros per row (same for all rows)
594 -  nnz - array containing the number of nonzeros in the various rows
595          (possibly different for each row) or NULL
596 
597    Output Parameter:
598 .  A - the matrix
599 
600    Notes:
601    If nnz is given then nz is ignored
602 
603    Level: intermediate
604 
605 .keywords: matrix, cray, sparse, parallel
606 
607 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
608 @*/
609 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
610 {
611   PetscErrorCode ierr;
612 
613   PetscFunctionBegin;
614   ierr = MatCreate(comm,A);CHKERRQ(ierr);
615   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
616   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
617   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
618   PetscFunctionReturn(0);
619 }
620 
621 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
622 {
623   PetscErrorCode ierr;
624 
625   PetscFunctionBegin;
626   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
627   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
628   PetscFunctionReturn(0);
629 }
630