xref: /petsc/src/mat/impls/aij/seq/aijmkl/aijmkl.c (revision 6e369cd56c19f466a165a36e147eb4df4120aa3d)
1 /*
2   Defines basic operations for the MATSEQAIJMKL matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but uses
5   sparse BLAS operations from the Intel Math Kernel Library (MKL)
6   wherever possible.
7 */
8 
9 #include <../src/mat/impls/aij/seq/aij.h>
10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>
11 
12 /* MKL include files. */
13 #include <mkl_spblas.h>  /* Sparse BLAS */
14 
15 typedef struct {
16   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
17   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
19   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
20   struct matrix_descr descr;
21 #endif
22 } Mat_SeqAIJMKL;
23 
24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
25 
26 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
27 {
28   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
29   /* so we will ignore 'MatType type'. */
30   PetscErrorCode ierr;
31   Mat            B       = *newmat;
32   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
33 
34   PetscFunctionBegin;
35   if (reuse == MAT_INITIAL_MATRIX) {
36     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
37     aijmkl = (Mat_SeqAIJMKL*)B->spptr;
38   }
39 
40   /* Reset the original function pointers. */
41   B->ops->duplicate        = MatDuplicate_SeqAIJ;
42   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
43   B->ops->destroy          = MatDestroy_SeqAIJ;
44   B->ops->mult             = MatMult_SeqAIJ;
45   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
46   B->ops->multadd          = MatMultAdd_SeqAIJ;
47   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
48 
49   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr);
50   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
51   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
52   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr);
53 
54   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this
55    * simply involves destroying the MKL sparse matrix handle and then freeing
56    * the spptr pointer. */
57 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
58   if (aijmkl->sparse_optimized) {
59     sparse_status_t stat;
60     stat = mkl_sparse_destroy(aijmkl->csrA);
61     if (stat != SPARSE_STATUS_SUCCESS) {
62       PetscFunctionReturn(PETSC_ERR_LIB);
63     }
64   }
65 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
66   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
67 
68   /* Change the type of B to MATSEQAIJ. */
69   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
70 
71   *newmat = B;
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
76 {
77   PetscErrorCode ierr;
78   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;
79 
80   PetscFunctionBegin;
81 
82   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an
83    * spptr pointer. */
84   if (aijmkl) {
85     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
86 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
87     if (aijmkl->sparse_optimized) {
88       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
89       stat = mkl_sparse_destroy(aijmkl->csrA);
90       if (stat != SPARSE_STATUS_SUCCESS) {
91         PetscFunctionReturn(PETSC_ERR_LIB);
92       }
93     }
94 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
95     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
96   }
97 
98   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
99    * to destroy everything that remains. */
100   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
101   /* Note that I don't call MatSetType().  I believe this is because that
102    * is only to be called when *building* a matrix.  I could be wrong, but
103    * that is how things work for the SuperLU matrix class. */
104   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
109 {
110   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
111   Mat_SeqAIJMKL   *aijmkl;
112   PetscInt        m,n;
113   MatScalar       *aa;
114   PetscInt        *aj,*ai;
115 
116 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
117   /* If the MKL library does not have mkl_sparse_optimize(), then this routine
118    * does nothing. We make it callable anyway in this case because it cuts
119    * down on littering the code with #ifdefs. */
120   PetscFunctionReturn(0);
121 #else
122 
123   sparse_status_t stat;
124 
125   aijmkl = (Mat_SeqAIJMKL*) A->spptr;
126 
127   if (aijmkl->no_SpMV2) PetscFunctionReturn(0);
128 
129   if (aijmkl->sparse_optimized) {
130     /* Matrix has been previously assembled and optimized. Must destroy old
131      * matrix handle before running the optimization step again. */
132     stat = mkl_sparse_destroy(aijmkl->csrA);
133     if (stat != SPARSE_STATUS_SUCCESS) {
134       PetscFunctionReturn(PETSC_ERR_LIB);
135     }
136   }
137 
138   /* Now perform the SpMV2 setup and matrix optimization. */
139   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
140   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
141   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
142   m = A->rmap->n;
143   n = A->cmap->n;
144   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
145   aa   = a->a;  /* Nonzero elements stored row-by-row. */
146   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
147   if (n>0) {
148     stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
149     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
150     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
151     stat = mkl_sparse_optimize(aijmkl->csrA);
152     if (stat != SPARSE_STATUS_SUCCESS) {
153       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
154       PetscFunctionReturn(PETSC_ERR_LIB);
155     }
156     aijmkl->sparse_optimized = PETSC_TRUE;
157   }
158 
159   PetscFunctionReturn(0);
160 #endif
161 }
162 
163 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
164 {
165   PetscErrorCode ierr;
166   Mat_SeqAIJMKL *aijmkl;
167   Mat_SeqAIJMKL *aijmkl_dest;
168 
169   PetscFunctionBegin;
170   ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
171   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
172   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
173   ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr);
174   aijmkl_dest->sparse_optimized = PETSC_FALSE;
175   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
180 {
181   PetscErrorCode  ierr;
182   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
183 
184   PetscFunctionBegin;
185   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
186 
187   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
188    * extra information and some different methods, call the AssemblyEnd
189    * routine for a MATSEQAIJ.
190    * I'm not sure if this is the best way to do this, but it avoids
191    * a lot of code duplication.
192    * I also note that currently MATSEQAIJMKL doesn't know anything about
193    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
194    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
195    * this, this may break things.  (Don't know... haven't looked at it.
196    * Do I need to disable this somehow?) */
197   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
198   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
199 
200   /* Now create the MKL sparse handle (if needed; the function checks). */
201   ierr = MatSeqAIJMKL_create_mkl_handle(A);CHKERRQ(ierr);
202 
203   PetscFunctionReturn(0);
204 }
205 
206 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
207 {
208   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
209   const PetscScalar *x;
210   PetscScalar       *y;
211   const MatScalar   *aa;
212   PetscErrorCode    ierr;
213   PetscInt          m=A->rmap->n;
214   const PetscInt    *aj,*ai;
215 
216   /* Variables not in MatMult_SeqAIJ. */
217   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
218 
219   PetscFunctionBegin;
220   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
221   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
222   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
223   aa   = a->a;  /* Nonzero elements stored row-by-row. */
224   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
225 
226   /* Call MKL sparse BLAS routine to do the MatMult. */
227   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
228 
229   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
230   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
231   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
236 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
237 {
238   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
239   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
240   const PetscScalar *x;
241   PetscScalar       *y;
242   PetscErrorCode    ierr;
243   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
244 
245   PetscFunctionBegin;
246 
247   /* If there are no rows, this is a no-op: return immediately. */
248   if(A->cmap->n < 1) PetscFunctionReturn(0);
249 
250   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
251   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
252 
253   /* Call MKL SpMV2 executor routine to do the MatMult. */
254   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
255 
256   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
257   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
258   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
259   if (stat != SPARSE_STATUS_SUCCESS) {
260     PetscFunctionReturn(PETSC_ERR_LIB);
261   }
262   PetscFunctionReturn(0);
263 }
264 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
265 
266 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
267 {
268   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
269   const PetscScalar *x;
270   PetscScalar       *y;
271   const MatScalar   *aa;
272   PetscErrorCode    ierr;
273   PetscInt          m=A->rmap->n;
274   const PetscInt    *aj,*ai;
275 
276   /* Variables not in MatMultTranspose_SeqAIJ. */
277   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
278 
279   PetscFunctionBegin;
280   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
281   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
282   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
283   aa   = a->a;  /* Nonzero elements stored row-by-row. */
284   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
285 
286   /* Call MKL sparse BLAS routine to do the MatMult. */
287   mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y);
288 
289   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
290   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
291   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
296 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
297 {
298   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
299   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
300   const PetscScalar *x;
301   PetscScalar       *y;
302   PetscErrorCode    ierr;
303   sparse_status_t   stat;
304 
305   PetscFunctionBegin;
306 
307   /* If there are no rows, this is a no-op: return immediately. */
308   if(A->cmap->n < 1) PetscFunctionReturn(0);
309 
310   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
311   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
312 
313   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
314   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
315 
316   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
317   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
318   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
319   if (stat != SPARSE_STATUS_SUCCESS) {
320     PetscFunctionReturn(PETSC_ERR_LIB);
321   }
322   PetscFunctionReturn(0);
323 }
324 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
325 
326 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
327 {
328   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
329   const PetscScalar *x;
330   PetscScalar       *y,*z;
331   const MatScalar   *aa;
332   PetscErrorCode    ierr;
333   PetscInt          m=A->rmap->n;
334   const PetscInt    *aj,*ai;
335   PetscInt          i;
336 
337   /* Variables not in MatMultAdd_SeqAIJ. */
338   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
339   PetscScalar       alpha = 1.0;
340   PetscScalar       beta = 1.0;
341   char              matdescra[6];
342 
343   PetscFunctionBegin;
344   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
345   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
346 
347   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
348   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
349   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
350   aa   = a->a;  /* Nonzero elements stored row-by-row. */
351   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
352 
353   /* Call MKL sparse BLAS routine to do the MatMult. */
354   if (zz == yy) {
355     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
356     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
357   } else {
358     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
359      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
360     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
361     for (i=0; i<m; i++) {
362       z[i] += y[i];
363     }
364   }
365 
366   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
367   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
368   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
369   PetscFunctionReturn(0);
370 }
371 
372 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
373 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
374 {
375   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
376   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
377   const PetscScalar *x;
378   PetscScalar       *y,*z;
379   PetscErrorCode    ierr;
380   PetscInt          m=A->rmap->n;
381   PetscInt          i;
382 
383   /* Variables not in MatMultAdd_SeqAIJ. */
384   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
385 
386   PetscFunctionBegin;
387 
388   /* If there are no rows, this is a no-op: return immediately. */
389   if(A->cmap->n < 1) PetscFunctionReturn(0);
390 
391   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
392   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
393 
394   /* Call MKL sparse BLAS routine to do the MatMult. */
395   if (zz == yy) {
396     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
397      * with alpha and beta both set to 1.0. */
398     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
399   } else {
400     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
401      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
402     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
403     for (i=0; i<m; i++) {
404       z[i] += y[i];
405     }
406   }
407 
408   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
409   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
411   if (stat != SPARSE_STATUS_SUCCESS) {
412     PetscFunctionReturn(PETSC_ERR_LIB);
413   }
414   PetscFunctionReturn(0);
415 }
416 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
417 
418 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
419 {
420   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
421   const PetscScalar *x;
422   PetscScalar       *y,*z;
423   const MatScalar   *aa;
424   PetscErrorCode    ierr;
425   PetscInt          m=A->rmap->n;
426   const PetscInt    *aj,*ai;
427   PetscInt          i;
428 
429   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
430   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
431   PetscScalar       alpha = 1.0;
432   PetscScalar       beta = 1.0;
433   char              matdescra[6];
434 
435   PetscFunctionBegin;
436   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
437   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
438 
439   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
440   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
441   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
442   aa   = a->a;  /* Nonzero elements stored row-by-row. */
443   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
444 
445   /* Call MKL sparse BLAS routine to do the MatMult. */
446   if (zz == yy) {
447     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
448     mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);
449   } else {
450     /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then
451      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
452     mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z);
453     for (i=0; i<m; i++) {
454       z[i] += y[i];
455     }
456   }
457 
458   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
459   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
460   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
461   PetscFunctionReturn(0);
462 }
463 
464 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
465 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
466 {
467   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
468   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
469   const PetscScalar *x;
470   PetscScalar       *y,*z;
471   PetscErrorCode    ierr;
472   PetscInt          m=A->rmap->n;
473   PetscInt          i;
474 
475   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
476   sparse_status_t stat = SPARSE_STATUS_SUCCESS;
477 
478   PetscFunctionBegin;
479 
480   /* If there are no rows, this is a no-op: return immediately. */
481   if(A->cmap->n < 1) PetscFunctionReturn(0);
482 
483   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
484   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
485 
486   /* Call MKL sparse BLAS routine to do the MatMult. */
487   if (zz == yy) {
488     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
489      * with alpha and beta both set to 1.0. */
490     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y);
491   } else {
492     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
493      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
494     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
495     for (i=0; i<m; i++) {
496       z[i] += y[i];
497     }
498   }
499 
500   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
501   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
502   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
503   if (stat != SPARSE_STATUS_SUCCESS) {
504     PetscFunctionReturn(PETSC_ERR_LIB);
505   }
506   PetscFunctionReturn(0);
507 }
508 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
509 
510 
511 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
512  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
513  * routine, but can also be used to convert an assembled SeqAIJ matrix
514  * into a SeqAIJMKL one. */
515 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
516 {
517   PetscErrorCode ierr;
518   Mat            B = *newmat;
519   Mat_SeqAIJMKL  *aijmkl;
520   PetscBool      set;
521   PetscBool      sametype;
522 
523   PetscFunctionBegin;
524   if (reuse == MAT_INITIAL_MATRIX) {
525     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
526   }
527 
528   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
529   if (sametype) PetscFunctionReturn(0);
530 
531   ierr     = PetscNewLog(B,&aijmkl);CHKERRQ(ierr);
532   B->spptr = (void*) aijmkl;
533 
534   /* Set function pointers for methods that we inherit from AIJ but override.
535    * We also parse some command line options below, since those determine some of the methods we point to.
536    * Note: Currently the transposed operations are not being set because I encounter memory corruption
537    * when these are enabled.  Need to look at this with Valgrind or similar. --RTM */
538   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
539   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
540   B->ops->destroy          = MatDestroy_SeqAIJMKL;
541 
542   aijmkl->sparse_optimized = PETSC_FALSE;
543 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
544   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
545 #elif
546   aijmkl->no_SpMV2 = PETSC_TRUE;
547 #endif
548 
549   /* Parse command line options. */
550   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr);
551   ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr);
552   ierr = PetscOptionsEnd();CHKERRQ(ierr);
553 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
554   if(!aijmkl->no_SpMV2) {
555     ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
556     aijmkl->no_SpMV2 = PETSC_TRUE;
557   }
558 #endif
559 
560   if(!aijmkl->no_SpMV2) {
561 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
562     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
563     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2; */
564     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
565     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */
566 #endif
567   } else {
568     B->ops->mult             = MatMult_SeqAIJMKL;
569     /* B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL; */
570     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
571     /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */
572   }
573 
574   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr);
575   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
576   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
577   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
578 
579   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr);
580   *newmat = B;
581   PetscFunctionReturn(0);
582 }
583 
584 /*@C
585    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
586    This type inherits from AIJ and is largely identical, but uses sparse BLAS
587    routines from Intel MKL whenever possible.
588    Collective on MPI_Comm
589 
590    Input Parameters:
591 +  comm - MPI communicator, set to PETSC_COMM_SELF
592 .  m - number of rows
593 .  n - number of columns
594 .  nz - number of nonzeros per row (same for all rows)
595 -  nnz - array containing the number of nonzeros in the various rows
596          (possibly different for each row) or NULL
597 
598    Output Parameter:
599 .  A - the matrix
600 
601    Notes:
602    If nnz is given then nz is ignored
603 
604    Level: intermediate
605 
606 .keywords: matrix, cray, sparse, parallel
607 
608 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
609 @*/
610 PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
611 {
612   PetscErrorCode ierr;
613 
614   PetscFunctionBegin;
615   ierr = MatCreate(comm,A);CHKERRQ(ierr);
616   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
617   ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr);
618   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
619   PetscFunctionReturn(0);
620 }
621 
622 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
623 {
624   PetscErrorCode ierr;
625 
626   PetscFunctionBegin;
627   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
628   ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
629   PetscFunctionReturn(0);
630 }
631